In [1]:
# cleaning can increase the amount of mising data
# even missingness is random, it can cause difficulties for analysis
# python implementations of basic statistical mathods like ANOVA t-tests and correlations will fail

# one way to solve this problem is to drop any rows that contain missing values in our variable of interest
# pandas package has the .dropna() dataframe method

In [2]:
import pandas as pd

# Sample data to play with and clean.
data = {
    'age': [27, 50, 34, None, None, None],
    'gender': ['f', 'f', 'f', 'm', 'm', None],
    'height' : [64, None, 71, 66, 68, None],
    'weight' : [140, None, 130, 110, 160, None],
}
df = pd.DataFrame(data)

# Full dataset.
print(df)

# Drop all rows that have any missing values in any column.
print(df.dropna()) 

# Drop only rows where all values are missing.
print(df.dropna(how='all'))

# Drop only rows where more than two values are missing.
print(df.dropna(thresh=2))

# Drop all rows that have any missing values in the 'gender' or 'height' columns.
print(df.dropna(subset=['gender','height']))

# Your turn. Write code below to drop rows where both height and weight
# are missing and print the result.


print(df.dropna(subset=['gender'and'height']))


    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
5   NaN   None     NaN     NaN
    age gender  height  weight
0  27.0      f    64.0   140.0
2  34.0      f    71.0   130.0
    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
    age gender  height  weight
0  27.0      f    64.0   140.0
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
    age gender  height  weight
0  27.0      f    64.0   140.0
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0


In [3]:
# WHEN DOES MISSINGNESS MATTER?
# sometimes dropping all rows with missing data is fine but someitmes it creates problems
# missing data matter if we believe the missingnes swill cause
# 1. loss of statistical power b/c so many rows have to be thrown out, making it harder to detect effets
# 2. bias because certain values are morelikely to . be missing than others

# To know when to worry about missing data and when to throw it out incomplete cases and proceed as planned, 
# see where missingness falls in following categories: 

# Missing COmpletely at Random (MCAR):
# 1. a catastriohic flood washing away some of the servers and 20% of the data was lost
# 2. Unless so much data is lost that samp sizes are now too small, it is fiar to throw out the missing values and proceed

# Missing at Random (MAR): 
# 1. Women are more likely to skip a question about weight regardless of their actual weight
# 2. b/c we can explain why the data is missing using data we have, we can proceed as long as we include the variable that explains the missingness in our analyses
# 3. there is no way to know that data is MAR but soemtimes we can assume it is. 
#     if we find a variable in our dataset that seems to differentiate really well b/w missing and non-missing

# Missing Not at Random (NMAR)
# 1. LGBT individueals less likel to answer a survey Q about sexual orientation
# 2. systematic meaningness: people who would answer in a certain way (LGBT vs. Heterosexual) are less likely to answer at all
# 3. Stop, do not pass Go, do not collect $200. 
#    if we throw out MNAR data, we end up with biased sample and conclusions
# 4. note that since by def we dont know what ppl would have aid for questions they dont answer, 
#    MNAR is an assumption based on looking at data and notcing what isnt there: 
#    Abnormally low counts of LGBT ppl, almost no men who say they are depressed, variables with missingness whem no one picks highest values







In [4]:
# what do you do if you have MNAR data you cant drop, or if it is MCAR or MAR but dropping missing values leaves sample too small
# IMPUTING DATA
# in cases where we want to keep all the info fro, all rows, even incomplete ones, we can guess what missing dat would have been nad fill in that cell with our guess
# this is called IMPUTATION

# most straightforward involves replacing missing values with mode mean or median of the variable
# this isnt perfect; keeps central tendency the same but reduces variance and correlations among variables

In [5]:
import pandas as pd

# Sample data to play with.
data = {
    'age': [27, 50, 34, None, None, None],
    'gender': ['f', 'f', 'f', 'm', 'm', None],
    'height' : [64, None, 71, 66, 68, None],
    'weight' : [140, None, 130, 110, 160, None],
}
df = pd.DataFrame(data)

# For each numeric column, replace the missing values with the mean for that column.
df.fillna(df.mean(),inplace=True)
print(df)

# For each column, replace the missing values with the most common value for that
# column. Useful for filling in missing categorical values.
# As written, this command will fill in missing values for both numerical and
# categorical columns.
df = pd.DataFrame(data)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
print(df)

# Your turn. Try replacing each value with the median, mode, or other statistic
# of your choice.



    age gender  height  weight
0  27.0      f   64.00   140.0
1  50.0      f   67.25   135.0
2  34.0      f   71.00   130.0
3  37.0      m   66.00   110.0
4  37.0      m   68.00   160.0
5  37.0   None   67.25   135.0
    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f    68.0   160.0
2  34.0      f    71.0   130.0
3  34.0      m    66.0   110.0
4  34.0      m    68.0   160.0
5  34.0      f    68.0   160.0
