#  Dealing with missing data

In [21]:
from __future__ import division
import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,7.0
10.0,11.0,12.0'''

csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))

# get sum of all null values of columns
df.isnull().sum()

A    0
B    0
C    0
D    2
dtype: int64

In [22]:
# to get numpy array from dataframe
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,  nan],
       [ 10.,  11.,  12.,  nan]])

Eliminating samples or features with missing values


In [23]:
# 1. drop samples with na values
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [24]:
# 2. drop features
df.dropna(axis=1)

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,5.0,6.0,7.0
2,10.0,11.0,12.0


In [17]:
# drop if all rows are na
df.dropna(how='all')

#drop rows that have not at least 4 non-NaN vlues
df.dropna(thresh=4)

#only drop rows where NaN appears in specific column
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,
2,10.0,11.0,12.0,


# Imputing Missing Values

In [20]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imr.fit(df)
imr.transform(df)

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   4.],
       [ 10.,  11.,  12.,   4.]])