#  Dealing with missing data

In [70]:
from __future__ import division
import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0
10.0,11.0,12.0'''

csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))

# get sum of all null values of columns
df.isnull().sum()

A    0
B    0
C    1
D    2
dtype: int64

In [71]:
# to get numpy array from dataframe
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,  nan,  nan],
       [ 10.,  11.,  12.,  nan]])

Eliminating samples or features with missing values


In [72]:
# 1. drop samples with na values
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [73]:
# 2. drop features
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [74]:
# drop if all rows are na
df.dropna(how='all')

#drop rows that have not at least 4 non-NaN vlues
df.dropna(thresh=4)

#only drop rows where NaN appears in specific column
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# Imputing Missing Values

In [75]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imr.fit(df)
imr.transform(df)

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   4. ],
       [ 10. ,  11. ,  12. ,   4. ]])

# Handling Categorical Data

In [76]:
import pandas as pd
df = pd.DataFrame([
        ['green','M',10.1,'class1'],
        ['red','L',13.5,'class2'],
        ['blue','XL',15.3,'class1'],
        ['Yellow','L',12.5,'class3']
    ])
df.columns = ['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1
3,Yellow,L,12.5,class3


In [77]:
size_maping = {'M':1,'L':2,'XL':3}
df['size'] = df['size'].map(size_maping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1
3,Yellow,2,12.5,class3


In [78]:
inv_size_mapping =  {v:k for k,v in size_maping.items()}
df['size'] = df['size'].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1
3,Yellow,L,12.5,class3


For Nominal categorical variabe, we can assign any Integer

In [79]:
import numpy as np
class_maping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))}
df['classlabel'] = df['classlabel'].map(class_maping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0
3,Yellow,L,12.5,2


In [80]:
inv_class_mapping = {v:k for k,v in class_maping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1
3,Yellow,L,12.5,class3


In [81]:
# Class Encoder Via scikit learn
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'])
y

array([0, 1, 0, 2])

In [82]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1', 'class3'], dtype=object)

# One-hot Encoding on Nominal features

In [83]:
df['size'] = df['size'].map(size_maping)
X = df.values
X = X[:,:-1]
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
X

array([[2, 1, 10.1],
       [3, 2, 13.5],
       [1, 3, 15.3],
       [0, 2, 12.5]], dtype=object)

In [85]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features = [0]) # sparse = False if we dont want sparse matrix 
ohe.fit_transform(X).toarray()

array([[  0. ,   0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   0. ,   1. ,   2. ,  13.5],
       [  0. ,   1. ,   0. ,   0. ,   3. ,  15.3],
       [  1. ,   0. ,   0. ,   0. ,   2. ,  12.5]])

In [88]:
# Another Method is Called on DataFrame which only changes the string values in df
pd.get_dummies(df[['color','size','price']])

Unnamed: 0,size,price,color_Yellow,color_blue,color_green,color_red
0,1,10.1,0.0,0.0,1.0,0.0
1,2,13.5,0.0,0.0,0.0,1.0
2,3,15.3,0.0,1.0,0.0,0.0
3,2,12.5,1.0,0.0,0.0,0.0
