# Data Preprocessing

dealing with missing data

In [11]:
import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0'''
df = pd.read_csv(StringIO(csv_data))
df


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [10]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [12]:
# drop rows with nan values
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [13]:
# drop columns with nan values
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [14]:
# drop only if all columns of a row is nan
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [16]:
# drop rows that have fewer than 4 real values
n = 4
df.dropna(thresh=n)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [18]:
# drop rows where nan appear in speficic columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [19]:
from sklearn.impute import SimpleImputer
import numpy as np

# insert mean of column values where there is nan values 
# this is useful for categorical features (when encoding names)
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [20]:
# cleaner approach
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [26]:
df = pd.DataFrame([ ['green', 'M', 10.1,'class1'],
                    ['red', 'L', 12.3, 'class2'],
                    ['blue', 'XL', 14.4, 'class1']])
df.columns = ['color', 'size', 'price', 'classLabel']
df

Unnamed: 0,color,size,price,classLabel
0,green,M,10.1,class1
1,red,L,12.3,class2
2,blue,XL,14.4,class1


In [27]:
# if we want an ordered mapping between categorical variables, there is no function that can automatically
# derive the correct order of labels so we must provide the mapping ourselves

# let m = 1, l = m +1, xl = l +1 --> m=1, l=2, xl=3
size_map = {'XL' : 3, 'L' : 2, 'M' : 1}
df['size'] = df['size'].map(size_map)
df

Unnamed: 0,color,size,price,classLabel
0,green,1,10.1,class1
1,red,2,12.3,class2
2,blue,3,14.4,class1


In [30]:
# if we want to reverse the mapping done, reverse the corresponding key and values in the dict
inv_size_map = {val : key for key, val in size_map.items()}
df['size'] = df['size'].map(inv_size_map)
df

Unnamed: 0,color,size,price,classLabel
0,green,M,10.1,class1
1,red,L,12.3,class2
2,blue,XL,14.4,class1
