# Building Good Training Sets - Data Preprocessing

### Dealing with missing data

In [176]:
import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df
df.isnull().sum()
df.values
df[df["C"].isna()]

Unnamed: 0,A,B,C,D
1,5.0,6.0,,8.0


In [177]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [178]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

### Eliminating samples or features with missing data

In [179]:
# remove rows that contain missing values
df.dropna(axis=0)   #행 방향으로 제거 (행 중에 결측치가 있으면 drop)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [180]:
# remove colomns that contain missing values
df.dropna(axis=1)    #열 방향으로 제거 (열 중에 결측치 있으면 그 열을 drop)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [181]:
# drop rows where all colums are NaN
df.dropna(how='all')
df.dropna(how='any')  # 행이나 열에 하나라도 nan이 있을때 그 행을 없앰

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [182]:
# drop rows where NaN appear in specific columns (for example : "C")
df.dropna(subset=["C"])   # 특정 열에 nan값이 있으면 그 행을 삭제 

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


### Imputing missing values

In [183]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [184]:
# Impute missing values via the column mean
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN',strategy='mean',axis=0)    # Imputer : 결측치를 strategy에서 설정한 것으로 채운다 3과 12를 더한 값의 평균을 nan에 채운다
imr = imr.fit(df.values)



In [185]:
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

## Handling Categorical data

### Nominal and ordinal features

In [186]:
import pandas as pd    #범주형

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


### Mapping ordinal features

In [187]:
size_mapping = {"XL":3, "L":2, "M":1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [188]:
# List comprehension
li = [ i for i in range(10)]
li

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [189]:
size_mapping.items()

dict_items([('XL', 3), ('L', 2), ('M', 1)])

In [190]:
# Dict comprehension
di={v:k for k, v in size_mapping.items()}    # key와 value위치를 바꾸기
di
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [191]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'] = df['size'].map(inv_size_mapping)            # 원래대로 돌린다 ***
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


### Encoding class labels

In [140]:
import numpy as np

# create a mapping dict
# to convert class labels from strings to integers
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [90]:
# to convert class labels from strings to integers
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,
1,red,L,13.5,
2,blue,XL,15.3,


In [141]:
# reverse the class label mapping
inv_class_mapping = {v:k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,1,M,10.1,
1,2,L,13.5,
2,0,XL,15.3,


In [192]:
from sklearn.preprocessing import LabelEncoder    # 기억하기 *********

# Label encoding with sklearn's LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)    # classlable을 숫자로 바꾸기
y

array([1, 0, 1])

In [193]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [194]:
df['classlabel']

0    class2
1    class1
2    class2
Name: classlabel, dtype: object

In [195]:
# reverse mapping                # 되돌리기 *****
class_le.inverse_transform(y) 

array(['class2', 'class1', 'class2'], dtype=object)

### Performing one-hot encoding on nominal features

In [196]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [197]:
from sklearn.preprocessing import LabelEncoder

In [198]:
le_col=LabelEncoder()

In [199]:
df['size'] = df['size'].map(size_mapping)
X = df[['color','size','price']].values

color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [200]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [201]:
# return dense array so that we can skip
# the toarray step

ohe = OneHotEncoder(categorical_features=[0], sparse=False)
ohe.fit_transform(X)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [202]:
# one-hot encoding via pandas
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [203]:
# multicollinearity guard in get_dummies
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


In [206]:
# multicollinearity guard for the OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])