# Building Good Training Sets --- Data Preprocessing

## 1. Dealing with missing data

In [1]:
import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''


df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [2]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [3]:
## number of NAN in each columns
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## Eliminating samples or features with missing values

In [4]:
# eliminate rows with nan
df.dropna()

Unnamed: 0,A,B,C,D
0,1,2,3,4


In [5]:
# eliminate columns with nan
df.dropna(axis=1)

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


In [6]:
# eliminate rows with nan in specific column
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1,2,3,4.0
2,10,11,12,


## Imputing missing values

In [7]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

# 2. Handling categorical data

In [22]:
import pandas as pd

df = pd.DataFrame([
            ['green', 'M', 10.1, 'class1'], 
            ['red', 'L', 13.5, 'class2'], 
            ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


## Mapping ordinal features

In [23]:
## ordinal to numerical
size_mapping = {'XL':3,
               'L':2,
               'M':1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [24]:
# back to ordinal
inverse_mapping = {v:k for k, v in size_mapping.items()}
df['size'].map(inverse_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## Encoding class labels

In [26]:
import numpy as np

## 这个class_mapping是需要记住的，最后预测出得值再反过来转回来
class_mapping = {lab:idx for idx,lab in 
                 enumerate(np.unique(df['classlabel']))}
df['classlabel'] = df['classlabel'].map(class_mapping)

df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [28]:
## return back
inverse_class_mapping = {idx:lab for lab,idx in class_mapping.items()}

df['classlabel'] = df['classlabel'].map(inverse_class_mapping)

df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [30]:
## alternately(这个在标签处理的时候还真的很管用)

from sklearn.preprocessing import LabelEncoder

class_lab = LabelEncoder()
y = class_lab.fit_transform(df['classlabel'].values)
y

array([0, 1, 0])

In [31]:
class_lab.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

## Performing one-hot encoding on nomial features

In [42]:
# 方法1：需要先用LabelEncoder转成integer label, 然后再使用OneHotEncode
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values

color_lab = LabelEncoder()
X[:, 0] = color_lab.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [46]:
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

In [48]:
## 方法二：更简单，使用pandas 的 get_dummies方法作用于dataframe
# 这个方法 only convert string columns and leave all other columns unchanged
pd.get_dummies(df[['color', 'size', 'price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


# 3. Selecting relevant features for model construction

## partitioning a dataset in training and test set

In [51]:
df_wine = pd.read_csv('./wine.data', header=None)
df_wine.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [52]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 
'Alcalinity of ash', 'Magnesium', 'Total phenols', 
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [69]:
from sklearn.cross_validation import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=0)

## Bringing features onto same scale

In [70]:
## 方法一（用的较少）：normalization
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_nrom = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [71]:
## 方法二（用的更多）：standardization (看书中有点标注)
from sklearn.preprocessing import StandardScaler

stds = StandardScaler()
X_train_std = stds.fit_transform(X_train)
X_test_std  = stds.transform(X_test)

## Selecting meaningful features