<img src="https://dsiag.ch/images/dsi_rgb.png" alt="dsi logo" width="100" style="position: absolute; right: 0px;"/>

# Example - Preprocessing

In [72]:
import pandas as pd
import numpy as np 

from sklearn import preprocessing, model_selection

def show(matrix) : 
    display(pd.DataFrame(matrix)) # Pandas dataframe just to improve display of matrix 

# Titanic Dataset 

In [49]:
titanic = pd.read_csv('../data/titanic.csv')
titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


## Model target as vector

In [51]:
target = 'Survived'
titanic_target = titanic[target].values
titanic_target[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1])

# Preprocessing 

## One-Hot-Encoding Passenger Class

`Pclass` contains the ordinal values 1,2,3 that should be transformed with a one-hot-encoding

In [40]:
one_hot_enc = preprocessing.OneHotEncoder(categories='auto')
onehot_features = ['Pclass']
onehot_data = one_hot_enc.fit_transform(titanic[onehot_features]).toarray()
show(onehot_data)

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
882,0.0,1.0,0.0
883,1.0,0.0,0.0
884,0.0,0.0,1.0
885,1.0,0.0,0.0


## Ordinal Encoding Sex 

The Feature `Sex` contains two values `female` and `male` that should be converted to integers

In [52]:
ordinal_enc = preprocessing.OrdinalEncoder()
ordinal_features = ['Sex']
ordinal_data = ordinal_enc.fit_transform(titanic[ordinal_features])
ordinal_data[:10]

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.]])

## Construct Feature Vector 

In [43]:
features = titanic.columns.tolist() 
features.remove(target)
features.remove('Name')
[features.remove(f) for f in onehot_features]
[features.remove(f) for f in ordinal_features]
titanic_data = np.concatenate((titanic[features].values, onehot_data, ordinal_data), axis=1)
show(titanic_data) 

Unnamed: 0,0,1,2,3,4,5,6,7
0,22.0,1.0,0.0,7.2500,0.0,0.0,1.0,1.0
1,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,0.0
2,26.0,0.0,0.0,7.9250,0.0,0.0,1.0,0.0
3,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0
4,35.0,0.0,0.0,8.0500,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
882,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,1.0
883,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0
884,7.0,1.0,2.0,23.4500,0.0,0.0,1.0,0.0
885,26.0,0.0,0.0,30.0000,1.0,0.0,0.0,1.0


## Feature Scaling

The features `Age`, `Siblings/Spouses Aboard`, `Parents/Children Aboard` and `Fare` are not scaled. 

In order to fit the scaler we have to split the data into test and training data first. 

The parameters of the scaler are then estimated on the training data.

In [None]:
features_to_scale=[0,1,2,3]
X_train, X_test, y_train, y_test = model_selection.train_test_split(titanic_data, titanic_target, test_size=0.33, random_state=0)

scaler = preprocessing.StandardScaler()
X_train[:,features_to_scale] = scaler.fit_transform(X_train[:,features_to_scale])
X_test[:,features_to_scale] = scaler.transform(X_test[:,features_to_scale])

## Resulting Input Features 

In [71]:
show(X_train)

show(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.049846,-0.487489,-0.451128,-0.501145,0.0,0.0,1.0,1.0
1,-1.860029,3.438862,0.729357,-0.091208,0.0,0.0,1.0,1.0
2,0.746634,0.494099,0.729357,-0.354308,0.0,0.0,1.0,1.0
3,-1.860029,3.438862,1.909842,-0.049691,0.0,0.0,1.0,1.0
4,-1.135956,0.494099,1.909842,1.663596,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
589,-0.049846,-0.487489,-0.451128,-0.080104,1.0,0.0,0.0,1.0
590,-1.932436,0.494099,0.729357,-0.151552,0.0,1.0,0.0,1.0
591,0.167375,-0.487489,-0.451128,-0.064656,1.0,0.0,0.0,1.0
592,-0.122254,-0.487489,-0.451128,-0.392928,0.0,1.0,0.0,1.0


Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.135956,-0.487489,-0.451128,-0.501949,0.0,0.0,1.0,0.0
1,-1.787621,7.365212,1.909842,0.689403,0.0,0.0,1.0,1.0
2,1.181077,-0.487489,-0.451128,0.111066,1.0,0.0,0.0,1.0
3,0.094968,-0.487489,-0.451128,-0.500581,0.0,0.0,1.0,1.0
4,-1.570399,2.457274,0.729357,-0.161850,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
288,-0.556697,-0.487489,-0.451128,-0.463651,0.0,0.0,1.0,0.0
289,-0.194661,-0.487489,-0.451128,-0.501145,0.0,0.0,1.0,1.0
290,-0.846327,-0.487489,-0.451128,-0.514017,0.0,0.0,1.0,1.0
291,-0.629105,-0.487489,-0.451128,-0.517478,0.0,0.0,1.0,1.0


# Store dataset 

In [None]:
dataset = {'X_train' :  X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test}
import pickle 

pickle.dump(dataset, open('../data/titanic.pickle', 'wb') )