# Session 2 - Data Preprocessing 
# Demo Implementation

## Import libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn 

: 

## Dataset 

<center> <img src="http://www.titanicuniverse.com/wp-content/uploads/2009/10/titanic-sinking.jpg" width=400/> </center>  

We will revisit our Titanic dataset, but this time we will work with the raw dataset and find your way to crafting a dataset that is ready to be fed to a Machine Learning model.  

Here is the description of the variables:


<tbody>
<tr><th><b>Variable</b></th><th><b>Definition</b></th><th><b>Key</b></th></tr>
<tr>
<td>survival</td>
<td>Survival</td>
<td>0 = No, 1 = Yes</td>
</tr>
<tr>
<td>pclass</td>
<td>Ticket class</td>
<td>1 = 1st, 2 = 2nd, 3 = 3rd</td>
</tr>
<tr>
<td>sex</td>
<td>Sex</td>
<td></td>
</tr>
<tr>
<td>Age</td>
<td>Age in years</td>
<td></td>
</tr>
<tr>
<td>sibsp</td>
<td># of siblings / spouses aboard the Titanic</td>
<td></td>
</tr>
<tr>
<td>parch</td>
<td># of parents / children aboard the Titanic</td>
<td></td>
</tr>
<tr>
<td>ticket</td>
<td>Ticket number</td>
<td></td>
</tr>
<tr>
<td>fare</td>
<td>Passenger fare</td>
<td></td>
</tr>
<tr>
<td>cabin</td>
<td>Cabin number</td>
<td></td>
</tr>
<tr>
<td>embarked</td>
<td>Port of Embarkation</td>
<td>C = Cherbourg, Q = Queenstown, S = Southampton</td>
</tr>
</tbody>


In [None]:
dataset_raw = pd.read_csv('Titanic.csv')

In [None]:
dataset_raw.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Fsize,Family,FsizeD,Deck,Child,Mother
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,2,Braund_2,small,,Adult,Not Mother
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,2,Cumings_2,small,C,Adult,Not Mother


## Remove irrelevant data

Removing irrelevant columns and the ones we will not use as predictors:

In [None]:
# It is good practice to use a different variable to hold the shortened dataset
dataset = dataset_raw.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Title', 'Surname', 'Fsize', 'Family', 'FsizeD', 'Child', 'Mother'], axis=1)
dataset.sample(n=5, random_state = 20)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Deck
347,1,3,female,24.0,16.1,S,
674,0,2,male,26.0,0.0,S,
791,0,2,male,16.0,26.0,S,
836,0,3,male,21.0,8.6625,S,
56,1,2,female,21.0,10.5,S,


Removing duplicate rows:

In [None]:
dataset.duplicated().sum()

72

In [None]:
dataset = dataset.drop_duplicates()
dataset.duplicated().sum()

0

## Fill missing data

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 819 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  819 non-null    int64  
 1   Pclass    819 non-null    int64  
 2   Sex       819 non-null    object 
 3   Age       819 non-null    float64
 4   Fare      819 non-null    float64
 5   Embarked  819 non-null    object 
 6   Deck      203 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 51.2+ KB


We are missing a lot of Deck values. However the deck is very informative of the survival probability.
<center> <img src="https://i.pinimg.com/originals/e0/84/8d/e0848dbb614b5c7a4c832828fed7b768.jpg" width=600/> </center>  

In [None]:
dataset['Deck'].value_counts()

C    59
B    46
D    33
E    32
A    15
F    13
G     4
T     1
Name: Deck, dtype: int64

In [None]:
dataset['Deck'] = dataset['Deck'].fillna('M')

In [None]:
dataset['Deck'].value_counts()

M    616
C     59
B     46
D     33
E     32
A     15
F     13
G      4
T      1
Name: Deck, dtype: int64

In [None]:
dataset[dataset['Fare'] <= 0]

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Deck
179,0,3,male,36.0,0.0,S,M
263,0,1,male,40.0,0.0,S,B
271,1,3,male,25.0,0.0,S,M
277,0,2,male,54.0,0.0,S,M
302,0,3,male,19.0,0.0,S,M
413,0,2,male,47.0,0.0,S,M
466,0,2,male,30.0,0.0,S,M
481,0,2,male,35.0,0.0,S,M
597,0,3,male,49.0,0.0,S,M
633,0,1,male,62.0,0.0,S,M


## Separate features and target variable

Target variable

In [None]:
y = dataset['Survived']
y[:4]

0    0
1    1
2    1
3    1
Name: Survived, dtype: int64

Features:

In [None]:
X = dataset.drop('Survived', axis=1)
X.head(4)

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck
0,3,male,22.0,7.25,S,M
1,1,female,38.0,71.2833,C,C
2,3,female,26.0,7.925,S,M
3,1,female,35.0,53.1,S,C


## Variable encoding

In [None]:
# Notice the keywoard argument include='all'
dataset.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Deck
count,819.0,819.0,819,819.0,819.0,819,819
unique,,,2,,,3,9
top,,,male,,,S,M
freq,,,514,,,583,616
mean,0.405372,2.273504,,30.027985,33.86108,,
std,0.491264,0.849806,,14.71578,51.338093,,
min,0.0,1.0,,0.42,0.0,,
25%,0.0,1.0,,21.0,7.925,,
50%,0.0,3.0,,28.0,15.5,,
75%,1.0,3.0,,39.0,32.75,,


Different data types entail different treatments:
    - Numeric: scaled and standardized
    - Binary: Encoded as 0 or 1
    - Categorical: One-hot encoded

### Binary variables  
[Label encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X['Sex'] = label_encoder.fit_transform(X['Sex'])
X[:4]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck
0,3,1,22.0,7.25,S,M
1,1,0,38.0,71.2833,C,C
2,3,0,26.0,7.925,S,M
3,1,0,35.0,53.1,S,C


In [None]:
label_encoder.classes_

array(['female', 'male'], dtype=object)

## Categorical variables  
[One hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

oh_encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Embarked'])], 
                               remainder='passthrough')
X_onehot = oh_encoder.fit_transform(X)
print(X_onehot[:6])


[[0.0 0.0 1.0 3 1 22.0 7.25 'M']
 [1.0 0.0 0.0 1 0 38.0 71.2833 'C']
 [0.0 0.0 1.0 3 0 26.0 7.925 'M']
 [0.0 0.0 1.0 1 0 35.0 53.1 'C']
 [0.0 0.0 1.0 3 1 35.0 8.05 'M']
 [0.0 1.0 0.0 3 1 21.0 8.4583 'M']]


In [None]:
oh_encoder.named_transformers_['encoder'].categories_

[array(['C', 'Q', 'S'], dtype=object)]

[Dummy variables](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html)

In [None]:
X = pd.get_dummies(X, columns=['Embarked'], prefix=['Embarked_from'])
X = pd.get_dummies(X, columns=['Deck'], prefix=['Deck_'])

In [None]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_from_C,Embarked_from_Q,Embarked_from_S,Deck__A,Deck__B,Deck__C,Deck__D,Deck__E,Deck__F,Deck__G,Deck__M,Deck__T
0,3,1,22.0,7.25,0,0,1,0,0,0,0,0,0,0,1,0
1,1,0,38.0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0
2,3,0,26.0,7.925,0,0,1,0,0,0,0,0,0,0,1,0
3,1,0,35.0,53.1,0,0,1,0,0,1,0,0,0,0,0,0
4,3,1,35.0,8.05,0,0,1,0,0,0,0,0,0,0,1,0


## Data set split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(614, 16)
(205, 16)
(614,)
(205,)


In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_from_C,Embarked_from_Q,Embarked_from_S,Deck__A,Deck__B,Deck__C,Deck__D,Deck__E,Deck__F,Deck__G,Deck__M,Deck__T
117,2,1,29.0,21.0,0,0,1,0,0,0,0,0,0,0,1,0
775,3,1,18.0,7.75,0,0,1,0,0,0,0,0,0,0,1,0
516,2,0,34.0,10.5,0,0,1,0,0,0,0,0,1,0,0,0
618,2,0,4.0,39.0,0,0,1,0,0,0,0,0,1,0,0,0
849,1,0,35.0,89.1042,1,0,0,0,0,1,0,0,0,0,0,0


## Scaling

In [None]:
X_train[['Age', 'Fare']][:5]

Unnamed: 0,Age,Fare
117,29.0,21.0
775,18.0,7.75
516,34.0,10.5
618,4.0,39.0
849,35.0,89.1042


In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['Age', 'Fare']] = scaler.fit_transform(X_train[['Age', 'Fare']])
X_test[['Age', 'Fare']] = scaler.transform(X_test[['Age', 'Fare']])

In [None]:
X_train[['Age', 'Fare']][:5]

Unnamed: 0,Age,Fare
117,-0.089183,-0.240499
775,-0.823366,-0.527561
516,0.244537,-0.467982
618,-1.757781,0.149472
849,0.311281,1.234982


In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_from_C,Embarked_from_Q,Embarked_from_S,Deck__A,Deck__B,Deck__C,Deck__D,Deck__E,Deck__F,Deck__G,Deck__M,Deck__T
117,2,1,-0.089183,-0.240499,0,0,1,0,0,0,0,0,0,0,1,0
775,3,1,-0.823366,-0.527561,0,0,1,0,0,0,0,0,0,0,1,0
516,2,0,0.244537,-0.467982,0,0,1,0,0,0,0,0,1,0,0,0
618,2,0,-1.757781,0.149472,0,0,1,0,0,0,0,0,1,0,0,0
849,1,0,0.311281,1.234982,1,0,0,0,0,1,0,0,0,0,0,0
