## 1. Library Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

## 2. Load Data

In [2]:
df_train = pd.read_csv('../Iteration_1_Baseline/train_iter_1.csv',
                      index_col = 0)
df_test = pd.read_csv('../Iteration_1_Baseline/test_iter_1.csv',
                     index_col = 0)

In [3]:
df_train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,0.0,3,0,22.0,1,0,7.25,0.0,7
1,1.0,1,1,38.0,1,0,71.2833,1.0,2
2,1.0,3,1,26.0,0,0,7.925,0.0,7
3,1.0,1,1,35.0,1,0,53.1,0.0,2
4,0.0,3,0,35.0,0,0,8.05,0.0,7
5,0.0,3,0,25.0,0,0,8.4583,2.0,7
6,0.0,1,0,54.0,0,0,51.8625,0.0,4
7,0.0,3,0,2.0,3,1,21.075,0.0,7
8,1.0,3,1,27.0,0,2,11.1333,0.0,7
9,1.0,2,1,14.0,1,0,30.0708,1.0,7


In [4]:
df_test.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,3,0,34.5,0,0,7.8292,2.0,7
1,3,1,47.0,1,0,7.0,0.0,7
2,2,0,62.0,0,0,9.6875,2.0,7
3,3,0,27.0,0,0,8.6625,0.0,7
4,3,1,22.0,1,1,12.2875,0.0,7
5,3,0,14.0,0,0,9.225,0.0,7
6,3,1,30.0,0,0,7.6292,2.0,7
7,2,0,26.0,1,1,29.0,0.0,7
8,3,1,18.0,0,0,7.2292,1.0,7
9,3,0,21.0,2,0,24.15,0.0,7


In [5]:
idx_train = df_train.index
idx_test = df_test.index
df_all = pd.concat([df_train, df_test],
                  axis = 0, sort = False)
df_all.reset_index(drop = True,
                inplace = True)
df_all

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,0.0,3,0,22.0,1,0,7.2500,0.0,7
1,1.0,1,1,38.0,1,0,71.2833,1.0,2
2,1.0,3,1,26.0,0,0,7.9250,0.0,7
3,1.0,1,1,35.0,1,0,53.1000,0.0,2
4,0.0,3,0,35.0,0,0,8.0500,0.0,7
...,...,...,...,...,...,...,...,...,...
1304,,3,0,25.0,0,0,8.0500,0.0,7
1305,,1,1,39.0,0,0,108.9000,1.0,2
1306,,3,0,38.5,0,0,7.2500,0.0,7
1307,,3,0,25.0,0,0,8.0500,0.0,7


## 3. Feature Engineering

In this second iteration I will normalize the dataset to observe the effect and difference in the accuracy compared to standardization.

In [10]:
df_all_not_survived = df_all.drop('Survived',
                                 axis = 1)
scaler = preprocessing.MinMaxScaler()
df_all_scaled = scaler.fit_transform(df_all_not_survived)

In [11]:
df_all_scaled = pd.DataFrame(df_all_scaled,
                             columns = df_all_not_survived.columns)

In [14]:
df_all_scaled

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,1.0,0.0,0.273456,0.125,0.000000,0.014151,0.0,1.000000
1,0.0,1.0,0.473882,0.125,0.000000,0.139136,0.5,0.285714
2,1.0,1.0,0.323563,0.000,0.000000,0.015469,0.0,1.000000
3,0.0,1.0,0.436302,0.125,0.000000,0.103644,0.0,0.285714
4,1.0,0.0,0.436302,0.000,0.000000,0.015713,0.0,1.000000
...,...,...,...,...,...,...,...,...
1304,1.0,0.0,0.311036,0.000,0.000000,0.015713,0.0,1.000000
1305,0.0,1.0,0.486409,0.000,0.000000,0.212559,0.5,0.285714
1306,1.0,0.0,0.480145,0.000,0.000000,0.014151,0.0,1.000000
1307,1.0,0.0,0.311036,0.000,0.000000,0.015713,0.0,1.000000


In [15]:
df_all_scaled.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,0.647441,0.355997,0.36369,0.062357,0.042781,0.064951,0.197097,0.850704
std,0.418918,0.478997,0.166665,0.130207,0.096173,0.100997,0.32675,0.293801
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.5,0.0,0.273456,0.0,0.0,0.015412,0.0,1.0
50%,1.0,0.0,0.323563,0.0,0.0,0.028213,0.0,1.0
75%,1.0,1.0,0.448829,0.125,0.0,0.061045,0.5,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
df_all_scaled = pd.concat([df_all['Survived'],df_all_scaled], axis = 1)

In [17]:
df_train = df_all_scaled[df_all_scaled.index.isin(idx_train)]
df_test = df_all_scaled[df_all_scaled.index.isin(idx_test)]

In [18]:
df_train.to_csv('train_iter_3.csv')
df_test.to_csv('test_iter_3.csv')