## 1. Library Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

## 2. Load Data

In [2]:
df_train = pd.read_csv('../Iteration_1_Baseline/train_iter_1.csv',
                      index_col = 0)
df_test = pd.read_csv('../Iteration_1_Baseline/test_iter_1.csv',
                     index_col = 0)

In [3]:
df_train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,0.0,3,0,22.0,1,0,7.25,0.0,7
1,1.0,1,1,38.0,1,0,71.2833,1.0,2
2,1.0,3,1,26.0,0,0,7.925,0.0,7
3,1.0,1,1,35.0,1,0,53.1,0.0,2
4,0.0,3,0,35.0,0,0,8.05,0.0,7
5,0.0,3,0,25.0,0,0,8.4583,2.0,7
6,0.0,1,0,54.0,0,0,51.8625,0.0,4
7,0.0,3,0,2.0,3,1,21.075,0.0,7
8,1.0,3,1,27.0,0,2,11.1333,0.0,7
9,1.0,2,1,14.0,1,0,30.0708,1.0,7


In [4]:
df_test.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,3,0,34.5,0,0,7.8292,2.0,7
1,3,1,47.0,1,0,7.0,0.0,7
2,2,0,62.0,0,0,9.6875,2.0,7
3,3,0,27.0,0,0,8.6625,0.0,7
4,3,1,22.0,1,1,12.2875,0.0,7
5,3,0,14.0,0,0,9.225,0.0,7
6,3,1,30.0,0,0,7.6292,2.0,7
7,2,0,26.0,1,1,29.0,0.0,7
8,3,1,18.0,0,0,7.2292,1.0,7
9,3,0,21.0,2,0,24.15,0.0,7


In [5]:
idx_train = df_train.index
idx_test = df_test.index
df_all = pd.concat([df_train, df_test],
                  axis = 0, sort = False)
df_all.reset_index(drop = True,
                inplace = True)
df_all

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,0.0,3,0,22.0,1,0,7.2500,0.0,7
1,1.0,1,1,38.0,1,0,71.2833,1.0,2
2,1.0,3,1,26.0,0,0,7.9250,0.0,7
3,1.0,1,1,35.0,1,0,53.1000,0.0,2
4,0.0,3,0,35.0,0,0,8.0500,0.0,7
...,...,...,...,...,...,...,...,...,...
1304,,3,0,25.0,0,0,8.0500,0.0,7
1305,,1,1,39.0,0,0,108.9000,1.0,2
1306,,3,0,38.5,0,0,7.2500,0.0,7
1307,,3,0,25.0,0,0,8.0500,0.0,7


## 3. Feature Engineering

In this iteration I will move from categorical variables to dummy variables.

At the end I will standardize the variables again, in this case just Age, SibSp, Parch and Fare.

In [23]:
df_all = pd.get_dummies(df_all, 
                        columns = ['Pclass','Embarked','Cabin_Letter'], 
                        prefix = ['Pclass','Embarked','Cabin_Letter'])
df_all

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_0.0,Embarked_1.0,Embarked_2.0,Cabin_Letter_0,Cabin_Letter_1,Cabin_Letter_2,Cabin_Letter_3,Cabin_Letter_4,Cabin_Letter_5,Cabin_Letter_6,Cabin_Letter_7
0,0.0,0,22.0,1,0,7.2500,0,0,1,1,0,0,0,0,0,0,0,0,0,1
1,1.0,1,38.0,1,0,71.2833,1,0,0,0,1,0,0,0,1,0,0,0,0,0
2,1.0,1,26.0,0,0,7.9250,0,0,1,1,0,0,0,0,0,0,0,0,0,1
3,1.0,1,35.0,1,0,53.1000,1,0,0,1,0,0,0,0,1,0,0,0,0,0
4,0.0,0,35.0,0,0,8.0500,0,0,1,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,0,25.0,0,0,8.0500,0,0,1,1,0,0,0,0,0,0,0,0,0,1
1305,,1,39.0,0,0,108.9000,1,0,0,0,1,0,0,0,1,0,0,0,0,0
1306,,0,38.5,0,0,7.2500,0,0,1,1,0,0,0,0,0,0,0,0,0,1
1307,,0,25.0,0,0,8.0500,0,0,1,1,0,0,0,0,0,0,0,0,0,1


In [24]:
df_all_not_survived = df_all.drop(['Survived','Sex','Pclass_1','Pclass_2','Pclass_3','Embarked_0.0',
                                  'Embarked_1.0','Embarked_2.0','Cabin_Letter_0','Cabin_Letter_1',
                                  'Cabin_Letter_2','Cabin_Letter_3','Cabin_Letter_4','Cabin_Letter_5',
                                  'Cabin_Letter_6','Cabin_Letter_7'],
                                 axis = 1)
df_all_scaled = preprocessing.scale(df_all_not_survived)

In [25]:
df_all_scaled = pd.DataFrame(df_all_scaled,
                             columns = df_all_not_survived.columns)

In [26]:
df_all_scaled

Unnamed: 0,Age,SibSp,Parch,Fare
0,-0.541613,0.481288,-0.445000,-0.503176
1,0.661414,0.481288,-0.445000,0.734809
2,-0.240856,-0.479087,-0.445000,-0.490126
3,0.435846,0.481288,-0.445000,0.383263
4,0.435846,-0.479087,-0.445000,-0.487709
...,...,...,...,...
1304,-0.316045,-0.479087,-0.445000,-0.487709
1305,0.736603,-0.479087,-0.445000,1.462069
1306,0.699008,-0.479087,-0.445000,-0.503176
1307,-0.316045,-0.479087,-0.445000,-0.487709


In [27]:
df_all_scaled.describe()

Unnamed: 0,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1309.0
mean,5.589282e-17,-6.632925e-16,-8.549311e-17,-6.473474e-17
std,1.000382,1.000382,1.000382,1.000382
min,-2.182992,-0.4790868,-0.4449995,-0.6433437
25%,-0.5416129,-0.4790868,-0.4449995,-0.4906907
50%,-0.2408563,-0.4790868,-0.4449995,-0.3638941
75%,0.5110352,0.4812878,-0.4449995,-0.03868998
max,3.819358,7.203909,9.956864,9.261749


In [36]:
df_all_dummy = df_all.loc[:,['Survived','Sex','Pclass_1','Pclass_2','Pclass_3','Embarked_0.0',
                                  'Embarked_1.0','Embarked_2.0','Cabin_Letter_0','Cabin_Letter_1',
                                  'Cabin_Letter_2','Cabin_Letter_3','Cabin_Letter_4','Cabin_Letter_5',
                                  'Cabin_Letter_6','Cabin_Letter_7']]
df_all_scaled = pd.concat([df_all_dummy,df_all_scaled], axis = 1)

In [38]:
df_train = df_all_scaled[df_all_scaled.index.isin(idx_train)]
df_test = df_all_scaled[df_all_scaled.index.isin(idx_test)]

In [40]:
df_train.to_csv('train_iter_5.csv')
df_test.to_csv('test_iter_5.csv')