## 1. Library Import

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

## 2. Load Data

In [5]:
df_train = pd.read_csv('../Iteration_1_Baseline/train_iter_1.csv',
                      index_col = 0)
df_test = pd.read_csv('../Iteration_1_Baseline/test_iter_1.csv',
                     index_col = 0)

In [6]:
df_train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,0.0,3,0,22.0,1,0,7.25,0.0,7
1,1.0,1,1,38.0,1,0,71.2833,1.0,2
2,1.0,3,1,26.0,0,0,7.925,0.0,7
3,1.0,1,1,35.0,1,0,53.1,0.0,2
4,0.0,3,0,35.0,0,0,8.05,0.0,7
5,0.0,3,0,25.0,0,0,8.4583,2.0,7
6,0.0,1,0,54.0,0,0,51.8625,0.0,4
7,0.0,3,0,2.0,3,1,21.075,0.0,7
8,1.0,3,1,27.0,0,2,11.1333,0.0,7
9,1.0,2,1,14.0,1,0,30.0708,1.0,7


In [7]:
df_test.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,3,0,34.5,0,0,7.8292,2.0,7
1,3,1,47.0,1,0,7.0,0.0,7
2,2,0,62.0,0,0,9.6875,2.0,7
3,3,0,27.0,0,0,8.6625,0.0,7
4,3,1,22.0,1,1,12.2875,0.0,7
5,3,0,14.0,0,0,9.225,0.0,7
6,3,1,30.0,0,0,7.6292,2.0,7
7,2,0,26.0,1,1,29.0,0.0,7
8,3,1,18.0,0,0,7.2292,1.0,7
9,3,0,21.0,2,0,24.15,0.0,7


In [10]:
idx_train = df_train.index
idx_test = df_test.index
df_all = pd.concat([df_train, df_test],
                  axis = 0, sort = False)
df_all.reset_index(drop = True,
                inplace = True)
df_all

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,0.0,3,0,22.0,1,0,7.2500,0.0,7
1,1.0,1,1,38.0,1,0,71.2833,1.0,2
2,1.0,3,1,26.0,0,0,7.9250,0.0,7
3,1.0,1,1,35.0,1,0,53.1000,0.0,2
4,0.0,3,0,35.0,0,0,8.0500,0.0,7
...,...,...,...,...,...,...,...,...,...
1304,,3,0,25.0,0,0,8.0500,0.0,7
1305,,1,1,39.0,0,0,108.9000,1.0,2
1306,,3,0,38.5,0,0,7.2500,0.0,7
1307,,3,0,25.0,0,0,8.0500,0.0,7


## 3. Feature Engineering

In this second iteration I will remove the outliers and standardize the dataset to observe the effect in the accuracy.

### 3.1. Standardization

In [40]:
df_all_scaled = preprocessing.scale(df_all)

In [41]:
df_all_scaled = pd.DataFrame(df_all_scaled,
                             columns = df_all.columns)

In [42]:
df_all_scaled

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
0,-0.789272,0.841916,-0.743497,-0.541613,0.481288,-0.445000,-0.503176,-0.603436,0.508349
1,1.266990,-1.546098,1.344995,0.661414,0.481288,-0.445000,0.734809,0.927373,-1.923773
2,1.266990,0.841916,1.344995,-0.240856,-0.479087,-0.445000,-0.490126,-0.603436,0.508349
3,1.266990,-1.546098,1.344995,0.435846,0.481288,-0.445000,0.383263,-0.603436,-1.923773
4,-0.789272,0.841916,-0.743497,0.435846,-0.479087,-0.445000,-0.487709,-0.603436,0.508349
...,...,...,...,...,...,...,...,...,...
1304,,0.841916,-0.743497,-0.316045,-0.479087,-0.445000,-0.487709,-0.603436,0.508349
1305,,-1.546098,1.344995,0.736603,-0.479087,-0.445000,1.462069,0.927373,-1.923773
1306,,0.841916,-0.743497,0.699008,-0.479087,-0.445000,-0.503176,-0.603436,0.508349
1307,,0.841916,-0.743497,-0.316045,-0.479087,-0.445000,-0.487709,-0.603436,0.508349


In [43]:
df_all_scaled.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_Letter
count,891.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,-2.287732e-16,-1.399441e-16,-1.284093e-16,5.589282e-17,-6.632925e-16,-8.549311e-17,-6.473474e-17,-1.1195530000000001e-17,2.005017e-16
std,1.000562,1.000382,1.000382,1.000382,1.000382,1.000382,1.000382,1.000382,1.000382
min,-0.7892723,-1.546098,-0.7434969,-2.182992,-0.4790868,-0.4449995,-0.6433437,-0.6034357,-2.896622
25%,-0.7892723,-0.3520907,-0.7434969,-0.5416129,-0.4790868,-0.4449995,-0.4906907,-0.6034357,0.5083488
50%,-0.7892723,0.8419164,-0.7434969,-0.2408563,-0.4790868,-0.4449995,-0.3638941,-0.6034357,0.5083488
75%,1.26699,0.8419164,1.344995,0.5110352,0.4812878,-0.4449995,-0.03868998,0.9273731,0.5083488
max,1.26699,0.8419164,1.344995,3.819358,7.203909,9.956864,9.261749,2.458182,0.5083488
