## 1. Library Import

In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, cross_val_predict

## 2. Load Data

In [114]:
df_train = pd.read_csv('../../data/train_clean.csv',
                      index_col = 0)
df_test = pd.read_csv('../../data/test_clean.csv',
                     index_col = 0)

In [115]:
df_train.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,Other,0.0,7
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1.0,2
3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,Other,0.0,7
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0.0,2
5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,Other,0.0,7
6,0.0,3,"Moran, Mr. James",0,25.0,0,0,330877,8.4583,Other,2.0,7
7,0.0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0.0,4
8,0.0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,Other,0.0,7
9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,Other,0.0,7
10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,Other,1.0,7


In [116]:
df_test.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,Other,2.0,7
893,,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,Other,0.0,7
894,,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,Other,2.0,7
895,,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,Other,0.0,7
896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,Other,0.0,7
897,,3,"Svensson, Mr. Johan Cervin",0,14.0,0,0,7538,9.225,Other,0.0,7
898,,3,"Connolly, Miss. Kate",1,30.0,0,0,330972,7.6292,Other,2.0,7
899,,2,"Caldwell, Mr. Albert Francis",0,26.0,1,1,248738,29.0,Other,0.0,7
900,,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",1,18.0,0,0,2657,7.2292,Other,1.0,7
901,,3,"Davies, Mr. John Samuel",0,21.0,2,0,A/4 48871,24.15,Other,0.0,7


In [117]:
idx_train = df_train.index
idx_test = df_test.index
df_all = pd.concat([df_train, df_test],
                  axis = 0, sort = False)
df_all.reset_index(drop = True,
                inplace = True)
index_col = df_all.index
df_all

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter
0,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,Other,0.0,7
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1.0,2
2,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,Other,0.0,7
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0.0,2
4,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,Other,0.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,3,"Spector, Mr. Woolf",0,25.0,0,0,A.5. 3236,8.0500,Other,0.0,7
1305,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,1.0,2
1306,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Other,0.0,7
1307,,3,"Ware, Mr. Frederick",0,25.0,0,0,359309,8.0500,Other,0.0,7


## 3. Feature Engineering

In this iteration I will create a feature to identify families with at least one male that survived.

At the I will perform the same actions as in iteration 12.

In [118]:
df_all['Ticket_number'] = df_all['Ticket'].str.split(' ')
df_all['Ticket_number'] = df_all['Ticket_number'].apply(lambda x: x[-1])
map_ticket_survival = pd.DataFrame(df_all['Ticket_number'].value_counts())
df_all['Ticket_count'] = df_all['Ticket_number'].apply(lambda x: (map_ticket_survival[map_ticket_survival['Ticket_number'].index == x].iloc[0].iloc[0]) if x in map_ticket_survival.index else 0)
df_all['Ticket_survived'] = df_all.apply(lambda x: 1 if (df_all[df_all['Ticket_number'] == x['Ticket_number']].Survived.sum() > 0) & (x['Ticket_count'] > 1) else 0, axis = 1)
df_all['Ticket_type'] = df_all['Ticket'].str.split(' ')
df_all['Ticket_type'] = df_all['Ticket_type'].apply(lambda x: x[0][0] if len(x) > 1 else 'Other')
dict_ticket_type = {'A':0,'C':1,'F':2,'L':3,'P':4,'S':5,'W':6,'Other':7}
df_all['Ticket_type'] = df_all['Ticket_type'].map(dict_ticket_type)

In [119]:
df_all['Woman_with_child'] = df_all.apply(lambda x: 1 if (x['Sex'] == 1) & (x['Age'] > 18) & (x['Parch'] > 0) else 0,
                                         axis = 1)
df_all

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter,Ticket_number,Ticket_count,Ticket_survived,Ticket_type,Woman_with_child
0,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,Other,0.0,7,21171,1,0,0,0
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1.0,2,17599,2,1,4,0
2,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,Other,0.0,7,3101282,1,0,5,0
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0.0,2,113803,2,1,7,0
4,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,Other,0.0,7,373450,1,0,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,3,"Spector, Mr. Woolf",0,25.0,0,0,A.5. 3236,8.0500,Other,0.0,7,3236,1,0,0,0
1305,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,1.0,2,17758,3,1,4,0
1306,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Other,0.0,7,3101262,1,0,5,0
1307,,3,"Ware, Mr. Frederick",0,25.0,0,0,359309,8.0500,Other,0.0,7,359309,1,0,7,0


In [120]:
df_all['Title'] = df_all['Name'].str.extract(', ([A-Za-z]+).')
df_all.groupby('Title')['Pclass'].count()

Title
Capt          1
Col           4
Don           1
Dona          1
Dr            8
Jonkheer      1
Lady          1
Major         2
Master       61
Miss        260
Mlle          2
Mme           1
Mr          757
Mrs         197
Ms            2
Rev           8
Sir           1
the           1
Name: Pclass, dtype: int64

We can see that most of the people are under the common titles (Master, Miss, Mr, Mrs). I will try to pack the uncommon ones in less categories.

- Capt, Col and Majo --> Military Rank
- Don --> Mr
- Dona --> Mrs
- Dr, Jonkheer, Lady, Sir, rev --> High class
- Mlle --> Miss
- Mme --> Mrs
- Ms --> Mrs
- the --> Mr

In [121]:
map_dict = {'Capt':'Soldier','Col':'Soldier','Don':'Mr','Dona':'Mrs','Dr':'Upper','Jonkheer':'Upper',
           'Lady':'Upper','Major':'Soldier','Mlle':'Miss','Mme':'Mrs','Ms':'Mrs','Rev':'Upper',
           'Sir':'Upper','the':'Mr','Master':'Master','Miss':'Miss','Mr':'Mr','Mrs':'Mrs'}
df_all['Title'] = df_all['Title'].map(map_dict)
df_all.groupby('Title')['Pclass'].count()

Title
Master      61
Miss       262
Mr         759
Mrs        201
Soldier      7
Upper       19
Name: Pclass, dtype: int64

In [122]:
df_all['Surname'] = df_all['Name'].str.split(',')
df_all['Surname'] = df_all['Surname'].apply(lambda x: x[0])
df_all

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Letter,Ticket_number,Ticket_count,Ticket_survived,Ticket_type,Woman_with_child,Title,Surname
0,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,Other,0.0,7,21171,1,0,0,0,Mr,Braund
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1.0,2,17599,2,1,4,0,Mrs,Cumings
2,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,Other,0.0,7,3101282,1,0,5,0,Miss,Heikkinen
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0.0,2,113803,2,1,7,0,Mrs,Futrelle
4,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,Other,0.0,7,373450,1,0,7,0,Mr,Allen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,3,"Spector, Mr. Woolf",0,25.0,0,0,A.5. 3236,8.0500,Other,0.0,7,3236,1,0,0,0,Mr,Spector
1305,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,1.0,2,17758,3,1,4,0,Mrs,Oliva y Ocana
1306,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Other,0.0,7,3101262,1,0,5,0,Mr,Saether
1307,,3,"Ware, Mr. Frederick",0,25.0,0,0,359309,8.0500,Other,0.0,7,359309,1,0,7,0,Mr,Ware


In [123]:
def find_male_survived(surname):
    data = df_all[(df_all['Surname'] == surname) & (df_all['Sex'] == 0) & (df_all['Age'] > 14)]
    if data.shape[0] > 1:
        if data['Survived'].sum() > 0:
            return 1
        else:
            return 0
    else:
        return 0

df_all['Family_male_survived'] = df_all['Surname'].apply(lambda x: find_male_survived(x))

In [124]:
def find_female_child_died(surname):
    data = df_all[((df_all['Surname'] == surname) & (df_all['Sex'] == 1) & (df_all['Age'] > 14)) | ((df_all['Surname'] == surname) & (df_all['Age'] < 14))]
    woman_child_survive = 0
    i = 0
    if data.shape[0] > 1:
        while ((i <= data.shape[0]-1) & (woman_child_survive == 0)):
            if data['Survived'].iloc[i] == np.nan:
                continue
            elif data['Survived'].iloc[i] == 0:
                woman_child_survive = 1
            i += 1
    else:
        return 0
    return woman_child_survive

df_all['Family_woman_child_died'] = df_all['Surname'].apply(lambda x: find_female_child_died(x))

In [125]:
df_all['Family_size'] = df_all.apply(lambda x: x['SibSp']+x['Parch']+1,
                                    axis = 1)
df_all['Alone'] = df_all['Family_size'].apply(lambda x: 1 if x == 1 else 0)
df_all

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Ticket_count,Ticket_survived,Ticket_type,Woman_with_child,Title,Surname,Family_male_survived,Family_woman_child_died,Family_size,Alone
0,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,Other,...,1,0,0,0,Mr,Braund,0,0,2,0
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,...,2,1,4,0,Mrs,Cumings,0,0,2,0
2,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,Other,...,1,0,5,0,Miss,Heikkinen,0,0,1,1
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,...,2,1,7,0,Mrs,Futrelle,0,0,2,0
4,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,Other,...,1,0,7,0,Mr,Allen,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,3,"Spector, Mr. Woolf",0,25.0,0,0,A.5. 3236,8.0500,Other,...,1,0,0,0,Mr,Spector,0,0,1,1
1305,,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,...,3,1,4,0,Mrs,Oliva y Ocana,0,0,1,1
1306,,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Other,...,1,0,5,0,Mr,Saether,0,0,1,1
1307,,3,"Ware, Mr. Frederick",0,25.0,0,0,359309,8.0500,Other,...,1,0,7,0,Mr,Ware,0,0,1,1


In [126]:
df_all.drop(['Name','Ticket','Cabin','Ticket_number','Ticket_count','Surname'],
           axis = 1,
           inplace = True)

In [127]:
df_all = pd.get_dummies(df_all, 
                        columns = ['Pclass','Embarked','Title','Ticket_type'], 
                        prefix = ['Pclass','Embarked','Title','Ticket_type'])
df_all

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Cabin_Letter,Ticket_survived,Woman_with_child,Family_male_survived,...,Title_Soldier,Title_Upper,Ticket_type_0,Ticket_type_1,Ticket_type_2,Ticket_type_3,Ticket_type_4,Ticket_type_5,Ticket_type_6,Ticket_type_7
0,0.0,0,22.0,1,0,7.2500,7,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.0,1,38.0,1,0,71.2833,2,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1.0,1,26.0,0,0,7.9250,7,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1.0,1,35.0,1,0,53.1000,2,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.0,0,35.0,0,0,8.0500,7,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,0,25.0,0,0,8.0500,7,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1305,,1,39.0,0,0,108.9000,2,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1306,,0,38.5,0,0,7.2500,7,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1307,,0,25.0,0,0,8.0500,7,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [128]:
df_all_not_survived = df_all.drop(['Survived','Sex','Pclass_1','Pclass_2','Pclass_3','Embarked_0.0',
                                  'Embarked_1.0','Embarked_2.0','Cabin_Letter','Alone','Title_Master','Title_Miss',
                                  'Title_Mr','Title_Mrs','Title_Soldier','Title_Upper','Ticket_survived',
                                  'Woman_with_child','Family_male_survived','Family_woman_child_died',
                                  'Ticket_type_0','Ticket_type_1','Ticket_type_2','Ticket_type_3',
                                  'Ticket_type_4','Ticket_type_5','Ticket_type_6','Ticket_type_7'],
                                 axis = 1)
df_all_scaled = preprocessing.scale(df_all_not_survived)

In [129]:
df_all_scaled = pd.DataFrame(df_all_scaled,
                             columns = df_all_not_survived.columns)

In [130]:
df_all_scaled

Unnamed: 0,Age,SibSp,Parch,Fare,Family_size
0,-0.541613,0.481288,-0.445000,-0.503176,0.073352
1,0.661414,0.481288,-0.445000,0.734809,0.073352
2,-0.240856,-0.479087,-0.445000,-0.490126,-0.558346
3,0.435846,0.481288,-0.445000,0.383263,0.073352
4,0.435846,-0.479087,-0.445000,-0.487709,-0.558346
...,...,...,...,...,...
1304,-0.316045,-0.479087,-0.445000,-0.487709,-0.558346
1305,0.736603,-0.479087,-0.445000,1.462069,-0.558346
1306,0.699008,-0.479087,-0.445000,-0.503176,-0.558346
1307,-0.316045,-0.479087,-0.445000,-0.487709,-0.558346


In [131]:
df_all_scaled.describe()

Unnamed: 0,Age,SibSp,Parch,Fare,Family_size
count,1309.0,1309.0,1309.0,1309.0,1309.0
mean,5.589282e-17,-6.632925e-16,-8.549311e-17,-6.473474e-17,2.59363e-16
std,1.000382,1.000382,1.000382,1.000382,1.000382
min,-2.182992,-0.4790868,-0.4449995,-0.6433437,-0.5583461
25%,-0.5416129,-0.4790868,-0.4449995,-0.4906907,-0.5583461
50%,-0.2408563,-0.4790868,-0.4449995,-0.3638941,-0.5583461
75%,0.5110352,0.4812878,-0.4449995,-0.03868998,0.07335229
max,3.819358,7.203909,9.956864,9.261749,5.758637


In [132]:
df_all_dummy = df_all.loc[:,['Survived','Sex','Pclass_1','Pclass_2','Pclass_3','Embarked_0.0',
                                  'Embarked_1.0','Embarked_2.0','Cabin_Letter','Alone','Title_Master','Title_Miss',
                                  'Title_Mr','Title_Mrs','Title_Soldier','Title_Upper','Ticket_survived',
                                  'Woman_with_child','Family_male_survived','Family_woman_child_died',
                                  'Ticket_type_0','Ticket_type_1','Ticket_type_2','Ticket_type_3',
                                  'Ticket_type_4','Ticket_type_5','Ticket_type_6','Ticket_type_7']]
df_all_scaled = pd.concat([df_all_dummy,df_all_scaled], axis = 1)

In [134]:
#Fit the model using the passengers with identified cabin
df_all_scaled_pred = df_all_scaled[df_all_scaled['Cabin_Letter'] != 7]
X_cabin_ml = df_all_scaled_pred.drop(['Cabin_Letter','Survived'],
                             axis = 1)
Y_cabin_ml = df_all_scaled_pred['Cabin_Letter']

In [76]:
#define the model
model = DecisionTreeClassifier()

#define the parameters to search
criterion = ['gini', 'entropy']
splitter = ['best','random']
#max_depth = list(range(1,100,10))
min_samples_split = list(range(2,40,2))
min_samples_leaf = list(range(1,20,2))
max_features = ['auto','sqrt','log2', None]

#define grid search
grid = dict(criterion = criterion,
           splitter = splitter,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_cabin_ml, Y_cabin_ml)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 3040 candidates, totalling 91200 fits




Best: 0.638046 using {'criterion': 'entropy', 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


The result is not great but it might help with the overall performance.

In [135]:
X_cabin_ml_pred = df_all_scaled[df_all_scaled['Cabin_Letter'] == 7].drop(['Survived','Cabin_Letter'],
                                                                        axis = 1)

In [136]:
model = DecisionTreeClassifier(criterion = 'entropy',
                       splitter = 'best',
                       min_samples_split = 2,
                       min_samples_leaf = 1,
                       max_features = None).fit(X_cabin_ml, Y_cabin_ml)
Y_cabin_ml_pred = model.predict(X_cabin_ml_pred)

In [137]:
df_Y_cabin_ml_pred = pd.DataFrame(data = Y_cabin_ml_pred,
                                 columns = ['Cabin_Letter'],
                                 index = X_cabin_ml_pred.index)
df_cabin_ml_pred = pd.concat([X_cabin_ml_pred,df_Y_cabin_ml_pred], axis = 1)
df_cabin_ml_pred

Unnamed: 0,Sex,Pclass_1,Pclass_2,Pclass_3,Embarked_0.0,Embarked_1.0,Embarked_2.0,Alone,Title_Master,Title_Miss,...,Ticket_type_4,Ticket_type_5,Ticket_type_6,Ticket_type_7,Age,SibSp,Parch,Fare,Family_size,Cabin_Letter
0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,-0.541613,0.481288,-0.445000,-0.503176,0.073352,5
2,1,0,0,1,1,0,0,1,0,1,...,0,1,0,0,-0.240856,-0.479087,-0.445000,-0.490126,-0.558346,5
4,0,0,0,1,1,0,0,1,0,0,...,0,0,0,1,0.435846,-0.479087,-0.445000,-0.487709,-0.558346,4
5,0,0,0,1,0,0,1,1,0,0,...,0,0,0,1,-0.316045,-0.479087,-0.445000,-0.479816,-0.558346,5
7,0,0,0,1,1,0,0,0,1,0,...,0,0,0,1,-2.045396,2.402037,0.710763,-0.235891,1.968447,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,1,0,0,1,1,0,0,1,0,1,...,0,0,0,1,-0.090478,-0.479087,-0.445000,-0.493026,-0.558346,5
1304,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,-0.316045,-0.479087,-0.445000,-0.487709,-0.558346,5
1306,0,0,0,1,1,0,0,1,0,0,...,0,1,0,0,0.699008,-0.479087,-0.445000,-0.503176,-0.558346,5
1307,0,0,0,1,1,0,0,1,0,0,...,0,0,0,1,-0.316045,-0.479087,-0.445000,-0.487709,-0.558346,5


In [138]:
df_all_pred = pd.concat([df_all_scaled_pred,df_cabin_ml_pred], axis = 0).reset_index()
df_all_pred.sort_values('index',
                       inplace = True)
df_cabin_pred = df_all_pred[['index','Cabin_Letter']].set_index('index',
                                               drop = True)

In [139]:
df_all_scaled_final = df_all_scaled.drop('Cabin_Letter', axis = 1)
df_all_scaled_final = pd.concat([df_all_scaled_final,df_cabin_pred],
                               axis = 1)
df_all_scaled_final

Unnamed: 0,Survived,Sex,Pclass_1,Pclass_2,Pclass_3,Embarked_0.0,Embarked_1.0,Embarked_2.0,Alone,Title_Master,...,Ticket_type_4,Ticket_type_5,Ticket_type_6,Ticket_type_7,Age,SibSp,Parch,Fare,Family_size,Cabin_Letter
0,0.0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,-0.541613,0.481288,-0.445000,-0.503176,0.073352,5
1,1.0,1,1,0,0,0,1,0,0,0,...,1,0,0,0,0.661414,0.481288,-0.445000,0.734809,0.073352,2
2,1.0,1,0,0,1,1,0,0,1,0,...,0,1,0,0,-0.240856,-0.479087,-0.445000,-0.490126,-0.558346,5
3,1.0,1,1,0,0,1,0,0,0,0,...,0,0,0,1,0.435846,0.481288,-0.445000,0.383263,0.073352,2
4,0.0,0,0,0,1,1,0,0,1,0,...,0,0,0,1,0.435846,-0.479087,-0.445000,-0.487709,-0.558346,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,0,0,0,1,1,0,0,1,0,...,0,0,0,0,-0.316045,-0.479087,-0.445000,-0.487709,-0.558346,5
1305,,1,1,0,0,0,1,0,1,0,...,1,0,0,0,0.736603,-0.479087,-0.445000,1.462069,-0.558346,2
1306,,0,0,0,1,1,0,0,1,0,...,0,1,0,0,0.699008,-0.479087,-0.445000,-0.503176,-0.558346,5
1307,,0,0,0,1,1,0,0,1,0,...,0,0,0,1,-0.316045,-0.479087,-0.445000,-0.487709,-0.558346,5


In [140]:
df_train = df_all_scaled_final[df_all_scaled.index.isin(idx_train-1)]
df_test = df_all_scaled_final[df_all_scaled.index.isin(idx_test-1)]

In [142]:
df_train

Unnamed: 0,Survived,Sex,Pclass_1,Pclass_2,Pclass_3,Embarked_0.0,Embarked_1.0,Embarked_2.0,Alone,Title_Master,...,Ticket_type_4,Ticket_type_5,Ticket_type_6,Ticket_type_7,Age,SibSp,Parch,Fare,Family_size,Cabin_Letter
0,0.0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,-0.541613,0.481288,-0.445000,-0.503176,0.073352,5
1,1.0,1,1,0,0,0,1,0,0,0,...,1,0,0,0,0.661414,0.481288,-0.445000,0.734809,0.073352,2
2,1.0,1,0,0,1,1,0,0,1,0,...,0,1,0,0,-0.240856,-0.479087,-0.445000,-0.490126,-0.558346,5
3,1.0,1,1,0,0,1,0,0,0,0,...,0,0,0,1,0.435846,0.481288,-0.445000,0.383263,0.073352,2
4,0.0,0,0,0,1,1,0,0,1,0,...,0,0,0,1,0.435846,-0.479087,-0.445000,-0.487709,-0.558346,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0,0,1,0,1,0,0,1,0,...,0,0,0,1,-0.165667,-0.479087,-0.445000,-0.392009,-0.558346,5
887,1.0,1,1,0,0,1,0,0,1,0,...,0,0,0,1,-0.767180,-0.479087,-0.445000,-0.063340,-0.558346,1
888,0.0,1,0,0,1,1,0,0,0,0,...,0,0,1,0,-1.180721,0.481288,1.866526,-0.189974,1.336749,5
889,1.0,0,1,0,0,0,1,0,1,0,...,0,0,0,1,-0.240856,-0.479087,-0.445000,-0.063340,-0.558346,2


In [143]:
df_train.to_csv('train_iter_15.csv')
df_test.to_csv('test_iter_15.csv')