In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%matplotlib inline

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


Check how the data look like:

In [2]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remove 'PassengerId' and the label 'Survived':

Remove 'PassengerId'

In [3]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:]
    

In [5]:
df_all = concat_df(df_train, df_test)
# Drop PassengerId
df_all.drop('PassengerId', axis = 1, inplace = True)

In [6]:
print('Missing data count on training set: ', df_train.isnull().sum())
print('Missing data count on test set: ', df_test.isnull().sum())

Missing data count on training set:  PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Missing data count on test set:  PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [7]:
df_all['Title'] = df_all['Name'].str.extract('([A-Za-z]+)\.', expand = False)
df_all[df_all['Title'].isnull()]
df_all['Title'] = df_all['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df_all['Title'] = df_all['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')
df_all['Title'].value_counts()
#df_all['Title'].fillna('N', inplace = True)

Mr                          757
Miss/Mrs/Ms                 464
Master                       61
Dr/Military/Noble/Clergy     26
Countess                      1
Name: Title, dtype: int64

In [8]:
Sex_mapping = {'male': 0, 'female': 1}
df_all['Sex'] = df_all['Sex'].map(Sex_mapping)

In [9]:
# Scaling
#df_all['Pclass'] = StandardScaler().fit_transform(df_all['Pclass'].values.reshape(-1, 1))

In [10]:
# Fill in missing Age values
#med_age = df_train['Age'].median()
#print('median age ', med_age)
#df_train['Age'].fillna(med_age, inplace=True)
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

# Scaling - determine not to do feature scaling for Age because of no performance improvement
#df_all['Age'] = StandardScaler().fit_transform(df_all['Age'].values.reshape(-1, 1))


In [11]:
df_all['Family_size'] = df_all['SibSp'] + df_all['Parch'] + 1
#df_all['Family_size'].astype(str)
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
df_all['Family_group'] = df_all['Family_size'].map(family_map)
# Scalling
#X_train['Family_size'] = StandardScaler().fit_transform(X_train['Family_size'].values.reshape(-1, 1))

In [12]:
# Find in the missing value for Fare
df_all[df_all['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_size,Family_group
1043,60.5,,S,,"Storey, Mr. Thomas",0,3,0,0,,3701,Mr,1,Alone


In [13]:
# Fill in the missing value for Fare with class 3 median fare
med_fare = df_all.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
df_all['Fare'] = df_all['Fare'].fillna(med_fare)

In [14]:
df_all['Fare_per_person'] = df_all['Fare']/df_all['Family_size']
# Scalling - No impact on tree related algorithms such as Random Forest and Gradient Boosting. Has positive impact on Logistic Regression.
df_all['Fare'] = StandardScaler().fit_transform(df_all['Fare'].values.reshape(-1, 1))
df_all['Fare_per_person'] = StandardScaler().fit_transform(df_all['Fare_per_person'].values.reshape(-1, 1))

In [15]:
df_all['Cabin'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'N')

#df_all['Cabin'] = df_all['Cabin'].replace(['A', 'B', 'C'], 'ABC')
#df_all['Cabin'] = df_all['Cabin'].replace(['D', 'E'], 'DE')
#df_all['Cabin'] = df_all['Cabin'].replace(['F', 'G'], 'FG')

In [16]:
# Find the missing data in 'Embarded'
df_all[df_all['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_size,Family_group,Fare_per_person
61,38.0,B,,0.903334,"Icard, Miss. Amelie",0,1,1,0,1.0,113572,Miss/Mrs/Ms,1,Alone,1.664157
829,62.0,B,,0.903334,"Stone, Mrs. George Nelson (Martha Evelyn)",0,1,1,0,1.0,113572,Miss/Mrs/Ms,1,Alone,1.664157


In [17]:
# Fill out the missing values in 'Embarked'
df_all['Embarked'].fillna('S', inplace = True)
#df_all['Embarked'].isnull().sum()


#Embarked_mapping = {'C': 0 , 'Q': 1, 'S':2}
#df_all['Embarked'] = df_all['Embarked'].map(Embarked_mapping)

One-hot encoding

In [18]:
df_all[['Sex','Pclass', 'Title', 'Cabin', 'Embarked']] = df_all[['Sex', 'Pclass', 'Title', 'Cabin', 'Embarked']].astype(str)
one_hot = pd.get_dummies(df_all[['Sex', 'Pclass', 'Title', 'Cabin', 'Embarked']])
print('one hot columns ', one_hot.columns)
#df_all.drop(['Sex'], axis = 1, inplace = True)
#df_all.drop(['Pclass'], axis = 1, inplace = True)
#X_train.drop('Embarked', axis = 1, inplace = True)

one hot columns  Index(['Sex_0', 'Sex_1', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Title_Countess',
       'Title_Dr/Military/Noble/Clergy', 'Title_Master', 'Title_Miss/Mrs/Ms',
       'Title_Mr', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_N', 'Cabin_T', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


In [19]:
df_train, df_test = divide_df(df_all)
one_hot_train, one_hot_test = divide_df(one_hot)

print('Size of training set" ', df_train.shape)
print('Columns of training set: ', df_train.columns)
print('Size of test set: ', df_test.shape)
print('Columns of test set: ', df_test.columns)

print('Columns of training set after one-hot: ', one_hot_train.columns)
print('Columns of set set after one-hot: ', one_hot_test.columns)

Size of training set"  (891, 15)
Columns of training set:  Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'Pclass', 'Sex',
       'SibSp', 'Survived', 'Ticket', 'Title', 'Family_size', 'Family_group',
       'Fare_per_person'],
      dtype='object')
Size of test set:  (418, 15)
Columns of test set:  Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'Pclass', 'Sex',
       'SibSp', 'Survived', 'Ticket', 'Title', 'Family_size', 'Family_group',
       'Fare_per_person'],
      dtype='object')
Columns of training set after one-hot:  Index(['Sex_0', 'Sex_1', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Title_Countess',
       'Title_Dr/Military/Noble/Clergy', 'Title_Master', 'Title_Miss/Mrs/Ms',
       'Title_Mr', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_N', 'Cabin_T', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')
Columns of set set after one-hot:  Index(['Sex_0', 'Sex_1', 'Pclass_1', 'Pclass_2', 'Pcl

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

survived = df_train['Survived']
#X_train.drop('Survived', axis = 1, inplace = True)

# Apply Wrap method - Forward selection
# Better performance for Fare instead of Fare_per_person on decision tree related algorithms
X_train = df_train[['Age', 'Fare', 'Family_size']]
X_test = df_test[['Age', 'Fare', 'Family_size']]
X_train = pd.concat([X_train, one_hot_train], axis = 1)
print('All X_train columns ', X_train.columns)
print(X_train.isnull().sum())
print(survived.shape)
print(X_train.shape)


print(survived)
print(X_train)

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=5, include_bias=False)
poly.fit(X_train)
X_poly = poly.transform(X_train)


All X_train columns  Index(['Age', 'Fare', 'Family_size', 'Sex_0', 'Sex_1', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Title_Countess', 'Title_Dr/Military/Noble/Clergy',
       'Title_Master', 'Title_Miss/Mrs/Ms', 'Title_Mr', 'Cabin_A', 'Cabin_B',
       'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_N',
       'Cabin_T', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')
Age                               0
Fare                              0
Family_size                       0
Sex_0                             0
Sex_1                             0
Pclass_1                          0
Pclass_2                          0
Pclass_3                          0
Title_Countess                    0
Title_Dr/Military/Noble/Clergy    0
Title_Master                      0
Title_Miss/Mrs/Ms                 0
Title_Mr                          0
Cabin_A                           0
Cabin_B                           0
Cabin_C                           0
Cabin_D            

In [21]:
from sklearn.linear_model import LogisticRegression

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_poly, survived, test_size = 0.3, random_state = 42)

clf = LogisticRegression(C = 0.05)
#LogReg = clf.fit(X_train, y_train)
LogReg = clf.fit(X_train_t, y_train_t)
#score = LogReg.score(X_train, y_train)
score = LogReg.score(X_train_t, y_train_t)
print('Logistic Regression score on training set ', score)

score = LogReg.score(X_test_t, y_test_t)
print('Logistic Regression score on CV ', score)



Logistic Regression score on training set  0.8218298555377207
Logistic Regression score on CV  0.7649253731343284


In [22]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
score = cross_val_score(clf, X_poly, survived, cv=k_fold)
print("Logistic Regression k-fold cross validation scores:\n", score)
print('Logistic Regression k-fold average score: ', round(np.mean(score)*100, 2))



Logistic Regression k-fold cross validation scores:
 [0.7877095  0.76404494 0.78089888 0.78651685 0.73595506]
Logistic Regression k-fold average score:  77.1




In [23]:
# Algorithms which do not need polynomial features
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_train, survived, test_size = 0.3, random_state = 42)

In [24]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion='gini', 
                                n_estimators=800,
                                           max_depth=3,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features= 'auto',
                                           oob_score=True,
                                           random_state=42,
                                           n_jobs=-1,
                                           verbose=1)
#clf = RandomForestClassifier(random_state=42)
rf = clf.fit(X_train_t, y_train_t)
score = rf.score(X_train_t, y_train_t)

print('Random Forest score on training set ', score)

score = rf.score(X_test_t, y_test_t)
print('Random Forest score on test set ', score)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 800 out of 800 | elapsed:    0.0s finished


Random Forest score on training set  0.8234349919743178


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 800 out of 800 | elapsed:    0.0s finished


Random Forest score on test set  0.8246268656716418


In [25]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
score = cross_val_score(clf, X_train, survived, cv=k_fold)
print("Random Forest k-fold cross validation scores:\n", score)
print('Random Forest k-fold average score: ', round(np.mean(score)*100, 2))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:   14.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 800 out of 800 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elap

Random Forest k-fold cross validation scores:
 [0.82681564 0.79213483 0.80337079 0.8258427  0.83707865]
Random Forest k-fold average score:  81.7


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 800 out of 800 | elapsed:    0.0s finished


In [26]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state = 42, max_depth = 1, learning_rate = 0.15, max_features = 14, min_samples_leaf = 12)

gb = clf.fit(X_train_t, y_train_t)
score = gb.score(X_train_t, y_train_t)

print('Gradient Boost score on training set ', score)#accuracy = round(np.mean(score)*100, 2)
#print('The accuracy of Gradient Boosting on training set is ', accuracy)

score = gb.score(X_test_t, y_test_t)
print('Gradient Boost score on CV ', score)
#accuracy = round(np.mean(score)*100, 2)
#print('The accuracy of Gradient Boosting on test set is ', accuracy)

Gradient Boost score on training set  0.8346709470304976
Gradient Boost score on CV  0.835820895522388


In [27]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
score = cross_val_score(clf, X_train, survived, cv=k_fold)
print("Gradient Boost cross validation scores:\n", score)
print('Gradient Boost k-fold average score: ', round(np.mean(score)*100, 2))

Gradient Boost cross validation scores:
 [0.81005587 0.81460674 0.83146067 0.8258427  0.84831461]
Gradient Boost k-fold average score:  82.61


In [28]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', random_state = 42, hidden_layer_sizes=[2,1], alpha = 1, max_iter=1000)
nn = clf.fit(X_train_t, y_train_t)
score = nn.score(X_train_t, y_train_t)
print('NN MLP score on training set ', score)
score = nn.score(X_test_t, y_test_t)
print('NN MLP score on CV ', score)

NN MLP score on training set  0.8539325842696629
NN MLP score on CV  0.8022388059701493


In [29]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
score=cross_val_score(clf, X_train, survived, cv=k_fold)
print("NN MLP cross validation scores:\n", score)
print('NN MLP k-fold average score: ', round(np.mean(score)*100, 2))

NN MLP cross validation scores:
 [0.82122905 0.82022472 0.83146067 0.80898876 0.80898876]
NN MLP k-fold average score:  81.82


In [30]:



X_test = pd.concat([X_test, one_hot_test], axis = 1)

print(X_test.shape)
print(X_test.columns)
print(X_test.head())

pred = gb.predict(X_test)
np.savetxt('code submission.csv', pred)


(418, 25)
Index(['Age', 'Fare', 'Family_size', 'Sex_0', 'Sex_1', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Title_Countess', 'Title_Dr/Military/Noble/Clergy',
       'Title_Master', 'Title_Miss/Mrs/Ms', 'Title_Mr', 'Cabin_A', 'Cabin_B',
       'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_N',
       'Cabin_T', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')
      Age      Fare  Family_size  Sex_0  Sex_1  Pclass_1  Pclass_2  Pclass_3  \
891  34.5 -0.491975            1      1      0         0         0         1   
892  47.0 -0.508006            2      0      1         0         0         1   
893  62.0 -0.456047            1      1      0         0         1         0   
894  27.0 -0.475864            1      1      0         0         0         1   
895  22.0 -0.405780            3      0      1         0         0         1   

     Title_Countess  Title_Dr/Military/Noble/Clergy  ...  Cabin_C  Cabin_D  \
891               0                            