## **titanic kaggle solution**

### **import libraries**

In [268]:
import pandas as pd
import numpy as np

### **load data**

In [269]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### **look at the data**

In [270]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [271]:
train.shape

(891, 12)

In [272]:
test.shape

(418, 11)

In [273]:
train.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [274]:
test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


### **feature engineering**

In [275]:
# extract the title from the name
train_test_data = [train, test]

for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [276]:
# title mapping
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2,
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [277]:
# fill missing age with median age for each title
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [278]:
# binning age
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 4

In [279]:
# sex mapping
train["Sex"] = train["Sex"].map({"male": 1, "female":0})
test["Sex"] = test["Sex"].map({"male": 1, "female":0})

In [280]:
# fill missing embarked with S
train['Embarked'] = train['Embarked'].fillna('S')

In [281]:
# embarked mapping
embarked_mapping = {"S": 0, "C": 1, "Q": 2}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

In [282]:
# fill missing fare with median fare for each class
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

In [283]:
# apply a log to reduce skewness
train['Fare_log'] = train["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
test['Fare_log'] = test["Fare"].map(lambda i: np.log(i) if i > 0 else 0)

In [284]:
# binning fare
bins = (-1, 2, 2.68, 3.44, 10)
group_names = [1, 2, 3, 4]
categories = pd.cut(train['Fare_log'], bins, labels=group_names)
train['Fare'] = categories
categories = pd.cut(test['Fare_log'], bins, labels=group_names)
test['Fare'] = categories

In [285]:
# create a family size column
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

In [286]:
# adding more features
for dataset in train_test_data:
    dataset['NameLength'] = dataset['Name'].apply(len)
    dataset['Age_Pclass'] = dataset['Age'] * dataset['Pclass']
    dataset['IsAlone'] = dataset['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

In [287]:
# family mapping
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)

In [288]:
# scaling features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train[['Age', 'Fare']] = scaler.fit_transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = scaler.transform(test[['Age', 'Fare']])

In [289]:
# delete unnecessary features
train.drop(['Name','Ticket','Cabin','Fare_log','SibSp','Parch'], axis=1, inplace=True)
train = train.drop(['PassengerId'], axis=1)
test.drop(['Name','Ticket','Cabin','Fare_log','SibSp','Parch'], axis=1, inplace=True)

In [290]:
train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,NameLength,Age_Pclass,IsAlone
0,0,3,1,-0.734272,-1.714945,0,0,0.4,23,3.0,0
1,1,1,0,1.323075,1.418376,1,2,0.4,51,3.0,0
2,1,3,0,-0.734272,-0.670505,0,1,0.0,22,3.0,1
3,1,1,0,0.294402,1.418376,0,2,0.4,44,2.0,0
4,0,3,1,0.294402,-0.670505,0,0,0.0,24,6.0,1


In [291]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize,NameLength,Age_Pclass,IsAlone
0,892,3,1,0.294402,-0.670505,2,0,0.0,16,6.0,1
1,893,3,0,1.323075,-1.714945,0,2,0.4,32,9.0,0
2,894,2,1,1.323075,-0.670505,2,0,0.0,25,6.0,1
3,895,3,1,0.294402,-0.670505,0,0,0.0,16,6.0,1
4,896,3,0,-0.734272,-0.670505,0,2,0.8,44,3.0,0


In [292]:
train_data = train.drop('Survived', axis=1)
target = train['Survived']
test_data = test.drop("PassengerId", axis=1).copy()

### **modelling**

In [293]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [294]:
# define the parameter grid
param_grid = {
    'n_estimators': [13, 22, 24],
    'max_depth': [None, 1, 2, 4],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2, 3, 1],
    'max_features': ['sqrt', 'log2', None]
}

# instantiate a Random Forest classifier and grid search
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [295]:
# fit to the training data
grid_search.fit(train_data, target)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [296]:
# get the best score
print('Best score: {}'.format(grid_search.best_score_))

Best score: 0.8417362375243236


In [297]:
# get the best parameters
print('Best parameters: {}'.format(grid_search.best_params_))

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 24}


In [298]:
# get the best estimator
best_model = grid_search.best_estimator_

In [299]:
# predict
prediction = best_model.predict(test_data)

In [300]:
# create the csv
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
    })

submission.to_csv('submission.csv', index=False)

In [301]:
submission = pd.read_csv('submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


### **references:**
###### https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb
###### https://www.kaggle.com/competitions/titanic/overview