In [237]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [292]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
titanic = pd.read_csv('Data/train.csv', index_col='PassengerId')
y = titanic['Survived']
X = titanic.drop('Survived', axis=1).copy()

# The individual cabins probably won't tell us much, but the letter block might
# We'll use categorical dummies here so the missing values shouldn't be terribly important
cabin_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
X['Cabin block'] = X['Cabin'].str[0].fillna('unknown')
cabin_dummies = pd.DataFrame(cabin_ohe.fit_transform(X[['Cabin block']]),
                             columns=['block_' + c for c in cabin_ohe.categories_[0]],
                             index=X.index)
X = pd.concat([X, cabin_dummies], axis=1)
X = X.drop(['Cabin', 'Cabin block'], axis=1)


# There are only two missing embarcation locations, so it should be fine with dummies
embark_ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
embark_dummies = pd.DataFrame(embark_ohe.fit_transform(X[['Embarked']].fillna('S')),
                              columns=['embark_' + c for c in embark_ohe.categories_[0]],
                              index=X.index)
X = pd.concat([X, embark_dummies], axis=1)
X = X.drop('Embarked', axis=1)


# The tickets always have number components except for 4 "LINE" tickets - let's try filling those with 0
X['Ticket #'] = X['Ticket'].str.split(' ').apply(lambda x: [t for t in x if t.isdigit()]).str[0].fillna(0)
X['Ticket #'] = pd.to_numeric(X['Ticket #'])
X = X.drop('Ticket', axis=1)

# Some feature engineering to get the number of like-surnames in the ship - who knows
X['Surname'] = X['Name'].str.split(',').str[0]
surname_counts = X['Surname'].value_counts()
X['Surname count'] = X['Surname'].apply(lambda x: surname_counts.loc[x])
X = X.drop(['Name', 'Surname'], axis=1)

# Sex should be a simple matter of categorical encoding
sex_le = LabelEncoder()
X['Sex'] = sex_le.fit_transform(X['Sex'])

# Age is tricky - there are a lot of missing values and this could prove critical.
# Let's try regressing against it from the other variables
# First, let's round it since the exact value probably doesn't matter much and the .5s are rounded or infants
# But let's keep the information for approximate age since people whose exact age we don't know probably died.
# (Side note: including this information is probably leaking target data)
X['Approx age'] = X['Age'] % 1 == 0.5
X['Age'] = X['Age'].round()

In [278]:
known_age = X[X['Age'].notnull()]
age_X = known_age.drop('Age', axis=1)
age_y = known_age['Age']

In [279]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor()
rf_params = [{'max_features': [0.2, 0.4, 0.5, 0.6, 0.8],
             'n_estimators': [50, 100, 200, 300]}]
rf_grid = GridSearchCV(rfr, rf_params, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

ela = ElasticNet()
ela_params = [{'alpha': [0.001, 0.01, 0.1, 1, 100, 1000]},
             {'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1.]}]
ela_grid = GridSearchCV(ela, ela_params, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [280]:
%%time
rf_grid.fit(age_X, age_y)
print(rf_grid.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   23.1s finished


-165.48807639669027
Wall time: 23.9 s


In [281]:
%%time
ela_grid.fit(age_X, age_y)
print(ela_grid.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


-166.13317360179266
Wall time: 440 ms


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


In [282]:
# The RF performed better, so let's use that to make our predictions on age
age_estimator = rf_grid.best_estimator_
age_predictions = age_estimator.predict(X[X['Age'].isnull()].drop('Age', axis=1))
X.loc[X['Age'].isnull(), 'Age'] = age_predictions.round()

In [283]:
%%time
# Now to train the actual model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_params = [{'max_features': [0.2, 0.4, 0.5, 0.6, 0.8],
             'n_estimators': [50, 100, 200, 300]}]
rfc_grid = GridSearchCV(rfc, rfc_params, verbose=1, cv=5, n_jobs=-1)

rfc_grid.fit(X, y)
print(rfc_grid.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   19.0s finished


0.8451321323206328
Wall time: 19.8 s


In [284]:
%%time
# Now to train the actual model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_params = [{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}]
lr_grid = GridSearchCV(lr, lr_params, verbose=1, cv=5, n_jobs=-1)

# Standardizing our numeric features should improve our results in a linear model like Logistic Regression
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_std = std.fit_transform(X)

lr_grid.fit(X_std, y)
print(lr_grid.best_score_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


0.8024606113866047
Wall time: 780 ms


[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    0.7s finished


In [286]:
import xgboost as xgb

xgbc = xgb.XGBClassifier()
xgbc_params = [{'n_estimators': [50, 100, 200, 300],
                'gamma': [0, 0.2, 0.4, 0.6, 0.8, 0.9, 1]}]
xgb_grid = GridSearchCV(xgbc, xgbc_params, cv=5, verbose=1, n_jobs=-1)
xgb_grid.fit(X, y)

print(xgb_grid.best_score_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.0s


0.8473793233318687


[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:   12.1s finished


In [298]:
test = pd.read_csv('Data/test.csv', index_col='PassengerId')
X = test.copy()

X['Cabin block'] = X['Cabin'].str[0].fillna('unknown')
cabin_dummies = pd.DataFrame(cabin_ohe.transform(X[['Cabin block']]),
                             columns=['block_' + c for c in cabin_ohe.categories_[0]],
                             index=X.index)
X = pd.concat([X, cabin_dummies], axis=1)
X = X.drop(['Cabin', 'Cabin block'], axis=1)


# There are only two missing embarcation locations, so it should be fine with dummies
embark_dummies = pd.DataFrame(embark_ohe.transform(X[['Embarked']].fillna('S')),
                              columns=['embark_' + c for c in embark_ohe.categories_[0]],
                              index=X.index)
X = pd.concat([X, embark_dummies], axis=1)
X = X.drop('Embarked', axis=1)


# The tickets always have number components except for 4 "LINE" tickets - let's try filling those with 0
X['Ticket #'] = X['Ticket'].str.split(' ').apply(lambda x: [t for t in x if t.isdigit()]).str[0].fillna(0)
X['Ticket #'] = pd.to_numeric(X['Ticket #'])
X = X.drop('Ticket', axis=1)

# Some feature engineering to get the number of like-surnames in the ship - who knows
X['Surname'] = X['Name'].str.split(',').str[0]
X['Surname count'] = X['Surname'].apply(lambda x: surname_counts.get(x)).fillna(1)
X = X.drop(['Name', 'Surname'], axis=1)

# Sex should be a simple matter of categorical encoding
X['Sex'] = sex_le.transform(X['Sex'])

# Age is tricky - there are a lot of missing values and this could prove critical.
# Let's try regressing against it from the other variables
# First, let's round it since the exact value probably doesn't matter much and the .5s are rounded or infants
# But let's keep the information for approximate age since people whose exact age we don't know probably died.
# (Side note: including this information is probably leaking target data)
X['Approx age'] = X['Age'] % 1 == 0.5
X['Age'] = X['Age'].round()
age_predictions = age_estimator.predict(X[X['Age'].isnull()].drop('Age', axis=1))
X.loc[X['Age'].isnull(), 'Age'] = age_predictions.round()

In [306]:
test['Survived'] = xgb_grid.predict(X)
test['Survived'].to_csv('submission.csv')