## Kaggle competition
### Titanic: Machine Learning From Desaster
Jens Hahn

### 2. Model

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, KBinsDiscretizer, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

Load data

In [None]:
df = pd.read_csv('data/train.csv', index_col=0)

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']

Set `FunctionTransformer`

In [None]:
def cabin_code(frame):
    """get one letter code for cabin, add 'U' for unknown"""
    column = frame.iloc[:,0]
    column.fillna('U', inplace=True)
    return column.str[0].to_frame()

In [None]:
def family(frame):
    """add a column with sum of family members"""
    frame['family'] = frame.sum(axis=1)
    frame['alone'] = (frame['family'] > 1).astype(int)
    return frame

In [None]:
def title_len(frame):
    """add column with length of name"""
    column = frame.iloc[:,0]
    return column.str.len().to_frame()

In [None]:
def titles(frame):
    """extract titles from names"""
    frame = frame.iloc[:,0].str.extract(r'(\w+\.)')
    frame.replace({'Mlle.': 'Miss.', 'Ms.': 'Miss.', 'Mme.': 'Mrs.', 
                   'Col.': 'Officer', 'Capt.': 'Officer', 'Jonkheer.': 'Nobles',
                   'Master.': 'Nobles', 'Don.': 'Nobles', 'Sir.': 'Nobles', 'Lady.': 'Nobles',
                   'Countess.': 'Nobles', 'Major.': 'Officer'}, inplace=True)
    return frame

In [None]:
def add_bias(frame):
    """add bias for box-cox transformation, all > 0"""
    frame.fillna(0, inplace=True)
    return frame + 0.001

Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

Define pipelines

In [None]:
age_pipe = Pipeline([
    ('age_imp', SimpleImputer(strategy='mean')),
    ('age_bin', KBinsDiscretizer(encode='ordinal', strategy='quantile', n_bins=3)),
    ('age_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

title_pipe = Pipeline([
    ('title_get', FunctionTransformer(titles)),
    ('titles_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

family_pipe = Pipeline([
    ('fam_get', FunctionTransformer(family)),
    ('fam_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

cabin_pipe = Pipeline([
    ('cab_letter', FunctionTransformer(cabin_code)),
    ('cab_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

embarked_pipe = Pipeline([
    ('emb_imp', SimpleImputer(strategy='most_frequent')),
    ('emb_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

fare_pipe = Pipeline([
    ('fare_add', FunctionTransformer(add_bias)),
    ('fare_trans', PowerTransformer(method='box-cox')),
    ('fare_bin', KBinsDiscretizer(encode='ordinal', strategy='uniform', n_bins=8)),
    ('fare_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

In [None]:
ct = ColumnTransformer([
    ('cabin', cabin_pipe, ['Cabin']),
    ('family', family_pipe, ['SibSp', 'Parch']),
    ('name_len', FunctionTransformer(title_len), ['Name']),
    ('title', title_pipe, ['Name']),
    ('fare', fare_pipe, ['Fare']),
    ('age', age_pipe, ['Age']),
    ('class', OneHotEncoder(), ['Pclass']),
    ('sex', OneHotEncoder(), ['Sex']),
    ('embark', embarked_pipe, ['Embarked'])
], remainder='drop')

In [None]:
model_pipe = Pipeline([
    ('ct', ct),
    ('model', RandomForestClassifier(max_depth=6))
    #('model', GradientBoostingClassifier())
    #('model', LogisticRegression())
])

User `GridSearchCV` to find best hyperparameter setting

In [None]:
model_pipe.get_params();

In [None]:
param_grid = {'model__max_depth': [3,4,5,6,7], 
              'ct__age__age_bin__strategy': ['quantile', 'uniform'],
              'ct__age__age_bin__n_bins': [3,4,5,6,7],
              'ct__fare__fare_bin__n_bins': [3,4,5,6,7,8],
              'ct__fare__fare_bin__strategy': ['quantile', 'uniform']
             }

In [None]:
gscv = GridSearchCV(model_pipe, param_grid=param_grid)

In [None]:
gscv.fit(X_train, y_train)

In [None]:
gscv.best_params_

In [None]:
gscv.best_estimator_.score(X_test, y_test)

## Kaggle prediction

In [None]:
df_kaggle = pd.read_csv('data/test.csv', index_col=0)

In [None]:
X_kaggle = df_kaggle.copy()

In [None]:
y_kaggle = gscv.best_estimator_.predict(X_kaggle)

In [None]:
sol = pd.DataFrame(y_kaggle, columns=['Survived'], index=df_kaggle.index)

In [None]:
sol.head()

In [None]:
sol.to_csv('solution.csv')