## Kaggle competition
### Titanic: Machine Learning From Desaster
Jens Hahn

### 2. Model

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, KBinsDiscretizer, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

Load data

In [5]:
df = pd.read_csv('../data/raw/train.csv', index_col=0)

In [6]:
X = df.drop('Survived', axis=1)
y = df['Survived']

Set `FunctionTransformer`

In [7]:
def cabin_code(frame):
    """get one letter code for cabin, add 'U' for unknown"""
    column = frame.iloc[:,0]
    column.fillna('U', inplace=True)
    return column.str[0].to_frame()

In [8]:
def family(frame):
    """add a column with sum of family members"""
    frame['family'] = frame.sum(axis=1)
    frame['alone'] = (frame['family'] > 1).astype(int)
    return frame

In [9]:
def title_len(frame):
    """add column with length of name"""
    column = frame.iloc[:,0]
    return column.str.len().to_frame()

In [10]:
def titles(frame):
    """extract titles from names"""
    output = frame.copy()
    col_name = output.columns[0]
    for i in output.index:
        name = str(output.loc[i, col_name])
        name = name.replace(',', '')
        name = name.replace('(', '')
        name = name.replace(')', '')
        name = name.replace('"', '')
        name = name.split(' ')
        if 'Mr.' in name or 'Mr ' in name:
            output.loc[i] = 'Mr'
        elif 'Miss' in name:
            output.loc[i] = 'Miss'
        elif 'Mrs.' in name or 'Mrs ' in name:
            output.loc[i] = 'Mrs'
        elif 'Master' in name:
            output.loc[i] = 'Master'
        elif 'Dr.' in name:
            output.loc[i] = 'Dr'
        elif 'Jr' in name or 'Jr.' in name:
            output.loc[i] = 'Jr'
        else:
            output.loc[i] = 'other'
    return output

In [11]:
def add_bias(frame):
    """add bias for box-cox transformation, all > 0"""
    frame.fillna(0, inplace=True)
    return frame + 0.001

Split data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

Define pipelines

In [13]:
age_pipe = Pipeline([
    ('age_imp', SimpleImputer(strategy='mean')),
    ('age_bin', KBinsDiscretizer(encode='ordinal', strategy='quantile', n_bins=3)),
    ('age_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

title_pipe = Pipeline([
    ('title_get', FunctionTransformer(titles)),
    ('titles_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

family_pipe = Pipeline([
    ('fam_get', FunctionTransformer(family)),
    ('fam_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

cabin_pipe = Pipeline([
    ('cab_letter', FunctionTransformer(cabin_code)),
    ('cab_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

embarked_pipe = Pipeline([
    ('emb_imp', SimpleImputer(strategy='most_frequent')),
    ('emb_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

fare_pipe = Pipeline([
    ('fare_add', FunctionTransformer(add_bias)),
    ('fare_trans', PowerTransformer(method='box-cox')),
    ('fare_bin', KBinsDiscretizer(encode='ordinal')),
    ('fare_ohe', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

In [14]:
ct = ColumnTransformer([
    ('cabin', cabin_pipe, ['Cabin']),
    ('family', family_pipe, ['SibSp', 'Parch']),
    ('name_len', FunctionTransformer(title_len), ['Name']),
    ('title', title_pipe, ['Name']),
    ('fare', fare_pipe, ['Fare']),
    ('age', age_pipe, ['Age']),
    ('class', OneHotEncoder(), ['Pclass']),
    ('sex', OneHotEncoder(), ['Sex']),
    ('embark', embarked_pipe, ['Embarked'])
], remainder='drop')

In [15]:
model_pipe = Pipeline([
    ('ct', ct),
    ('model', RandomForestClassifier(max_depth=6))
    # ('model', GradientBoostingClassifier())
    #('model', LogisticRegression())
])

User `GridSearchCV` to find best hyperparameter setting

In [16]:
model_pipe.get_params();

In [21]:
param_grid = {'model__max_depth': [3,4,5,6,7], 
              'ct__age__age_bin__strategy': ['quantile', 'uniform'],
              'ct__age__age_bin__n_bins': [3,4,5,6,7]
             }

In [22]:
gscv = GridSearchCV(model_pipe, param_grid=param_grid)

In [23]:
gscv.fit(X_train, y_train)



In [25]:
gscv.best_params_

{'ct__age__age_bin__n_bins': 7,
 'ct__age__age_bin__strategy': 'quantile',
 'model__max_depth': 6}

In [26]:
gscv.best_estimator_.score(X_test, y_test)

0.820627802690583

## Kaggle prediction

In [28]:
df_kaggle = pd.read_csv('../data/raw/test.csv', index_col=0)

In [29]:
X_kaggle = df_kaggle.copy()

In [30]:
y_kaggle = gscv.best_estimator_.predict(X_kaggle)



In [31]:
sol = pd.DataFrame(y_kaggle, columns=['Survived'], index=df_kaggle.index)

In [32]:
sol.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [34]:
sol.to_csv('../data/prediction/solution.csv')