## Kaggle competition
### Titanic: Machine Learning From Desaster
Jens Hahn

### 2. Model

In [84]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, KBinsDiscretizer, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

Load data

In [166]:
df = pd.read_csv('data/train.csv', index_col=0)

In [167]:
X = df.drop('Survived', axis=1)
y = df['Survived']

Set `FunctionTransformer`

In [168]:
def cabin_code(frame):
    """get one letter code for cabin, add 'U' for unknown"""
    column = frame.iloc[:,0]
    column.fillna('U', inplace=True)
    return column.str[0].to_frame()

In [183]:
def family(frame):
    """add a column with sum of family members"""
    frame['family'] = frame.sum(axis=1)
    return frame

In [185]:
def title_len(frame):
    """add column with length of name"""
    column = frame.iloc[:,0]
    return column.str.len().to_frame()

In [186]:
def titles(frame):
    """extract titles from names"""
    output = frame.copy()
    col_name = output.columns[0]
    for i in output.index:
        name = str(output.loc[i, col_name])
        name = name.replace(',', '')
        name = name.replace('(', '')
        name = name.replace(')', '')
        name = name.replace('"', '')
        name = name.split(' ')
        if 'Mr.' in name or 'Mr ' in name:
            output.loc[i] = 'Mr'
        elif 'Miss' in name:
            output.loc[i] = 'Miss'
        elif 'Mrs.' in name or 'Mrs ' in name:
            output.loc[i] = 'Mrs'
        elif 'Master' in name:
            output.loc[i] = 'Master'
        elif 'Dr.' in name:
            output.loc[i] = 'Dr'
        elif 'Jr' in name or 'Jr.' in name:
            output.loc[i] = 'Jr'
        else:
            output.loc[i] = 'other'
    return output

In [201]:
def add_bias(frame):
    """add bias for box-cox transformation, all > 0"""
    frame.fillna(0, inplace=True)
    return frame + 0.001

Split data

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

Define pipelines

In [225]:
age_pipe = Pipeline([
    ('age_imp', SimpleImputer(strategy='mean')),
    ('age_bin', KBinsDiscretizer())
])

title_pipe = Pipeline([
    ('title_get', FunctionTransformer(titles)),
    ('titles_ohe', OneHotEncoder(handle_unknown='ignore'))
])

family_pipe = Pipeline([
    ('fam_get', FunctionTransformer(family)),
    ('fam_ohe', OneHotEncoder(handle_unknown='ignore'))
])

cabin_pipe = Pipeline([
    ('cab_letter', FunctionTransformer(cabin_code)),
    ('cab_ohe', OneHotEncoder(handle_unknown='ignore'))
])

embarked_pipe = Pipeline([
    ('emb_imp', SimpleImputer(strategy='most_frequent')),
    ('emb_ohe', OneHotEncoder(handle_unknown='ignore'))
])

fare_pipe = Pipeline([
    ('fare_add', FunctionTransformer(add_bias)),
    ('fare_trans', PowerTransformer(method='box-cox')),
    ('fare_bin', KBinsDiscretizer())
])

In [226]:
ct = ColumnTransformer([
    ('cabin', cabin_pipe, ['Cabin']),
    ('family', family_pipe, ['SibSp', 'Parch']),
    ('name_len', FunctionTransformer(title_len), ['Name']),
    ('title', title_pipe, ['Name']),
    ('fare', fare_pipe, ['Fare']),
    ('age', age_pipe, ['Age']),
    ('class', OneHotEncoder(), ['Pclass']),
    ('sex', OneHotEncoder(), ['Sex']),
    ('embark', embarked_pipe, ['Embarked'])
], remainder='drop')

In [239]:
model_pipe = Pipeline([
    ('ct', ct),
    # ('model', RandomForestClassifier(max_depth=6))
    ('model', GradientBoostingClassifier())
])

In [240]:
model_pipe.fit(X_train, y_train)

In [241]:
model_pipe.score(X_train, y_train)

0.9041916167664671

In [242]:
model_pipe.score(X_test, y_test)

0.8116591928251121

In [243]:
scores = cross_val_score(model_pipe, X_train, y_train, cv=5)

In [244]:
scores.mean()

0.8308158455841095

## Kaggle prediction

In [245]:
df_kaggle = pd.read_csv('data/test.csv', index_col=0)

In [246]:
X_kaggle = df_kaggle.copy()

In [247]:
y_kaggle = model_pipe.predict(X_kaggle)

In [248]:
sol = pd.DataFrame(y_kaggle, columns=['Survived'], index=df_kaggle.index)

In [249]:
sol.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,0


In [250]:
sol.to_csv('solution.csv')