In [310]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import ensemble
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, csr_matrix

from keras import layers, models

In [311]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

## Exploratory Data Analysis

In [312]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [313]:
pd.concat([pd.DataFrame(df_train.nunique()),
           pd.DataFrame(df_train.isnull().sum()),
           pd.DataFrame(df_train.mean())],
          axis=1,
          ignore_index=True)

Unnamed: 0,0,1,2
Age,88,177,29.699118
Cabin,147,687,
Embarked,3,2,
Fare,248,0,32.204208
Name,891,0,
Parch,7,0,0.381594
PassengerId,891,0,446.0
Pclass,3,0,2.308642
Sex,2,0,
SibSp,7,0,0.523008


In [314]:
def make_score(clf, X, y):
    return np.mean(cross_val_score(clf, X, y, scoring='accuracy'))

## Models

In [318]:
def parse_0(df):
    x = df[['Fare', 'SibSp', 'Parch']].values
    y = df['Survived'].values
    
    return x, y

X, y = parse_0(df_train.copy())
clf = linear_model.LogisticRegression()
make_score(clf, X, y)

0.6734006734006734

In [319]:
def parse_1(df):
    df = pd.concat([df_train,
                    pd.get_dummies(df['Sex'])],
                   axis=1)
    
    x = df[['Fare', 'SibSp', 'Parch', 'male', 'female']].values
    y = df['Survived'].values
    
    return x, y

X, y = parse_1(df_train.copy())
clf = linear_model.LogisticRegression()
make_score(clf, X, y)

0.7890011223344556

In [320]:
def parse_2(df):
    df = pd.concat([df,
                    pd.get_dummies(df['Sex']),
                    pd.get_dummies(df['Embarked'])],
                   axis=1)
    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['is_child'] = df['Age'].apply(lambda r: 1 if r < 8 else 0)
    
    x = df[['Fare', 'SibSp', 'Parch', 'Age',
            'male', 'female', 'is_child']].values
    y = df['Survived'].values
    
    return x, y

X, y = parse_2(df_train.copy())
clf = linear_model.LogisticRegression()
make_score(clf, X, y)

0.8013468013468014

In [321]:
def parse_3(df):
    df['Bridge'] = df['Cabin'].apply(lambda r: r[0] if not pd.isnull(r) else -1)
    
    df = pd.concat([df,
                    pd.get_dummies(df['Sex']),
                    pd.get_dummies(df['Embarked'], prefix='embarked'),
                    pd.get_dummies(df['Bridge'], prefix='bridge'),
                    pd.get_dummies(df['Name'].str.extract('[A-Za-z]*,\s([A-Z][a-z]*).', expand=False).str.strip())],
                   axis=1)
    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['is_child'] = df['Age'].apply(lambda r: 1 if r < 8 else 0)
    df['family_size'] = df['Parch'] + df['SibSp']
    df['is_alone'] = df['family_size'].apply(lambda r: 0 if r > 0 else 1)
    
    x = df.drop(['PassengerId', 'Survived', 'Embarked',
                 'Sex', 'Name', 'Cabin', 'Ticket', 'Bridge'],
                axis=1).values
    
    return x, y

X, y = parse_3(df_train.copy())
clf = linear_model.LogisticRegression()
make_score(clf, X, y)

0.8204264870931537

In [322]:
X, y = parse_3(df_train.copy())
clf = ensemble.GradientBoostingClassifier(learning_rate=0.01,
                                          n_estimators=100,
                                          max_depth=3)
make_score(clf, X, y)

0.8338945005611672

In [323]:
def parse_4(df):
    df['Bridge'] = df['Cabin'].apply(lambda r: r[0] if not pd.isnull(r) else -1)
    df['Title'] = df['Name'].str.extract('[A-Za-z]*,\s([A-Z][a-z]*).', expand=False).str.strip()          
    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['family_size'] = df['Parch'] + df['SibSp']
    df['is_child'] = df['Age'].apply(lambda r: 1 if r < 8 else 0)
    df['is_alone'] = df['family_size'].apply(lambda r: 0 if r > 0 else 1)
    
    x = hstack([
        CountVectorizer(token_pattern='[A-Z]*').fit_transform(df['Bridge'].astype(str)),
        CountVectorizer().fit_transform(df['Title'].astype(str)),
        CountVectorizer().fit_transform(df['Sex']),
        CountVectorizer(token_pattern='[A-Z]*').fit_transform(df['Embarked'].fillna('')),
        df['Age'].values.reshape(-1, 1),
        df['family_size'].values.reshape(-1, 1),
        df['is_child'].values.reshape(-1, 1),
        df['is_alone'].values.reshape(-1, 1)
    ])
    
    y = df['Survived']
    
    return x, y

X, y = parse_4(df_train.copy())

clf = ensemble.GradientBoostingClassifier(learning_rate=0.01,
                                          n_estimators=100,
                                          max_depth=3)
make_score(clf, X, y)

0.8316498316498318

## Final Model and submission

In [336]:
def preprocess_df(df):
    df['Bridge'] = df['Cabin'].apply(lambda r: r[0] if not pd.isnull(r) else -1)
    df['Title'] = df['Name'].str.extract('[A-Za-z]*,\s([A-Z][a-z]*).', expand=False).str.strip()          

    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['family_size'] = df['Parch'] + df['SibSp']
    df['is_child'] = df['Age'].apply(lambda r: 1 if r < 8 else 0)
    df['is_alone'] = df['family_size'].apply(lambda r: 0 if r > 0 else 1)
    
    return df

df_train = preprocess_df(df_train)

bridge_cv = CountVectorizer(token_pattern='[A-Z]*')
title_cv = CountVectorizer()
sex_cv = CountVectorizer()
embarked_cv = CountVectorizer(token_pattern='[A-Z]*')

x_train = hstack([
    bridge_cv.fit_transform(df_train['Bridge'].astype(str)),
    title_cv.fit_transform(df_train['Title'].astype(str)),
    sex_cv.fit_transform(df_train['Sex']),
    embarked_cv.fit_transform(df_train['Embarked'].fillna('')),
    df_train['Age'].values.reshape(-1, 1),
    df_train['family_size'].values.reshape(-1, 1),
    df_train['is_child'].values.reshape(-1, 1),
    df_train['is_alone'].values.reshape(-1, 1)
])

y_train = df_train['Survived']

clf = ensemble.GradientBoostingClassifier(learning_rate=0.01,
                                          n_estimators=100,
                                          max_depth=3)

clf.fit(x_train, y_train)

df_test = preprocess_df(df_test)

x_test = hstack([
    bridge_cv.transform(df_test['Bridge'].astype(str)),
    title_cv.transform(df_test['Title'].astype(str)),
    sex_cv.transform(df_test['Sex']),
    embarked_cv.transform(df_test['Embarked'].fillna('')),
    df_test['Age'].values.reshape(-1, 1),
    df_test['family_size'].values.reshape(-1, 1),
    df_test['is_child'].values.reshape(-1, 1),
    df_test['is_alone'].values.reshape(-1, 1)
])

df_test['Survived'] = clf.predict(x_test)

In [338]:
df_test[['PassengerId', 'Survived']].to_csv('./data/submission.csv', index=False)