In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [75]:
def make_title_col(df):
    import re 
    idx=df.columns.tolist().index('Name')
    find=re.compile(r'(\w+)\.')
    titles=[find.search(i[idx]).groups()[0] for i in np.array(df) if find.search(i[idx])]
    titles=[i if i in ['Mr', 'Mrs', 'Miss', 'Master']
            else 'Miss' if i in ['Ms']
            else 'FancyM' if i in ['Don', 'Rev', 'Dr', 'Major', 'Sir', 'Col', 'Capt', 'Jonkheer'] 
            else 'FancyF' for i in titles]    
    ret=df.copy()
    ret['Title'] = np.array(titles)
    return ret

def impute_col(df, columns):
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(strategy='mean')
    tmp=df.copy()    
    tmp.loc[:,columns] = imp.fit_transform(tmp.loc[:, columns].values.reshape(-1,1)).tolist() 
    return tmp

def clean(df):
    tmp=df.copy()
    tmp=make_title_col(tmp)
    #display(tmp.groupby('Title').describe())
    #tmp=tmp.groupby('Title').apply(lambda g: impute_col(g, 'Age'))
    #display(tmp.describe())
    #tmp=tmp.reset_index(drop=True)
    tmp['Boy'] = [1 if i[tmp.columns.get_loc('Age')]<10
             and i[tmp.columns.get_loc('Sex')] == 'male'
             or i[tmp.columns.get_loc('Title') == 'Master']
             else 0
             for i in tmp.values]
    tmp.drop(columns=['Name', 'PassengerId', 'Sex', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Pclass', 'Age'], inplace=True)
    tmp=pd.get_dummies(tmp, columns=['Title'])
    return tmp

def clean_train(df):
    return clean(df.drop(columns=['Survived']))

def clean_test(df):
    return clean(df)

In [76]:
def make_model(X, y, models):
    
    from statistics import mean
    from sklearn.model_selection import cross_val_score as cv
    
    results=[]
        
    for model, kwargs in models:
        M = model(**kwargs)
        results.append((M, mean(cv(M, X, y, cv=5))))
    
    results = sorted(results, key=lambda i: i[1], reverse=True)
    
    return results

In [77]:
def submit(model, filename):
    test=pd.read_csv(filename)
    ids=test[['PassengerId']]
    test=clean_test(test) 
    predictions=pd.DataFrame(model.predict(test))
    predictions.columns=['Survived']
    df=pd.concat([ids, predictions], axis=1)
    df.to_csv('submission.csv', index=False)

In [79]:
from sklearn.model_selection import train_test_split as tts
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC

from itertools import combinations_with_replacement

train=pd.read_csv('data/train.csv')
train_y=train['Survived'].values

train=clean_train(train)

X, tX, y, ty = tts(train, train_y)

tests=[
    #*[(KNN, {'n_neighbors': i, 'weights': j}) for j in ['uniform', 'distance'] for i in range(4,12)],
    #*[(MLP, {'hidden_layer_sizes': i, 'max_iter':1000}) for j in [2,3] for i in combinations_with_replacement(range(4,8), j)],
    *[(SVC, {'gamma': 'auto'})],
    *[(RFC, {'n_estimators': i, 'max_depth': j}) for j in [2,3,4] for i in range(80, 80+10*25, 25)]
    #*[()],
    #*[()]
]

make_model(X, y, tests)



[(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False), 0.8324287652645862),
 (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                         max_depth=4, max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=80,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False), 0.8309136137494346),
 (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                         max_depth=4, max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_im

# Cleaning the Data

## Importing the data

In [15]:
titanic = pd.read_csv('data/train.csv')
titanic.describe(include='all').drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889
unique,,,2,,,,,3
top,,,male,,,,,S
freq,,,577,,,,,644
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,
min,0.0,1.0,,0.42,0.0,0.0,0.0,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,


### Missing data?

In [None]:
def nullstat(dataset):
    total=dataset.isnull().sum().sort_values(ascending=False)
    percent=(dataset.isnull().sum()/dataset.isnull().count()) \
        .sort_values(ascending=False)
    missing_data=pd.concat([total, percent], axis=1, keys=['Total','Percent'])
    missing_data=missing_data[missing_data.Percent != 0]
    f, ax = plt.subplots(figsize=(11, 6))
    sns.barplot(y=missing_data.index, x=missing_data['Percent'], orient='h')
    plt.ylabel('Features', fontsize=18)
    plt.xlabel('Percent of missing values', fontsize=18)
    plt.title('Percent missing data by feature', fontsize=18)
    plt.xlim([0.0, 1.0])

nullstat(titanic)
titanic.drop(columns=['Ticket', 'Cabin'], inplace=True)
titanic.head()

Stone, Mrs. George Nelson (Martha Evelyn) and her Maid Icard Miss. Amelie, embarked in Southampton<sup>1</sup>

<sup>1</sup>(2019) Martha Evelyn Stone Encyclopedia Titanica (ref: #287, updated 18th June 2019 06:25:53 AM)
URL : https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html

In [None]:
titanic.at[829, 'Embarked'] = 'S'
titanic.at[61, 'Embarked'] = 'S'

### Name Column

In [None]:
import random as rand
print(max(titanic['Name'], key=len))
sname_lengths=[len(i) for i in titanic[titanic.Survived==1]['Name']]
dname_lengths=[len(i) for i in titanic[titanic.Survived==0]['Name']]
display(sorted(map(len, titanic['Name'].to_list()), reverse=True)[0:5])
sns.distplot(rand.sample(sname_lengths, 3*len(sname_lengths)//4), bins=15, hist=False);
sns.distplot(rand.sample(dname_lengths, 3*len(dname_lengths)//4), bins=15, hist=False);

In [None]:
titanic=make_title_col(titanic)

In [None]:
title_stats=[]
for title in titles:
    title_match=re.compile(title + r'\.')
    matches=np.array([i for i in np.array(titanic) if title_match.search(i[3])])
    survived=sum(matches.T[1])
    count=len(matches)    
    title_stats.append({'Title':title, 'Count':count, 'Survival Rate':survived/count})

title_stats=pd.DataFrame(sorted(title_stats, key=lambda i: i['Count'], reverse=True)).T
title_stats.columns=title_stats.T['Title']
title_stats=title_stats.T.drop(columns='Title')[['Survival Rate', 'Count']].T
title_stats

In [None]:
for name, group in titanic.groupby('Title'):
        with open("title_stats.txt", "w") as f:
            f.write(name + '\n' + str(group.describe(include='all')) + '\n')