In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
def make_title_col(df):
    import re 
    idx=df.columns.tolist().index('Name')
    find=re.compile(r'(\w+)\.')
    titles=[find.search(i[idx]).groups()[0] for i in np.array(df) if find.search(i[idx])]
    titles=[i if i in ['Mr', 'Mrs', 'Miss', 'Master']
            else 'Miss' if i in ['Ms']
            else 'FancyM' if i in ['Don', 'Rev', 'Dr', 'Major', 'Sir', 'Col', 'Capt', 'Jonkheer'] 
            else 'FancyF' for i in titles]
    ret=df.copy()
    ret['Title'] = np.array(titles)
    return ret

def impute_col(df, columns):
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(strategy='mean')
    tmp=df.copy()    
    tmp.loc[:,columns] = imp.fit_transform(tmp.loc[:, columns].values.reshape(-1,1)).tolist() 
    return tmp

def clean_train(df):
    df.at[829, 'Embarked'] = 'S'
    df.at[61, 'Embarked'] = 'S'
    return clean(df.drop(columns=['Survived']))

def clean_test(df):
    return clean(df)

In [4]:
from itertools import combinations_with_replacement

def make_model(X, y, models):
    
    from statistics import mean
    from sklearn.model_selection import cross_val_score as cv
    
    results=[]
    
    display(sum([len(list(combinations_with_replacement(X.columns, i))) for i in range(8, 8)]))
    
    for i in range(3, 25):
        for j in combinations_with_replacement(X.columns, i):
            for model, kwargs in models:
                M = model(**kwargs)
                results.append((M, mean(cv(M, X, y, cv=5, n_jobs=4))))
    
    results = sorted(results, key=lambda i: i[1], reverse=True)
    
    return results

In [5]:
def submit(model, filename):
    test=pd.read_csv(filename)
    ids=test[['PassengerId']]
    test=clean_test(test) 
    predictions=pd.DataFrame(model.predict(test))
    predictions.columns=['Survived']
    df=pd.concat([ids, predictions], axis=1)
    
    import time
    
    time.time()
    
    df.to_csv('submission.csv' + str(time.time()), index=False)

In [6]:
from sklearn.model_selection import train_test_split as tts
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC as SVC
#from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as RFC 
from sklearn.linear_model import LogisticRegression as LR

from itertools import combinations_with_replacement


def clean(df):
    tmp=df.copy()
    tmp=make_title_col(tmp)
    tmp['Age']=tmp.groupby('Title')['Age'].transform(lambda x: x.fillna(x.mode()[0]))# impute_col(tmp, 'Age')
    #tmp=tmp.reset_index(drop=True)
    tmp['Boy'] = [1 if i[tmp.columns.get_loc('Age')]<10
             and i[tmp.columns.get_loc('Sex')] == 'male'
             or i[tmp.columns.get_loc('Title') == 'Master']
             else 0
             for i in tmp.values]
    tmp['Age'] = [i/tmp['Age'].values.max() for i in tmp['Age']]
    tmp['SibSp'] = [3 if i>=3 else i for i in tmp['SibSp']]
    tmp['Parch'] = [3 if i>=3 else i for i in tmp['Parch']]
    tmp['Pclass'] = [0 if i[tmp.columns.get_loc('Sex')] == 'male' else i[tmp.columns.get_loc('Pclass')] for i in tmp.values]
    tmp['Pclass'] = [4 if i[tmp.columns.get_loc('Fare')] >= 200 else i[tmp.columns.get_loc('Pclass')] for i in tmp.values]
    tmp.drop(columns=['Name', 'PassengerId', 'Fare', 'Cabin', 'Ticket'], inplace=True)
    
    #display(tmp[['Survived', 'SibSp']].groupby('SibSp').describe())
    
    tmp=pd.get_dummies(tmp, columns=['Title', 'SibSp', 'Pclass', 'Parch', 'Embarked', 'Sex'])
    
    tmp.drop(columns=['Embarked_Q'], inplace=True)
    
    display(tmp.describe())
    
    return tmp

X=pd.read_csv('data/train.csv')
y=X['Survived'].values

X=clean_train(X)

tests=[
    *[(KNN, {'n_neighbors': i, 'weights': j}) for j in ['uniform', 'distance'] for i in range(4,15)],
]

m=make_model(X, y, tests)
m=m[0][0].fit(X, y)

submit(m, 'data/test.csv')



Unnamed: 0,Age,Boy,Title_FancyF,Title_FancyM,Title_Master,Title_Miss,Title_Mr,Title_Mrs,SibSp_0,SibSp_1,...,Pclass_3,Pclass_4,Parch_0,Parch_1,Parch_2,Parch_3,Embarked_C,Embarked_S,Sex_female,Sex_male
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.34477,0.040404,0.005612,0.023569,0.044893,0.205387,0.580247,0.140292,0.682379,0.234568,...,0.161616,0.022447,0.760943,0.132435,0.089787,0.016835,0.188552,0.725028,0.352413,0.647587
std,0.172367,0.197016,0.074743,0.151787,0.207186,0.404211,0.493796,0.347485,0.465813,0.423966,...,0.368305,0.148214,0.426747,0.339154,0.286037,0.128725,0.391372,0.446751,0.47799,0.47799
min,0.00525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.2375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,0.4375,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


0

KeyboardInterrupt: 

# Cleaning the Data

## Importing the data

In [58]:
titanic = pd.read_csv('data/train.csv')
titanic = make_title_col(titanic)
titanic['Boy'] = [1 if i[titanic.columns.get_loc('Age')]<10
             and i[titanic.columns.get_loc('Sex')] == 'male'
             or i[titanic.columns.get_loc('Title') == 'Master']
             else 0
             for i in titanic.values]
print(titanic[['Survived', 'Boy']].groupby('Boy').describe().T[0:3].T.to_latex())

\begin{tabular}{lrrr}
\toprule
{} & \multicolumn{3}{l}{Survived} \\
{} &    count &      mean &       std \\
Boy &          &           &           \\
\midrule
0   &    859.0 &  0.376019 &  0.484667 \\
1   &     32.0 &  0.593750 &  0.498991 \\
\bottomrule
\end{tabular}



  import sys


In [53]:
import re

titanic = pd.read_csv('data/train.csv')
find=re.compile(r'([A-Z])')
cabins=[find.search(i).groups()[0] if not isinstance(i, float) and find.search(i) else '' for i in titanic['Cabin'].values]
titanic['CabinLetter'] = cabins
print(titanic[['Pclass', 'CabinLetter']].groupby('CabinLetter').describe().T)

CabinLetter                  A     B     C          D          E          F  \
Pclass count  687.000000  15.0  47.0  59.0  33.000000  32.000000  13.000000   
       mean     2.639010   1.0   1.0   1.0   1.121212   1.312500   2.384615   
       std      0.589602   0.0   0.0   0.0   0.331434   0.644455   0.506370   
       min      1.000000   1.0   1.0   1.0   1.000000   1.000000   2.000000   
       25%      2.000000   1.0   1.0   1.0   1.000000   1.000000   2.000000   
       50%      3.000000   1.0   1.0   1.0   1.000000   1.000000   2.000000   
       75%      3.000000   1.0   1.0   1.0   1.000000   1.000000   3.000000   
       max      3.000000   1.0   1.0   1.0   2.000000   3.000000   3.000000   

CabinLetter     G    T  
Pclass count  4.0  1.0  
       mean   3.0  1.0  
       std    0.0  NaN  
       min    3.0  1.0  
       25%    3.0  1.0  
       50%    3.0  1.0  
       75%    3.0  1.0  
       max    3.0  1.0  


### Missing data?

In [None]:
def nullstat(dataset):
    total=dataset.isnull().sum().sort_values(ascending=False)
    percent=(dataset.isnull().sum()/dataset.isnull().count()) \
        .sort_values(ascending=False)
    missing_data=pd.concat([total, percent], axis=1, keys=['Total','Percent'])
    missing_data=missing_data[missing_data.Percent != 0]
    f, ax = plt.subplots(figsize=(11, 6))
    sns.barplot(y=missing_data.index, x=missing_data['Percent'], orient='h')
    plt.ylabel('Features', fontsize=18)
    plt.xlabel('Percent of missing values', fontsize=18)
    plt.title('Percent missing data by feature', fontsize=18)
    plt.xlim([0.0, 1.0])

nullstat(titanic)
titanic.drop(columns=['Ticket', 'Cabin'], inplace=True)
titanic.head()

Stone, Mrs. George Nelson (Martha Evelyn) and her Maid Icard Miss. Amelie, embarked in Southampton<sup>1</sup>

<sup>1</sup>(2019) Martha Evelyn Stone Encyclopedia Titanica (ref: #287, updated 18th June 2019 06:25:53 AM)
URL : https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html

### Name Column

In [None]:
import random as rand
print(max(titanic['Name'], key=len))
sname_lengths=[len(i) for i in titanic[titanic.Survived==1]['Name']]
dname_lengths=[len(i) for i in titanic[titanic.Survived==0]['Name']]
display(sorted(map(len, titanic['Name'].to_list()), reverse=True)[0:5])
sns.distplot(rand.sample(sname_lengths, 3*len(sname_lengths)//4), bins=15, hist=False);
sns.distplot(rand.sample(dname_lengths, 3*len(dname_lengths)//4), bins=15, hist=False);

In [None]:
titanic=make_title_col(titanic)

In [None]:
title_stats=[]
for title in titles:
    title_match=re.compile(title + r'\.')
    matches=np.array([i for i in np.array(titanic) if title_match.search(i[3])])
    survived=sum(matches.T[1])
    count=len(matches)    
    title_stats.append({'Title':title, 'Count':count, 'Survival Rate':survived/count})

title_stats=pd.DataFrame(sorted(title_stats, key=lambda i: i['Count'], reverse=True)).T
title_stats.columns=title_stats.T['Title']
title_stats=title_stats.T.drop(columns='Title')[['Survival Rate', 'Count']].T
title_stats

In [None]:
for name, group in titanic.groupby('Title'):
        with open("title_stats.txt", "w") as f:
            f.write(name + '\n' + str(group.describe(include='all')) + '\n')