In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Cleaning the data

In [3]:
def make_title_col(df):
    import re 
    idx=df.columns.tolist().index('Name')
    find=re.compile(r'(\w+)\.')
    titles=[find.search(i[idx]).groups()[0] for i in np.array(df) if find.search(i[idx])]
    titles=[i if i in ['Mr', 'Mrs', 'Miss', 'Master'] 
            else 'FancyM' if i in ['Don', 'Rev', 'Dr', 'Major', 'Sir', 'Col', 'Capt', 'Jonkheer'] 
            else 'FancyF' for i in titles]    
    ret=df.copy()
    ret['Title'] = np.array(titles)
    return ret

def impute_col(df, columns):
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(strategy='mean')
    tmp=df.copy()    
    tmp.loc[:,columns] = imp.fit_transform(tmp.loc[:, columns].values.reshape(-1,1)).tolist() 
    return tmp

def clean(df):
    tmp=df.copy()
    tmp=make_title_col(tmp)
   # tmp=tmp.groupby('Title').apply(lambda g: impute_col(g, 'Age'))
    tmp=tmp.reset_index(drop=True)
    tmp['Child']=[1 if i<10 else 0 for i in tmp['Age']]
    tmp.drop(columns=['Name', 'PassengerId', 'Sex', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Pclass', 'Age'], inplace=True)
    tmp=pd.get_dummies(tmp, columns=['Title'])
    return tmp

def clean_train(df):
    return clean(df.drop(columns=['Survived']))

def clean_test(df):
    return clean(df)

train=pd.read_csv('data/train.csv')
train=clean_train(train)

In [4]:
def model(X, y):
    from sklearn.neighbors import KNeighborsClassifier as Model
    from sklearn.model_selection import cross_val_score as cv

    results=[]
    
    for i in ['uniform', 'distance']:
        for k in range(5,10):
            knn=Model(n_neighbors=k, weights=i)
            results.append((knn, sum(cv(knn, X, y, cv=15))/15))
    
    results = sorted(results, key=lambda i: i[1], reverse=True)
    
    return results[0][0].fit(X,y)

In [5]:
def submit(model, filename):
    test=pd.read_csv(filename)
    ids=test[['PassengerId']]
    test=clean_test(test) 
    predictions=pd.DataFrame(model.predict(test))
    predictions.columns=['Survived']
    df=pd.concat([ids, predictions], axis=1)
    df.to_csv('submission.csv', index=False)

In [14]:
# Clean the data

from sklearn.model_selection import train_test_split as tts

train=pd.read_csv('data/train.csv')
train_y=train['Survived'].values

train=clean_train(train)

results = []

for i in range(0, 30):

    X, tX, y, ty = tts(train, train_y)

    knn=model(X, y)
    submit(knn, 'data/test.csv')

    results.append((knn, knn.score(tX, ty)))

results = sorted(results, key=lambda i: i[1], reverse=True)
results[0]

(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                      weights='uniform'), 0.8609865470852018)

# Cleaning the Data

## Importing the data

In [None]:
titanic = pd.read_csv('data/train.csv')
titanic.describe(include='all').drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

### Missing data?

In [None]:
def nullstat(dataset):
    total=dataset.isnull().sum().sort_values(ascending=False)
    percent=(dataset.isnull().sum()/dataset.isnull().count()) \
        .sort_values(ascending=False)
    missing_data=pd.concat([total, percent], axis=1, keys=['Total','Percent'])
    missing_data=missing_data[missing_data.Percent != 0]
    f, ax = plt.subplots(figsize=(11, 6))
    sns.barplot(y=missing_data.index, x=missing_data['Percent'], orient='h')
    plt.ylabel('Features', fontsize=18)
    plt.xlabel('Percent of missing values', fontsize=18)
    plt.title('Percent missing data by feature', fontsize=18)
    plt.xlim([0.0, 1.0])

nullstat(titanic)
titanic.drop(columns=['Ticket', 'Cabin'], inplace=True)
titanic.head()

Stone, Mrs. George Nelson (Martha Evelyn) and her Maid Icard Miss. Amelie, embarked in Southampton<sup>1</sup>

<sup>1</sup>(2019) Martha Evelyn Stone Encyclopedia Titanica (ref: #287, updated 18th June 2019 06:25:53 AM)
URL : https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html

In [None]:
titanic.at[829, 'Embarked'] = 'S'
titanic.at[61, 'Embarked'] = 'S'

### Name Column

In [None]:
import random as rand
print(max(titanic['Name'], key=len))
sname_lengths=[len(i) for i in titanic[titanic.Survived==1]['Name']]
dname_lengths=[len(i) for i in titanic[titanic.Survived==0]['Name']]
display(sorted(map(len, titanic['Name'].to_list()), reverse=True)[0:5])
sns.distplot(rand.sample(sname_lengths, 3*len(sname_lengths)//4), bins=15, hist=False);
sns.distplot(rand.sample(dname_lengths, 3*len(dname_lengths)//4), bins=15, hist=False);

In [None]:
titanic=make_title_col(titanic)

In [None]:
title_stats=[]
for title in titles:
    title_match=re.compile(title + r'\.')
    matches=np.array([i for i in np.array(titanic) if title_match.search(i[3])])
    survived=sum(matches.T[1])
    count=len(matches)    
    title_stats.append({'Title':title, 'Count':count, 'Survival Rate':survived/count})


title_stats=pd.DataFrame(sorted(title_stats, key=lambda i: i['Count'], reverse=True)).T
title_stats.columns=title_stats.T['Title']
title_stats=title_stats.T.drop(columns='Title')[['Survival Rate', 'Count']].T
title_stats

In [None]:
for name, group in titanic.groupby('Title'):
        with open("title_stats.txt", "w") as f:
            f.write(name + '\n' + str(group.describe(include='all')) + '\n')