# Titanic Survivors (NN)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.tabular import *

In [3]:
path = f'./data'

## DataFrames and Feature Engineering

Basic feature engineering taken from the kaggle examples provided

In [4]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    return np.nan


def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [5]:
df = pd.read_csv(f'{path}/train.csv')
 
#replacing all titles with mr, mrs, miss, master
df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
df['Title']=df.apply(replace_titles, axis=1)

#Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))

In [6]:
test_df = pd.read_csv(f'{path}/test.csv')

## Data Bunch

In [7]:
dep_var = 'Survived'
cat_names = ['Pclass', 'Embarked', 'Sex', 'Title', 'Deck']
cont_names = ['Age', 'SibSp', 'Parch', 'Fare']
procs = [FillMissing, Categorify, Normalize]

In [8]:
# todo: still working through fixing NaN values in test set
#test = TabularList.from_df(test_df, path=path, cat_names=cat_names, cont_names=cont_names, procs=[FillMissing])

In [9]:
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names = cont_names, procs=procs)
                   .split_by_idx(list(range(800, 891)))
                   .label_from_df(cols=dep_var)
                   #.add_test(test)
                   .databunch())

In [10]:
data.show_batch(rows=10)

Pclass,Embarked,Sex,Title,Deck,Age_na,Age,SibSp,Parch,Fare,target
3,S,male,Mr,#na#,False,-0.8070,-0.4878,-0.4663,-0.4829,0
3,Q,male,Mr,#na#,True,-0.1146,0.4525,-0.4663,-0.3404,0
3,S,female,Mrs,#na#,False,0.8855,-0.4878,5.7722,0.1290,0
3,Q,male,Mr,#na#,True,-0.1146,-0.4878,-0.4663,-0.4911,0
2,S,female,Miss,#na#,False,-1.9610,0.4525,0.7814,-0.1948,1
2,C,male,Mr,#na#,False,-0.4993,-0.4878,-0.4663,-0.3492,0
2,S,male,Mr,#na#,False,-0.3454,0.4525,-0.4663,-0.1366,0
3,S,male,Mr,#na#,True,-0.1146,-0.4878,-0.4663,-0.4880,0
1,S,male,Mr,E,False,1.3471,-0.4878,-0.4663,-0.1446,0
2,S,female,Mrs,#na#,False,1.1933,-0.4878,-0.4663,-0.3792,1


## Training and Model Fit

In [11]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(4, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,0.640780,0.646061,0.626374
2,0.553121,0.477035,0.802198
3,0.500513,0.422233,0.846154
4,0.468170,0.381163,0.857143
