In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

train = pd.read_csv('./train.csv', index_col='PassengerId')
test = pd.read_csv('./test.csv', index_col='PassengerId')

df_full = train.append(test)
df_full['Survived'] = df_full['Survived'].astype(pd.Int64Dtype())

print(df_full.dtypes)
df_full.head()

Survived      Int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object


  df_full = train.append(test)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Exploring the data

In [2]:
pd.crosstab(df_full['Pclass'], df_full['Survived'], normalize='index').round(3)

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.37,0.63
2,0.527,0.473
3,0.758,0.242


In [3]:
pd.crosstab(df_full['Sex'], df_full['Survived'], normalize='index').round(3)

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.258,0.742
male,0.811,0.189


In [4]:
pd.crosstab(df_full['Embarked'], df_full['Survived'], normalize='index').round(3)

Survived,0,1
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.446,0.554
Q,0.61,0.39
S,0.663,0.337


### Adding More Features

#### Age groups

In [5]:
#removing for now, breaks pipeline
#bins = [0, 2, 12, 20, 60, np.inf]
#labels = ['baby', 'child', 'teenager', 'adult', 'elderly']
#df_full['AgeGroup'] = pd.cut(train.Age, bins, labels = labels)

#### Title

In [6]:
df_full['Title'] = df_full['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [7]:
pd.crosstab(df_full['Title'], df_full['Survived'], margins=True, dropna= False).sort_values('All', ascending=False)

Survived,0,1,All
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All,549,342,1309
Mr,436,81,757
Miss,55,127,260
Mrs,26,99,197
Master,17,23,61
Dr,4,3,8
Rev,6,0,8
Col,1,1,4
Major,1,1,2
Mlle,0,2,2


By comparing 'All' to what the totals should be, we can see how many of each was nan and belongs to the testing group. We can pick The top groups knowing those categories are represented in both.

In [8]:
df_full['Title'] = df_full['Title'].replace(
    ['Lady', 'Countess','Capt', 'Col', 'Don', 'Major', 'Sir', 'Jonkheer', 'Dona'],
    'Other')
df_full['Title'] = df_full['Title'].replace(['Mlle', 'Ms'], 'Miss')
df_full['Title'] = df_full['Title'].replace('Mme', 'Mrs')

#### Deck and room from cabin

In [9]:
df_full[['Deck', 'Room']] = df_full['Cabin'].str.extract(r'([A-Z])([0-9]+)')
df_full['Room'] = df_full['Room'].astype(pd.StringDtype()).astype('float64')

In [10]:
df_full.groupby(['Survived'])[['Room']].mean().round(1)

Unnamed: 0_level_0,Room
Survived,Unnamed: 1_level_1
0,53.7
1,48.9


In [11]:
df_full.groupby(['Survived', 'Deck'])[['Room']].mean().round(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Room
Survived,Deck,Unnamed: 2_level_1
0,A,18.4
0,B,57.5
0,C,74.6
0,D,33.1
0,E,53.0
0,F,20.0
0,G,44.2
1,A,22.3
1,B,50.1
1,C,76.2


Room appears to only have an effect on survival when combined with deck.

In [12]:
pd.crosstab(df_full['Deck'], df_full['Survived'], normalize='index').round(3)

Survived,0,1
Deck,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.533,0.467
B,0.255,0.745
C,0.407,0.593
D,0.233,0.767
E,0.242,0.758
F,0.222,0.778
G,0.714,0.286


Deck has an effect by itself.

#### Ticket

In [13]:
reversed_ticket = pd.DataFrame()
reversed_ticket[['Ticket_num', 'Ticket_ext1', 'Ticket_ext2']] = df_full['Ticket'].str[::-1].str.split(' ', expand=True)

df_full['Ticket_num'] = pd.to_numeric(reversed_ticket['Ticket_num'].str[::-1], 'coerce')
df_full['Ticket_ext'] = reversed_ticket['Ticket_ext1'].str[::-1] #+ reversed_ticket['Ticket_ext2'].str[::-1]

#### Family size and survival

In [14]:
df_full['LastName'] = df_full['Name'].str.extract('^([a-zA-Z]+)')

In [15]:
df_full['FamSize'] = df_full['SibSp'] + df_full['Parch'] + 1
pd.crosstab(df_full['FamSize'], df_full['Survived'], normalize='index').round(3)

Survived,0,1
FamSize,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.696,0.304
2,0.447,0.553
3,0.422,0.578
4,0.276,0.724
5,0.8,0.2
6,0.864,0.136
7,0.667,0.333
8,1.0,0.0
11,1.0,0.0


The following lines calculate the percentage of family that is known to survive. This is not including the observed person. This should make the validation tests more accurate.

In [16]:
df_full['FamSurvived'] = (df_full.groupby(['LastName', 'FamSize', 'Cabin'], dropna = False)['Survived'].transform(np.sum) - df_full['Survived'])/(df_full['FamSize'] - 1)
df_full.loc[df_full['Survived'].isna(), 'FamSurvived'] = df_full.groupby(['LastName', 'FamSize', 'Cabin'], dropna = False)['Survived'].transform(np.nanmean)

df_full.loc[df_full['FamSize'] == 1, 'FamSurvived'] = pd.NA

df_full['FamSurvived'] = df_full['FamSurvived'].astype('float64')

#### Reviewing additions

In [17]:
pd.DataFrame(df_full.dtypes, columns = ['Type'])

Unnamed: 0,Type
Survived,Int64
Pclass,int64
Name,object
Sex,object
Age,float64
SibSp,int64
Parch,int64
Ticket,object
Fare,float64
Cabin,object


In [18]:
df_full.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Room,Ticket_num,Ticket_ext,LastName,FamSize,FamSurvived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,,,21171.0,A/5,Braund,2,0.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,85.0,17599.0,PC,Cumings,2,0.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,,,3101282.0,STON/O2.,Heikkinen,1,
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,C,123.0,113803.0,,Futrelle,2,0.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,,,373450.0,,Allen,1,


### Creating train, validation, and test sets

In [19]:
from sklearn.model_selection import train_test_split

test_x = df_full[df_full['Survived'].isna()]
train = df_full[~df_full['Survived'].isna()]

train, valid = train_test_split(train)
train_survived = train['Survived'].astype('int64')
valid_survived = valid['Survived'].astype('int64')

train_x = train.drop(['Survived'], axis=1)
valid_x = valid.drop(['Survived'], axis=1)
test_x = test_x.drop(['Survived'], axis=1)

### Creating a model

In [22]:
import tensorflow as tf
from tensorflow import keras

In [39]:
def create_nn():
    mod = keras.Sequential([
        keras.layers.Dense(units=2**8, activation=tf.nn.relu),
        keras.layers.Dense(units=2**9, activation=tf.nn.relu),
        keras.layers.Dense(units=2**10, activation=tf.nn.relu),
        keras.layers.Dense(units=2**11, activation=tf.nn.relu),
        keras.layers.Dense(units=2**10, activation=tf.nn.relu),
        keras.layers.Dense(units=2**9, activation=tf.nn.relu),
        keras.layers.Dense(units=2**8, activation=tf.nn.relu),
        keras.layers.Dense(2, activation=tf.nn.softmax)
    ])
    
    mod.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.001),
        loss=tf.losses.sparse_categorical_crossentropy,
        metrics=['accuracy']
    )
    return mod
    

### Creating a pipeline

In [40]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction import text


categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

pass_transformer = ColumnTransformer(
    remainder='passthrough',
    transformers=[])

transfrom_cols = ColumnTransformer(
    remainder='drop',
    verbose=False,
    sparse_threshold=0,
    transformers=[
        ('categorical transform', categorical_transformer, [
            'Pclass', 'Sex', 'Embarked', 'Deck', 'Title'
            ]),
        ('passthrough columns', pass_transformer, [
            'Age', 'SibSp', 'Parch', 'Fare', 'Room', 'Ticket_num', 'FamSize', 'FamSurvived'
            ])
    ])

it_imputer = IterativeImputer()

preprocessor = Pipeline(
    verbose=False,
    steps = [
        ('transform columns', transfrom_cols),
        ('multivatiate imputer', it_imputer)
    ])

model = create_nn()

Test preprocessor for array shape

In [41]:
preprocessor.fit_transform(train_x).shape

(668, 30)

In [663]:
preprocessor.fit_transform(test_x).shape

(418, 30)

In [664]:
preprocessor.fit_transform(valid_x).shape

(223, 30)

In [43]:
model.fit(preprocessor.fit_transform(train_x), train_survived, batch_size = 2, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
 57/334 [====>.........................] - ETA: 0s - loss: 0.6630 - accuracy: 0.6140

KeyboardInterrupt: 