# Titanic: Machine Learning from Disaster

In [315]:
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from keras.utils.np_utils import to_categorical
# from keras.utils.np_utils import normalize

import regex as re

from utils import *

In [316]:
train = pd.read_csv('kg-data/train.csv')
train.shape

(891, 12)

In [317]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [318]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [319]:
def print_feat_to_surv(feature):
    print(train[[feature, 'Survived']].groupby([feature], as_index=False).mean())

In [334]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

def preprocess_dataset(dataset):
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

    dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
    dataset['CategoricalFare'] = pd.qcut(dataset['Fare'], 4, labels=[str(i) for i in range(4)])
    dataset['CategoricalFare'] = dataset['CategoricalFare'].astype(int)

    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
#     print('age_avg: %f, age_std: %f, age_null_count: %d' % (age_avg, age_std, age_null_count))
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std)
    # print(age_null_random_list)
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    dataset['CategoricalAge'] = pd.cut(train['Age'], 5, labels=[str(i) for i in range(5)])
    dataset['CategoricalAge'] = dataset['CategoricalAge'].astype(int)


    dataset['Title'] = dataset['Name'].apply(get_title)
#     print(pd.crosstab(dataset['Title'], dataset['Sex']))
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', \
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['Title'] = dataset['Title'].fillna(0)
    dataset['Title'] = dataset['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5})

    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)

    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
def extract_features_(dataset):
    dataset = dataset[['Survived', 'Pclass', 'Sex', 'CategoricalAge', 'CategoricalFare', \
                       'Embarked', 'IsAlone', 'Title']]
    # print(dataset.head())
    X = dataset.drop(['Survived'], axis=1).values
    Y = dataset[['Survived']].values.reshape((-1,))
    return X, Y

def extract_features_pred(dataset):
    dataset = dataset[['Pclass', 'Sex', 'CategoricalAge', 'CategoricalFare', \
                       'Embarked', 'IsAlone', 'Title']]
    # print(dataset.head())
    X = dataset.values
    return X

In [321]:
preprocess_dataset(train)

In [296]:
print_feat_to_surv('Pclass')

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [297]:
print_feat_to_surv('Sex')

   Sex  Survived
0    0  0.742038
1    1  0.188908


In [298]:
print_feat_to_surv('FamilySize')

   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


In [299]:
print_feat_to_surv('IsAlone')

   IsAlone  Survived
0        0  0.505650
1        1  0.303538


In [300]:
print_feat_to_surv('Embarked')

   Embarked  Survived
0         0  0.339009
1         1  0.553571
2         2  0.389610


In [301]:
print_feat_to_surv('CategoricalFare')

   CategoricalFare  Survived
0                0  0.197309
1                1  0.303571
2                2  0.454955
3                3  0.581081


In [302]:
print_feat_to_surv('CategoricalAge')

   CategoricalAge  Survived
0               0  0.550000
1               1  0.344762
2               2  0.403226
3               3  0.434783
4               4  0.090909


In [303]:
print_feat_to_surv('Title')

   Title  Survived
0      1  0.156673
1      2  0.702703
2      3  0.793651
3      4  0.575000
4      5  0.347826


In [305]:
X, Y = extract_features(train)

In [307]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, shuffle=False)
print('train/test: %d/%d' % (len(X_train), len(X_test)))

X_fet_len = X_train.shape[1]

train/test: 801/90


## Shallow model

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [340]:
# model = SVC(kernel='linear', gamma='auto', C=0.1)
# model = SVC(kernel='poly', degree=8, gamma='auto')
model = SVC(kernel='rbf', gamma='auto')
# model = SVC(kernel='sigmoid', gamma='auto')

# scores = cross_val_score(model, X, Y, cv=10)
# print(np.mean(scores))

Y_pred = cross_val_predict(model, X, Y, cv=10)
print(classification_report(Y, Y_pred))

model.fit(X, Y)

0.8170619112473044
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       549
           1       0.82      0.67      0.74       342

    accuracy                           0.82       891
   macro avg       0.82      0.79      0.80       891
weighted avg       0.82      0.82      0.81       891



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Deep model

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Conv2D, AveragePooling2D, Dropout
from keras.models import load_model

In [None]:
def MyModel_v1():
    features = Input(shape=(X_fet_len, ), dtype='float32')
    X = Dense(20, activation='relu')(features)
    X = Dense(20, activation='relu')(X)
    X = Dense(1, activation='sigmoid', kernel_initializer='random_normal')(X)
    model = Model(inputs=features, outputs=X)
    return model

In [None]:
del model

In [None]:
model = MyModel_v1()
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, Y_train, epochs=20, batch_size=8, validation_split=0.1)

In [None]:
plot_model_accuracy(history)
plot_model_loss(history)

In [None]:
loss, acc = model.evaluate(X_test, Y_test)
print("Test loss = ", loss)
print("Test accuracy = ", acc)

## Submission

In [330]:
sub_test = pd.read_csv('kg-data/test.csv', sep=',')
sub_test.shape

(418, 11)

In [331]:
sub_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [335]:
# preprocess_dataset(sub_test)
X_sub_test = extract_features_pred(sub_test)

In [336]:
Y_sub_pred = model.predict(X_sub_test)

In [None]:
# keras
Y_sub_pred = model.predict(X_sub_test).flatten()
Y_sub_pred = (Y_sub_pred >= 0.5).astype(int)

In [337]:
ids = sub_test['PassengerId']
labels = pd.Series(Y_sub_pred, name="Survived")
submission = pd.concat([ids, labels], axis=1)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [338]:
submission.to_csv('kg-data/submission.csv', sep=',', index=False)

## Kaggle

In [None]:
!kaggle competitions download -c titanic -p 'kg-data'

In [339]:
!kaggle competitions submit -c titanic -f 'kg-data/submission.csv' -m 'Sina preprocessing'

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 23.8kB/s]
Successfully submitted to Titanic: Machine Learning from Disaster