# Titanic: Machine Learning from Disaster

In [473]:
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [474]:
train = pd.read_csv('kg-data/train.csv')
train.shape

(891, 12)

In [475]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [623]:
train['Cabin'].unique()
len(train['Cabin'].unique())
labels, uniques = train['Embarked'].factorize()
labels
uniques

Index(['S', 'C', 'Q'], dtype='object')

In [682]:
def normalize(df):
#     return (df-df.mean())/df.std()
    return (df - df.min()) / (df.max() - df.min())

def extract_features(df):
    X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked']]

    X.loc[:, 'Sex'] = X['Sex'].factorize()[0]
    X.loc[:, 'Age'] = normalize(X['Age'])
    X.loc[:, 'Fare'] = normalize(X['Fare'])
    X.loc[:, 'Cabin'] = X['Cabin'].factorize()[0]
    X.loc[:, 'Embarked'] = X['Embarked'].factorize()[0]
    X.loc[:, 'Fsize'] = df[['SibSp', 'Parch']].sum(axis=1)    
    X = X.fillna(-1)

    if 'Survived' in df.columns:
        Y = df['Survived']
        return X, Y
    return X, None

X, Y = extract_features(train)
print(X.head())

X = X.values
Y = Y.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


   Pclass  Sex       Age      Fare  Cabin  Embarked  Fsize
0       3    0  0.271174  0.014151     -1         0      1
1       1    1  0.472229  0.139136      0         1      1
2       3    1  0.321438  0.015469     -1         0      0
3       1    1  0.434531  0.103644      1         0      1
4       3    0  0.434531  0.015713     -1         0      0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [681]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
print('train/test: %d/%d' % (len(X_train), len(X_test)))

X_fet_len = X_train.shape[1]

train/test: 712/179


## Shallow model

In [130]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [654]:
# model = SVC(kernel='linear', gamma='auto')
# model = SVC(kernel='poly', degree=6, gamma='auto')
model = SVC(kernel='rbf', gamma='auto')
# model = SVC(kernel='sigmoid', gamma='auto')
model.fit(X_train, Y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [655]:
Y_pred = model.predict(X_test)

In [656]:
print(confusion_matrix(Y_test, Y_pred))  
print(classification_report(Y_test, Y_pred))

[[96 19]
 [13 51]]
              precision    recall  f1-score   support

           0       0.88      0.83      0.86       115
           1       0.73      0.80      0.76        64

    accuracy                           0.82       179
   macro avg       0.80      0.82      0.81       179
weighted avg       0.83      0.82      0.82       179



## Deep model

In [502]:
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Conv2D, AveragePooling2D, Dropout
from keras.models import load_model

Using TensorFlow backend.


In [666]:
def MyModel_v1():
    features = Input(shape=(X_fet_len, ), dtype='float32')
    X = Dense(20, activation='relu')(features)
    X = Dense(20, activation='relu')(X)
    X = Dense(20, activation='relu')(X)
    X = Dense(1, activation='sigmoid', kernel_initializer='random_normal')(X)
    model = Model(inputs=features, outputs=X)
    return model

In [667]:
del model

In [668]:
model = MyModel_v1()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 7)                 0         
_________________________________________________________________
dense_31 (Dense)             (None, 20)                160       
_________________________________________________________________
dense_32 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_33 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_34 (Dense)             (None, 1)                 21        
Total params: 1,021
Trainable params: 1,021
Non-trainable params: 0
_________________________________________________________________


In [669]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [679]:
model.fit(X_train, Y_train, epochs=20, batch_size=16, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x138d952b0>

In [680]:
loss, acc = model.evaluate(X_test, Y_test)
print("Test accuracy = ", acc)

Test accuracy =  0.8268156407931664


## Submission

In [527]:
sub_test = pd.read_csv('kg-data/test.csv', sep=',')
sub_test.shape

(418, 11)

In [561]:
sub_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [605]:
X_sub_test, _ = extract_features(sub_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [570]:
Y_sub_pred = model.predict(X_sub_test)

In [563]:
# keras
Y_sub_pred = [0 if i <=0.5 else 1 for i in Y_sub_pred]

In [571]:
sub_pred_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
for i in range(len(sub_test)):
    pidx = sub_test['PassengerId'].loc[i]
    sub_pred_df.loc[i] = (pidx, Y_sub_pred[i])

In [572]:
sub_pred_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [551]:
sub_pred_df.to_csv('kg-data/submission.csv', sep=',', index=False)

## Kaggle

In [None]:
!kaggle competitions download -c titanic -p 'kg-data'

In [None]:
!kaggle competitions submit -c titanic -f 'kg-data/submission.csv' -m 'Message'