In [6]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import csv
import re

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures

import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation, Dropout
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras.wrappers.scikit_learn import KerasClassifier

In [18]:
root = '/Users/schwalmdaniel/github/kaggle/titanic'
#root = 'd:/dev/python/kaggle/titanic'

train=pd.read_csv(root + "/train.csv")
test=pd.read_csv(root + "/test.csv")

train['ticket_prefix'] = train['Ticket'].apply(lambda x: None if x.split(' ')[0].isnumeric() \
                                               else re.sub(r'\W','', x.split(' ')[0]).lower())
test['ticket_prefix'] = test['Ticket'].apply(lambda x: None if x.split(' ')[0].isnumeric() \
                                               else re.sub(r'\W','', x.split(' ')[0]).lower())

train['name_prefix'] = train['Name'].apply(lambda x: x.lower().strip().split(',')[1].strip().split(' ')[0])
test['name_prefix'] = test['Name'].apply(lambda x: x.lower().strip().split(',')[1].strip().split(' ')[0])

train = train.drop(['Name','Ticket'],axis=1)
test = test.drop(['Name','Ticket'],axis=1)

train['Cabin'].fillna('Unknown', inplace=True)
test['Cabin'].fillna('Unknown', inplace=True)

train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)

test['Fare'].fillna(0, inplace=True)

train['deck'] = train['Cabin'].apply(lambda x: re.sub(r'[\d ]','',x[:len(x) if x.find(' ') < 0 else x.find(' ') ]))
test['deck'] = test['Cabin'].apply(lambda x: re.sub(r'[\d ]','',x[:len(x) if x.find(' ') < 0 else x.find(' ') ]))

train['multicabin'] = train['Cabin'].apply(lambda x: 1 if ' ' in x else 0 )
test['multicabin'] = test['Cabin'].apply(lambda x:  1 if ' ' in x else 0)

#train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
#test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

train = train.drop(['Cabin'],axis=1)
test = test.drop(['Cabin'],axis=1)

train = pd.get_dummies(train, 
               columns = ['Pclass', 'Sex', 'Embarked','deck','ticket_prefix','name_prefix'],  # which columns to dummify
               prefix_sep='__')  # the separator between the prefix (column name) and cell value

train = train.drop(['Sex__male'],axis=1) # drop because of dummy trap
test = pd.get_dummies(test, 
               columns = ['Pclass', 'Sex', 'Embarked','deck','ticket_prefix','name_prefix'],  # which columns to dummify
               prefix_sep='__')  # the separator between the prefix (column name) and cell value

test = test.drop(['Sex__male'],axis=1) # drop because of dummy trap
train = train.drop(['deck__T'],axis=1) # drop because test set does not contain it

test = test.drop(list((set([x for x in test.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x])\
                     ^ set([x for x in train.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x]))),axis=1,errors='ignore')
train = train.drop(list((set([x for x in test.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x])\
                     ^ set([x for x in train.columns.tolist() if 'ticket_prefix' in x or 'name_prefix' in x]))),axis=1,errors='ignore')


X = train.drop(['PassengerId','Survived'], axis=1)
# create our feature matrix by removing the response variable
print ("learning from {} rows".format(X.shape[0]))
y = train['Survived']


y_test = train['Survived']
x_test = test.drop(['PassengerId'],axis=1)


learning from 891 rows


In [19]:
scaler = MinMaxScaler(feature_range=(0,1))

y_labels = (y.as_matrix())

x_samples = scaler.fit_transform(X.values)


x_test_samples = scaler.fit_transform(x_test.values)


x_test_samples.shape


(418, 48)

In [20]:
seed = 7
np.random.seed(seed)

def create_model(optimizer='Adam'):
    model = Sequential([
    Dense(64,input_shape=(48,),activation='relu'),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    #Dropout(.5),
    Dense(2, activation='softmax')
    ])
    model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return model

In [5]:

model = KerasClassifier(build_fn=create_model, verbose=2, batch_size=20, epochs=50)

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(x_samples, y_labels)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
 - 1s - loss: 0.6946 - acc: 0.6027
Epoch 2/50
 - 1s - loss: 0.6795 - acc: 0.6397
Epoch 2/50
 - 1s - loss: 0.6938 - acc: 0.6061
Epoch 2/50
 - 0s - loss: 0.6535 - acc: 0.6397
Epoch 3/50
 - 0s - loss: 0.6661 - acc: 0.6027
Epoch 3/50
 - 0s - loss: 0.6687 - acc: 0.6061
Epoch 3/50
 - 0s - loss: 0.6456 - acc: 0.6027
Epoch 4/50
 - 0s - loss: 0.6337 - acc: 0.6397
Epoch 4/50
 - 1s - loss: 0.5857 - acc: 0.6987
Epoch 2/50
 - 0s - loss: 0.6513 - acc: 0.6061
Epoch 4/50
 - 0s - loss: 0.6167 - acc: 0.6397
Epoch 5/50
 - 0s - loss: 0.6262 - acc: 0.6027
Epoch 5/50
 - 0s - loss: 0.4423 - acc: 0.8199
Epoch 3/50
 - 0s - loss: 0.6357 - acc: 0.6061
Epoch 5/50
 - 0s - loss: 0.6004 - acc: 0.6397
Epoch 6/50
 - 0s - loss: 0.6080 - acc: 0.6178
Epoch 6/50
 - 0s - loss: 0.4045 - acc: 0.8350
Epoch 4/50
 - 0s - loss: 0.5834 - acc: 0.6397
Epoch 7/50
 - 0s - loss: 0.5881 - acc: 0.6633
Epoch 7/50
 - 0s - loss: 0.6206 - acc: 0.6094
Epoch 6/50
 - 0s - loss: 0.3760 - acc: 0.8401
 

Epoch 46/50
 - 0s - loss: 0.4108 - acc: 0.8333
 - 0s - loss: 0.3900 - acc: 0.8535
Epoch 45/50
Epoch 47/50
 - 0s - loss: 0.2636 - acc: 0.8872
Epoch 44/50
 - 0s - loss: 0.3547 - acc: 0.8620
Epoch 47/50
 - 0s - loss: 0.2551 - acc: 0.8906
Epoch 45/50
 - 0s - loss: 0.3884 - acc: 0.8519
 - 0s - loss: 0.4129 - acc: 0.8316
Epoch 48/50
Epoch 46/50
 - 0s - loss: 0.3526 - acc: 0.8636
Epoch 48/50
 - 0s - loss: 0.2531 - acc: 0.8906
 - 0s - loss: 0.3878 - acc: 0.8451
 - 0s - loss: 0.4104 - acc: 0.8333
Epoch 47/50
Epoch 49/50
Epoch 46/50
 - 0s - loss: 0.3518 - acc: 0.8586
Epoch 49/50
 - 0s - loss: 0.3853 - acc: 0.8468
 - 0s - loss: 0.2499 - acc: 0.8990
Epoch 47/50
 - 0s - loss: 0.4087 - acc: 0.8367
Epoch 48/50
Epoch 50/50
 - 0s - loss: 0.3508 - acc: 0.8603
Epoch 50/50
 - 0s - loss: 0.3840 - acc: 0.8552
 - 0s - loss: 0.2549 - acc: 0.9024
Epoch 48/50
 - 0s - loss: 0.4096 - acc: 0.8367
Epoch 49/50
 - 0s - loss: 0.3505 - acc: 0.8620
 - 0s - loss: 0.2536 - acc: 0.8939
Epoch 49/50
 - 0s - loss: 0.4070 - ac

 - 0s - loss: 0.2664 - acc: 0.8923
Epoch 39/50
 - 0s - loss: 0.3019 - acc: 0.8855
Epoch 37/50
 - 0s - loss: 0.3272 - acc: 0.8670
Epoch 41/50
 - 0s - loss: 0.2980 - acc: 0.8822
Epoch 41/50
 - 0s - loss: 0.2653 - acc: 0.8872
Epoch 40/50
 - 0s - loss: 0.3018 - acc: 0.8838
Epoch 38/50
 - 0s - loss: 0.3236 - acc: 0.8653
Epoch 42/50
 - 0s - loss: 0.2983 - acc: 0.8737
 - 0s - loss: 0.2624 - acc: 0.8872
Epoch 42/50
Epoch 41/50
 - 0s - loss: 0.3053 - acc: 0.8838
Epoch 39/50
 - 0s - loss: 0.3348 - acc: 0.8636
Epoch 43/50
 - 0s - loss: 0.3046 - acc: 0.8788
Epoch 43/50
 - 0s - loss: 0.2651 - acc: 0.8906
Epoch 42/50
 - 0s - loss: 0.3007 - acc: 0.8822
Epoch 40/50
 - 0s - loss: 0.3224 - acc: 0.8788
Epoch 44/50
 - 0s - loss: 0.2621 - acc: 0.8889
Epoch 43/50
 - 0s - loss: 0.2962 - acc: 0.8838
Epoch 44/50
 - 0s - loss: 0.3251 - acc: 0.8653
 - 0s - loss: 0.2990 - acc: 0.8754
Epoch 41/50
Epoch 45/50
 - 0s - loss: 0.2611 - acc: 0.8923
 - 0s - loss: 0.3001 - acc: 0.8805
Epoch 45/50
 - 0s - loss: 0.2991 - ac

Epoch 28/50
 - 0s - loss: 0.3197 - acc: 0.8636
Epoch 41/50
 - 0s - loss: 0.3052 - acc: 0.8704
 - 0s - loss: 0.2795 - acc: 0.8872
Epoch 32/50
Epoch 32/50
 - 0s - loss: 0.3199 - acc: 0.8704
Epoch 42/50
 - 0s - loss: 0.3458 - acc: 0.8535
Epoch 29/50
 - 0s - loss: 0.2771 - acc: 0.8872
 - 0s - loss: 0.3009 - acc: 0.8838
Epoch 33/50
Epoch 33/50
 - 0s - loss: 0.3470 - acc: 0.8603
Epoch 30/50
 - 0s - loss: 0.3173 - acc: 0.8754
 - 0s - loss: 0.2800 - acc: 0.8805
Epoch 43/50
Epoch 34/50
 - 0s - loss: 0.3035 - acc: 0.8872
Epoch 34/50
 - 0s - loss: 0.3458 - acc: 0.8586
Epoch 31/50
 - 0s - loss: 0.3157 - acc: 0.8754
Epoch 44/50
 - 0s - loss: 0.2832 - acc: 0.8771
Epoch 35/50
 - 0s - loss: 0.3011 - acc: 0.8805
Epoch 35/50
 - 0s - loss: 0.3140 - acc: 0.8653
Epoch 45/50
 - 0s - loss: 0.2766 - acc: 0.8822
Epoch 36/50
 - 0s - loss: 0.3435 - acc: 0.8653
 - 0s - loss: 0.3037 - acc: 0.8771
Epoch 36/50
Epoch 32/50
 - 0s - loss: 0.3120 - acc: 0.8704
Epoch 46/50
 - 0s - loss: 0.2797 - acc: 0.8855
Epoch 37/50
 

 - 0s - loss: 0.3399 - acc: 0.8535
 - 0s - loss: 0.2493 - acc: 0.8939
Epoch 24/50
Epoch 34/50
 - 0s - loss: 0.2894 - acc: 0.8889
Epoch 26/50
 - 0s - loss: 0.3113 - acc: 0.8805
 - 0s - loss: 0.3297 - acc: 0.8636
Epoch 25/50
Epoch 25/50
 - 0s - loss: 0.2570 - acc: 0.8923
Epoch 35/50
 - 0s - loss: 0.2885 - acc: 0.8838
Epoch 27/50
 - 0s - loss: 0.2951 - acc: 0.8788
Epoch 26/50
 - 0s - loss: 0.3194 - acc: 0.8704
Epoch 26/50
 - 0s - loss: 0.2882 - acc: 0.8838
Epoch 28/50
 - 0s - loss: 0.2569 - acc: 0.8906
Epoch 36/50
 - 0s - loss: 0.2980 - acc: 0.8855
Epoch 27/50
 - 0s - loss: 0.2831 - acc: 0.8889
Epoch 29/50
 - 0s - loss: 0.3238 - acc: 0.8620
Epoch 27/50
 - 0s - loss: 0.2499 - acc: 0.8990
Epoch 37/50
 - 0s - loss: 0.2968 - acc: 0.8838
Epoch 28/50
 - 0s - loss: 0.2753 - acc: 0.8906
Epoch 30/50
 - 0s - loss: 0.3281 - acc: 0.8687
Epoch 28/50
 - 0s - loss: 0.2420 - acc: 0.8906
Epoch 38/50
 - 0s - loss: 0.2904 - acc: 0.8855
Epoch 29/50
 - 0s - loss: 0.2743 - acc: 0.8822
Epoch 31/50
 - 0s - loss:

Epoch 16/50
 - 0s - loss: 0.3218 - acc: 0.8805
Epoch 28/50
 - 0s - loss: 0.3161 - acc: 0.8838
Epoch 15/50
 - 0s - loss: 0.3444 - acc: 0.8552
Epoch 25/50
 - 0s - loss: 0.2722 - acc: 0.8805
Epoch 17/50
 - 0s - loss: 0.3237 - acc: 0.8805
Epoch 29/50
 - 0s - loss: 0.3093 - acc: 0.8737
Epoch 16/50
 - 0s - loss: 0.3451 - acc: 0.8636
Epoch 26/50
 - 0s - loss: 0.2656 - acc: 0.8956
Epoch 18/50
 - 0s - loss: 0.3158 - acc: 0.8771
Epoch 30/50
 - 0s - loss: 0.3067 - acc: 0.8788
Epoch 17/50
 - 0s - loss: 0.3461 - acc: 0.8586
Epoch 27/50
 - 0s - loss: 0.2714 - acc: 0.8872
Epoch 19/50
 - 0s - loss: 0.3076 - acc: 0.8754
Epoch 18/50
 - 0s - loss: 0.3150 - acc: 0.8855
Epoch 31/50
 - 0s - loss: 0.3379 - acc: 0.8620
Epoch 28/50
 - 0s - loss: 0.2715 - acc: 0.8855
Epoch 20/50
 - 0s - loss: 0.3096 - acc: 0.8855
Epoch 32/50
 - 0s - loss: 0.3058 - acc: 0.8721
Epoch 19/50
 - 0s - loss: 0.3441 - acc: 0.8552
Epoch 29/50
 - 0s - loss: 0.2640 - acc: 0.8923
 - 0s - loss: 0.3172 - acc: 0.8822
Epoch 21/50
Epoch 33/50
 

Epoch 4/50
 - 0s - loss: 0.4111 - acc: 0.8272
Epoch 5/50
 - 0s - loss: 0.4035 - acc: 0.8350
Epoch 6/50
 - 0s - loss: 0.3921 - acc: 0.8384
Epoch 7/50
 - 0s - loss: 0.3849 - acc: 0.8462
Epoch 8/50
 - 0s - loss: 0.3776 - acc: 0.8485
Epoch 9/50
 - 0s - loss: 0.3745 - acc: 0.8586
Epoch 10/50
 - 0s - loss: 0.3764 - acc: 0.8530
Epoch 11/50
 - 0s - loss: 0.3677 - acc: 0.8552
Epoch 12/50
 - 0s - loss: 0.3642 - acc: 0.8597
Epoch 13/50
 - 0s - loss: 0.3613 - acc: 0.8586
Epoch 14/50
 - 0s - loss: 0.3598 - acc: 0.8620
Epoch 15/50
 - 0s - loss: 0.3557 - acc: 0.8530
Epoch 16/50
 - 0s - loss: 0.3524 - acc: 0.8563
Epoch 17/50
 - 0s - loss: 0.3515 - acc: 0.8653
Epoch 18/50
 - 0s - loss: 0.3476 - acc: 0.8563
Epoch 19/50
 - 0s - loss: 0.3468 - acc: 0.8653
Epoch 20/50
 - 0s - loss: 0.3426 - acc: 0.8653
Epoch 21/50
 - 0s - loss: 0.3423 - acc: 0.8608
Epoch 22/50
 - 0s - loss: 0.3381 - acc: 0.8608
Epoch 23/50
 - 0s - loss: 0.3355 - acc: 0.8642
Epoch 24/50
 - 0s - loss: 0.3346 - acc: 0.8653
Epoch 25/50
 - 0s -

In [23]:
model = create_model(optimizer='Adamax')
model.fit(x_samples,y_labels, batch_size=10,epochs=20, verbose=2, validation_split=0.2)
preds = model.predict_classes(x_test_samples,batch_size=10,verbose=0)

Train on 712 samples, validate on 179 samples
Epoch 1/20
 - 1s - loss: 0.6262 - acc: 0.6952 - val_loss: 0.5006 - val_acc: 0.8045
Epoch 2/20
 - 0s - loss: 0.4756 - acc: 0.7907 - val_loss: 0.4030 - val_acc: 0.8045
Epoch 3/20
 - 0s - loss: 0.4387 - acc: 0.8048 - val_loss: 0.3836 - val_acc: 0.8268
Epoch 4/20
 - 0s - loss: 0.4217 - acc: 0.8188 - val_loss: 0.3649 - val_acc: 0.8492
Epoch 5/20
 - 0s - loss: 0.4133 - acc: 0.8329 - val_loss: 0.3645 - val_acc: 0.8603
Epoch 6/20
 - 0s - loss: 0.4071 - acc: 0.8329 - val_loss: 0.3572 - val_acc: 0.8547
Epoch 7/20
 - 0s - loss: 0.4019 - acc: 0.8385 - val_loss: 0.3557 - val_acc: 0.8603
Epoch 8/20
 - 0s - loss: 0.3979 - acc: 0.8385 - val_loss: 0.3539 - val_acc: 0.8659
Epoch 9/20
 - 0s - loss: 0.3948 - acc: 0.8399 - val_loss: 0.3517 - val_acc: 0.8715
Epoch 10/20
 - 0s - loss: 0.3916 - acc: 0.8385 - val_loss: 0.3526 - val_acc: 0.8771
Epoch 11/20
 - 0s - loss: 0.3859 - acc: 0.8399 - val_loss: 0.3486 - val_acc: 0.8659
Epoch 12/20
 - 0s - loss: 0.3840 - acc:

In [24]:
predicted = pd.DataFrame()
predicted['PassengerId'] = test['PassengerId']
predicted['Survived'] = preds
predicted[['PassengerId', 'Survived']] = predicted[['PassengerId', 'Survived']].astype(int)
predicted.to_csv(root + '/submission_keras_modelnew.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)

predicted.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
