In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#import xgboost as XGBClassifier
import seaborn as sns
import numpy as np

import tensorflow as tf

In [111]:
train_raw = pd.read_pickle('../output/preprocessed_train.pkl')
test = pd.read_pickle('../output/preprocessed_test.pkl')

# Keep a hold out test set

In [112]:
validation = train_raw.sample(frac=0.2)

train = train_raw.loc[~train_raw.index.isin(validation.index), :]

print(train_raw.shape[0], train.shape[0], validation.shape[0])

891 713 178


In [113]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 0 to 890
Data columns (total 18 columns):
PassengerId    713 non-null int64
Survived       713 non-null int64
Pclass         713 non-null int64
Name           713 non-null object
Sex            713 non-null object
Age            713 non-null float64
SibSp          713 non-null int64
Parch          713 non-null int64
Ticket         713 non-null object
Fare           713 non-null float64
Cabin          713 non-null object
Embarked       713 non-null object
cabin_cat      713 non-null object
family_size    713 non-null int64
family_cat     713 non-null object
Initial        713 non-null object
fare_range     713 non-null category
fare_cat       713 non-null category
dtypes: category(2), float64(2), int64(6), object(8)
memory usage: 96.3+ KB


In [114]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 17 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          418 non-null object
Embarked       418 non-null object
cabin_cat      418 non-null object
family_size    418 non-null int64
family_cat     418 non-null object
Initial        418 non-null object
fare_range     418 non-null category
fare_cat       418 non-null category
dtypes: category(2), float64(2), int64(5), object(8)
memory usage: 50.0+ KB


# Convntional Models

## Feature Engineering and Selection

In [115]:
cat_cols = ['Pclass', 'Sex', 'Embarked', 'cabin_cat', 'family_cat', 'Initial', 'fare_cat']
num_cols = ['Age', 'Fare', 'family_size']

onehot_transformer = ColumnTransformer(transformers=[
                                                      ('num', MinMaxScaler(), num_cols),
                                                      ('cat', OneHotEncoder(sparse=True), cat_cols)
                                                      ],
                                        remainder='drop')
ordinal_transformer = ColumnTransformer(transformers=[
                                                      ('num', MinMaxScaler(), num_cols),
                                                      ('cat', OrdinalEncoder(), cat_cols)
                                                      ],
                                        remainder='drop')
rfe = RFECV(ExtraTreeClassifier(), cv=3, verbose=True, n_jobs=2)
coef = SelectFromModel(LogisticRegression())

## Pipeline

In [116]:
pipeline = Pipeline(steps=[('transform', 'passthrough'), ('feature', 'passthrough'), ('clf', SVC())])
params = [{'transform':[onehot_transformer],
          'feature': [rfe, coef],
          'clf__C': [10],
          'clf__gamma': ['scale'],
          'clf__kernel': ['sigmoid', 'rbf']}]
#          {'transform':[onehot_transformer, ordinal_transformer],
#           'feature': [rfe, coef],
#           'clf': [GradientBoostingClassifier(), RandomForestClassifier()],
#           'clf__max_depth': [None, 2, 6, 10],
#           'clf__criterion': ['gini', 'entropy'],
#           'clf__min_samples_split': [2, 0.1, 0.2],
#           'clf__min_samples_leaf': [1, 10, 0.1]}]
grid_cv = GridSearchCV(pipeline, param_grid=params, cv=3, refit=True, return_train_score=True, verbose=True, n_jobs=2)
grid_cv.fit(train.drop('Survived', axis=1), train.Survived.values)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:    7.0s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('transform', 'passthrough'),
                                       ('feature', 'passthrough'),
                                       ('clf',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=...
                                                                         MinMaxScaler(copy=True,
            

In [117]:
grid_cv.best_params_

{'clf__C': 10,
 'clf__gamma': 'scale',
 'clf__kernel': 'rbf',
 'feature': SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1, l1_ratio=None,
                                              max_iter=100, multi_class='auto',
                                              n_jobs=None, penalty='l2',
                                              random_state=None, solver='lbfgs',
                                              tol=0.0001, verbose=0,
                                              warm_start=False),
                 max_features=None, norm_order=1, prefit=False, threshold=None),
 'transform': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('num',
                                  MinMaxScaler(copy=True, feature_range=(0,

In [119]:
pd.DataFrame(grid_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__gamma,param_clf__kernel,param_feature,param_transform,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.258678,0.010819,0.0,0.0,10,scale,sigmoid,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...","ColumnTransformer(n_jobs=None, remainder='drop...","{'clf__C': 10, 'clf__gamma': 'scale', 'clf__ke...",...,,,,,3,,,,,
1,0.037874,0.006353,0.014282,0.004651,10,scale,sigmoid,SelectFromModel(estimator=LogisticRegression(C...,"ColumnTransformer(n_jobs=None, remainder='drop...","{'clf__C': 10, 'clf__gamma': 'scale', 'clf__ke...",...,0.647059,0.691983,0.671838,0.018631,2,0.690526,0.665263,0.707983,0.687924,0.017537
2,0.212473,0.068276,0.0,0.0,10,scale,rbf,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...","ColumnTransformer(n_jobs=None, remainder='drop...","{'clf__C': 10, 'clf__gamma': 'scale', 'clf__ke...",...,,,,,4,,,,,
3,0.043241,0.00966,0.01287,0.001989,10,scale,rbf,SelectFromModel(estimator=LogisticRegression(C...,"ColumnTransformer(n_jobs=None, remainder='drop...","{'clf__C': 10, 'clf__gamma': 'scale', 'clf__ke...",...,0.810924,0.831224,0.813489,0.013555,1,0.844211,0.84,0.829832,0.838014,0.006036


In [90]:
grid_cv.best_score_

0.8247172286636174

In [92]:
grid_cv.score(validation.drop('Survived', axis=1), validation.Survived)

0.848314606741573

## Deep Learning

In [126]:
X = onehot_transformer.fit_transform(train.drop('Survived', axis=1))
y = train.Survived.values

In [129]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [194]:
x_train.shape

(534, 24)

In [239]:
dl_model = tf.keras.Sequential(
    [tf.keras.layers.Dense(50, activation='relu', input_shape=(24,)),
#      tf.keras.layers.Dropout(0.2),
#      tf.keras.layers.Dense(30, activation='relu'),
#      tf.keras.layers.Dropout(0.2),
     tf.keras.layers.Dense(30, activation='relu'),
     tf.keras.layers.Dropout(0.5),
     tf.keras.layers.Dense(10, activation='relu'),
     tf.keras.layers.Dropout(0.5),
     tf.keras.layers.Dense(5, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')])
dl_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])


In [240]:
dl_model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs=50)

Train on 534 samples, validate on 179 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1f24071b7c8>

In [241]:
x_val = onehot_transformer.fit_transform(validation.drop('Survived', axis=1))
y_val = validation.Survived.values

In [242]:
dl_model.evaluate(x_test, y_test, batch_size=32)



[0.40359946685796344, 0.82122904]

# Submit to Kaggle

In [233]:
def kaggle(grid_cv, test, dl=False):
    if dl:
        test_transformed = onehot_transformer.fit_transform(test)
        test['Survived'] = grid_cv.predict(test_transformed).round(0).astype(int)
    else:
        test['Survived'] = grid_cv.predict(test)
    print(test[['PassengerId', 'Survived']])
    submission_df = test[['PassengerId', 'Survived']]
    submission_df.to_csv('../output/titanic_submission.csv', index=False)

In [234]:
kaggle(dl_model, test, dl=True)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
