In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as XGB
import lightgbm
import seaborn as sns
import numpy as np

import tensorflow as tf

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
train_raw = pd.read_pickle('../output/preprocessed_train.pkl')
test = pd.read_pickle('../output/preprocessed_test.pkl')

# Keep a hold out test set

In [3]:
validation = train_raw.sample(frac=0.2)

train = train_raw.loc[~train_raw.index.isin(validation.index), :]

print(train_raw.shape[0], train.shape[0], validation.shape[0])

891 713 178


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 1 to 890
Data columns (total 18 columns):
PassengerId    713 non-null int64
Survived       713 non-null int64
Pclass         713 non-null int64
Name           713 non-null object
Sex            713 non-null object
Age            713 non-null float64
SibSp          713 non-null int64
Parch          713 non-null int64
Ticket         713 non-null object
Fare           713 non-null float64
Cabin          713 non-null object
Embarked       713 non-null object
cabin_cat      713 non-null object
family_size    713 non-null int64
family_cat     713 non-null object
Initial        713 non-null object
fare_range     713 non-null category
fare_cat       713 non-null category
dtypes: category(2), float64(2), int64(6), object(8)
memory usage: 96.3+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 17 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          418 non-null object
Embarked       418 non-null object
cabin_cat      418 non-null object
family_size    418 non-null int64
family_cat     418 non-null object
Initial        418 non-null object
fare_range     418 non-null category
fare_cat       418 non-null category
dtypes: category(2), float64(2), int64(5), object(8)
memory usage: 50.0+ KB


# Convntional Models

## Feature Engineering and Selection

In [13]:
cat_cols = ['Pclass', 'Sex', 'Embarked', 'cabin_cat', 'family_cat', 'Initial', 'fare_cat']
num_cols = ['Age', 'Fare', 'family_size']

onehot_transformer = ColumnTransformer(transformers=[
                                                      ('num', MinMaxScaler(), num_cols),
                                                      ('cat', OneHotEncoder(sparse=True), cat_cols)
                                                      ],
                                        remainder='drop')
ordinal_transformer = ColumnTransformer(transformers=[
                                                      ('num', MinMaxScaler(), num_cols),
                                                      ('cat', OrdinalEncoder(), cat_cols)
                                                      ],
                                        remainder='drop')
rfe = RFECV(ExtraTreeClassifier(), cv=3, verbose=True, n_jobs=1)
coef = SelectFromModel(LogisticRegression())

In [12]:
x_train = ordinal_transformer.fit_transform(train)
y_train = train.Survived.values
x_val = ordinal_transformer.fit_transform(validation)
y_val = validation.Survived.values

In [15]:
# params = {'objective': 'binary'}
# lgbm = lightgbm.sklearn.LGBMClassifier()
# lgbm.fit(x_train, y_train)
# lgbm.score(x_val, y_val)
# x_test = ordinal_transformer.fit_transform(test)
# test['Survived'] = lgbm.predict(x_test)
# test[['PassengerId', 'Survived']].to_csv('../output/titanic_submission.csv', index=False)

## Pipeline

In [55]:
pipeline = Pipeline(steps=[('transform', 'passthrough'), ('feature', 'passthrough'), ('clf', SVC())])
params = [
#          {'transform':[onehot_transformer],
#           'feature': [rfe, coef],
#           'clf__C': [10],
#           'clf__gamma': ['scale'],
#           'clf__kernel': ['sigmoid', 'rbf']},
          {'transform': [ordinal_transformer],
           'feature': [rfe, coef],
           'clf': [XGB.XGBClassifier()],
           'clf__booster': ['dart', 'gblinear'],
           'clf__learning_rate': [0.03, 0.1, 1],
#            'clf__colsample_bytree': [0.3, 0.7],
           'clf__max_depth': [2, 3, 5],
#            'clf__subsample': [0.4, 0.6],
           'clf__max_delta_step': [0, 1, 5]}]
#          {'transform':[onehot_transformer, ordinal_transformer],
#           'feature': [rfe, coef],
#           'clf': [GradientBoostingClassifier(), RandomForestClassifier()],
#           'clf__max_depth': [None, 2, 6, 10],
#           'clf__criterion': ['gini', 'entropy'],
#           'clf__min_samples_split': [2, 0.1, 0.2],
#           'clf__min_samples_leaf': [1, 10, 0.1]}]
grid_cv = GridSearchCV(pipeline, param_grid=params, cv=3, refit=True, return_train_score=True, verbose=True, n_jobs=2)
grid_cv.fit(train.drop('Survived', axis=1), train.Survived.values)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.4s
[Parallel(n_jobs=2)]: Done 206 tasks      | elapsed:   20.2s
[Parallel(n_jobs=2)]: Done 324 out of 324 | elapsed:   29.9s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('transform', 'passthrough'),
                                       ('feature', 'passthrough'),
                                       ('clf',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001...
                                                          transformer_weights=None,
                                                          transformers=

In [56]:
grid_cv.best_params_

{'clf': XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0,
               learning_rate=0.03, max_delta_step=0, max_depth=2,
               min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
               nthread=None, objective='binary:logistic', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               silent=None, subsample=1, verbosity=1),
 'clf__booster': 'dart',
 'clf__learning_rate': 0.03,
 'clf__max_delta_step': 0,
 'clf__max_depth': 2,
 'feature': SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1, l1_ratio=None,
                                              max_iter=100, multi_class='warn',
                                              n_jobs=None, penalty='l2',
                 

In [57]:
pd.DataFrame(grid_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__booster,param_clf__learning_rate,param_clf__max_delta_step,param_clf__max_depth,param_feature,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.262289,0.078859,0.010239,0.001296,"XGBClassifier(base_score=0.5, booster='dart', ...",dart,0.03,0,2,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...",...,0.819328,0.797468,0.817672,0.015823,33,0.850526,0.835789,0.813025,0.833114,0.015426
1,0.105795,0.019028,0.012084,0.003389,"XGBClassifier(base_score=0.5, booster='dart', ...",dart,0.03,0,2,SelectFromModel(estimator=LogisticRegression(C...,...,0.848739,0.843882,0.840112,0.008985,1,0.840000,0.837895,0.842437,0.840111,0.001856
2,0.364722,0.074895,0.012093,0.001845,"XGBClassifier(base_score=0.5, booster='dart', ...",dart,0.03,0,3,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...",...,0.815126,0.839662,0.830295,0.010833,14,0.869474,0.865263,0.850840,0.861859,0.007979
3,0.123749,0.007642,0.014017,0.001154,"XGBClassifier(base_score=0.5, booster='dart', ...",dart,0.03,0,3,SelectFromModel(estimator=LogisticRegression(C...,...,0.848739,0.843882,0.840112,0.008985,1,0.840000,0.842105,0.842437,0.841514,0.001079
4,0.405107,0.091628,0.010538,0.002196,"XGBClassifier(base_score=0.5, booster='dart', ...",dart,0.03,0,5,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...",...,0.823529,0.848101,0.827489,0.015429,17,0.907368,0.890526,0.899160,0.899018,0.006876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,0.058923,0.017342,0.014170,0.006144,"XGBClassifier(base_score=0.5, booster='dart', ...",gblinear,1,5,2,SelectFromModel(estimator=LogisticRegression(C...,...,0.617647,0.616034,0.617111,0.000760,80,0.616842,0.616842,0.617647,0.617110,0.000379
104,0.161414,0.018194,0.014159,0.004197,"XGBClassifier(base_score=0.5, booster='dart', ...",gblinear,1,5,3,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...",...,0.647059,0.616034,0.628331,0.013467,69,0.625263,0.646316,0.617647,0.629742,0.012125
105,0.059702,0.020582,0.014083,0.006109,"XGBClassifier(base_score=0.5, booster='dart', ...",gblinear,1,5,3,SelectFromModel(estimator=LogisticRegression(C...,...,0.617647,0.616034,0.617111,0.000760,80,0.616842,0.616842,0.617647,0.617110,0.000379
106,0.121450,0.003288,0.010102,0.001017,"XGBClassifier(base_score=0.5, booster='dart', ...",gblinear,1,5,5,"RFECV(cv=3,\n estimator=ExtraTreeClassifi...",...,0.617647,0.628692,0.642356,0.027494,55,0.680000,0.616842,0.621849,0.639564,0.028666


In [58]:
grid_cv.best_score_

0.8401122019635343

In [59]:
grid_cv.score(validation.drop('Survived', axis=1), validation.Survived)

0.797752808988764

## Deep Learning

In [126]:
X = onehot_transformer.fit_transform(train.drop('Survived', axis=1))
y = train.Survived.values

In [129]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [194]:
x_train.shape

(534, 24)

In [239]:
dl_model = tf.keras.Sequential(
    [tf.keras.layers.Dense(50, activation='relu', input_shape=(24,)),
#      tf.keras.layers.Dropout(0.2),
#      tf.keras.layers.Dense(30, activation='relu'),
#      tf.keras.layers.Dropout(0.2),
     tf.keras.layers.Dense(30, activation='relu'),
     tf.keras.layers.Dropout(0.5),
     tf.keras.layers.Dense(10, activation='relu'),
     tf.keras.layers.Dropout(0.5),
     tf.keras.layers.Dense(5, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')])
dl_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])


In [240]:
dl_model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs=50)

Train on 534 samples, validate on 179 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1f24071b7c8>

In [241]:
x_val = onehot_transformer.fit_transform(validation.drop('Survived', axis=1))
y_val = validation.Survived.values

In [242]:
dl_model.evaluate(x_test, y_test, batch_size=32)



[0.40359946685796344, 0.82122904]

# Submit to Kaggle

In [11]:
def kaggle(grid_cv, test, dl=False):
    if dl:
        test_transformed = onehot_transformer.fit_transform(test)
        test['Survived'] = grid_cv.predict(test_transformed).round(0).astype(int)
    else:
        test['Survived'] = grid_cv.predict(test)
    print(test[['PassengerId', 'Survived']])
    submission_df = test[['PassengerId', 'Survived']]
    submission_df.to_csv('../output/titanic_submission.csv', index=False)

In [60]:
kaggle(lgbm, test)

     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]


In [234]:
kaggle(dl_model, test, dl=True)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
