In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as XGBClassifier
import seaborn as sns
import numpy as np

In [17]:
train = pd.read_pickle('../output/preprocessed_train.pkl')
test = pd.read_pickle('../output/preprocessed_test.pkl')

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 18 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
cabin_cat      891 non-null object
family_size    891 non-null int64
family_cat     891 non-null object
Initial        891 non-null object
fare_range     891 non-null category
fare_cat       891 non-null category
dtypes: category(2), float64(2), int64(6), object(8)
memory usage: 113.3+ KB


In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 17 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          418 non-null object
Embarked       418 non-null object
cabin_cat      418 non-null object
family_size    418 non-null int64
family_cat     418 non-null object
Initial        418 non-null object
fare_range     418 non-null category
fare_cat       418 non-null category
dtypes: category(2), float64(2), int64(5), object(8)
memory usage: 50.0+ KB


In [21]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'cabin_cat', 'family_size',
       'family_cat', 'Initial', 'fare_range', 'fare_cat'],
      dtype='object')

In [39]:
cat_cols = ['Pclass', 'Sex', 'Embarked', 'cabin_cat', 'family_cat', 'Initial', 'fare_cat']
num_cols = ['Age', 'Fare', 'family_size']

onehot_transformer = ColumnTransformer(transformers=[
                                                      ('num', MinMaxScaler(), num_cols),
                                                      ('cat', OneHotEncoder(sparse=True), cat_cols)
                                                      ],
                                        remainder='drop')
ordinal_transformer = ColumnTransformer(transformers=[
                                                      ('num', MinMaxScaler(), num_cols),
                                                      ('cat', OrdinalEncoder(), cat_cols)
                                                      ],
                                        remainder='drop')

# Feature Selection

In [63]:
rfe_pipe = Pipeline(steps=[('trasnform', ordinal_transformer), ('tree', DecisionTreeClassifier())])
rfe = RFECV(rfe_pipe, cv=3)
rfe.fit(train.drop('Survived', axis=1), train.Survived.values)

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

# SVM

## One hot coding

In [25]:
Onehot_transformer = OneHotEncoder(sparse=True)
svm = SVC()
clf = Pipeline(steps=[('preprocessor', 'passthrough'), 
                      ('scaler', 'passthrough'), 
                      ('clf', svm)])#the last step has to be an estimator with 'score' function. 
                                    #'SVM' object will be replaced by other estimators later in the params_grid

In [55]:
params = [{'preprocessor': [onehot_transformer, ordinal_transformer],
          'scaler': [None],
          'clf__C': [0.1, 1, 10],
          'clf__gamma': ['auto', 'scale'],
          'clf__kernel': ['linear', 'sigmoid', 'rbf']},
          {'preprocessor': [ordinal_transformer],
          'scaler': [None],
          'clf': [RandomForestClassifier()],
          #'clf__':[0.2],
           #'clf__loss':['deviance', 'exponential'],
          'clf__max_depth': [None, 2, 6, 10],
          'clf__criterion': ['gini', 'entropy'],
          'clf__oob_score': [True]}]
grid = GridSearchCV(clf, params, cv=3, verbose=True, return_train_score=True)

In [56]:
grid.fit(train.drop('Survived', axis=1), train.Survived.values)

Fitting 3 folds for each of 44 candidates, totalling 132 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("S

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
[Parallel(n_jobs=1)]: Done 132 out of 132 | elapsed:    7.4s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor', 'passthrough'),
                                       ('scaler', 'passthrough'),
                                       ('clf',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.0...
                                                             transformers=[('num',
                                                                        

In [57]:
grid.best_params_

{'clf__C': 0.1,
 'clf__gamma': 'scale',
 'clf__kernel': 'rbf',
 'preprocessor': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('num',
                                  MinMaxScaler(copy=True, feature_range=(0, 1)),
                                  ['Age', 'Fare', 'family_size']),
                                 ('cat',
                                  OneHotEncoder(categorical_features=None,
                                                categories=None, drop=None,
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                n_values=None, sparse=True),
                                  ['Pclass', 'Sex', 'Embarked', 'cabin_cat',
                                   'family_cat', 'Initial', 'fare_cat'])],
                   verbose=False),
 'scaler

In [58]:
grid.best_score_

0.8327721661054994

In [117]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
params = [{'preprocessor':[OneHotEncoder()],
           'scaler':[None],
           'clf':[SVC()],
           'clf__C':[0.1, 1, 10], 
           'clf__gamma':['auto', 'scale'],
           'clf__kernel': ['linear', 'sigmoid', 'rbf']},
          {'preprocessor':[OrdinalEncoder()],
           'scaler':[None, MinMaxScaler()],
           'clf':[RandomForestClassifier(), GradientBoostingClassifier()],
           'clf__n_estimators':[100, 300],
           'clf__max_features': max_features,
           'clf__max_depth': [None, 3, 5],
           'clf__min_samples_split': min_samples_split,
           'clf__min_samples_leaf': min_samples_leaf}]
grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=3, return_train_score=True, n_jobs=-1, verbose=True, error_score=0)
grid_search.fit(train.drop(columns=['Survived']), train.Survived)

Fitting 3 folds for each of 210 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 463 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed:  2.1min finished


GridSearchCV(cv=3, error_score=0,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor', 'passthrough'),
                                       ('scaler', 'passthrough'),
                                       ('clf',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))...
                          'clf__max_features': ['auto', 'sqrt'],
                          'clf__min_samples_leaf': [2, 4],
                          'clf

In [119]:
grid_search.best_score_

0.8305274971941639

In [121]:
pd.DataFrame(grid_search.cv_results_)[['params', 'mean_test_score', 'mean_train_score']]

Unnamed: 0,params,mean_test_score,mean_train_score
0,"{'clf': SVC(C=10, cache_size=200, class_weight...",0.828283,0.828283
1,"{'clf': SVC(C=10, cache_size=200, class_weight...",0.734007,0.746914
2,"{'clf': SVC(C=10, cache_size=200, class_weight...",0.773288,0.790123
3,"{'clf': SVC(C=10, cache_size=200, class_weight...",0.828283,0.828283
4,"{'clf': SVC(C=10, cache_size=200, class_weight...",0.799102,0.806958
...,...,...,...
205,{'clf': GradientBoostingClassifier(criterion='...,0.814815,0.868126
206,{'clf': GradientBoostingClassifier(criterion='...,0.814815,0.863636
207,{'clf': GradientBoostingClassifier(criterion='...,0.817059,0.865320
208,{'clf': GradientBoostingClassifier(criterion='...,0.812570,0.868126


In [156]:
test_new = preprocess(test)

In [160]:
test['Survived'] = grid_search.predict(test_new)

In [161]:
submission_df = test[['PassengerId', 'Survived']]

In [164]:
submission_df.to_csv('../output/titanic_submission.csv', index=False)