# **Predict Survival - Modelling & Evaluation**

## Objectives

* Fit and evaluate a classification model to predict if a passenger will survive.

## Inputs

* outputs/datasets/collection/titanic_passengers.csv
* Instructions on which variables to use for data cleaning and feature engineering, as described in previous notebooks.

## Outputs

* Train set (features and target)
* Test set (features and target)
* Data cleaning and Feature Engineering pipeline
* Modelling pipeline
* Feature importance plot



---

## Set up the Working Directory

In [None]:
import os
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
current_dir

## Load Collected Data

In [None]:
import pandas as pd
df_raw_path = "outputs/datasets/collection/titanic_passengers.csv"
df = pd.read_csv(df_raw_path)
df.head()

---

## Pipeline

ML Pipeline with Data Cleaning and Feature Engineering stages from previous notebooks. As the distribution of the target variable ('Survived') is imbalanced, the pipeline will be split into two stages.

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropFeatures
from feature_engine.encoding import OneHotEncoder
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

def pipeline_dc_fe():
  pipeline_base = Pipeline([
    ('drop', DropFeatures(features_to_drop=['Cabin', 'PassengerId', 'Ticket','Name'])),
    ('median', MeanMedianImputer(variables=['Age'], imputation_method='median')),
    ('categorical_imputer', CategoricalImputer(imputation_method='missing', fill_value='Missing',variables=['Embarked'])),
    ('categorical_encoder',OneHotEncoder(variables=['Sex', 'Embarked'], drop_last=False)),
  ])
  return pipeline_base

pipeline_dc_fe()

ML Pipeline for Modelling and Hyperparameter Optimisation

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

def pipeline_clf(model):
  pipeline_base = Pipeline([
       ("scaler",StandardScaler() ),
        ("feat_selection", SelectFromModel(model)),
        ("model", model),
  ])

  return pipeline_base

 

## Hyperparameter Optimization

The code below is a custom class, used with permission from the Code Institue 'Churnometer' walkthrough project.

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np



class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = pipeline_clf(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches


## Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Survived'], axis=1),
    df['Survived'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

## Handle Target Imbalance

First of all, the data cleaning and feature engineering steps are applied to the X_train and X_test sets.

In [None]:
pipeline_data_cleaning_feat_eng = pipeline_dc_fe()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train)
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

## Check Distribution

In [None]:
import matplotlib.pyplot as plt
y_train.value_counts().plot(kind='bar', title='Train Set Target Distribution')
plt.show()

The plot above shows that the distribution of the target variables (Survived) is a little imbalanced.

Use Synthetic Minority Oversampling Technique to balance set

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(sampling_strategy='minority', random_state=0)
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
y_train.value_counts().plot(kind='bar', title='Train Set Target Distribution After Oversampling')
plt.show()

## Grid Search

The following steps will identify the best Model and Hyperparameters for training the model.

In [None]:
models_quick_search = {
    "LogisticRegression": LogisticRegression(random_state=0),
    "XGBClassifier": XGBClassifier(random_state=0),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
}

params_quick_search = {
    "LogisticRegression": {},
    "XGBClassifier": {},
    "DecisionTreeClassifier": {},
    "RandomForestClassifier": {},
    "GradientBoostingClassifier": {},
    "ExtraTreesClassifier": {},
    "AdaBoostClassifier": {},
}


In [None]:
from sklearn.metrics import make_scorer, accuracy_score
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(accuracy_score),
           n_jobs=-1, cv=5)

In [None]:

grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

This quick search reveals that the three best models, all with mean_scores above 0.8 are: RandomForestClassifier, GradientBoostingClassifier & ExtraTreesClassifier.

---

## Extensive Search


The following parameters will be examined to find the best set for the model.

In [None]:
models_search = {
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
}

params_search = {
    "GradientBoostingClassifier": {
        'model__learning_rate': [0.1, 0.01, 0.001],
        'model__n_estimators': [100, 200, 300, 400],
        'model__subsample': [1.0, 0.8, 0.6],
        'model__max_depth': [None, 1, 3, 5],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
    },
    "ExtraTreesClassifier": {
        'model__n_estimators': [100, 200, 300, 400, 500],
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [None, 10, 20, 30, 40],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': [None, 'sqrt', 'log2'],
    },
    "RandomForestClassifier": {
        'model__n_estimators': [100, 200, 300, 400, 500],
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [None, 10, 20, 30, 40],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': [None, 'sqrt', 'log2'],
    }
}

In [None]:
from sklearn.metrics import make_scorer, accuracy_score
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(accuracy_score),
           n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

Get the best model rogrammatically:


In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Get the best parameterts for this model programatically:

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Therefore, the best pipeline is:


In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

Assess feature importance

In [None]:
X_train.head(3)

In [None]:

df_feature_importance = (pd.DataFrame(data={
    'Feature': X_train.columns[pipeline_clf['feat_selection'].get_support()],
    'Importance': pipeline_clf['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)


best_features = df_feature_importance['Feature'].to_list()


print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

## Evaluate Pipeline

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


def confusion_matrix_and_report(X, y, pipeline, label_map):

    prediction = pipeline.predict(X)

    print('---  Confusion Matrix  ---')
    print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
          columns=[["Actual " + sub for sub in label_map]],
          index=[["Prediction " + sub for sub in label_map]]
          ))
    print("\n")

    print('---  Classification Report  ---')
    print(classification_report(y, prediction, target_names=label_map), "\n")


def clf_performance(X_train, y_train, X_test, y_test, pipeline, label_map):
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train, y_train, pipeline, label_map)

    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test, y_test, pipeline, label_map)

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_clf,
                label_map= ['Did Not Survive', 'Survived'] 
                )

This model and parameters produces results that meet the stated success metric (80% Precision for both Survived and Did Not Survive.)

It may be possible to produce similar results using only the most important features. This is explored below.

## Refit pipeline with best features

In [None]:
best_features

As `'Sex'` has been encoded, it needs restating below.

In [None]:
best_features = ['Sex','Fare','Age','Pclass']

## Redefine cleaning and feature engineering

In [None]:
def pipeline_dc_fe():
  pipeline_base = Pipeline([
    ('median', MeanMedianImputer(variables=['Age'], imputation_method='median')),
    ('categorical_encoder',OneHotEncoder(variables=['Sex'], drop_last=False)),
  ])
  return pipeline_base

In [None]:
def pipeline_clf(model):
  pipeline_base = Pipeline([
       ("scaler",StandardScaler() ),
        ("model", model),
  ])

  return pipeline_base

## Split Train and Test, cosidering only best features

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Survived'], axis=1),
    df['Survived'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Filter only the best features

In [None]:
X_train = X_train.filter(best_features)
X_test = X_test.filter(best_features)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(3)

## Handle Target Imbalance

In [None]:
pipeline_data_cleaning_feat_eng = pipeline_dc_fe()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train)
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(sampling_strategy='minority', random_state=0)
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

## Rexamine the performance of the pipeline


Redefine the model and parameters

In [None]:
models_search = {
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
}

params_search = {
    "GradientBoostingClassifier": {'model__learning_rate': [0.1],
                                   'model__max_depth': [3],
                                   'model__min_samples_leaf': [4],
                                   'model__min_samples_split': [10],
                                   'model__n_estimators': [300],
                                   'model__subsample': [0.6]}
}

In [None]:
from sklearn.metrics import make_scorer, accuracy_score
quick_search = HyperparameterOptimizationSearch(
    models=models_search, params=params_search)

quick_search.fit(X_train, y_train,
                 scoring=make_scorer(accuracy_score),
                 n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary 

Define the best pipeline:

In [None]:
best_model = grid_search_summary.iloc[0, 0]
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

## Evaluate Pipeline using most important features

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_clf,
                label_map= ['Did Not Survive', 'Survived'] 
                )

Using the best features `['Sex','Fare','Age','Pclass']` results in an even stronger performance.

# Push files to Repo

In [None]:
import os

version = 'v6'
file_path = f'outputs/ml_pipeline/predict-survivor/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

Save datasets

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

Save pipelines

In [None]:
pipeline_dc_fe()

In [None]:
import joblib

joblib.dump(value=pipeline_data_cleaning_feat_eng ,
            filename=f"{file_path}/pipeline_dc_fe.pkl")

In [None]:
joblib.dump(value=pipeline_clf,
            filename=f"{file_path}/pipeline_clf.pkl")

Save plot showing feature importance

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.show()

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')