# Titanic - Predictive Modeling!
In our last session, we focused on performing feature engineering on the raw Titanic dataset. At the end of that session, we exported our cleaned dataset into another CSV file. We are now ready to start our predictive modeling process!

## Notebook Setup

In [1]:
# Importing the necessary Python libraries
import cloudpickle
import pandas as pd
from category_encoders.one_hot import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading in the cleaned datasets
X = pd.read_csv('../data/clean/X.csv')
y = pd.read_csv('../data/clean/y.csv')

In [3]:
# Viewing the first few rows of the X dataset
X.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan,Age_child,Age_teen,Age_young_adult,Age_adult,Age_elder
0,3,1,0,7.25,1,0,1,0,0,0,0,0,1,0,0
1,1,1,0,71.2833,0,1,0,1,0,0,0,0,0,1,0
2,3,0,0,7.925,0,1,1,0,0,0,0,0,1,0,0
3,1,1,0,53.1,0,1,1,0,0,0,0,0,0,1,0
4,3,0,0,8.05,1,0,1,0,0,0,0,0,0,1,0


In [4]:
# Viewing the first few rows of the y dataset
y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


## Data Separation
When working with a training dataset, it is a good idea to hold out a portion of the data so that we have something we can validate the model against. In the cell below, we will use Scikit-Learn's `train_test_split` functionality to split the data into respective training and validation sets.

In [5]:
# Splitting the datasets between training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
# Viewing the first few rows of the X_train dataset
X_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan,Age_child,Age_teen,Age_young_adult,Age_adult,Age_elder
331,1,0,0,28.5,1,0,1,0,0,0,0,0,0,1,0
733,2,0,0,13.0,1,0,1,0,0,0,0,0,1,0,0
382,3,0,0,7.925,1,0,1,0,0,0,0,0,0,1,0
704,3,1,0,7.8542,1,0,1,0,0,0,0,0,1,0,0
813,3,4,2,31.275,0,1,1,0,0,0,1,0,0,0,0


In [7]:
# Viewing the first few rows of the y_train dataset
y_train.head()

Unnamed: 0,Survived
331,0
733,0
382,0
704,0
813,0


In [8]:
# Checking to see that the 20% split worked properly
len(X_val) / len(X)

0.20089786756453423

## Hyperparameter Tuning

In [9]:
# Instantiating a Random Forest Classifier object
rfc_gridsearch = RandomForestClassifier()

In [10]:
# Defining the parameter grid for hyperparameter tuning
params = {'n_estimators': [10, 50, 100],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 2, 5],
          'max_depth': [10, 20, 50]
         }

In [11]:
# Instantiating the GridSearchCV object
hyperparameter_tuner = GridSearchCV(estimator = rfc_gridsearch,
                                    param_grid = params)

In [12]:
# Running the hyperparameter tuning job
hyperparameter_tuner.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20, 50],
                         'min_samples_leaf': [1, 2, 5],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [10, 50, 100]})

In [13]:
# Viewing the best parameters from the hyperparameter tuning job
hyperparameter_tuner.best_params_

{'max_depth': 50,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 10}

## Model Training

In [14]:
# Instantiating a new Random Forest Classifier object
rfc_model = RandomForestClassifier(n_estimators = 50,
                                   max_depth = 20,
                                   min_samples_split = 10,
                                   min_samples_leaf = 2)

In [15]:
# Performing the model training
rfc_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=10,
                       n_estimators=50)

## Model Validation

In [16]:
# Getting predictions on the X_val dataset using the trained RFC model
val_preds = rfc_model.predict(X_val)

In [17]:
# Getting the metrics with the validation dataset
val_accuracy = accuracy_score(y_val, val_preds)
val_roc_auc = roc_auc_score(y_val, val_preds)
val_confusion_matrix = confusion_matrix(y_val, val_preds)

In [18]:
# Printing out the validation metrics
print(f'Accuracy Score: {val_accuracy}')
print(f'ROC-AUC Score: {val_roc_auc}')
print(f'Confusion Matrix: \n{val_confusion_matrix}')

Accuracy Score: 0.8268156424581006
ROC-AUC Score: 0.8124839124839125
Confusion Matrix: 
[[94 11]
 [20 54]]


## Saving out a Simple Model

In [19]:
# Saving the RFC model to a pickle
with open('../models/rfc_model.pkl', 'wb') as f:
    cloudpickle.dump(rfc_model, f)

## Loading our Trained Model

In [20]:
# Loading in the RFC model from serialized file
with open('../models/rfc_model.pkl', 'rb') as f:
    rfc_loaded_model = cloudpickle.load(f)

In [21]:
# Getting predictions with the loaded model
loaded_preds = rfc_loaded_model.predict(X_val)

In [22]:
# Showing the metrics with the loaded preds
val_accuracy = accuracy_score(y_val, loaded_preds)
val_roc_auc = roc_auc_score(y_val, loaded_preds)
val_confusion_matrix = confusion_matrix(y_val, loaded_preds)

print(f'Accuracy Score: {val_accuracy}')
print(f'ROC-AUC Score: {val_roc_auc}')
print(f'Confusion Matrix: \n{val_confusion_matrix}')

Accuracy Score: 0.8268156424581006
ROC-AUC Score: 0.8124839124839125
Confusion Matrix: 
[[94 11]
 [20 54]]


## Creating a Full Pipeline
In our livestream on Sept. 9, we focused on training a model and just saving that model alone. It is possible to save a serialized pickle file that not only contains the ability to perform inference on cleaned data but can also do that data cleansing itself. In this section, we'll be taking everything we've done so far to create a full ML inference pipeline.

### Loading Raw Data
Since our pipeline is going to take data in pretty much it's purest, rawest form, we are going to load in that original raw dataset we downloaded from Kaggle instead of working with our already cleaned data.

In [41]:
# Loading in the raw Titanic training data
df_raw = pd.read_csv('../data/raw/train.csv')

In [42]:
# Separating predictor value from the remainder of the dataset
X = df_raw.drop(columns = ['Survived'])
y = df_raw[['Survived']]

In [43]:
# Performing training / validation dataset split
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

### Helper Functions
As part of our feature engineering, you might recall that we did some special things to create certain features. Before we jump into creating our pipeline, we'll need to package that feature engineering as their own respective Python functions. You'll notice this code is pretty much copy/pasted as is from our feature engineering notebook.

In [27]:
# Creating a function to appropriately engineer the 'Age' column
def create_age_bins(df):
    '''Engineers age bin variables for the pipeline'''
    
    # Filling any null values with the median age of 28.0
    median_age = 28.0
    df['Age'].fillna(median_age, inplace = True)
    
    # Establishing our bins values and names
    bin_labels = ['child', 'teen', 'young_adult', 'adult', 'elder']
    bin_values = [-1, 12, 19, 30, 60, 100]
    
    # Applying "Age" binning with Pandas cut
    age_bins = pd.cut(df['Age'], bins = bin_values, labels = bin_labels)
    df_age_bins = pd.DataFrame(age_bins)
    
    # Dropping the original "Age" column
    df.drop(columns = ['Age'], inplace = True)
    
    # Concatenating the new "Age" column to the original DataFrame
    df = pd.concat([df, df_age_bins], axis = 1)
    
    return df

### Pipeline Creation
Now that we have created our helper functions to perform the feature engineering, we are ready to begin packaging everything as a single, unified pipeline.

In [28]:
# Creating the "Age" binning function transformer as the first step into our modeling pipeline
age_binner = FunctionTransformer(create_age_bins, validate = False)

In [29]:
# Creating the data preprocessor that will perform our feature engineering
data_preprocessor = ColumnTransformer(transformers = [
    ('ohe_engineering', OneHotEncoder(use_cat_names = True, handle_unknown = 'ignore'), ['Age', 'Sex', 'Embarked']),
    ('columns_to_drop', 'drop', ['PassengerId', 'Name', 'Ticket', 'Cabin'])],
                                      remainder = 'passthrough'
)

In [30]:
# Creating the full inference pipeline
rfc_pipeline = Pipeline(steps = [
    ('age_binning', age_binner),
    ('feature_engineering', data_preprocessor),
    ('predictive_modeling', RandomForestClassifier(n_estimators = 50,
                                                   max_depth = 20,
                                                   min_samples_split = 10,
                                                   min_samples_leaf = 2))
])

In [31]:
# Training the inference pipeline with the training data
rfc_pipeline.fit(X_train, y_train)

Pipeline(steps=[('age_binning',
                 FunctionTransformer(func=<function create_age_bins at 0x7f9db92c8ca0>)),
                ('feature_engineering',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe_engineering',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                use_cat_names=True),
                                                  ['Age', 'Sex', 'Embarked']),
                                                 ('columns_to_drop', 'drop',
                                                  ['PassengerId', 'Name',
                                                   'Ticket', 'Cabin'])])),
                ('predictive_modeling',
                 RandomForestClassifier(max_depth=20, min_samples_leaf=2,
                                        min_samples_split=10,
                                        n_estimators=5

In [32]:
# Generating inferences on the validation dataset with the trained RFC pipeline
val_preds = pd.DataFrame(rfc_pipeline.predict(X_val))

In [33]:
# Getting the metrics with the validation dataset
val_accuracy = accuracy_score(y_val, val_preds)
val_roc_auc = roc_auc_score(y_val, val_preds)
val_confusion_matrix = confusion_matrix(y_val, val_preds)

In [34]:
# Printing out the validation metrics
print(f'Accuracy Score: {val_accuracy}')
print(f'ROC-AUC Score: {val_roc_auc}')
print(f'Confusion Matrix: \n{val_confusion_matrix}')

Accuracy Score: 0.8340807174887892
ROC-AUC Score: 0.8166610766392757
Confusion Matrix: 
[[121  13]
 [ 24  65]]


In [36]:
# Saving the pipeline to a serialized pickle file
with open('../models/rfc_pipeline.pkl', 'wb') as f:
    cloudpickle.dump(rfc_pipeline, f)

### Loading Our Trained Pipeline

In [38]:
# Loading in the trained RFC pipeline from the serialized pickle file
with open('../models/rfc_pipeline.pkl', 'rb') as f:
    loaded_rfc_pipeline = cloudpickle.load(f)

In [45]:
# Getting inferences with the loaded RFC pipeline
val_preds = pd.DataFrame(loaded_rfc_pipeline.predict(X_val))

In [46]:
# Getting the metrics with the validation dataset
val_accuracy = accuracy_score(y_val, val_preds)
val_roc_auc = roc_auc_score(y_val, val_preds)
val_confusion_matrix = confusion_matrix(y_val, val_preds)

# Printing out the validation metrics
print(f'Accuracy Score: {val_accuracy}')
print(f'ROC-AUC Score: {val_roc_auc}')
print(f'Confusion Matrix: \n{val_confusion_matrix}')

Accuracy Score: 0.8340807174887892
ROC-AUC Score: 0.8166610766392757
Confusion Matrix: 
[[121  13]
 [ 24  65]]
