# Baseline Models

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle
import os
import numpy as np

In [6]:
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc

In [4]:
from imblearn.over_sampling import SMOTE

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

# Seasonal vaccine data

In [31]:
y = pd.read_csv('data/training_set_labels.csv').drop(['respondent_id', 'h1n1_vaccine'], axis = 1)

X = pd.read_csv('data/training_set_features.csv').drop('respondent_id', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

We'll separate the non numerical and numerical data from the dataset.

In [8]:
numericals = []
non_numericals = []

for column in X_train.columns:
    if X_train[column].dtype == 'float64':
        numericals.append(column)
    if X_train[column].dtype == 'object':
        non_numericals.append(column)

To make the data cleaning process quicker in the future, we'll setup this pipeline which will handle our **missing data** and **one hot encoding**.

In [9]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median', add_indicator = True)),
                               ('scaler', StandardScaler())])

categorical_transformer = Pipeline([('cat_imputer', SimpleImputer(strategy='most_frequent', add_indicator = True)),
                                    ('encoder', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numericals),
            ("cat", categorical_transformer, non_numericals),
        ]
    )

We'll also create a dictionary that has our various models of interest.

In [26]:
models = {'Logistic_regresion':
                       {'regressor': LogisticRegression(fit_intercept=False, C=1e12, solver = 'liblinear'),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                    'Naive_Bayes':
                       {'regressor': GaussianNB(),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                     'Random_Forest':
                       {'regressor': RandomForestClassifier(n_jobs = 2, max_depth = 5),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                    'Gradient_boost':
                       {'regressor': GradientBoostingClassifier(),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                     'Hist_Gradient_boost':
                       {'regressor': HistGradientBoostingClassifier(max_depth = 5),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       }
                    }

Now, we shall create a helper function which, firstly takes in the train test split data, transform it based on the pipeline and then create models from the model dictionary. <br>
<br>
Ultimately, the function will return a summary dictionary that contains:<br>
> the **AUC score** <br> 
> the **accuracy score** <br> 
> the **fpr** and **tpr** score <br>
> the **test** and **train accuracy scores** <br>

In [11]:
def model_evaluation(X_train, X_test, y_train, y_test,
                         baseline_models, 
                         preprocessor,
                         folder_name = None, 
                         ):
    
    # Create a summary dictionary
    summary_dict = {}
    
    for name, model in baseline_models.items():
        
        # transform the features    
        processor = model['preprocessor']
        X_train_processed = processor.fit_transform(X_train)
        X_test_processed = processor.transform(X_test)
    
        # Cross validation
        model['train_accuracy_score'] = np.mean(cross_val_score(model['regressor'], 
                                                        X_train_processed, y_train.values.ravel(), 
                                                        scoring="accuracy", cv=5))
    
        train_accuracy_score = model['train_accuracy_score']
    
        # fit the new model and make predictions
        new_model = model['regressor']
        new_model.fit(X_train_processed, y_train.values.ravel())
        preds = new_model.predict(X_test_processed)
        y_score = new_model.predict_proba(X_test_processed)

        # get our scoring metrics
        model['test_accuracy_score'] = accuracy_score(y_test, preds)
        test_accuracy_score = model['test_accuracy_score']
        
        model['auc_score'] = roc_auc_score(y_test, y_score[:,1])
        auc_score = model['auc_score']
        
        model['recall_score'] = recall_score(y_test, preds)
        model['precision_score'] = f1_score(y_test, preds)
        model['f1_score'] = precision_score(y_test, preds)
        
        recall = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        precision = precision_score(y_test, preds)
        
        # Visualisations
        fpr, tpr, thresholds = roc_curve(y_test, y_score[:,1])
        model['fpr'] = fpr
        model['tpr'] = tpr
        model['thresholds'] = thresholds
    
        # Saving the model
        if folder_name == None:
            pass
        else:
            os.makedirs(f'models/{name}/{folder_name}') 
            filepath = f'models/{name}/{folder_name}/baseline_model.pickl'
            pickle.dump(new_model, open(filepath, 'wb'))
        
        #Place everything into a dictionary and place that into the summary list
        summary_dict.update({name: {
                                   'train_score': train_accuracy_score, 'test_score': test_accuracy_score,
                                   'recall': recall, 'precision': precision, 'f1': f1,
                                   'auc': auc_score, 'tpr': tpr, 'fpr': fpr
                                   }})

    return summary_dict

In [32]:
seasonal_summary_table = model_evaluation(X_train, X_test, y_train['seasonal_vaccine'],
                         y_test['seasonal_vaccine'], models, preprocessor)

In [33]:
seasonal_df = pd.DataFrame(seasonal_summary_table)
seasonal_df

Unnamed: 0,Logistic_regresion,Naive_Bayes,Random_Forest,Gradient_boost,Hist_Gradient_boost
train_score,0.776435,0.636445,0.760509,0.784324,0.785022
test_score,0.788528,0.629325,0.771604,0.792422,0.791673
recall,0.75419,0.744988,0.688137,0.764706,0.766678
precision,0.7756,0.571609,0.78427,0.776443,0.774054
f1,0.764745,0.646883,0.733065,0.77053,0.770348
auc,0.859189,0.720609,0.84328,0.863926,0.866036
tpr,"[0.0, 0.00032862306933946765, 0.02793296089385...","[0.0, 0.025303976339139007, 0.0266184686164968...","[0.0, 0.00032862306933946765, 0.00230036148537...","[0.0, 0.00032862306933946765, 0.01708839960565...","[0.0, 0.00032862306933946765, 0.00394347683207..."
fpr,"[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275...","[0.0, 0.006329113924050633, 0.0063291139240506...","[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275...","[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275...","[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275..."


In [36]:
from plotly import graph_objects as go

data = seasonal_summary_table

fig = go.Figure(
    data=[
        go.Line(
            name="Benchmark",
            x = np.linspace(0,1,100),
            y = np.linspace(0,1,100)
        ),
        go.Line(
            name="Naive Bayes",
            x = data['Naive_Bayes']['fpr'],
            y = data['Naive_Bayes']['tpr'],
        ),
        go.Line(
            name="Hist_gradient_boost",
            x = data['Hist_Gradient_boost']['fpr'],
            y = data['Hist_Gradient_boost']['tpr'],
        )
    ],
    layout=go.Layout(
        title = "Seasonal Vaccine Baseline models ROC Curve",
        yaxis_title = "True positive rate",
        xaxis_title = 'False positive rate'
    )
)

fig.show()

# H1N1 vaccine data

In [12]:
y = pd.read_csv('data/training_set_labels.csv').drop(['respondent_id', 'seasonal_vaccine'], axis = 1)

X = pd.read_csv('data/training_set_features.csv').drop('respondent_id', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [16]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [17]:
sm = SMOTE()
X_train_smoted, y_train_smoted = sm.fit_resample(X_train_processed, y_train)

In [23]:
def model_evaluation_smoted(X_train_smoted, X_test, y_train_smoted, y_test,
                         baseline_models,
                         folder_name = None, 
                         ):
    
    # Create a summary dictionary
    summary_dict = {}
    
    for name, model in baseline_models.items():
    
        # Cross validation
        model['train_accuracy_score'] = np.mean(cross_val_score(model['regressor'], 
                                                        X_train_smoted, y_train_smoted.values.ravel(), 
                                                        scoring="accuracy", cv=5))
    
        train_accuracy_score = model['train_accuracy_score']
    
        # fit the new model and make predictions
        new_model = model['regressor']
        new_model.fit(X_train_smoted, y_train_smoted.values.ravel())
        preds = new_model.predict(X_test_processed)
        y_score = new_model.predict_proba(X_test_processed)

        # get our scoring metrics
        model['test_accuracy_score'] = accuracy_score(y_test, preds)
        test_accuracy_score = model['test_accuracy_score']
        
        model['auc_score'] = roc_auc_score(y_test, y_score[:,1])
        auc_score = model['auc_score']
        
        model['recall_score'] = recall_score(y_test, preds)
        model['precision_score'] = f1_score(y_test, preds)
        model['f1_score'] = precision_score(y_test, preds)
        
        recall = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        precision = precision_score(y_test, preds)
        
        # Visualisations
        fpr, tpr, thresholds = roc_curve(y_test, y_score[:,1])
        model['fpr'] = fpr
        model['tpr'] = tpr
        model['thresholds'] = thresholds
    
        # Saving the model
        if folder_name == None:
            pass
        else:
            #os.makedirs(f'models/{name}/{folder_name}') 
            filepath = f'models/{name}/{folder_name}/baseline_model.pickl'
            pickle.dump(new_model, open(filepath, 'wb'))
        
        #Place everything into a dictionary and place that into the summary list
        summary_dict.update({name: {
                                   'train_score': train_accuracy_score, 'test_score': test_accuracy_score,
                                   'recall': recall, 'precision': precision, 'f1': f1,
                                   'auc': auc_score, 'tpr': tpr, 'fpr': fpr
                                   }})

    return summary_dict

In [27]:
h1n1_summary_table = model_evaluation_smoted(X_train_smoted, X_test, y_train_smoted['h1n1_vaccine'],
                                             y_test['h1n1_vaccine'], models, folder_name = 'h1n1')

In [28]:
h1n1_df = pd.DataFrame(h1n1_summary_table)
h1n1_df

Unnamed: 0,Logistic_regresion,Naive_Bayes,Random_Forest,Gradient_boost,Hist_Gradient_boost
train_score,0.802702,0.656059,0.851681,0.884778,0.886491
test_score,0.80021,0.583795,0.831062,0.852479,0.858469
recall,0.760056,0.688779,0.605505,0.532816,0.526464
precision,0.520039,0.294508,0.601261,0.700371,0.731373
f1,0.617546,0.412598,0.603376,0.60521,0.612228
auc,0.861583,0.693531,0.845036,0.868287,0.871147
tpr,"[0.0, 0.0007057163020465773, 0.006351446718419...","[0.0, 0.07339449541284404, 0.07410021171489062...","[0.0, 0.0, 0.0056457304163726185, 0.0056457304...","[0.0, 0.0007057163020465773, 0.000705716302046...","[0.0, 0.0007057163020465773, 0.011291460832745..."
fpr,"[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001...","[0.0, 0.006083650190114068, 0.0060836501901140...","[0.0, 0.00019011406844106463, 0.00019011406844...","[0.0, 0.0, 0.00019011406844106463, 0.000190114...","[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001..."


In [35]:
from plotly import graph_objects as go

data = h1n1_summary_table

fig = go.Figure(
    data=[
        go.Line(
            name="Benchmark",
            x = np.linspace(0,1,100),
            y = np.linspace(0,1,100)
        ),
        go.Line(
            name="Naive Bayes",
            x = data['Naive_Bayes']['fpr'],
            y = data['Naive_Bayes']['tpr'],
        ),
        go.Line(
            name="Hist_gradient_boost",
            x = data['Hist_Gradient_boost']['fpr'],
            y = data['Hist_Gradient_boost']['tpr'],
        )
    ],
    layout=go.Layout(
        title = "H1N1 Vaccine Baseline models ROC Curve",
        yaxis_title = "True positive rate",
        xaxis_title = 'False positive rate'
    )
)

fig.show()