# Baseline Models

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle
import os
import numpy as np

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

## Data Load

In [7]:
y = pd.read_csv('data/training_set_labels.csv').drop('respondent_id', axis = 1)

X = pd.read_csv('data/training_set_features.csv').drop('respondent_id', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Pipelines

In [8]:
numericals = []
non_numericals = []

for column in X_train.columns:
    if X_train[column].dtype == 'float64':
        numericals.append(column)
    if X_train[column].dtype == 'object':
        non_numericals.append(column)

In [9]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median', add_indicator = True)),
                               ('scaler', StandardScaler())])

categorical_transformer = Pipeline([('cat_imputer', SimpleImputer(strategy='most_frequent', add_indicator = True)),
                                    ('encoder', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numericals),
            ("cat", categorical_transformer, non_numericals),
        ]
    )

## Baseline Modeling

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.svm import SVC
from sklearn.linear_model import PassiveAggressiveClassifier

In [11]:
models = {'Logistic_regresion':
                       {'regressor': LogisticRegression(fit_intercept=False, C=1e12, solver = 'liblinear'),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                    'Naive_Bayes':
                       {'regressor': GaussianNB(),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                     'Random_Forest':
                       {'regressor': RandomForestClassifier(n_jobs = 2, max_depth = 5),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                      'Ada_boost':
                       {'regressor': AdaBoostClassifier(),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                    'Gradient_boost':
                       {'regressor': GradientBoostingClassifier(),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       },
                     'Hist_Gradient_boost':
                       {'regressor': HistGradientBoostingClassifier(max_depth = 5),
                       'preprocessor': preprocessor,
                       'train_accuracy_score': None,
                       'test_accuracy_score': None,
                       'auc_score': None,
                       'fpr': None,
                       'tpr': None,
                       'thresholds': None
                       }
                    }

In [12]:
def model_evaluation(X_train, X_test, y_train, y_test,
                         baseline_models, 
                         preprocessor,
                         folder_name = None, 
                         ):
    
    # Create a summary dictionary
    summary_dict = {}
    
    for name, model in baseline_models.items():
        
        # transform the features    
        processor = model['preprocessor']
        X_train_processed = processor.fit_transform(X_train)
        X_test_processed = processor.transform(X_test)
    
        # Cross validation
        model['train_accuracy_score'] = np.mean(cross_val_score(model['regressor'], 
                                                        X_train_processed, y_train.values.ravel(), 
                                                        scoring="accuracy", cv=5))
    
        train_accuracy_score = model['train_accuracy_score']
    
        # fit the new model and make predictions
        new_model = model['regressor']
        new_model.fit(X_train_processed, y_train.values.ravel())
        preds = new_model.predict(X_test_processed)
        y_score = new_model.predict_proba(X_test_processed)

        # get our scoring metrics
        model['test_accuracy_score'] = accuracy_score(y_test, preds)
        test_accuracy_score = model['test_accuracy_score']
        
        model['auc_score'] = roc_auc_score(y_test, y_score[:,1])
        auc_score = model['auc_score']
        
        model['recall_score'] = recall_score(y_test, preds)
        model['precision_score'] = f1_score(y_test, preds)
        model['f1_score'] = precision_score(y_test, preds)
        
        recall = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        precision = precision_score(y_test, preds)
        
        # Visualisations
        fpr, tpr, thresholds = roc_curve(y_test, y_score[:,1])
        model['fpr'] = fpr
        model['tpr'] = tpr
        model['thresholds'] = thresholds
    
        # Saving the model
        if folder_name == None:
            pass
        else:
            os.makedirs(f'models/{name}/{folder_name}') 
            filepath = f'models/{name}/{folder_name}/baseline_model.pickl'
            pickle.dump(new_model, open(filepath, 'wb'))
        
        #Place everything into a dictionary and place that into the summary list
        summary_dict.update({name: {
                                   'train_score': train_accuracy_score, 'test_score': test_accuracy_score,
                                   'recall': recall, 'precision': precision, 'f1': f1,
                                   'auc': auc_score, 'tpr': tpr, 'fpr': fpr
                                   }})

    return summary_dict

### Evaluation Metrics

In [14]:
seasonal_summary_table = model_evaluation(X_train, X_test, y_train['seasonal_vaccine'],
                         y_test['seasonal_vaccine'], models, preprocessor)

In [16]:
h1n1_summary_table = model_evaluation(X_train, X_test, y_train['h1n1_vaccine'],
                         y_test['h1n1_vaccine'], models, preprocessor)

In [49]:
seasonal_df = pd.DataFrame(seasonal_summary_table)
seasonal_df

Unnamed: 0,Logistic_regresion,Naive_Bayes,Random_Forest,Ada_boost,Gradient_boost,Hist_Gradient_boost
train_score,0.776435,0.636445,0.761757,0.779031,0.784423,0.784723
test_score,0.788528,0.629325,0.770256,0.784784,0.792422,0.791673
recall,0.75419,0.744988,0.690437,0.742688,0.764706,0.763391
precision,0.7756,0.571609,0.780171,0.775566,0.776443,0.775885
f1,0.764745,0.646883,0.732566,0.758771,0.77053,0.769588
auc,0.859189,0.720609,0.842872,0.85933,0.863933,0.866966
tpr,"[0.0, 0.00032862306933946765, 0.02793296089385...","[0.0, 0.025303976339139007, 0.0266184686164968...","[0.0, 0.00032862306933946765, 0.00262898455471...","[0.0, 0.00032862306933946765, 0.00788695366414...","[0.0, 0.00032862306933946765, 0.01708839960565...","[0.0, 0.00032862306933946765, 0.00558659217877..."
fpr,"[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275...","[0.0, 0.006329113924050633, 0.0063291139240506...","[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275...","[0.0, 0.0, 0.0, 0.000275178866263071, 0.000275..."


In [51]:
h1n1_df = pd.DataFrame(h1n1_summary_table)
h1n1_df

Unnamed: 0,Logistic_regresion,Naive_Bayes,Random_Forest,Ada_boost,Gradient_boost,Hist_Gradient_boost
train_score,0.850424,0.604993,0.811033,0.847978,0.852871,0.85362
test_score,0.852778,0.614797,0.810843,0.854126,0.856672,0.858769
recall,0.490473,0.709951,0.122089,0.493296,0.485533,0.516584
precision,0.726987,0.317651,0.901042,0.731937,0.751092,0.739394
f1,0.585756,0.438918,0.21504,0.589376,0.589799,0.608226
auc,0.864429,0.715227,0.84312,0.866807,0.873122,0.874436
tpr,"[0.0, 0.0007057163020465773, 0.011997177134791...","[0.0, 0.043754410726887794, 0.0458715596330275...","[0.0, 0.0007057163020465773, 0.002822865208186...","[0.0, 0.0007057163020465773, 0.019760056457304...","[0.0, 0.0007057163020465773, 0.017642907551164...","[0.0, 0.0007057163020465773, 0.017642907551164..."
fpr,"[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001...","[0.0, 0.008555133079847909, 0.0085551330798479...","[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001...","[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001...","[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001...","[0.0, 0.0, 0.0, 0.00019011406844106463, 0.0001..."


In [119]:
from plotly import graph_objects as go

data = seasonal_summary_table

fig = go.Figure(
    data=[
        go.Line(
            name="Benchmark",
            x = np.linspace(0,1,100),
            y = np.linspace(0,1,100)
        ),
        go.Line(
            name="Naive Bayes",
            x = data['Naive_Bayes']['fpr'],
            y = data['Naive_Bayes']['tpr'],
        ),
        go.Line(
            name="Hist_gradient_boost",
            x = data['Hist_Gradient_boost']['fpr'],
            y = data['Hist_Gradient_boost']['tpr'],
        )
    ],
    layout=go.Layout(
        title = "ROC Curve",
        yaxis_title = "True positive rate",
        xaxis_title = 'False positive rate'
    )
)

fig.show()