In [1]:
# import necessar libraries
import pandas as pd
import numpy as np

# import libraries
import data_preparation as dp
import visualizations as viz

In [2]:
# read data
data = pd.read_csv("../data/training_set_features.csv", index_col = 0)
vac = pd.read_csv("../data/training_set_labels.csv", index_col = 0)

all_data = data.merge(vac['h1n1_vaccine'], left_index=True, right_index=True)

all_data.drop(['doctor_recc_seasonal',
               'opinion_seas_vacc_effective',
               'opinion_seas_risk',
               'opinion_seas_sick_from_vacc'], 
              axis = 1,
              inplace=True)

In [3]:
X = dp.missing(all_data)
y = X.pop('h1n1_vaccine')

for c in X.columns:
    X = dp.dummy(X, c)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings(action='ignore')

### Baseline Models
#### Simple models

In [7]:
lr = LogisticRegression()

dp.scores(X_train, y_train, X_test, y_test, lr)

CV score:     85.15%
X-test score: 85.17%
RMSE:         0.3851

Train score
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     15770
           1       0.73      0.49      0.59      4260

    accuracy                           0.85     20030
   macro avg       0.80      0.72      0.75     20030
weighted avg       0.84      0.85      0.84     20030



X-test score

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5263
           1       0.73      0.48      0.58      1414

    accuracy                           0.85      6677
   macro avg       0.80      0.72      0.74      6677
weighted avg       0.84      0.85      0.84      6677



In [8]:
dtc = DecisionTreeClassifier()
dp.scores(X_train, y_train, X_test, y_test, dtc)

CV score:     77.84%
X-test score: 78.61%
RMSE:         0.4625

Train score
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15770
           1       1.00      1.00      1.00      4260

    accuracy                           1.00     20030
   macro avg       1.00      1.00      1.00     20030
weighted avg       1.00      1.00      1.00     20030



X-test score

              precision    recall  f1-score   support

           0       0.86      0.87      0.86      5263
           1       0.50      0.49      0.49      1414

    accuracy                           0.79      6677
   macro avg       0.68      0.68      0.68      6677
weighted avg       0.79      0.79      0.79      6677



#### More complex models

In [9]:
rfc = RandomForestClassifier()
dp.scores(X_train, y_train, X_test, y_test, rfc)

CV score:     84.98%
X-test score: 84.90%
RMSE:         0.3885

Train score
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15770
           1       1.00      1.00      1.00      4260

    accuracy                           1.00     20030
   macro avg       1.00      1.00      1.00     20030
weighted avg       1.00      1.00      1.00     20030



X-test score

              precision    recall  f1-score   support

           0       0.86      0.96      0.91      5263
           1       0.75      0.43      0.55      1414

    accuracy                           0.85      6677
   macro avg       0.81      0.70      0.73      6677
weighted avg       0.84      0.85      0.83      6677



In [10]:
abc = AdaBoostClassifier()
dp.scores(X_train, y_train, X_test, y_test, abc)

CV score:     85.16%
X-test score: 85.08%
RMSE:         0.3862

Train score
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     15770
           1       0.72      0.49      0.58      4260

    accuracy                           0.85     20030
   macro avg       0.80      0.72      0.75     20030
weighted avg       0.84      0.85      0.84     20030



X-test score

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5263
           1       0.72      0.49      0.58      1414

    accuracy                           0.85      6677
   macro avg       0.80      0.72      0.74      6677
weighted avg       0.84      0.85      0.84      6677



In [11]:
gbc = GradientBoostingClassifier()
dp.scores(X_train, y_train, X_test, y_test, gbc)

CV score:     85.29%
X-test score: 85.41%
RMSE:         0.3819

Train score
              precision    recall  f1-score   support

           0       0.88      0.96      0.91     15770
           1       0.75      0.50      0.60      4260

    accuracy                           0.86     20030
   macro avg       0.81      0.73      0.76     20030
weighted avg       0.85      0.86      0.85     20030



X-test score

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5263
           1       0.74      0.48      0.58      1414

    accuracy                           0.85      6677
   macro avg       0.81      0.72      0.75      6677
weighted avg       0.84      0.85      0.84      6677



Random forest model is selected for the hyperparameter tuning as it has the highest recall score for 0's (not vaccinated). Decision tree is selected for the one of baseline model as random forest idea is based on decision tree method.