# Predicting Vaccination Likelyhood

## Business Understanding

In light of their new vaccination initiative, the CDC has conducted surveys on random individuals throughout the country. Deliver a predictive binary classifier model to stakeholder (CDC) that determines if someone will take the Seasonal Flu vaccine based on responses to a phone survey. Predictions on future surveys can help assess public health risk by determining the percent of the population likely to get vaccinated.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix,plot_confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Data Cleaning & Exploration

In [2]:
df_var = pd.read_csv('data/training_set_features.csv')
df_tar = pd.read_csv('data/training_set_labels.csv')['seasonal_vaccine']
df_var = df_var.drop(['respondent_id','h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','doctor_recc_h1n1','hhs_geo_region'],axis=1)
df_var = df_var.drop(['health_insurance','income_poverty','employment_industry','employment_occupation'],axis=1)
X_train,X_test,y_train,y_test = train_test_split(df_var,df_tar,random_state=42)
X_train = X_train.copy(deep=True)

In [5]:
col_imputer = ColumnTransformer(transformers=[
    ("sim", SimpleImputer(strategy='most_frequent'), frequent_columns),

    ("sib", SimpleImputer(strategy='median'), median_columns)

    ],
    remainder="passthrough")

col_ohe = ColumnTransformer(transformers=[
    ("ohe", OneHotEncoder(categories="auto", drop='first'), ohe_cols)
    ], 
    remainder='passthrough')
# Create a pipeline containing the single column transformer

pipe1 = Pipeline(steps=[
    ('col_imputer', col_imputer)
])

imputed = pipe1.fit_transform(X_train)
X_train_pipe_impute = pd.DataFrame(imputed, columns=X_train.columns)

pipe2 = Pipeline(steps=[
    ('col_ohe', col_ohe)
])

# Use the pipeline to fit and transform the data
transformed_data = pipe2.fit_transform(X_train)

encoder = col_ohe.namedtransformers['ohe']
category_labels = encoder.get_feature_names(ohe_columns)

# Make a dataframe with the relevant columns
X_train_pipe_processed = pd.DataFrame(transformed_data, columns=list(X_train_pipe_impute.drop(ohe_columns, axis=1).columns) + list(category_labels))
X_train_pipe_processed.columns

NameError: name 'frequent_columns' is not defined

## Feature Selection

## Results

### Baseline Model - Most Frequent

In [None]:
# Dummy Classifier
dc = DummyClassifier(strategy='most_frequent',random_state=42)
dc.fit(X_train_ohe_scaled,y_train)
cv_scores = cross_val_score(dc,X_train_ohe_scaled,y_train,cv=5)
cv_scores

### Simple Model - Logistic Regression

In [None]:
# Gridsearch on Logreg Lasso Penalty

logreg_l1 = LogisticRegression()

param_grid = {
    'C': np.linspace(1e-5,1,50),
    'solver': ['liblinear','saga'],
    'penalty': ['l1']
}

gs_logreg_l1 = GridSearchCV(logreg_l1, param_grid, cv=3)
gs_logreg_l1.fit(X_train_ohe_scaled, y_train)

gs_logreg_l1.best_params_

In [None]:
# Recursive Feature Selection

cv_rfe = []
keep_lists = []
max_features = 20
for n in range(1,max_features+1):
    num_features_to_select = n
    lr_rfe = LogisticRegression(penalty='l1',random_state=42,solver='liblinear')
    select = RFE(lr_rfe, n_features_to_select=num_features_to_select)
    select.fit(X=X_train_ohe_scaled, y=y_train)
    feature_list = [(k,v) for k,v in zip(X_train_ohe_scaled.columns,select.support_)]
    current_keep_list = []
    for k,v in feature_list:
        if v:
            current_keep_list.append(k)
    
    current_cv = cross_val_score(lr_rfe,X_train_ohe[current_keep_list],y_train,cv=3,scoring='roc_auc').mean()

    cv_rfe.append(current_cv)
    keep_lists.append(current_keep_list)

In [None]:
#Plotting Mean Cross Val AUC ROC for RFE Models

fig,ax = plt.subplots(figsize=(10,10))
ax.plot(range(1,max_features+1),cv_rfe)
ax.set_xlabel('Number of Features')
ax.set_ylabel('Mean Cross Val ROC AUC Score')
plt.show()

In [None]:
# Confusion Matrix of Final Logreg Model

plot_confusion_matrix(logreg_final,X_train_ohe_scaled[keep_lists[-1]],y_train)

In [None]:
# Final Logreg Model Mean Cross Val AUC ROC Score

logreg_final = LogisticRegression(penalty='l1',random_state=42,solver='saga',C = 0.08164183673469387)
logreg_final.fit(X_train_ohe_scaled[keep_lists[-1]],y_train)
cross_val_score(lr_rfe,X_train_ohe[keep_lists[-1]],y_train,cv=5,scoring='roc_auc').mean()

### Final Model - Random Forest

## Conlclusion