# Pipeline

In [23]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

In [24]:
!ls data

submission_format.csv  test_data.csv	      training_set_features.csv
test_data	       test_set_features.csv  training_set_labels.csv
test_data,csv	       training_data.csv


Load in training data for initial ETL:

In [25]:
#import training data
train = pd.read_csv("./data/training_data.csv")
train

Unnamed: 0.1,Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,25194,25194,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,1.0,1.0,,,0,0
1,14006,14006,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,,Employed,lzgpxyit,"MSA, Not Principle City",2.0,1.0,fcxhlnwr,oijqvulv,0,1
2,11285,11285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,kbazzjca,"MSA, Principle City",0.0,1.0,wlfvacwt,hfxkjkmi,0,1
3,2900,2900,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,mlyzmhmf,"MSA, Not Principle City",0.0,0.0,mcubkhph,ukymxvdu,0,0
4,19083,19083,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,,,bhuqouqj,"MSA, Not Principle City",,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,21575,21575,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Not in Labor Force,qufhixun,"MSA, Principle City",0.0,0.0,,,0,1
20026,5390,5390,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Unemployed,mlyzmhmf,"MSA, Principle City",0.0,0.0,,,0,0
20027,860,860,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,Non-MSA,1.0,0.0,atmlpfrs,xqwwgdyp,0,0
20028,15795,15795,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,Own,Employed,kbazzjca,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0


In [26]:
#remove unnamed:0 becuase its a duplicate of respondent_id
train = train.drop(labels="Unnamed: 0", axis=1)

In [27]:
train

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,25194,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,1.0,1.0,,,0,0
1,14006,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,Employed,lzgpxyit,"MSA, Not Principle City",2.0,1.0,fcxhlnwr,oijqvulv,0,1
2,11285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,kbazzjca,"MSA, Principle City",0.0,1.0,wlfvacwt,hfxkjkmi,0,1
3,2900,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,mlyzmhmf,"MSA, Not Principle City",0.0,0.0,mcubkhph,ukymxvdu,0,0
4,19083,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,,,bhuqouqj,"MSA, Not Principle City",,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,21575,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,Own,Not in Labor Force,qufhixun,"MSA, Principle City",0.0,0.0,,,0,1
20026,5390,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,Own,Unemployed,mlyzmhmf,"MSA, Principle City",0.0,0.0,,,0,0
20027,860,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,Own,Employed,qufhixun,Non-MSA,1.0,0.0,atmlpfrs,xqwwgdyp,0,0
20028,15795,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,Own,Employed,kbazzjca,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0


In [28]:
x_train = train.drop(labels=["respondent_id","h1n1_vaccine","seasonal_vaccine"], axis=1)
y_train = train["seasonal_vaccine"]

In [29]:
x_train

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,1.0,1.0,,
1,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,Married,,Employed,lzgpxyit,"MSA, Not Principle City",2.0,1.0,fcxhlnwr,oijqvulv
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,kbazzjca,"MSA, Principle City",0.0,1.0,wlfvacwt,hfxkjkmi
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Own,Employed,mlyzmhmf,"MSA, Not Principle City",0.0,0.0,mcubkhph,ukymxvdu
4,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,,,,,bhuqouqj,"MSA, Not Principle City",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Own,Not in Labor Force,qufhixun,"MSA, Principle City",0.0,0.0,,
20026,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Unemployed,mlyzmhmf,"MSA, Principle City",0.0,0.0,,
20027,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,Non-MSA,1.0,0.0,atmlpfrs,xqwwgdyp
20028,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,kbazzjca,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea


In [30]:
y_train

0        0
1        1
2        1
3        0
4        1
        ..
20025    1
20026    0
20027    0
20028    0
20029    0
Name: seasonal_vaccine, Length: 20030, dtype: int64

Begin Pipeline Construction

In [31]:
x_train.columns

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [32]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20030 entries, 0 to 20029
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 19963 non-null  float64
 1   h1n1_knowledge               19943 non-null  float64
 2   behavioral_antiviral_meds    19974 non-null  float64
 3   behavioral_avoidance         19873 non-null  float64
 4   behavioral_face_mask         20016 non-null  float64
 5   behavioral_wash_hands        19994 non-null  float64
 6   behavioral_large_gatherings  19960 non-null  float64
 7   behavioral_outside_home      19972 non-null  float64
 8   behavioral_touch_face        19932 non-null  float64
 9   doctor_recc_h1n1             18395 non-null  float64
 10  doctor_recc_seasonal         18395 non-null  float64
 11  chronic_med_condition        19313 non-null  float64
 12  child_under_6_months         19425 non-null  float64
 13  health_worker   

In [33]:
#labels

#already encoded in survey responses ordinally, values generally range from 0 to 5
ordinal_feat = ["h1n1_concern","h1n1_knowledge",'opinion_h1n1_vacc_effective','opinion_h1n1_risk',
                'opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk',
                'opinion_seas_sick_from_vacc','household_adults','household_children']
#features that are ordinal in nature, but are recorded as strings
ordinal_e_feat = ["age_group", "education"]
#categorical and binary features recorded in the dataset. Nulls represent "prefer not to answer"- 
#-type responses in compliance with human subjects research ethics requirements. Thus, all binary 
#features are now categorical to account for trends in refusal behavior in the model.
cat_feat = ['behavioral_antiviral_meds', 'behavioral_avoidance',
           'behavioral_face_mask', 'behavioral_wash_hands',
           'behavioral_large_gatherings', 'behavioral_outside_home',
           'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
           'chronic_med_condition', 'child_under_6_months', 'health_worker',
           'health_insurance', 'race', 'sex', 'income_poverty', 'marital_status',
           'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry',
           'employment_occupation']

In [34]:
#subpipelines
ordinal_impute = Pipeline(steps=[('impute', SimpleImputer(strategy="most_frequent"))])
ordinal_encoding = Pipeline(steps=[('impute', SimpleImputer(strategy="most_frequent")),
                                  ("ord_encode", OrdinalEncoder())
                                  ])
#mapping binary values to yes/no to resolve errors with one_hot_encoder
def mapper(data):
    data = data.applymap(lambda x: "yes" if (x==1) else x or "no" if (x==0) else x)
    data.astype(str)
    return data
mappy = FunctionTransformer(func=mapper)    
#map_binary = Pipeline(steps=[("map_bin", mappy)])

one_hot_encoder = Pipeline(steps=[("map_bin", mappy),
                                  ('impute', SimpleImputer(strategy="constant",fill_value="no_response")),
                                  ("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"))
                                 ])


In [35]:
#columntransformer
col_tr = ColumnTransformer(transformers=[#("map", map_binary, cat_feat),
                                         ("ord_imp", ordinal_impute, ordinal_feat), 
                                         ("ord_enc", ordinal_encoding, ordinal_e_feat),
                                         ("imp_ohe", one_hot_encoder, cat_feat)],
                          sparse_threshold=0)

In [36]:
col_tr.fit_transform(x_train)

array([[1., 1., 4., ..., 0., 0., 0.],
       [2., 1., 3., ..., 0., 0., 0.],
       [0., 0., 4., ..., 0., 0., 0.],
       ...,
       [2., 1., 4., ..., 1., 0., 0.],
       [2., 1., 1., ..., 0., 0., 0.],
       [3., 1., 4., ..., 0., 0., 0.]])

### Construct Initial Model

Initial model will be a voting classifier composed of a random forest estimator and a naive bayes estimator. A grid search will then be performed to find the best hyperparameters for each base estimator as well as the overall weights of the voting classifier. 

In [37]:
#import estimators
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, classification_report

In [38]:
#instantiate base estimators
rfc = RandomForestClassifier()
cat_nb = CategoricalNB()

In [39]:
#instantiate meta classifier
vc = VotingClassifier(estimators=[("rfc", rfc),
                                 ("cat_nb", cat_nb)])

In [40]:
initial_model = Pipeline(steps=[("col_tr", col_tr),
                   ("vc", vc)])
initial_model.fit(x_train,y_train)

Pipeline(steps=[('col_tr',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('ord_imp',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'opinion_h1n1_vacc_effective',
                                                   'opinion_h1n1_risk',
                                                   'opinion_h1n1_sick_from_vacc',
                                                   'opinion_seas_vacc_effective',
                                                   'opinion_seas_risk',
                                                   'opinion_seas_sick_from_vacc',...
                                                   'doctor

In [45]:
cross_val_score(estimator=initial_model, X=x_train,y=y_train)

array([0.76410384, 0.74962556, 0.76210684, 0.74987519, 0.76135796])

In [46]:
grid = {"vc__rfc__n_estimators": [10,100,500], 
        "vc__rfc__max_depth": [10, 50, 100],
        "vc__cat_nb__alpha": [1,5,25,125],
        "vc__cat_nb__fit_prior": [True,False],
        "vc__voting": ['hard', 'soft']}
        #"vc__wieghts": [[.25, .75], [.75, .25], [.5, .5]]}

In [47]:
gs= GridSearchCV(initial_model, grid, verbose=3, n_jobs=-4)

In [48]:
gs.fit(x_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-4)]: Using backend LokyBackend with 13 concurrent workers.
[Parallel(n_jobs=-4)]: Done   6 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-4)]: Done 102 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-4)]: Done 262 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-4)]: Done 486 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-4)]: Done 720 out of 720 | elapsed:  9.0min finished


GridSearchCV(estimator=Pipeline(steps=[('col_tr',
                                        ColumnTransformer(sparse_threshold=0,
                                                          transformers=[('ord_imp',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent'))]),
                                                                         ['h1n1_concern',
                                                                          'h1n1_knowledge',
                                                                          'opinion_h1n1_vacc_effective',
                                                                          'opinion_h1n1_risk',
                                                                          'opinion_h1n1_sick_from_vacc',
                                                                   

In [70]:
gs.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [71]:
gs.best

AttributeError: 'GridSearchCV' object has no attribute 'best'

In [64]:
grid_2 = {"vc__rfc__n_estimators": [500, 600, 700], 
        "vc__rfc__max_depth": [40, 50, 60],
        "vc__cat_nb__alpha": [1,2,3],
        "vc__cat_nb__fit_prior": [True, False],
        "vc__weights": [[.25, .75], [.75, .25], [.5, .5]]}

In [65]:
gs_2= GridSearchCV(initial_model, grid_2, verbose=3, n_jobs=-4)

In [66]:
initial_model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'col_tr', 'vc', 'col_tr__n_jobs', 'col_tr__remainder', 'col_tr__sparse_threshold', 'col_tr__transformer_weights', 'col_tr__transformers', 'col_tr__verbose', 'col_tr__ord_imp', 'col_tr__ord_enc', 'col_tr__imp_ohe', 'col_tr__ord_imp__memory', 'col_tr__ord_imp__steps', 'col_tr__ord_imp__verbose', 'col_tr__ord_imp__impute', 'col_tr__ord_imp__impute__add_indicator', 'col_tr__ord_imp__impute__copy', 'col_tr__ord_imp__impute__fill_value', 'col_tr__ord_imp__impute__missing_values', 'col_tr__ord_imp__impute__strategy', 'col_tr__ord_imp__impute__verbose', 'col_tr__ord_enc__memory', 'col_tr__ord_enc__steps', 'col_tr__ord_enc__verbose', 'col_tr__ord_enc__impute', 'col_tr__ord_enc__ord_encode', 'col_tr__ord_enc__impute__add_indicator', 'col_tr__ord_enc__impute__copy', 'col_tr__ord_enc__impute__fill_value', 'col_tr__ord_enc__impute__missing_values', 'col_tr__ord_enc__impute__strategy', 'col_tr__ord_enc__impute__verbose', 'col_tr__ord_enc__ord_encode__categori

In [67]:
gs_2.fit(x_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-4)]: Using backend LokyBackend with 13 concurrent workers.
[Parallel(n_jobs=-4)]: Done   6 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-4)]: Done 102 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-4)]: Done 262 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-4)]: Done 486 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-4)]: Done 774 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-4)]: Done 810 out of 810 | elapsed: 29.8min finished


GridSearchCV(estimator=Pipeline(steps=[('col_tr',
                                        ColumnTransformer(sparse_threshold=0,
                                                          transformers=[('ord_imp',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='most_frequent'))]),
                                                                         ['h1n1_concern',
                                                                          'h1n1_knowledge',
                                                                          'opinion_h1n1_vacc_effective',
                                                                          'opinion_h1n1_risk',
                                                                          'opinion_h1n1_sick_from_vacc',
                                                                   

In [68]:
gs_2.best_estimator_

Pipeline(steps=[('col_tr',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('ord_imp',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'opinion_h1n1_vacc_effective',
                                                   'opinion_h1n1_risk',
                                                   'opinion_h1n1_sick_from_vacc',
                                                   'opinion_seas_vacc_effective',
                                                   'opinion_seas_risk',
                                                   'opinion_seas_sick_from_vacc',...
                                                   'health

In [69]:
gs_2.best_params_

{'vc__cat_nb__alpha': 2,
 'vc__cat_nb__fit_prior': True,
 'vc__rfc__max_depth': 60,
 'vc__rfc__n_estimators': 600,
 'vc__weights': [0.75, 0.25]}

In [74]:
gs_2.score_

AttributeError: 'GridSearchCV' object has no attribute 'score_'