# Pipeline

In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
!ls data

submission_format.csv
test_data.csv
test_set_features.csv
training_data.csv
training_set_features.csv
training_set_labels.csv


Load in training data for initial ETL:

In [3]:
#import training data
train = pd.read_csv("./data/training_data.csv")
train

Unnamed: 0.1,Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,25194,25194,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,1.0,1.0,,,0,0
1,14006,14006,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,,Employed,lzgpxyit,"MSA, Not Principle City",2.0,1.0,fcxhlnwr,oijqvulv,0,1
2,11285,11285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,kbazzjca,"MSA, Principle City",0.0,1.0,wlfvacwt,hfxkjkmi,0,1
3,2900,2900,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,mlyzmhmf,"MSA, Not Principle City",0.0,0.0,mcubkhph,ukymxvdu,0,0
4,19083,19083,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,,,bhuqouqj,"MSA, Not Principle City",,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,21575,21575,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Not in Labor Force,qufhixun,"MSA, Principle City",0.0,0.0,,,0,1
20026,5390,5390,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Unemployed,mlyzmhmf,"MSA, Principle City",0.0,0.0,,,0,0
20027,860,860,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,Non-MSA,1.0,0.0,atmlpfrs,xqwwgdyp,0,0
20028,15795,15795,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,Own,Employed,kbazzjca,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0


In [5]:
#remove unnamed:0 becuase its a duplicate of respondent_id
train = train.drop(labels="Unnamed: 0", axis=1)

In [6]:
train

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,25194,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,1.0,1.0,,,0,0
1,14006,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,Employed,lzgpxyit,"MSA, Not Principle City",2.0,1.0,fcxhlnwr,oijqvulv,0,1
2,11285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,kbazzjca,"MSA, Principle City",0.0,1.0,wlfvacwt,hfxkjkmi,0,1
3,2900,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,mlyzmhmf,"MSA, Not Principle City",0.0,0.0,mcubkhph,ukymxvdu,0,0
4,19083,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,,,bhuqouqj,"MSA, Not Principle City",,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,21575,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,Own,Not in Labor Force,qufhixun,"MSA, Principle City",0.0,0.0,,,0,1
20026,5390,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,Own,Unemployed,mlyzmhmf,"MSA, Principle City",0.0,0.0,,,0,0
20027,860,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,Own,Employed,qufhixun,Non-MSA,1.0,0.0,atmlpfrs,xqwwgdyp,0,0
20028,15795,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,Own,Employed,kbazzjca,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0


In [7]:
x_train = train.drop(labels=["h1n1_vaccine","seasonal_vaccine"], axis=1)
y_train = pd.DataFrame(train[["h1n1_vaccine", "seasonal_vaccine"]])

In [8]:
x_train

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,25194,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,1.0,1.0,,
1,14006,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,Married,,Employed,lzgpxyit,"MSA, Not Principle City",2.0,1.0,fcxhlnwr,oijqvulv
2,11285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,kbazzjca,"MSA, Principle City",0.0,1.0,wlfvacwt,hfxkjkmi
3,2900,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Own,Employed,mlyzmhmf,"MSA, Not Principle City",0.0,0.0,mcubkhph,ukymxvdu
4,19083,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,,,,,bhuqouqj,"MSA, Not Principle City",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20025,21575,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,"> $75,000",Not Married,Own,Not in Labor Force,qufhixun,"MSA, Principle City",0.0,0.0,,
20026,5390,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Unemployed,mlyzmhmf,"MSA, Principle City",0.0,0.0,,
20027,860,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,Non-MSA,1.0,0.0,atmlpfrs,xqwwgdyp
20028,15795,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,"> $75,000",Married,Own,Employed,kbazzjca,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea


In [9]:
y_train

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,1
3,0,0
4,0,1
...,...,...
20025,0,1
20026,0,0
20027,0,0
20028,0,0


Begin Pipeline Construction

In [13]:
x_train.columns

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [14]:
#labels

#already encoded in survey responses ordinally, values generally range from 0 to 5
ordinal_feat = ["h1n1_concern","h1n1_knowledge",'opinion_h1n1_vacc_effective','opinion_h1n1_risk',
                'opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk',
                'opinion_seas_sick_from_vacc','household_adults','household_children']
#features that are ordinal in nature, but are recorded as strings
ordinal_e_feat = ["age_group", "education"]
#categorical and binary features recorded in the dataset. Nulls represent "prefer not to answer"- 
#-type responses in compliance with human subjects research ethics requirements. Thus, all binary 
#features are now categorical to account for trends in refusal behavior in the model.
cat_feat = ['behavioral_antiviral_meds', 'behavioral_avoidance',
           'behavioral_face_mask', 'behavioral_wash_hands',
           'behavioral_large_gatherings', 'behavioral_outside_home',
           'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
           'chronic_med_condition', 'child_under_6_months', 'health_worker',
           'health_insurance', 'race', 'sex', 'income_poverty', 'marital_status',
           'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry',
           'employment_occupation']

In [20]:
#subpipelines
ordinal_impute = Pipeline(steps=[('impute', SimpleImputer(strategy="median"))])
ordinal_encoding = Pipeline(steps=[('impute', SimpleImputer(strategy="median")),
                                  ("ord_encode", OrdinalEncoder())
                                  ])
one_hot_encoder = Pipeline(steps=[('impute', SimpleImputer(strategy="constant",fill_value="no_response")),
                                  ("ohe", OneHotEncoder())
                                 ])


In [21]:
#columntransformer
col_tr = ColumnTransformer(transformers=[("ord_imp", ordinal_impute, ordinal_feat), 
                                         ("ord_enc", ordinal_encoding, ordinal_e_feat),
                                         ("imp_ohe"), one_hot_encoder, cat_feat],
                          sparse_threshold=0)