<a href="https://colab.research.google.com/github/dln277/portfolio/blob/main/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Objective**: To predict how likely individuals are to receive their H1N1 and seasonal flu vaccines. Specifically, you'll be predicting two probabilities: one for h1n1_vaccine and one for seasonal_vaccine.



**1. Gather Data**

In [36]:
#Load data and join dataframes 

import numpy as np
import pandas as pd 

X_train = pd.read_csv('training_set_features.csv')
y_train = pd.read_csv('training_set_labels.csv')
X_test = pd.read_csv('test_set_features.csv')

In [37]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

**2. Preprocessing, Imputing Missing Values and Dropping Unncessary Variables**

In [38]:
#Dropping geo region and employment info

X_train = X_train.drop(columns=['hhs_geo_region','employment_industry','employment_occupation'])

In [39]:
#Default title text
#Dropping rows with more than 4 missing values 

X_train = X_train.dropna(thresh=X_train.shape[1]-4,axis=0)
X_train = X_train.reset_index(drop=True)

In [40]:
merged_train = X_train.merge(y_train, how='inner',on='respondent_id')

X_train = merged_train.iloc[:,:-2]
y_train = merged_train.iloc[:,-2:]

In [41]:
#Imputing missing values for numerical variables
#Impute with median value 
import numpy as np

X_train[['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective',
         'opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children']]= X_train[['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk',
         'opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children']].replace(np.nan,X_train[['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children']].median())
X_test[['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children']]= X_test[['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children']].replace(np.nan,X_test[['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc','household_adults','household_children']].median())

In [42]:
#Impute binary variables with 0 with assumption that no response = 0 

X_train[['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
         'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']]=X_train[['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
         'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']].replace(np.nan,0)
X_test[['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
         'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']]=X_test[['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands','behavioral_large_gatherings','behavioral_outside_home',
         'behavioral_touch_face','doctor_recc_h1n1','doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']].replace(np.nan,0)

In [43]:
#Imputing categorical binary variables with OneHotEncoder
#Place NaN as new variable

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(dtype=np.int,sparse=True)

X_train['marital_status'] = X_train['marital_status'].replace(np.nan,'Other Marial Status')
X_train['rent_or_own'] = X_train['rent_or_own'].replace(np.nan,'Other Home')
X_train['census_msa'] = X_train['census_msa'].replace(np.nan,'Other MSA Status')


bin_cat = pd.DataFrame(
    onehot.fit_transform(X_train[['race','sex','marital_status','rent_or_own','census_msa']]).toarray(),
    columns=['Black','Hispanic','Other or Multiple','White','Female','Male',
             'Married','Not Married','Other Marital Status','Other Home','Own','Rent','MSA','Other MSA Census','Non-MSA'])

X_train=X_train.drop(['race','sex','marital_status','rent_or_own','census_msa'],axis=1).join(bin_cat,how='outer')

In [44]:
#Impute ordinal categorical values  
#Place NaN as new values 

X_train['education'] = X_train['education'].fillna('Higher Education')
X_train['income_poverty'] = X_train['income_poverty'].fillna('Below Poverty')
X_train['employment_status'] = X_train['employment_status'].fillna('Not in Labor Force')


cat_age = pd.Categorical(X_train.age_group,
                     categories=['18 - 34 Years','35 - 44 Years','45 - 54 Years','55 - 64 Years','65+ Years'],
                                 ordered=True)
cat_edu = pd.Categorical(X_train.education,
                         categories=['<12 Years','12 Years','Some College','College Graduate','Higher Education'],
                         ordered=True)
cat_inc = pd.Categorical(X_train.income_poverty,
                         categories=['Below Poverty','<= $75,000, Above Poverty','> $75,000'],
                         ordered=True)
cat_emp = pd.Categorical(X_train.employment_status,
                         categories=['Not in Labor Force','Unemployed','Employed'],
                         ordered=True)

age_labels, unique = pd.factorize(cat_age, sort=True)
edu_labels, unique = pd.factorize(cat_edu, sort=True)
inc_labels, unique = pd.factorize(cat_inc, sort=True)
emp_labels, unique = pd.factorize(cat_emp, sort=True)

X_train.age_group = age_labels
X_train.education = edu_labels
X_train.income_poverty = inc_labels
X_train.employment_status = emp_labels


In [45]:
X_train = X_train.drop(columns=['respondent_id'])
X_test = X_test.drop(columns=['respondent_id'])

**4.Train & Fit Models**

In [46]:
#KNN Classifier 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_acc = round(knn.score(X_train,y_train)*100,2)

#Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train,y_train)
tree_acc = round(tree.score(X_train,y_train)*100,2)

#Random Forest 
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train,y_train)
forest_acc = round(forest.score(X_train,y_train)*100,2)

**4. Evaluation**

In [47]:
#Evaluate with Accuracy

results = pd.DataFrame({
    'Model': ['KNN Classifier','Decision Tree','Random Forest'],
    'Score': [knn_acc,tree_acc,forest_acc]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head

<bound method NDFrame.head of                 Model
Score                
99.99   Decision Tree
99.99   Random Forest
71.72  KNN Classifier>

Decision Tree and Random Forest perform best for both predictions, likely due to overfitting --> Examine further with cross validation

In [48]:
#Cross Validation

from sklearn.model_selection import cross_val_score 

scores_knn = cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy')
scores_tree = cross_val_score(tree,X_train,y_train,cv=10,scoring='accuracy')
scores_forest = cross_val_score(forest,X_train,y_train,cv=10,scoring='accuracy')

cross_val = pd.DataFrame({
        'Model': ['KNN Classifier','Decision Tree','Random Forest'],
        'Score': [scores_knn,scores_tree,scores_forest],
        'Mean': [scores_knn.mean(),scores_tree.mean(),scores_forest.mean()],
        'Standard Deviation': [scores_knn.std(),scores_tree.std(),scores_forest.std()]})

cross_val_df = cross_val.sort_values(by='Mean',ascending=False)
cross_val_df = cross_val_df.set_index('Mean')

cross_val_df.head()

Unnamed: 0_level_0,Model,Score,Standard Deviation
Mean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.677072,Random Forest,"[0.670481452249408, 0.6748224151539068, 0.6696...",0.007
0.618982,KNN Classifier,"[0.6148382004735596, 0.6136543014996053, 0.616...",0.006068
0.55221,Decision Tree,"[0.5473559589581689, 0.5303867403314917, 0.544...",0.011457


In [49]:
#Understand which features are the most important (remove those that are not to improve performance)

importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head

<bound method NDFrame.head of                              importance
feature                                
opinion_seas_risk                 0.079
opinion_seas_vacc_effective       0.067
age_group                         0.055
doctor_recc_seasonal              0.054
opinion_h1n1_risk                 0.053
doctor_recc_h1n1                  0.052
opinion_h1n1_vacc_effective       0.046
health_insurance                  0.036
education                         0.036
opinion_seas_sick_from_vacc       0.035
opinion_h1n1_sick_from_vacc       0.035
h1n1_concern                      0.034
income_poverty                    0.028
household_adults                  0.027
h1n1_knowledge                    0.026
household_children                0.024
employment_status                 0.021
chronic_med_condition             0.018
health_worker                     0.018
MSA                               0.017
behavioral_touch_face             0.016
behavioral_outside_home           0.016
behavioral

In [55]:
#Drop unimportant features and retrain the models 

X_train = X_train.filter(['opinion_seas_risk','opinion_seas_vacc_effective','age_group','opinion_h1n1_risk',
                          'doctor_recc_seasonal','doctor_recc_h1n1','opinion_h1n1_vacc_effective'],
                       axis=1)
X_test = X_test.filter(['opinion_seas_risk','opinion_seas_vacc_effective','age_group','opinion_h1n1_risk',
                          'doctor_recc_seasonal','doctor_recc_h1n1','opinion_h1n1_vacc_effective'],
                       axis=1)

#Transform relevant columns in X_test accordingly
cat_age_test = pd.Categorical(X_test.age_group,
                     categories=['18 - 34 Years','35 - 44 Years','45 - 54 Years','55 - 64 Years','65+ Years'],
                                 ordered=True)
age_labels_test, unique = pd.factorize(cat_age_test, sort=True)
X_test.age_group = age_labels_test

In [51]:
#KNN Classifier 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_acc = round(knn.score(X_train,y_train)*100,2)

#Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train,y_train)
tree_acc = round(tree.score(X_train,y_train)*100,2)

#Random Forest 
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train,y_train)
forest_acc = round(forest.score(X_train,y_train)*100,2)

#Evaluate with Accuracy

results = pd.DataFrame({
    'Model': ['KNN Classifier','Decision Tree','Random Forest'],
    'Score': [knn_acc,tree_acc,forest_acc]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head

<bound method NDFrame.head of                 Model
Score                
70.30   Random Forest
70.18   Decision Tree
65.96  KNN Classifier>

In [52]:
#Cross Validation

from sklearn.model_selection import cross_val_score 

scores_knn = cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy')
scores_tree = cross_val_score(tree,X_train,y_train,cv=10,scoring='accuracy')
scores_forest = cross_val_score(forest,X_train,y_train,cv=10,scoring='accuracy')

cross_val = pd.DataFrame({
        'Model': ['KNN Classifier','Decision Tree','Random Forest'],
        'Score': [scores_knn,scores_tree,scores_forest],
        'Mean': [scores_knn.mean(),scores_tree.mean(),scores_forest.mean()],
        'Standard Deviation': [scores_knn.std(),scores_tree.std(),scores_forest.std()]})

cross_val_df = cross_val.sort_values(by='Mean',ascending=False)
cross_val_df = cross_val_df.set_index('Mean')

cross_val_df.head()

Unnamed: 0_level_0,Model,Score,Standard Deviation
Mean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.634688,Random Forest,"[0.6132596685082873, 0.6306235201262825, 0.643...",0.011086
0.631018,Decision Tree,"[0.6108918705603789, 0.6183898973954223, 0.634...",0.011347
0.61764,KNN Classifier,"[0.6215469613259669, 0.6207576953433307, 0.627...",0.009958


In [None]:
#Hyperparameter tuning to improve random forest's performance

from sklearn.model_selection import GridSearchCV

param_grid = { "criterion" : ["gini", "entropy"],
              "min_samples_leaf" : [1, 5, 10, 25, 50, 70],
              "min_samples_split" : [2, 4, 10, 12, 16, 18, 25, 35],
              "n_estimators": [100, 400, 700, 1000, 1500]}

rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, y_train)

clf.best_params_

In [56]:
# Test hyperparameter
random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 25, 
                                       min_samples_split = 2,   
                                       n_estimators=100, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=0, 
                                       n_jobs=-1)

random_forest.fit(X_train, y_train)
y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

oob score: 79.56 %


In [57]:
print(y_prediction)

[[0 0]
 [0 0]
 [1 1]
 ...
 [0 0]
 [0 0]
 [0 1]]


In [58]:
y_pred = random_forest.predict_proba(X_test)
print(y_pred)

[array([[0.88784059, 0.11215941],
       [0.95165074, 0.04834926],
       [0.45083343, 0.54916657],
       ...,
       [0.88575735, 0.11424265],
       [0.90852755, 0.09147245],
       [0.5508245 , 0.4491755 ]]), array([[0.84735157, 0.15264843],
       [0.92507111, 0.07492889],
       [0.21823964, 0.78176036],
       ...,
       [0.77015074, 0.22984926],
       [0.63501358, 0.36498642],
       [0.34751439, 0.65248561]])]


In [60]:
print(np.array(y_pred).shape)

(2, 26708, 2)


In [61]:
y_pred_2d = np.array(y_pred).transpose(2,0,1).reshape(-1,np.array(y_pred).shape[1])

print(y_pred_2d)

[[0.88784059 0.95165074 0.45083343 ... 0.88575735 0.90852755 0.5508245 ]
 [0.84735157 0.92507111 0.21823964 ... 0.77015074 0.63501358 0.34751439]
 [0.11215941 0.04834926 0.54916657 ... 0.11424265 0.09147245 0.4491755 ]
 [0.15264843 0.07492889 0.78176036 ... 0.22984926 0.36498642 0.65248561]]


In [62]:
y_pred_2d = y_pred_2d.T

In [63]:
print(y_pred_2d.shape)

(26708, 4)


In [64]:
np.savetxt("pred.csv",  
           y_prediction, 
           delimiter =", ",  
           fmt ='% s') 

np.savetxt("pred_proba.csv",  
           y_pred_2d, 
           delimiter =", ",  
           fmt ='% s') 