In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
X_init = pd.read_csv('Data/training_set_features.csv', index_col=0)
y_init = pd.read_csv('Data/training_set_labels.csv', index_col=0)

In [3]:
X_init.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26615 non-null  float64
 1   h1n1_knowledge               26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_h1n1             24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

In [4]:
(X_init.isna().sum()/len(X_init)).sort_values(ascending=False)

employment_occupation          0.504362
employment_industry            0.499120
health_insurance               0.459580
income_poverty                 0.165612
doctor_recc_h1n1               0.080878
doctor_recc_seasonal           0.080878
rent_or_own                    0.076459
employment_status              0.054780
marital_status                 0.052720
education                      0.052683
chronic_med_condition          0.036358
child_under_6_months           0.030704
health_worker                  0.030104
opinion_seas_sick_from_vacc    0.020107
opinion_seas_risk              0.019246
opinion_seas_vacc_effective    0.017299
opinion_h1n1_sick_from_vacc    0.014790
opinion_h1n1_vacc_effective    0.014640
opinion_h1n1_risk              0.014528
household_children             0.009323
household_adults               0.009323
behavioral_avoidance           0.007788
behavioral_touch_face          0.004793
h1n1_knowledge                 0.004343
h1n1_concern                   0.003445


In [5]:
X_drop = X_init.drop(['employment_occupation', 'employment_industry', 'health_insurance'], axis=1)

In [6]:
X_drop.corrwith(y_init['h1n1_vaccine']).sort_values(ascending=False)

doctor_recc_h1n1               0.393890
opinion_h1n1_risk              0.323265
opinion_h1n1_vacc_effective    0.269347
opinion_seas_risk              0.258571
doctor_recc_seasonal           0.209864
opinion_seas_vacc_effective    0.179272
health_worker                  0.169768
h1n1_concern                   0.121929
h1n1_knowledge                 0.117951
chronic_med_condition          0.095207
opinion_h1n1_sick_from_vacc    0.075091
behavioral_wash_hands          0.074712
behavioral_touch_face          0.071648
behavioral_face_mask           0.070498
child_under_6_months           0.066962
behavioral_avoidance           0.047690
behavioral_antiviral_meds      0.040608
behavioral_outside_home        0.021768
behavioral_large_gatherings    0.017822
opinion_seas_sick_from_vacc    0.008360
household_adults               0.007545
household_children            -0.003320
dtype: float64

In [7]:
X_drop.corrwith(y_init['seasonal_vaccine']).sort_values(ascending=False)

opinion_seas_risk              0.390106
doctor_recc_seasonal           0.369190
opinion_seas_vacc_effective    0.361875
opinion_h1n1_risk              0.216625
opinion_h1n1_vacc_effective    0.205072
doctor_recc_h1n1               0.198607
chronic_med_condition          0.170174
h1n1_concern                   0.154828
health_worker                  0.127311
behavioral_touch_face          0.120228
h1n1_knowledge                 0.120152
behavioral_wash_hands          0.112414
behavioral_avoidance           0.076395
behavioral_large_gatherings    0.064025
behavioral_outside_home        0.053509
behavioral_face_mask           0.050083
opinion_h1n1_sick_from_vacc    0.027404
child_under_6_months           0.012097
behavioral_antiviral_meds      0.006277
opinion_seas_sick_from_vacc   -0.061510
household_adults              -0.064840
household_children            -0.114614
dtype: float64

In [8]:
y_init.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   h1n1_vaccine      26707 non-null  int64
 1   seasonal_vaccine  26707 non-null  int64
dtypes: int64(2)
memory usage: 625.9 KB


In [9]:
y_init['h1n1_vaccine'].value_counts(normalize=True)

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

In [10]:
y_init['seasonal_vaccine'].value_counts(normalize=True)

0    0.534392
1    0.465608
Name: seasonal_vaccine, dtype: float64

In [11]:
y_drop = y_init.drop('h1n1_vaccine', axis=1)

In [12]:
y_drop

Unnamed: 0_level_0,seasonal_vaccine
respondent_id,Unnamed: 1_level_1
0,0
1,1
2,0
3,1
4,0
...,...
26702,0
26703,0
26704,1
26705,0


In [13]:
X_drop.corrwith(y_drop['seasonal_vaccine']).sort_values(ascending=False)

opinion_seas_risk              0.390106
doctor_recc_seasonal           0.369190
opinion_seas_vacc_effective    0.361875
opinion_h1n1_risk              0.216625
opinion_h1n1_vacc_effective    0.205072
doctor_recc_h1n1               0.198607
chronic_med_condition          0.170174
h1n1_concern                   0.154828
health_worker                  0.127311
behavioral_touch_face          0.120228
h1n1_knowledge                 0.120152
behavioral_wash_hands          0.112414
behavioral_avoidance           0.076395
behavioral_large_gatherings    0.064025
behavioral_outside_home        0.053509
behavioral_face_mask           0.050083
opinion_h1n1_sick_from_vacc    0.027404
child_under_6_months           0.012097
behavioral_antiviral_meds      0.006277
opinion_seas_sick_from_vacc   -0.061510
household_adults              -0.064840
household_children            -0.114614
dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_drop, y_drop, random_state=50)

In [15]:
X_train

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16087,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,oxchjgsf,"MSA, Principle City",2.0,0.0
9261,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,White,Male,"> $75,000",Married,Own,Employed,bhuqouqj,Non-MSA,1.0,0.0
16193,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,2.0
18216,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,White,Female,"<= $75,000, Above Poverty",Not Married,Rent,Employed,dqpwygqj,"MSA, Principle City",0.0,0.0
9161,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15649,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,,...,White,Male,"<= $75,000, Above Poverty",Married,Own,Employed,kbazzjca,Non-MSA,1.0,2.0
22637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,,,,,kbazzjca,Non-MSA,2.0,3.0
10123,3.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,White,Female,"> $75,000",Married,Own,Not in Labor Force,oxchjgsf,"MSA, Not Principle City",1.0,0.0
5600,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,White,Male,"> $75,000",Married,Own,Employed,lzgpxyit,Non-MSA,1.0,0.0


In [16]:
X_train_nums = X_train.select_dtypes(include=pd.Float64Dtype)
X_train_cat = X_train.select_dtypes(include=object) 

In [17]:
X_train_nums.isna().sum().sort_values(ascending=False)

doctor_recc_h1n1               1578
doctor_recc_seasonal           1578
chronic_med_condition           733
child_under_6_months            618
health_worker                   606
opinion_seas_sick_from_vacc     419
opinion_seas_risk               394
opinion_seas_vacc_effective     352
opinion_h1n1_sick_from_vacc     303
opinion_h1n1_vacc_effective     302
opinion_h1n1_risk               296
household_adults                185
household_children              185
behavioral_avoidance            160
behavioral_touch_face            95
h1n1_knowledge                   90
h1n1_concern                     72
behavioral_outside_home          66
behavioral_large_gatherings      64
behavioral_antiviral_meds        58
behavioral_wash_hands            31
behavioral_face_mask             17
dtype: int64

In [18]:
numerical_pipeline = Pipeline(steps=[
    ('impute_nums', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
#describe why median was chose
categorical_pipeline = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder())])

trans = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train_nums.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
])

In [19]:
simple_model_pipe = Pipeline(steps=[
    ('trans', trans),
    ('tree', DecisionTreeClassifier(max_depth=2, random_state=50))
    ])

In [20]:
simple_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('impute_nums',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outsid...
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('impute_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                       

In [21]:
simple_model_pipe.score(X_train, y_train)

0.7220169745381927

In [22]:
simple_model_pipe.score(X_test, y_test)

0.7158903699266137

In [23]:
simple_preds = simple_model_pipe.predict(X_test)
print(f'This is our f1 score: {f1_score(y_test, simple_preds)}')
print(f'This is our roc-auc score: {roc_auc_score(y_test, simple_preds)}')
print(f'This is our precision score: {precision_score(y_test, simple_preds)}')
print(f'This is our recall score: {recall_score(y_test, simple_preds)}')

This is our f1 score: 0.7136603773584906
This is our roc-auc score: 0.7176303623769451
This is our precision score: 0.6812680115273775
This is our recall score: 0.7492868462757528


In [24]:
simple_importance = simple_model_pipe.named_steps['tree'].feature_importances_
pd.Series(simple_importance, index=X_train.columns).sort_values(ascending=False)

opinion_seas_vacc_effective    0.591166
doctor_recc_seasonal           0.408834
h1n1_concern                   0.000000
household_adults               0.000000
census_msa                     0.000000
hhs_geo_region                 0.000000
employment_status              0.000000
rent_or_own                    0.000000
marital_status                 0.000000
income_poverty                 0.000000
sex                            0.000000
race                           0.000000
education                      0.000000
age_group                      0.000000
opinion_seas_sick_from_vacc    0.000000
opinion_seas_risk              0.000000
opinion_h1n1_sick_from_vacc    0.000000
h1n1_knowledge                 0.000000
opinion_h1n1_risk              0.000000
opinion_h1n1_vacc_effective    0.000000
health_worker                  0.000000
child_under_6_months           0.000000
chronic_med_condition          0.000000
doctor_recc_h1n1               0.000000
behavioral_touch_face          0.000000


In [25]:
trans2 = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train[['opinion_seas_vacc_effective', 'doctor_recc_seasonal']].columns),
    ])
log_model_pipe = Pipeline(steps=[
    ('trans', trans2), ## need to drop out unused columns from the pipeline
    ('log', LogisticRegression(random_state=50))
    ])

In [27]:
log_model_pipe.fit(X_train[['opinion_seas_vacc_effective', 'doctor_recc_seasonal']], y_train)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('impute_nums',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['opinion_seas_vacc_effective', 'doctor_recc_seasonal'], dtype='object'))])),
                ('log', LogisticRegression(random_state=50))])

In [28]:
log_model_pipe.score(X_test[['opinion_seas_vacc_effective', 'doctor_recc_seasonal']], y_test)

0.7259248165343717

In [29]:
simplog_preds = log_model_pipe.predict(X_test[['opinion_seas_vacc_effective', 'doctor_recc_seasonal']])

In [30]:
print(f'This is our f1 score: {f1_score(y_test, simplog_preds)}')
print(f'This is our roc-auc score: {roc_auc_score(y_test, simplog_preds)}')
print(f'This is our precision score: {precision_score(y_test, simplog_preds)}')
print(f'This is our recall score: {recall_score(y_test, simplog_preds)}')

This is our f1 score: 0.7166305357695881
This is our roc-auc score: 0.7263163128571055
This is our precision score: 0.7005752346351801
This is our recall score: 0.7334389857369256


In [31]:
log_model_pipe.named_steps['log'].coef_


array([[0.85995859, 0.73711464]])

In [32]:
log_model_pipe.named_steps['log'].intercept_

array([-0.20514959])

In [40]:
a = log_model_pipe.named_steps['log'].coef_
a

array([[0.85995859, 0.73711464]])

In [41]:
np.exp(0.85995859) 

2.3630628372476004

In [44]:
def logOddsConverter(data):
    odds = []
    for item in data:
        odds.append(np.exp(item))
    return odds

In [45]:
logOddsConverter(a)

[array([2.36306284, 2.08989671])]