# EDA notebook - Cleaning

In [1]:
# import necessar libraries
import pandas as pd
import numpy as np

In [2]:
# read data
# predictors
data = pd.read_csv("../data/training_set_features.csv", index_col = 0)
# target
vac = pd.read_csv("../data/training_set_labels.csv", index_col = 0)

In [3]:
# merge them to align the index
all_data = data.merge(vac['h1n1_vaccine'], left_index=True, right_index=True)

In [4]:
all_data.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0


In [5]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26615 non-null  float64
 1   h1n1_knowledge               26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_h1n1             24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

In [6]:
# remove columns related to seasonal flu
all_data.drop(['doctor_recc_seasonal',
               'opinion_seas_vacc_effective',
               'opinion_seas_risk',
               'opinion_seas_sick_from_vacc'], 
              axis = 1,
              inplace=True)

## Handling Missing Values
Below shows all column with the number of missing values.

In [7]:
all_data.isna().sum()

h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
rent_or_own                     2042
employment_status               1463
hhs_geo_region                     0
census_msa                         0
h

The data qualities are checked for all variables. A few variables are selected for the display.

In [8]:
all_data.age_group.value_counts()

65+ Years        6843
55 - 64 Years    5563
45 - 54 Years    5238
18 - 34 Years    5215
35 - 44 Years    3848
Name: age_group, dtype: int64

In [9]:
all_data.sex.value_counts()

Female    15858
Male      10849
Name: sex, dtype: int64

In [10]:
all_data.race.value_counts()

White                21222
Black                 2118
Hispanic              1755
Other or Multiple     1612
Name: race, dtype: int64

In [11]:
# copying the whole data that will used to replace the missing values
all_data_no_missing = all_data.copy()

All columns with missing values are assigned with a new value. The new values replaced with missing values usually mean "does not know", or "prefer not to answer". For example, one might not know if the respondent has health insurance or not. For the question asking household size, one might have 4 or more people while the options are available up to only 3.

In [12]:
all_data_no_missing.h1n1_concern.fillna(4, inplace=True)
all_data_no_missing.h1n1_knowledge.fillna(3, inplace=True)
all_data_no_missing.behavioral_antiviral_meds.fillna(2, inplace=True)
all_data_no_missing.behavioral_avoidance.fillna(2, inplace=True)
all_data_no_missing.behavioral_face_mask.fillna(2, inplace=True)
all_data_no_missing.behavioral_wash_hands.fillna(2, inplace=True)
all_data_no_missing.behavioral_large_gatherings.fillna(2, inplace=True)
all_data_no_missing.behavioral_outside_home.fillna(2, inplace=True)
all_data_no_missing.behavioral_touch_face.fillna(2, inplace=True)
all_data_no_missing.doctor_recc_h1n1.fillna(2, inplace=True)
all_data_no_missing.chronic_med_condition.fillna(2, inplace=True)
all_data_no_missing.child_under_6_months.fillna(2, inplace=True)
all_data_no_missing.health_worker.fillna(2, inplace=True)
all_data_no_missing.health_insurance.fillna(2, inplace=True)
all_data_no_missing.education.fillna('N/A', inplace=True)
all_data_no_missing.income_poverty.fillna('N/A', inplace=True)
all_data_no_missing.marital_status.fillna('N/A', inplace=True)
all_data_no_missing.rent_or_own.fillna('N/A', inplace=True)
all_data_no_missing.employment_status.fillna('N/A', inplace=True)
all_data_no_missing.household_adults.fillna(4, inplace=True)
all_data_no_missing.household_children.fillna(4, inplace=True)
all_data_no_missing.employment_industry.fillna('N/A', inplace=True)
all_data_no_missing.employment_occupation.fillna('N/A', inplace=True)

There is a special case for three questions asking opinions about the vaccine. The risk of guessing the missing value does not seem to be high as there is an option for "don't know". The below code represents three questions with the missing values replaced with 3, "don't know".

In [13]:
all_data_no_missing.opinion_h1n1_vacc_effective.fillna(3, inplace=True)
all_data_no_missing.opinion_h1n1_risk.fillna(3, inplace=True)
all_data_no_missing.opinion_h1n1_sick_from_vacc.fillna(3, inplace=True)

In [14]:
all_data_no_missing.isna().sum()

h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_region                 0
census_msa                     0
household_adults               0
household_children             0
employment_industry            0
employment

### Creating Dummies
This dataset contains only categorical data.

In [15]:
# Separating the target variable
y = all_data_no_missing.pop('h1n1_vaccine')

In [16]:
X = all_data_no_missing.copy()

In [17]:
# a function to create dummy variables for one column from original data
# remove original column
def dummy(data, col):
    data = pd.concat([data, pd.get_dummies(data[col], prefix=col)], axis = 1)
    data.drop(col, axis = 1, inplace = True)
    return data

In [18]:
# for loop to call dummy function above for each columns
# This was necessary to keep original column names as prefix for the created dummy columns
for c in X.columns:
    X = dummy(X, c)

In [19]:
# Checking dummy variables
for c in X.columns:
    print(c)

h1n1_concern_0.0
h1n1_concern_1.0
h1n1_concern_2.0
h1n1_concern_3.0
h1n1_concern_4.0
h1n1_knowledge_0.0
h1n1_knowledge_1.0
h1n1_knowledge_2.0
h1n1_knowledge_3.0
behavioral_antiviral_meds_0.0
behavioral_antiviral_meds_1.0
behavioral_antiviral_meds_2.0
behavioral_avoidance_0.0
behavioral_avoidance_1.0
behavioral_avoidance_2.0
behavioral_face_mask_0.0
behavioral_face_mask_1.0
behavioral_face_mask_2.0
behavioral_wash_hands_0.0
behavioral_wash_hands_1.0
behavioral_wash_hands_2.0
behavioral_large_gatherings_0.0
behavioral_large_gatherings_1.0
behavioral_large_gatherings_2.0
behavioral_outside_home_0.0
behavioral_outside_home_1.0
behavioral_outside_home_2.0
behavioral_touch_face_0.0
behavioral_touch_face_1.0
behavioral_touch_face_2.0
doctor_recc_h1n1_0.0
doctor_recc_h1n1_1.0
doctor_recc_h1n1_2.0
chronic_med_condition_0.0
chronic_med_condition_1.0
chronic_med_condition_2.0
child_under_6_months_0.0
child_under_6_months_1.0
child_under_6_months_2.0
health_worker_0.0
health_worker_1.0
health_work