In [None]:
# Improve Data Quality: 
#   - Address inaccuracies, inconsistencies, and missing values.
# Prepare Data for Analysis: 
#   - Transform data into a suitable format for modeling and visualization.
# Reduce Noise: 
#   - Minimize the impact of irrelevant or redundant information.
# Enhance Model Performance: 
#   - Improve the accuracy, efficiency, and generalization ability of machine learning models.

In [3]:
## Data Inspection & Understanding

In [17]:
import pandas as pd
import numpy as np
import os

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [45]:
# Import the raw data of training features and labels
data_features = pd.read_csv("/Users/jackie/Documents/data_science/projects/predict_h1n1_and_seasonal_flu_vaccines/data/raw/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv")
data_labels = pd.read_csv("/Users/jackie/Documents/data_science/projects/predict_h1n1_and_seasonal_flu_vaccines/data/raw/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv")

In [13]:
# According to the foundings based on EDA
# For columns having siginifiant count of missing values, a new category of NA will be imputed
columns_with_new_category = ['health_insurance', 'income_poverty', 'employment_industry', 'employment_occupation']

In [46]:
# Before fillna()
data_features[columns_with_new_category].isnull().sum()

health_insurance         12274
income_poverty            4423
employment_industry      13330
employment_occupation    13470
dtype: int64

In [47]:
# After fillna()
data_features[columns_with_new_category] = data_features[columns_with_new_category].fillna('NA')

data_features[columns_with_new_category].isnull().sum()

health_insurance         0
income_poverty           0
employment_industry      0
employment_occupation    0
dtype: int64

In [48]:
# Each categorical column, find the most frequent category, do fillna() with that category
categorical_col = data_features.select_dtypes(include='object').columns
data_features_cat_gp = data_features[categorical_col].describe().reset_index()

for col in categorical_col:
    data_features[col] = data_features[col].fillna(data_features_cat_gp.loc[2, col])

In [49]:
# After handling the missing values of categorical columns
data_features[categorical_col].isnull().sum()

health_insurance         0
age_group                0
education                0
race                     0
sex                      0
income_poverty           0
marital_status           0
rent_or_own              0
employment_status        0
hhs_geo_region           0
census_msa               0
employment_industry      0
employment_occupation    0
dtype: int64

In [50]:
data_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [35]:
# Encoding categorical data
# Encoding the Independent Variable
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

health_insurance         0
age_group                0
education                0
race                     0
sex                      0
income_poverty           0
marital_status           0
rent_or_own              0
employment_status        0
hhs_geo_region           0
census_msa               0
employment_industry      0
employment_occupation    0
dtype: int64

In [43]:
# Taking care of missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(data_features)
data_features = imputer.transform(data_features)

In [44]:
data_features.isnull().sum()

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [37]:
# Convert pandas dataframe into numpy array and exclude the column 'respondent_id'
X = data_features.iloc[:, 1:].values
y = data_labels.iloc[:, 1:].values

In [38]:
X.shape

(26707, 35)

In [41]:
print(X)

[[1.0 0.0 0.0 ... 0.0 'NA' 'NA']
 [3.0 2.0 0.0 ... 0.0 'pxcmvdjn' 'xgwztkwe']
 [1.0 1.0 0.0 ... 0.0 'rucpziij' 'xtkaffoo']
 ...
 [2.0 2.0 0.0 ... 0.0 'NA' 'NA']
 [1.0 1.0 0.0 ... 0.0 'fcxhlnwr' 'haliazsg']
 [0.0 0.0 0.0 ... 0.0 'NA' 'NA']]


In [42]:
X.shape

(26707, 35)