## Pre-Processing Functions ##

In [27]:
def data_cleaning(df):
    #deletes columns with more than 40% null values
    #uses the most frequent imputation strategy for numeric and
    #string data
    i = 0
    while (i<len(df.columns)):
        current_col = df.iloc[:, i]
        if ((current_col.isnull().sum()/26708) > 0.4): 
            df.drop(df.columns[[i]], axis=1, inplace=True)
        if (current_col.dtypes == 'float64'):
            mode = current_col.mode()[0]
            current_col.fillna(mode, inplace=True)
        if (current_col.dtypes == 'object'):
            mode = current_col.mode()[0]
            current_col.fillna(mode, inplace=True)
        i = i + 1
    return df

In [28]:
def label_encoding(df):
        age_map = {
        '18 - 34 Years':1,
        '35 - 44 Years':2,
        '45 - 54 Years':3,
        '55 - 64 Years':4,
        '65+ Years':5
        }

        education_map = {
            '< 12 Years':1,
            '12 Years':2,
            'Some College':3,
            'College Graduate':4
        }

        income_map = {
            'Below Poverty':1,
            '<= $75,000, Above Poverty':2,
            '> $75,000':3
        }

        df['age_group_numerical'] = df.age_group.map(age_map)
        df['education_numerical'] = df.education.map(education_map)
        df['income_poverty_numerical'] = df.income_poverty.map(income_map)
        return df.drop(columns=['age_group','education','income_poverty'])

In [None]:
#This function takes in a dataframe (df),
#then calls the label_encoding function on it,
#then calls the get_dummies function on it,
#and returns an updated dataframe (df2).
def data_preprocessing(df):
    df2 = pd.get_dummies(label_encoding(df), prefix_sep='_', drop_first=True)
    return df2

### Importing and Preprocessing the Data ###

In [17]:
import pandas as pd

#Training features
X_train = pd.read_csv(
    "training_set_features.csv"
)
#Testing features
X_test = pd.read_csv(
    "test_set_features.csv"
)
#Training labels
y_train = pd.read_csv(
    "training_set_labels.csv"
)
#Submission format
submission_format = pd.read_csv(
    "submission_format.csv"
)

In [None]:
preprocessed_X_train = data_preprocessing(X_train)
preprocessed_X_test = data_preprocessing(X_test)

In [None]:
### XGBoost Regression ###

In [18]:
import xgboost as xgb
from xgboost import XGBRegressor
model = XGBRegressor(max_depth = 3,verbosity = 0)

In [19]:
model.fit(preprocessed_X_train, y_train[['h1n1_vaccine']])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [20]:
h1n1_pred = model.predict(preprocessed_X_test)

In [21]:
submission_format["h1n1_vaccine"] = h1n1_pred

In [22]:
model.fit(preprocessed_X_train, y_train[['seasonal_vaccine']])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [23]:
seas_pred = model.predict(preprocessed_X_test)

In [24]:
submission_format["seasonal_vaccine"] = seas_pred

### Final Submission ###

In [25]:
final_submission = submission_format.set_index('respondent_id')
final_submission

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,-0.030461,0.359172
26708,-0.057395,-0.321879
26709,0.113549,0.653209
26710,0.629822,0.447161
26711,0.252050,0.540102
...,...,...
53410,0.072542,0.499591
53411,0.088375,0.423394
53412,0.093270,0.066458
53413,-0.151954,-0.011762


In [26]:
final_submission.to_csv('final_submission.csv', index=True)