# THIS NOTEBOOK CONTAINS THE CODE FOR THE PROJECT

## GENERAL IMPORTS

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set_style('darkgrid')
plt.style.use('dark_background')

In [4]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# READING DATA

In [6]:
df = pd.read_csv('Dataset/training_set_features.csv')
target = pd.read_csv('Dataset/test_set_features.csv')
to_drop = ['census_msa', 'employment_industry', 'employment_occupation', 'rent_or_own', 'behavioral_outside_home']
df.drop(to_drop, axis=1, inplace=True)
target.drop(to_drop, axis=1, inplace=True)

### FEATURE ENGINEERING

In [8]:
# CONVERTING OBJECTS TO NOMINAL INPUTS
edu_map = {"< 12 Years":1,
           "12 Years":2,
           "Some College":3,
           "College Graduate":4}

age_map = {"35 - 44 Years" : 2,
           "18 - 34 Years" : 1,
           "45 - 54 Years" : 3,
           "55 - 64 Years" : 4,
           "65+ Years": 5}

race_map = {"White":4,
            "Black":3,
            "Hispanic":2,
            "Other or Multiple":1}

inc_map = {"<= $75,000, Above Poverty" : 3,
           "> $75,000":2,
           "Below Poverty":1}

emp_map = {"Employed":3,
           "Not in Labor Force":2,
           "Unemployed":1}

hhs_map = {"dqpwygqj":1,
           "atmpeygn":2,
           "lrircsnp":3,
           "mlyzmhmf":4,
           "bhuqouqj":5,
           "kbazzjca":6,
           "oxchjgsf":7,
           "qufhixun":8,
           "fpwskwrf":9,
           "lzgpxyit":10}

In [9]:
df.education, target.education = df.education.replace(edu_map), target.education.replace(edu_map)
df.age_group, target.age_group = df.age_group.replace(age_map), target.age_group.replace(age_map)
df.race, target.race = df.race.replace(race_map), target.race.replace(race_map)
df.income_poverty, target.income_poverty = df.income_poverty.replace(inc_map), target.income_poverty.replace(inc_map)
df.employment_status, target.employment_status = df.employment_status.replace(emp_map), target.employment_status.replace(emp_map)
df.hhs_geo_region, target.hhs_geo_region = df.hhs_geo_region.replace(hhs_map), target.hhs_geo_region.replace(hhs_map)

In [10]:
# ONE-HOT ENCODING
male_series_df = pd.get_dummies(df.sex, drop_first=True).astype('int32')
male_series_target = pd.get_dummies(target.sex, drop_first=True).astype('int32')
df['Male'] = male_series_df
target['Male'] = male_series_target
df.drop('sex', axis=1, inplace=True)
target.drop('sex', axis=1, inplace=True)

marital_series_df = pd.get_dummies(df.marital_status, drop_first=True).astype('int32')
marital_series_target = pd.get_dummies(target.marital_status, drop_first=True).astype('int32')
df['Marital_Status'] = marital_series_df
target['Marital_Status'] = marital_series_target
df.drop('marital_status', axis=1, inplace=True)
target.drop('marital_status', axis=1, inplace=True)

In [11]:
# FILLING NULL VALUES WITH MODE
null_fill = {"behavioral_outside_home" : 0,
"behavioral_antiviral_meds" : 0,
"opinion_xyz_vacc_effective" : 4,
"behavioral_face_mask" : 0,
"behavioral_wash_hands" : 1,
"behavioral_large_gatherings" : 0,
"opinion_seas_risk": 2,
"opinion_seas_sick_from_vacc" : 1,
"behavioral_avoidance" : 1,
"opinion_xyz_sick_from_vacc" : 2,
"doctor_recc_xyz" : 0,
"opinion_seas_vacc_effective" : 4,
"opinion_xyz_risk" : 2,
"doctor_recc_seasonal" : 0,
 "xyz_concern" : 2, 
"xyz_knowledge" : 1, 
"chronic_med_condition" : 0,
"behavioral_touch_face" : 1,
"child_under_6_months" : 0,
"employment_status" : 3,
"health_insurance" : 1,
"health_worker" : 0,
"household_adults" : 1,
"household_children" : 0,
"income_poverty" : 3,
"education" : 4}

df.fillna(null_fill, inplace=True)
target.fillna(null_fill, inplace=True)

In [12]:
df.set_index('respondent_id', inplace=True)
target.set_index('respondent_id', inplace=True)
uniqueFeature_seas = ["opinion_seas_risk", "opinion_seas_sick_from_vacc", "opinion_seas_vacc_effective", "doctor_recc_seasonal"]
uniqueFeature_xyz = ["opinion_xyz_vacc_effective", "opinion_xyz_sick_from_vacc", "doctor_recc_xyz", "opinion_xyz_risk", "xyz_concern",
                     "xyz_knowledge"]

In [13]:
# SEPARATING FEATURES FOR XYZ AND SEASONAL VACCINE
df_xyz = df.drop(uniqueFeature_seas, axis=1)
target_xyz = target.drop(uniqueFeature_seas, axis=1)
df_seas = df.drop(uniqueFeature_xyz, axis=1)
target_seas = target.drop(uniqueFeature_xyz, axis=1)

## READ TRAINING LABELS

In [15]:
y_label = pd.read_csv('Dataset/training_set_labels.csv')
y_label_xyz = y_label.xyz_vaccine
y_label_seas = y_label.seasonal_vaccine

# TESTING AUC SCORE

In [17]:
XYZ_ACCURACY = 0.8361697299750397
XYZ_AUC_SCORE = 0.8259381655272489

SEAS_ACCURACY = 0.7720671658724756
SEAS_AUC_SCORE = 0.8452752554393703

FINAL_SCORE = (XYZ_AUC_SCORE + SEAS_AUC_SCORE) / 2
FINAL_SCORE

0.8356067104833096

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_seas, y_label_seas, test_size=0.33)

scaler_test_seas = StandardScaler()
X_train = scaler_test_seas.fit_transform(X_train)

log_mod_seas = LogisticRegression(penalty='l2')
log_mod_seas.fit(X_train, y_train)

In [19]:
X_test = scaler_test_seas.transform(X_test)

log_mod_seas.score(X_test, y_test)

0.770024960290447

In [20]:
predicted = log_mod_seas.predict_proba(X_test)[:,1]

roc_auc_score(y_test, predicted)

0.8470186250031004

# CREATING LOGISTIC MODELS

### XYZ VACCINE

In [23]:
scaler_xyz = StandardScaler()
df_xyz = scaler_xyz.fit_transform(df_xyz)

logisticModel_xyz = LogisticRegression(penalty='l2')
logisticModel_xyz.fit(df_xyz, y_label_xyz)

In [24]:
target_xyz = scaler_xyz.transform(target_xyz)
xyz_vaccine = logisticModel_xyz.predict_proba(target_xyz)[:,1]

### SEASONAL VACCINE

In [26]:
scaler_seas = StandardScaler()
df_seas = scaler_seas.fit_transform(df_seas)

logisticModel_seas = LogisticRegression(penalty='l2')
logisticModel_seas.fit(df_seas, y_label_seas)

In [27]:
target_seas = scaler_seas.transform(target_seas)
seasonal_vaccine = logisticModel_seas.predict_proba(target_seas)[:,1]

## CREATING OUTPUT DATAFRAME

In [29]:
respondent = pd.read_csv('Dataset/test_set_features.csv')
respondent = respondent.respondent_id

dict = {'respondent_id':respondent, 'xyz_vaccine':xyz_vaccine, 'seasonal_vaccine':seasonal_vaccine}
result = pd.DataFrame(dict)

In [30]:
result.to_csv('result.csv', index=False)