### Data Prep/Feature Eng
* use cross val for tuning and selecting hyperparameters
* use test set at very end on best model
* find optimal complexityq to balance bias variance

### TRY ALL MODELS - but have rationale on why you are trying models
* document the iterative process

### Deliverables
* Contract by monday 2pm mountain time - communication frequency and tangible deadlines
* model completed by monday EOD
* proof of concept that your target and predictors are fit for machine learning classification
* decide as a team if target can be used as is or needs to be transformed
* FSM

### GROUP: Evan, Drew, Mustafa

In [153]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [95]:
def education_to_num(x):
    ed_dict = {'College Graduate':3,'Some College':2,'12 Years':1,'< 12 Years':0}
    if type(x) == str:
        return ed_dict[x]
    return x

In [96]:
df_var = pd.read_csv('../data/training_set_features.csv')
df_tar = pd.read_csv('../data/training_set_labels.csv')

In [97]:
# Drop Based On Relevance
df_var = df_var.drop(['respondent_id','h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','doctor_recc_h1n1','hhs_geo_region'],axis=1)

In [98]:
percent_nan = df_var.isna().sum() / df_var.shape[0] * 100
percent_nan.map(round)[percent_nan > 10]

health_insurance         46
income_poverty           17
employment_industry      50
employment_occupation    50
dtype: int64

In [99]:
# Drop based on Nan
df_var = df_var.drop(['health_insurance','income_poverty','employment_industry','employment_occupation'],axis=1)

In [100]:
df_comb = pd.concat([df_var,df_tar['seasonal_vaccine']],axis=1,ignore_index=True)
df_comb.columns = list(df_var.columns)+['target']

### Train Test Split

In [101]:
X_train,X_test,y_train,y_test = train_test_split(df_var,df_tar,random_state=42)
X_train = X_train.copy(deep=True)

In [107]:
X_train.columns

Index(['behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_seas_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex',
       'marital_status', 'rent_or_own', 'employment_status', 'census_msa',
       'household_adults', 'household_children'],
      dtype='object')

In [113]:
# Imputing Majority Columns
imputer_majority = SimpleImputer(strategy='most_frequent')
majority_columns = ['behavioral_antiviral_meds', 'behavioral_avoidance',
                     'behavioral_face_mask', 'behavioral_wash_hands',
                     'behavioral_large_gatherings', 'behavioral_outside_home',
                     'behavioral_touch_face', 'doctor_recc_seasonal',
                     'chronic_med_condition', 'child_under_6_months', 'health_worker',
                     'education', 'rent_or_own', 'marital_status', 'employment_status',
                     'sex'
                  ]

X_train[majority_columns] = imputer_majority.fit_transform(X_train[majority_columns])

In [110]:
# Imputing Opinion Columns
imputer_opinion = SimpleImputer(strategy='median')
opinion_columns = ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                   'opinion_seas_sick_from_vacc','household_adults', 'household_children'
                   ]

X_train[opinion_columns] = imputer_binary.fit_transform(X_train[opinion_columns])

In [138]:
# One Hot Encoding
ohe = OneHotEncoder(drop='first',categories='auto')
ohe_columns = ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                   'opinion_seas_sick_from_vacc','age_group','education','race',
                   'employment_status', 'census_msa'
                   ]
ohe_array = ohe.fit_transform(X_train[ohe_columns])

ohe_df = pd.DataFrame(ohe_array.todense(),columns=ohe.get_feature_names(ohe_columns))
ohe_df.reset_index(inplace=True,drop=True)
X_train.reset_index(inplace=True,drop=True)

X_train_ohe = pd.concat([X_train.drop(ohe_columns,axis=1),ohe_df],axis=1,ignore_index=True)
X_train_ohe.columns = list(X_train.drop(ohe_columns,axis=1).columns) + list(ohe_df.columns)

In [147]:
# Ordinal Encoding
# Sex - 0=Female | 1=Male
# Marital Status - 0=Married | 1=Not Married
# Rent or Own - 0=Own | 1=Rent

oe = OrdinalEncoder(categories='auto')
X_train_ohe[['sex','marital_status','rent_or_own']] = oe.fit_transform(X_train[['sex','marital_status','rent_or_own']])

In [156]:
# Scaling Numerical Columns
ss = StandardScaler()
scaled_array = ss.fit_transform(X_train_ohe)
X_train_ohe_scaled = pd.DataFrame(scaled_array,columns=X_train_ohe.columns)

In [159]:
X_train_ohe_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20030 entries, 0 to 20029
Data columns (total 42 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   behavioral_antiviral_meds             20030 non-null  float64
 1   behavioral_avoidance                  20030 non-null  float64
 2   behavioral_face_mask                  20030 non-null  float64
 3   behavioral_wash_hands                 20030 non-null  float64
 4   behavioral_large_gatherings           20030 non-null  float64
 5   behavioral_outside_home               20030 non-null  float64
 6   behavioral_touch_face                 20030 non-null  float64
 7   doctor_recc_seasonal                  20030 non-null  float64
 8   chronic_med_condition                 20030 non-null  float64
 9   child_under_6_months                  20030 non-null  float64
 10  health_worker                         20030 non-null  float64
 11  sex            