# Import Libraries

In [None]:
!pip install optuna
!pip install category_encoders
!pip install skorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 5.2 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.4 MB/s 
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 28.7 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 6.1 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 43.1 MB/s 
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.11.0-py2.py3-none-any.whl (112 kB)
[K

In [None]:
# -------------------------------------
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import os
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import optuna
from optuna.integration import SkorchPruningCallback
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim
import skorch
from skorch import NeuralNetClassifier

import patsy
import re
from category_encoders import TargetEncoder

# Read in Files

In [None]:
# Mount drive so that we can access google drive files
from google.colab import drive
drive.mount('/drive')

train = pd.read_csv('/drive/My Drive/Colab Notebooks/MMA 869 - Machine Learning & AI/training_set_features.csv')
train_labels = pd.read_csv('/drive/My Drive/Colab Notebooks/MMA 869 - Machine Learning & AI/training_set_labels.csv')
test = pd.read_csv('/drive/My Drive/Colab Notebooks/MMA 869 - Machine Learning & AI/test_set_features.csv')
sub_format = pd.read_csv('/drive/My Drive/Colab Notebooks/MMA 869 - Machine Learning & AI/submission_format.csv')


# create y label columns
y_train_h1n1 = train_labels['h1n1_vaccine']
y_train_seas = train_labels['seasonal_vaccine']


Mounted at /drive


# Imputation Strategies

In [None]:
def negOne_noneTag_imputation(train_set,test_set):
    # Fill missing numerics with -1
    train_nums = train_set.select_dtypes('number')
    test_nums = test_set.select_dtypes('number')

    train_nums = train_nums.fillna(value=-1)
    test_nums = test_nums.fillna(value=-1)

    # Now simply fill with 'None' for the categorical features
    train_cats = train_set.select_dtypes('object')
    test_cats = test_set.select_dtypes('object')

    train_cats = train_cats.fillna(value='None')
    test_cats = test_cats.fillna(value='None')

    train_df = pd.concat([train_nums,train_cats],axis=1)
    test_df = pd.concat([test_nums,test_cats],axis=1)

    return train_df,test_df


def knn_noneTag_imputation(train_set,test_set):

    numeric_cols = list(train_set.select_dtypes('number').columns.values)

    # KNN works best with normalized data so we will normalize, fit to KNN imputer, and then reverse the normalization
    scaler = StandardScaler()
    scaled_train_nums = pd.DataFrame(scaler.fit_transform(train_set[numeric_cols]), columns = train_set[numeric_cols].columns, index=train_set.index)
    scaled_test_nums = pd.DataFrame(scaler.transform(test_set[numeric_cols]), columns = test_set[numeric_cols].columns, index=test_set.index)

    # Fill missing numeric with nearest neighbour values
    knn_imputer = KNNImputer(n_neighbors=5).fit(scaled_train_nums)
    train_num = pd.DataFrame(knn_imputer.transform(scaled_train_nums), columns = scaled_train_nums.columns, index=train_set.index)
    test_num = pd.DataFrame(knn_imputer.transform(scaled_test_nums), columns = scaled_test_nums.columns, index=test_set.index)

    # Now reverse scaling using KNN output
    train_num = pd.DataFrame(scaler.inverse_transform(train_num), columns = train_num.columns, index=train_set.index)
    test_num = pd.DataFrame(scaler.inverse_transform(test_num), columns = test_num.columns, index=test_set.index)

    # Now simply fill with 'None' for the categorical features
    train_cats = train_set.select_dtypes('object')
    test_cats = test_set.select_dtypes('object')

    train_cats = train_cats.fillna(value='None')
    test_cats = test_cats.fillna(value='None')

    train_df = pd.concat([train_num,train_cats],axis=1)
    test_df = pd.concat([test_num,test_cats],axis=1)

    return train_df,test_df


def mean_none_imputation(train_set,test_set):
    
    numeric_cols = list(train_set.select_dtypes('number').columns.values)

    num_imputer = SimpleImputer(strategy='mean').fit(train_set[numeric_cols])
    train_nums = pd.DataFrame(num_imputer.transform(train_set[numeric_cols]), columns = train_set[numeric_cols].columns, index=train_set.index)
    test_nums = pd.DataFrame(num_imputer.transform(test_set[numeric_cols]), columns = test_set[numeric_cols].columns, index=test_set.index)

    # Don't round values as it seems to decrease auc
    #train_nums = round(train_nums,0)
    #test_nums = round(test_nums,0)

    # Now simply fill with 'None' for the categorical features
    train_cats = train_set.select_dtypes('object')
    test_cats = test_set.select_dtypes('object')

    train_cats = train_cats.fillna(value='None')
    test_cats = test_cats.fillna(value='None')

    train_df = pd.concat([train_nums,train_cats],axis=1)
    test_df = pd.concat([test_nums,test_cats],axis=1)

    return train_df,test_df

def meanGroup_none_imputation(train_set,test_set):

    # Collect numeric columns
    numeric_cols = list(train_set.select_dtypes('number').columns.values)

    # Identify the numeric columns with null values
    cols_wNulls = pd.DataFrame(train_set[numeric_cols].isna().sum() > 0).reset_index()
    cols_wNulls = cols_wNulls.rename(columns={"index": "column", 0: "flag"})
    cols_wNulls = cols_wNulls[cols_wNulls['flag'] == True]

    # For each numeric column with NaNs...
    for col in cols_wNulls['column'].unique():

        # Calculate group averages (NOTE: geo_region and age_group have 0 nulls in both train and test, otherwise we should fill those beforehand)
        geo_age_means = train_set.groupby(['hhs_geo_region','age_group'])[col].mean().reset_index()
        geo_means = train_set.groupby(['hhs_geo_region'])[col].mean().reset_index()
        age_means = train_set.groupby(['age_group'])[col].mean().reset_index()

        # Iterate through each row in the means and match it in rows that are NaN 
        for index,row in geo_age_means.iterrows():
            
            age_category = row['age_group']
            geo_category = row['hhs_geo_region']
            mean_value_to_impute = row[col]
            
            # Insert mean value for null values when geo_region and age_group match
            train_set.loc[(train_set[col].isna()) & (train_set['hhs_geo_region']==geo_category) & (train_set['age_group']==age_category), col] = mean_value_to_impute

            # Do the same for testing data
            test_set.loc[(test_set[col].isna()) & (test_set['hhs_geo_region']==geo_category) & (test_set['age_group']==age_category), col] = mean_value_to_impute

        # Not all values may be found in testing data so when we still have missing values, we will fill using the age-based means for the current column
        if test_set[col].isna().sum() > 0:
            age_only_mean = age_means[age_means['age_group'] == age_category]
            age_mean_to_impute = age_only_mean[col].iat[0]
            
            test_set.loc[(test_set[col].isna()) & (test_set['age_group']==age_category), col] = age_mean_to_impute
    
    # Separate numerics from categories so we can easily combine them after imputing categorical nulls
    train_nums = train_set.select_dtypes('number')
    test_nums = test_set.select_dtypes('number')

    # Now simply fill with 'None' for the categorical features
    train_cats = train_set.select_dtypes('object')
    test_cats = test_set.select_dtypes('object')

    train_cats = train_cats.fillna(value='None')
    test_cats = test_cats.fillna(value='None')

    train_df = pd.concat([train_nums,train_cats],axis=1)
    test_df = pd.concat([test_nums,test_cats],axis=1)

    return train_df,test_df



# Feature Engineering

In [None]:
catFeat_to_group = ['age_group','race','hhs_geo_region','employment_status','income_poverty']
feat_to_calc = ['doctor_recc_h1n1','doctor_recc_seasonal','h1n1_concern','h1n1_knowledge',
                'health_insurance','chronic_med_condition','opinion_h1n1_vacc_effective',
                'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc']

def add_catMeans(training,testing,bygroupCols,calcCols):
    
    # for each category column we want to group things by...
    for col in bygroupCols:
        means = training.groupby(col)[[calcCols]].mean().add_suffix(col+'_Mean').reset_index(col)

        training = training.merge(means, on=col, how='left')
        testing = testing.merge(means, on=col, how='left')
        
    return training, testing


def add_opinionMean(training,testing):

    opinion_cols = ['opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc',
                'opinion_seas_vacc_effective','opinion_seas_risk','opinion_seas_sick_from_vacc']
    
    # for each category column we want to group things by...
    for col in opinion_cols:

        training[col+'_ovrAvg'] = np.where(training[col]>training[col].mean(), 1 ,0)
        testing[col+'_ovrAvg'] = np.where(testing[col]>training[col].mean(), 1 ,0)
        
    return training, testing


formula = "~ hhs_geo_region*education + age_group*education*sex + income_poverty*census_msa"
def create_matrix(training,testing,matrix_formula):

    # Create flag before concat
    training['trainTest'] = 'train'
    testing['trainTest'] = 'test'

    # Concat into one to ensure all categories are covered
    input_data = pd.concat([training,testing],axis=0)
    
    # Create interaction matrix
    interact_matrix = patsy.dmatrix(matrix_formula,input_data,return_type="dataframe")

    # Drop any columns that were used in the matrix because their dummy variables are now in the matrix
    interact_matrix = interact_matrix.drop(['Intercept','hhs_geo_region','education','age_group','sex','income_poverty','census_msa'],axis=1)


    # Combine matrix with original data
    input_data = pd.concat([input_data,interact_matrix],axis=1)
    

    # Split data up and drop flag column created earlier
    x_trainOutput = [input_data['trainTest']=='train']
    x_trainOutput.drop(['trainTest'],axis=1,inplace=True)
    
    x_testOutput = input_data[input_data['trainTest']=='test']
    x_testOutput.drop(['trainTest'],axis=1,inplace=True)

    return x_trainOutput, x_testOutput

# Categorical Encoding

In [None]:
def ordinal_encoder(x_train,x_test):

    # Create flag before concat
    x_train['trainTest'] = 'train'
    x_test['trainTest'] = 'test'

    # Concat into one
    input_data = pd.concat([x_train,x_test],axis=0)

    # Ordinal column mappings
    ordinal_cols = ['age_group','education','employment_status','income_poverty']
    ordinal_dicts = [
        {"None":0, "18 - 34 Years":1, "35 - 44 Years":2, "45 - 54 Years":3, "55 - 64 Years":4, "65+ Years":5},
        {"None":0, "< 12 Years":1, "12 Years":2, "Some College":3, "College Graduate":4},
        {"None":0, "Unemployed":1, "Not in Labor Force":2, "Employed":3},
        {"None":0, "Below Poverty":1, "<= $75,000, Above Poverty":2, "> $75,000":3}
        ]

    # for each column map overwrite the original column with the ordinal mapping
    for col,dictionary in zip(ordinal_cols,ordinal_dicts):
        input_data[col] = input_data[col].map(dictionary)

    # Split data up and drop flag column created earlier
    x_trainOutput = input_data[input_data['trainTest']=='train']
    x_trainOutput.drop(['trainTest'],axis=1,inplace=True)
    
    x_testOutput = input_data[input_data['trainTest']=='test']
    x_testOutput.drop(['trainTest'],axis=1,inplace=True)

    return x_trainOutput,x_testOutput


def dummy_encoder(x_trainData, x_testData):

    # Create flag before concat
    x_trainData['trainTest'] = 'train'
    x_testData['trainTest'] = 'test'

    # Concat into one
    input_data = pd.concat([x_trainData,x_testData],axis=0)
    
    # Loop through object columns and transform to dummy variable
    collector = pd.DataFrame()
    for col in input_data.select_dtypes('object'):
        if col != 'trainTest':
            col_dummies = pd.get_dummies(input_data[col], drop_first=True, prefix=col, prefix_sep='_')
            collector = pd.concat([collector, col_dummies], axis=1)

    # Combine encoded object data with numeric data
    output_data = pd.concat([input_data.select_dtypes(['number']),collector,input_data['trainTest']],axis=1)

    # Split data up and drop flag column created earlier
    x_trainOutput = output_data[output_data['trainTest']=='train']
    x_trainOutput.drop(['trainTest'],axis=1,inplace=True)
    
    x_testOutput = output_data[output_data['trainTest']=='test']
    x_testOutput.drop(['trainTest'],axis=1,inplace=True)

    return x_trainOutput, x_testOutput



def target_encoder(x_trainCV, x_valCV, y_trainCV):

    for col in x_trainCV.select_dtypes('object'):
        target_encoder = TargetEncoder()

        x_trainCV[col] = target_encoder.fit_transform(x_trainCV[col],y_trainCV)
        x_valCV[col] = target_encoder.transform(x_valCV[col])

    return x_trainCV, x_valCV


def woe_encoder(x_trainCV,x_valCV,y_trainCV):

    # Combine y_train and x_train first, as this will always be where
    # we get the weight-of-evidence values from
    xy = pd.concat([x_trainCV,y_trainCV],axis=1)

    # For each categorical column...
    for col in xy.select_dtypes('object'):

        # Group by the vaccine column to get sum
        cat_group = xy[[col,y_trainCV.name]].groupby([col,y_trainCV.name]).size().reset_index(name='count')

        # Re-organize data so that the counts are in two columns and the categories are unique
        vaccine_0 = cat_group[cat_group[y_trainCV.name] == 0]
        vaccine_0['0'] = vaccine_0['count']
        vaccine_0.drop([y_trainCV.name,'count'],axis=1,inplace=True)
        
        vaccine_1 = cat_group[cat_group[y_trainCV.name] == 1]
        vaccine_1['1'] = vaccine_1['count']
        vaccine_1.drop([y_trainCV.name,'count'],axis=1,inplace=True)
        
        cat_group = vaccine_0.merge(vaccine_1,on=col,how='left')


        # Calculate percentage of 0s and 1s for each category
        cat_group['% 0'] = cat_group['0'] / (cat_group['0']+cat_group['1'])
        cat_group['% 1'] = cat_group['1'] / (cat_group['0']+cat_group['1'])
        cat_group['woe'] = np.log((cat_group['% 1']/cat_group['% 0']))


        # For each category in this feature...
        for item in cat_group[col].unique():

            # Map the WOE value of the given category to the training while overwriting the column
            cat_woe_value = cat_group[cat_group[col] == item]
            xy.loc[xy[col] == item, col] = cat_woe_value['woe'].iat[0]

            # Do the same for testing data, but for those not found in the cat_group from training use the feature's WOE mean
            if item not in x_valCV[col].unique():
                x_valCV.loc[x_valCV[col] == item, col] = cat_group['woe'].mean()
            else:
                x_valCV.loc[x_valCV[col] == item, col] = cat_woe_value['woe'].iat[0]


        xy[col] = xy[col].astype(float)
        x_valCV[col] = x_valCV[col].astype(float)

    # Drop y variable
    x_fitOutput = xy.drop([y_trainCV.name],axis=1)
    x_valOutput = x_valCV


    return x_fitOutput, x_valOutput

In [None]:
def cleanup(x_training, x_testing):

    for col in ['respondent_id','index','level_0']:
        if col in x_training.columns:
            x_training = x_training.drop([col],axis=1)
        if col in x_testing.columns:
            x_testing = x_testing.drop([col],axis=1)
    
    # Lightgbm doesn't like certain characters in column names so remove them
    for char in ['<', '>', '[' ,']' ,'+', '.', ':', ',']:
        x_training.columns = x_training.columns.str.replace(char, '')
        
    for char in ['<', '>', '[' ,']' ,'+', '.', ':', ',']:
        x_testing.columns = x_testing.columns.str.replace(char, '')
        
    return x_training, x_testing

# Pipelines

In [None]:
def pipe_baseline(training_data,testing_data,y_labels):
    # ******************
    # Imputation - Using -1 and none as indicators
    # Ordinal Encoding
    # Dummy Encoding - Using remaining categoricals
    # ******************
    
    train_impute, test_impute = negOne_noneTag_imputation(training_data, testing_data)
    train_ord, test_ord = ordinal_encoder(train_impute, test_impute)
    train_encoded, test_encoded = dummy_encoder(train_ord, test_ord)
    X_train, X_test = cleanup(train_encoded, test_encoded)
    return X_train, X_test

def pipe_meanGroupNone_ord_dummy(training_data,testing_data,y_labels):
    # ******************
    # Imputation - Using mean and none as indicator
    # Ordinal Encoding
    # Weight-of-Evidence Encoding - Using remaining categoricals
    # ******************
    
    train_impute, test_impute = meanGroup_none_imputation(training_data, testing_data)
    train_ord, test_ord = ordinal_encoder(train_impute, test_impute)
    train_encoded, test_encoded = dummy_encoder(train_ord, test_ord)
    X_train, X_test = cleanup(train_encoded, test_encoded)
    return X_train, X_test

def pipe_meanNone_targEncode(training_data,testing_data,y_labels):
    # ******************
    # Imputation - Using mean and none as indicator
    # Target Encoding - Using remaining categoricals
    # ******************
    
    train_impute, test_impute = meanGroup_none_imputation(training_data, testing_data)
    train_encoded, test_encoded = target_encoder(train_impute, test_impute, y_labels)
    X_train, X_test = cleanup(train_encoded, test_encoded)
    return X_train, X_test

# Tuning with Optuna

In [None]:
target_vaccine = y_train_h1n1 # <- change this to seasonal flu or h1n1 as needed

In [None]:
# RUN DATA THROUGH PIPELINE
X_train, X_test = pipe_baseline(train,test,target_vaccine)

# SCALE DATA
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns, index=X_train.index)

# SPLIT DATA
X_fit, X_val, y_fit, y_val = train_test_split(X_train, target_vaccine, test_size=0.25, random_state=42)

# TRANSFORM DATA TO NUMPY
from skorch.callbacks import BatchScoring, EarlyStopping
X_fit = X_fit.to_numpy().astype(np.float32)
X_val = X_val.to_numpy().astype(np.float32)
y_fit = y_fit.to_numpy().astype(np.int64)
y_val = y_val.to_numpy().astype(np.int64)

device = "cuda" if torch.cuda.is_available() else "cpu"

class ClassifierModule(nn.Module):
    def __init__(self, trial: optuna.Trial) -> None:
        super().__init__()

        # We optimize the number of layers, hidden units in each layer and dropouts.
        layers = []

        # Suggest a number of layers
        n_layers = trial.suggest_int("n_layers", 1, 10)
        dropout = trial.suggest_float("dropout", 0.2, 0.5)

        # Original input dimensions will be equal to the number of X columns
        input_dim = len(X_train.columns)

        # For each layer in the number of suggested layers (first layer included)
        for i in range(n_layers):

            # Suggest output nodes for each layer
            output_dim = trial.suggest_int("n_units_layer_{}".format(i), 4, 128, log=True)

            # Add layer info to list
            layers.append(nn.Linear(input_dim, output_dim))
            layers.append(nn.Dropout(dropout))
            layers.append(nn.ReLU())

            # Now re-assign input_dimensions to the last layer's output dimensions
            # this will then be used to create as the input dimensions for the final output layer
            input_dim = output_dim

        # Append final output layer with 2 (binary) output dimensions
        layers.append(nn.Linear(input_dim, 2))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return F.softmax(self.model(x), dim=-1)

# Define optuna objective function that is based on the auc score
def objective(trial: optuna.Trial) -> float:
    net = skorch.NeuralNetClassifier(
        ClassifierModule(trial),
        max_epochs=100,
        lr=0.1,
        device=device,
        callbacks=[SkorchPruningCallback(trial, "valid_acc")],
    )

    net.fit(X_fit, y_fit)

    return roc_auc_score(y_val, net.predict_proba(X_val)[:,1])

In [None]:
#study = optuna.create_study(direction="maximize", pruner=pruner)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))