## Hyperparameter Tuning (Cross Validation)

#### Goal:
To determine optimized hyperparameters for SVM, random forest, and gradient boosted trees classifiers

#### Input(s):

CSV file with preprocessed narrative text ({my_directory}/processed_narr_{batch_date}.csv)
File with case/control flags ('E:/Data Science Demonstration Project/SUDORSdataTruth.xlsx')
#### Output(s):

CSV file with tuned hyperparameters for each model ({my_directory}/tuned_params_{batch_date}.pickle)
#### To run, set 2 variables and make sure correct input files are specified in first cell:

my_directory = where you want outputs to save (e.g., 'C:/Users/dc20b49/Documents/TDH_DS_Demo/')
batch_date = batch_date used in 1_preprocessing_pipeline.ipynb (e.g, '8-4-22')
## Setup

In [4]:
import pandas as pd

my_directory = 'C:/Users/dc20b46/Desktop/tndh_ds_demo/'
batch_date = '8-10-22'

print(f'''my_directory = {my_directory},
batch_date = {batch_date}''')

# load data
narr_df = pd.read_csv(f'{my_directory}/processed_narr_{batch_date}.csv')
print(narr_df.shape)

# assign case-control flags
cases = pd.read_csv(f'{my_directory}/')
cases = pd.read_excel('T:/Data Science Demonstration Project/SUDORSdataTruth.xlsx') 
print(cases.shape)

print('Data loaded')

my_directory = C:/Users/dc20b46/Desktop/tndh_ds_demo/,
batch_date = 8-10-22


my_directory = C:/Users/dc20b46/Documents/tndh_ds_ddemo/,
batch_date = 8-10-22

### Prepare Test and Train Datasets

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import re
import numpy as np

def transform_data(count_matrix, vectorizer, idx):
    count_array = count_matrix.toarray()
    df_transformed = pd.DataFrame(data=count_array, columns = vectorizer.get_feature_names_out())
    df_transformed = df_transformed[[c for c in df_transformed.columns if not re.match(r'^\d+$', c) and len(c) > 3]]
    df_transformed.set_index(idx, inplace = True)
    return df_transformed

# assign case/control flags
narr_df['DID'] = narr_df['DID'].astype(str)
cases['DID'] = cases['DID'].astype(str)
narr_df['case'] = narr_df['DID'].isin(cases.DID.unique()).astype(int)

# remove autopsies < 100 characters
print(f'''Autopsies removed: {np.sum(narr_df.full_narr_lemma_text_len < 100)} ''')
narr_df = narr_df.loc[narr_df.full_narr_lemma_text_len >= 100]

# set test/train data
narr_df['train'] = narr_df['year'].apply(lambda x: x < 2021).astype(int)
narr_df.set_index('DID', inplace = True)

# shuffle data
shuffled_df = shuffle(narr_df, random_state = 0)

# DIDs for train, calibration, and test sets
all_train_DID = shuffled_df.loc[shuffled_df.train==1].index
train_DID, calib_DID = train_test_split(all_train_DID, test_size = 0.05, random_state = 0)
test_DID = shuffled_df.loc[shuffled_df.train==0].index

# get labels
ytrain, _, ytest = shuffled_df['case'].loc[train_DID], shuffled_df['case'].loc[calib_DID], shuffled_df['case'].loc[test_DID]

print(f'''Total: {shuffled_df.shape[0]}
Train: {len(train_DID)}
[Calibration: {len(calib_DID)}]
Test: {len(test_DID)}
''')

Autopsies removed: 146 
Total: 17375
Train: 10240
Calibration: 539
Test: 6596

In [None]:
### term frequency
# # create vocabulary based on training set
# coun_vect = CountVectorizer(min_df = 20)
# count_matrix = coun_vect.fit_transform(shuffled_df.loc[train_DID]['full_narr_lemma_text'])
# word_count_train = transform_data(count_matrix, coun_vect, train_DID)

# # calibration
# count_matrix_calib = coun_vect.transform(shuffled_df.loc[calib_DID]['full_narr_lemma_text'])
# word_count_calib = transform_data(count_matrix_calib, coun_vect, calib_DID)

# # test
# count_matrix_test = coun_vect.transform(shuffled_df.loc[test_DID]['full_narr_lemma_text'])
# word_count_test = transform_data(count_matrix_test, coun_vect, test_DID)

### TF-IDF
# create vocabulary based on training set
tfidf_vect = TfidfVectorizer(min_df=20)
tfidf_matrix = tfidf_vect.fit_transform(shuffled_df.loc[train_DID]['full_narr_lemma_text'])#.values.astype('U'))
tfidf_train = transform_data(tfidf_matrix, tfidf_vect, train_DID)

# calibration
# tfidf_matrix_calib = tfidf_vect.transform(shuffled_df.loc[calib_DID]['full_narr_lemma_text'])
# tfidf_calib = transform_data(tfidf_matrix_calib, tfidf_vect, calib_DID)

# test
tfidf_matrix_test = tfidf_vect.transform(shuffled_df.loc[test_DID]['full_narr_lemma_text'])
tfidf_test = transform_data(tfidf_matrix_test, tfidf_vect, test_DID)

# datasets
datasets = {
    #'Freq': [word_count_train, word_count_calib, word_count_test],
            'TFIDF': [tfidf_train, tfidf_calib, tfidf_test]}

xtrain, _, xtest = datasets['TFIDF']

print(f'''
Vocabulary: {tfidf_train.shape}
Total: {shuffled_df.shape[0]}
Train: {xtrain.shape[0]}
Test: {xtest.shape[0]}
''')


Vocabulary: (10240, 4006)
Total: 17375
Train: 10240
Test: 6596

## Support Vector Machine

In [None]:
%%time

from sklearn import svm
from sklearn.model_selection import GridSearchCV

params = {
    'C': [1, 10, 100, 1000],
    'gamma': [0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}
clf = GridSearchCV(
    estimator=svm.SVC(),
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1
)
clf.fit(xtrain, ytrain)
print(clf.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Wall time: 37min 59s

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Create the random grid of hyperparameters
parameter_grid = {'n_estimators': [250, 275, 300, 325, 350, 375, 400, 425, 450],
                  'max_depth': [35, 37, 39, 41, 43, 45, 47],
                  'min_samples_split': [2, 3, 4, 5, 6, 7, 8],
                  'min_samples_leaf': list(np.arange(1, 3)),
                  'oob_score':  [True]}

def randomforest_cv(X_train, y_train, parameter_grid = parameter_grid):
    
    clf = RandomForestClassifier()

    rf_cf = RandomizedSearchCV(clf, parameter_grid, 
                                   n_iter=50, cv=5, n_jobs=6)

    rf_cf.fit(X_train, y_train)
    
    return rf_cf

In [None]:
%%time

# train models
rfc_cv = randomforest_cv(xtrain, ytrain)

print(rfc_cv.best_estimator_)

RandomForestClassifier(max_depth=43, min_samples_split=6, n_estimators=450,
                       oob_score=True)
Wall time: 48min 20s

## XGBoost

In [None]:
import xgboost as xgb

def xgbc_CV(xgb_params, param_grid, xtrain, ytrain):
    
    min_error = np.inf
    best_params = None
    
    dtrain = xgb.DMatrix(xtrain, label=ytrain)

    for max_depth, min_child_weight, eta, alpha, lambda_val in param_grid:

        # Update our parameters
        xgb_params['max_depth'] = max_depth
        xgb_params['min_child_weight'] = min_child_weight
        xgb_params['eta'] = eta
        xgb_params['reg_alpha'] = alpha
        xgb_params['reg_lambda'] = lambda_val

        # Run CV
        cv_results = xgb.cv(
            xgb_params,
            dtrain,
            num_boost_round=50,
            nfold=5,
            metrics={'error'},
            early_stopping_rounds=3
        )
        
        # Update best score
        mean_error = cv_results['test-error-mean'].min()
        boost_rounds = cv_results['test-error-mean'].argmin()
        #print("\tError {} for {} rounds".format(mean_error, boost_rounds))
        if mean_error < min_error:
            min_error = mean_error
            best_params = (max_depth, min_child_weight, eta, alpha, lambda_val)

    return best_params


Wall time: 617 ms

In [None]:
%%time
import itertools

# random grid
max_depths = [3, 4, 5, 6, 7]
min_child_wghts = [2, 3, 4, 5]
etas = [0.1, 0.25, 0.5]
alphas = [0, 0.001, 0.01, 0.1, 0.2, 0.25, 0.3, 0.4]
lambdas = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

hyp_params = [max_depths,
              min_child_wghts,
              etas,
              alphas,
              lambdas]

param_grid = list(itertools.product(*hyp_params))

# parameters
xgb_params = {
        'max_depth': 5,
        'eta': 0.25,
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'colsample_bytree': 0.5,
        'reg_alpha' : 0,
        'reg_lambda' : 0
}

print('Performing cross validation...')

# hyperparameters of best model
best_params = xgbc_CV(xgb_params, param_grid, xtrain, ytrain)

# best params
max_depth, min_child_weight, eta, alpha, lambda_val = best_params

xgb_params['max_depth'] = max_depth
xgb_params['min_child_weight'] = min_child_weight
xgb_params['eta'] = eta
xgb_params['reg_alpha'] = alpha
xgb_params['reg_lambda'] = lambda_val

xgb_params

{'max_depth': 4,
 'eta': 0.25,
 'objective': 'binary:hinge',
 'eval_metric': 'error',
 'colsample_bytree': 0.5,
 'reg_alpha': 0,
 'reg_lambda': 0.2,
 'min_child_weight': 5}

## Save Tuned Parameters

In [None]:
import pickle

tuned_params = {'SVM': clf.best_params_,
                'RF': rfc_cv.best_params_,
                'XGB': xgb_params}

with open(f'{my_directory}/tuned_params_{batch_date}.pickle', 'wb') as handle:
    pickle.dump(tuned_params, handle)
    
print(f'Tuned parameters saved to {my_directory}/tuned_params_{batch_date}.pickle')