In [29]:
import pandas as pd
import numpy as np
from typing import Mapping, Sequence
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost
from scipy.stats import randint

In [2]:
train = pd.read_csv('data/processed/cases_train.csv', parse_dates=['date_confirmation'])
train['date_confirmation'] = train['date_confirmation'].apply(lambda r: r.value)

# Label Encoding

In [3]:
le = LabelEncoder()
for col in train.columns:
    if train[col].dtype == 'object' and col != 'outcome_group':
        train[col] = le.fit_transform(train[col])

train['outcome_group'] = pd.Categorical(train['outcome_group']).codes
X = train.iloc[:, 0:-1]
y = train.iloc[:, -1]

# Feature Selection

In [4]:
X_train_fs, X_valid_fs, y_train_fs, y_valid_fs = train_test_split(X, y, stratify=y, train_size=0.8)

In [6]:
classifiers = [(LogisticRegression(),'Logistic Regression'),
                   (GaussianNB(),'GaussianNB'),
                   (KNeighborsClassifier(),'KNN'),
                   (DecisionTreeClassifier(),'Decision Tree'),
                   (SVC(),'Support Vector')]
    
best_score = 0
best_classifier = None
best_m:str = None
for (c,m) in classifiers:
    c.fit(X_train_fs,y_train_fs)
    valid_score = c.score(X_valid_fs,y_valid_fs)
    if valid_score > best_score:
        best_score = valid_score
        best_classifier = c
        best_m = m

print('proceeding with classifier {0} with valid score {1}'.format(best_m,round(best_score,6)))

cols: Sequence[str] = X_train_fs.columns.to_list()
desired_n_features = round(len(cols)*0.75)

while (len(cols) != desired_n_features):
    print('working with {0}/{1} features'.format(len(cols),X_train_fs.shape[1]))
    least_reduction = 1
    for f in cols:
        X_train_fs_no_f = X_train_fs[cols]
        X_valid_fs_no_f = X_valid_fs[cols]
        best_classifier.fit(X_train_fs_no_f,y_train_fs)
        valid_score_no_f = best_classifier.score(X_valid_fs_no_f, y_valid_fs)
        reduction = best_score-valid_score_no_f
        if reduction < least_reduction:
            print('reduction \\f {0}: {1}'.format(f,round(reduction,6)))
            least_reduction = reduction
            least_reduction_feature = f
    print('removing feature', least_reduction_feature)
    cols.pop(cols.index(least_reduction_feature))
print('done!')

proceeding with classifier KNN with valid score 0.937264
working with 14/14 features
reduction \f age: 0.0
removing feature age
working with 13/14 features
reduction \f sex: 0.002904
removing feature sex
working with 12/14 features
reduction \f province: 0.001452
removing feature province
working with 11/14 features
reduction \f country: 0.001452
removing feature country
done!


# Balancing Dataset

In [35]:
from sklearn.model_selection import StratifiedKFold

X = X[cols]
k = len(classifiers)
k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.8)

# Building Models and Hyperparameter Tuning

In [34]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

import warnings
warnings.filterwarnings('ignore') 

### Logistic Regression

In [25]:
lr = LogisticRegression()
params = {
    'penalty': [None, 'l2', 'l1'],
    'C': np.linspace(0.001,1,20),
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}
best_lr = HalvingRandomSearchCV(lr, params, cv=k_fold).fit(X_train,y_train)
print('best training score: ',best_lr.best_score_)
print('validation score:', best_lr.score(X_valid, y_valid))

best training score:  0.9371220530375567
validation score: 0.9282602381643915


### GaussianNB

In [26]:
nb = GaussianNB()
params = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
best_nb = HalvingRandomSearchCV(nb, params, cv=k_fold).fit(X_train,y_train)
print('best training score: ',best_nb.best_score_)
print('validation score:', best_nb.score(X_valid, y_valid))

best training score:  0.8006858710562414
validation score: 0.7905896020911996


### KNeighbors

In [27]:
kn = KNeighborsClassifier()
params = {
    'n_neighbors': np.arange(1,11,1),
    'weights': ['uniform','distance'],
    'algorithm': ['ball_tree','kd_tree','brute'],
    'leaf_size': np.arange(10,201,1),
    'p': [1,2],
    'metric': ['cityblock','cosine','euclidean','haversine','l1','l2','manhattan']
}
best_kn = HalvingRandomSearchCV(kn, params, cv=k_fold).fit(X_train,y_train)
print('best training score: ',best_kn.best_score_)
print('validation score:', best_kn.score(X_valid, y_valid))

best training score:  0.929690057207703
validation score: 0.9294220156839965


### RandomForest

In [31]:
rf = RandomForestClassifier()
params = {
    "max_depth": np.arange(2,1001,1),
    "max_features": np.arange(1,len(cols)+1,1),
    "min_samples_split": np.arange(2,101,1),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}
best_rf = HalvingRandomSearchCV(rf, params, cv=k_fold).fit(X_train,y_train)
print('best training score: ',best_rf.best_score_)
print('validation score:', best_rf.score(X_valid, y_valid))

best training score:  0.9492391868214843
validation score: 0.9433633459192564


### SVC

In [None]:
svc = SVC()
params = {
    'C': np.linspace(),
    'kernel': ['linear','poly','rbf','sigmoid','precomputed'],
    'degree': np.linspace(),
    'gamma': ['scale','auto'],
    'coef0': np.arange(),
    'shrinking': [True,False],
    'tol': np.arange(),
    'class_weight': [None, 'balanced'],
    'decision_function_shape': ['ovo', 'ovr']
}
best_svc = HalvingRandomSearchCV(svc, params, cv=k_fold).fit(X_train,y_train)
print('best training score: ',best_svc.best_score_)
print('validation score:', best_svc.score(X_valid, y_valid))

### XGBoost

In [None]:
xgb = xgboost()
params = {
}
best_xgb = HalvingRandomSearchCV(xgb, params, cv=k_fold).fit(X_train,y_train)
print('best training score: ',best_xgb.best_score_)
print('validation score:', best_xgb.score(X_valid, y_valid))

# TODO:
- randomsearch strength is selecting from distributions. Might want to swap to scipy.stats instead of np.linspace or np.arange
- class balancing