In [41]:
import pandas as pd
import numpy as np
from typing import Mapping, Sequence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import xgboost as xgb

In [13]:
train = pd.read_csv('data/processed/cases_train.csv', parse_dates=['date_confirmation'])
train['date_confirmation'] = train['date_confirmation'].apply(lambda r: r.value)

# Label Encoding

In [17]:
le = LabelEncoder()
for col in train.columns:
    if train[col].dtype == 'object' and col != 'outcome_group':
        train[col] = le.fit_transform(train[col])

train['outcome_group'] = pd.Categorical(train['outcome_group']).codes
X = train.iloc[:, 0:-1]
y = train.iloc[:, -1]

# Feature Selection

In [18]:
X_train_fr, X_valid_fr, y_train_fr, y_valid_fr = train_test_split(X, y, stratify=y, train_size=0.8)

In [20]:
classifiers = [(LogisticRegression(),'Logistic Regression'),
                   (GaussianNB(),'GaussianNB'),
                   (KNeighborsClassifier(),'KNN'),
                   (DecisionTreeClassifier(),'Decision Tree'),
                   (SVC(),'Support Vector')]
    
best_score = 0
best_classifier = None
best_m:str = None
for (c,m) in classifiers:
    c.fit(X_train_fr,y_train_fr)
    valid_score = c.score(X_valid_fr,y_valid_fr)
    if valid_score > best_score:
        best_score = valid_score
        best_classifier = c
        best_m = m

print('proceeding with classifier {0} with valid score {1}'.format(best_m,round(best_score,6)))

cols: Sequence[str] = X_train_fr.columns.to_list()
desired_n_features = round(len(cols)*0.75)

while (len(cols) != desired_n_features):
    print('working with {0}/{1} features'.format(len(cols),X_train_fr.shape[1]))
    least_reduction = 1
    for f in cols:
        X_train_fr_no_f = X_train_fr[cols]
        X_valid_fr_no_f = X_valid_fr[cols]
        best_classifier.fit(X_train_fr_no_f,y_train_fr)
        valid_score_no_f = best_classifier.score(X_valid_fr_no_f, y_valid_fr)
        reduction = best_score-valid_score_no_f
        if reduction < least_reduction:
            print('reduction \\f {0}: {1}'.format(f,round(reduction,6)))
            least_reduction = reduction
            least_reduction_feature = f
    print('removing feature', least_reduction_feature)
    cols.pop(cols.index(least_reduction_feature))

proceeding with classifier KNN with valid score 0.932617
working with 14/14 features
reduction \f age: 0.0
removing feature age
working with 13/14 features
reduction \f sex: 0.000871
removing feature sex
working with 12/14 features
reduction \f province: 0.003485
removing feature province
working with 11/14 features
reduction \f country: 0.003485
removing feature country


# Building Models and Hyperparameter Tuning

## K-Fold Cross-Validation

In [44]:
classifiers = [(LogisticRegression(),'Logistic Regression'),
               (GaussianNB(),'GaussianNB'),
               (KNeighborsClassifier(),'KNN'),
               (DecisionTreeClassifier(),'Decision Tree'),
               (SVC(),'Support Vector'),
               (xgb.XGBClassifier(),'XGBoost')]

In [47]:
k = 6
skf = StratifiedKFold(n_splits=k)
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    x_train_fold, x_valid_fold = X.iloc[train_index], X.iloc[valid_index]
    y_train_fold, y_valid_fold = y[train_index], y[valid_index]
    
    # lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold))