In [None]:
import numpy as np
import pandas as pd
import xgboost
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, classification_report, ConfusionMatrixDisplay, confusion_matrix, plot_confusion_matrix
import itertools
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Preprocessing

In [None]:
def preprocessing(raw_data):
    data = raw_data

    # Dealing with 'sex' column
    sex_encoder = LabelBinarizer()
    sex_encoder.fit(data['sex'])
    transformed = sex_encoder.transform(data['sex'])
    df = pd.DataFrame(transformed, columns=['male'])
    data = pd.concat([data, df], axis=1).drop(['sex'], axis=1)

    # Dealing with 'province' and 'country' and duplicate lat lon columns
    data = data.drop(columns=['province', 'country', 'Province_State', 'Country_Region', 'Lat', 'Long_'])

    # Dealing with 'date_confirmation' column
    data['date_confirmation'] = pd.to_datetime(data.date_confirmation, infer_datetime_format=True).apply(lambda x: x.toordinal())

    # Dealing with 'additional_information' column
    # info_encode = pd.get_dummies(data.date_confirmation, prefix='country')
    # data = pd.concat([data, info_encode], axis=1).drop(['country'], axis=1)
    data = data.drop(columns=['additional_information', 'source', 'Last_Update'])

    # Dealing with 'Last_Update'
    # data['Last_Update'] = pd.to_datetime(data.Last_Update, infer_datetime_format=True).apply(lambda x: x.toordinal())
    data = data.dropna()

    return data

In [None]:
train_data = pd.read_csv("../data/cases_train_processed.csv")
train_data = preprocessing(train_data)

In [None]:
X = train_data.drop(columns=['outcome'])
y = train_data.outcome

## Tuning Hyperparameters

In [None]:
def param_scores(model):
    results = pd.concat([
        pd.DataFrame(pd.Series([list(x.values()) for x in model.cv_results_['params']]), columns=['hyperparameters']),
        pd.DataFrame(model.cv_results_['mean_test_F1_Deceased'], columns=['F1_Deceased']),
        pd.DataFrame(model.cv_results_['mean_test_Recall_Deceased'], columns=['Recall_Deceased']),
        pd.DataFrame(model.cv_results_['mean_test_Overall_Accuracy'], columns=['Overall_Accuracy']),
        pd.DataFrame(model.cv_results_['mean_test_Overall_Recall'], columns=['Overall_Recall']),
        ],
        axis=1
    )
    return results

### KNN

In [None]:
def tune_knn(X, y):
    leaf_size = [5, 10, 15]
    n_neighbors = [5, 10, 15]
    p=[1,2]

    hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

    scoring = {
        'Overall_Accuracy': make_scorer(accuracy_score),
        'Overall_Recall': make_scorer(recall_score, average='weighted'),
        'F1_Deceased': make_scorer(f1_score, labels=['deceased'], average='weighted'),
        'Recall_Deceased': make_scorer(recall_score, labels=['deceased'], average='weighted')
    }

    knn = KNeighborsClassifier()
    gs = GridSearchCV(estimator=knn, param_grid=hyperparameters, scoring=scoring, cv=5, refit='F1_Deceased', n_jobs=3)

    best_model = gs.fit(X, y)

    print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
    print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
    print('Best p:', best_model.best_estimator_.get_params()['p'])
    return best_model

In [None]:
%%time
best_knn = tune_knn(X,y)

In [None]:
knn_scores = param_scores(best_knn)
knn_scores.to_csv('../results/knn_tuning.csv', index=False)
knn_scores

### XGBoost Model

In [None]:
def tune_XGBoost(X, y):

    classifier = xgboost.XGBClassifier(use_label_encoder=True)
    scoring = {
        'Overall_Accuracy': make_scorer(accuracy_score),
        'Overall_Recall': make_scorer(recall_score, average='weighted'),
        'F1_Deceased': make_scorer(f1_score, labels=['deceased'], average='weighted'),
        'Recall_Deceased': make_scorer(recall_score, labels=['deceased'], average='weighted')
    }

    parameters ={
        "learning_rate": [0.10, 0.20, 0.30] ,
        "max_depth": [8, 12, 15],
        "min_child_weight": [50, 100, 150],
    }
    grid_search = GridSearchCV(estimator=classifier, scoring=scoring, param_grid=parameters, refit='F1_Deceased', cv=5, n_jobs=3)
    best_model = grid_search.fit(X, y)
    
    print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])
    print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
    print('Best min_child_weight:', best_model.best_estimator_.get_params()['min_child_weight'])
    
    return best_model

In [None]:
%%time
best_xgboost = tune_XGBoost(X,y)

In [None]:
xgboost_scores = param_scores(best_xgboost)
xgboost_scores.to_csv('../results/xgboost_tuning.csv', index=False)
xgboost_scores

### Random Forest Model

In [None]:
def tune_rf(X, y):
    scoring = {
        'Overall_Accuracy': make_scorer(accuracy_score),
        'Overall_Recall': make_scorer(recall_score, average='weighted'),
        'F1_Deceased': make_scorer(f1_score, labels=['deceased'], average='weighted'),
        'Recall_Deceased': make_scorer(recall_score, labels=['deceased'], average='weighted')
    }
    rf_parameters = {
        'n_estimators': [250, 500, 1000],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [10, 25, 50]
    }
    rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(), scoring=scoring, param_grid=rf_parameters, refit='F1_Deceased', cv=5, n_jobs=3)
    best_model = rf_grid_search.fit(X, y)
    
    print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
    print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
    print('Best min_samples_leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])    
    
    return best_model

In [None]:
%%time
best_rf = tune_rf(X,y)

In [None]:
rf_scores = param_scores(best_rf)
rf_scores.to_csv('../results/randomforest_tuning.csv', index=False)
rf_scores

## Prediction on test dataset

In [None]:
def preprocessing(raw_data):
    data = raw_data

    # Dealing with 'sex' column
    sex_encoder = LabelBinarizer()
    sex_encoder.fit(data['sex'])
    transformed = sex_encoder.transform(data['sex'])
    df = pd.DataFrame(transformed, columns=['male'])
    data = pd.concat([data, df], axis=1).drop(['sex'], axis=1)

    # Dealing with 'province' and 'country' and duplicate lat lon columns
    data = data.drop(columns=['province', 'country', 'Province_State', 'Country_Region', 'Lat', 'Long_'])

    # Dealing with 'date_confirmation' column
    data['date_confirmation'] = pd.to_datetime(data.date_confirmation, infer_datetime_format=True).apply(lambda x: x.toordinal())

    # Dealing with 'additional_information' column
    # info_encode = pd.get_dummies(data.date_confirmation, prefix='country')
    # data = pd.concat([data, info_encode], axis=1).drop(['country'], axis=1)
    data = data.drop(columns=['additional_information', 'source', 'Last_Update'])

    # Dealing with 'Last_Update'
    # data['Last_Update'] = pd.to_datetime(data.Last_Update, infer_datetime_format=True).apply(lambda x: x.toordinal())
#     data = data.dropna()

    return data

In [None]:
test_data = pd.read_csv("../data/cases_test_processed.csv")
test_data = test_data.drop(columns=['outcome'])
test_data = preprocessing(test_data)

In [None]:
predictions = best_xgboost.best_estimator_.predict(test_data)
len(predictions)

In [None]:
with open('../results/predictions.txt', 'w') as f:
    for x in range(len(predictions)):
        if x == len(predictions)-1:
            f.write(predictions[x])
        else:
            f.write(predictions[x]+'\n')

### Check predictions file

In [None]:
def check_if_file_valid(filename):
    assert filename.endswith('../results/predictions.txt'), 'Incorrect filename'
    f = open(filename).read()
    l = f.split('\n')
    assert len(l) == 46500, 'Incorrect number of items'
    assert (len(set(l)) == 4), 'Wrong class labels'
    return 'The predictions file is valid'
check_if_file_valid('../results/predictions.txt')