In [1]:
# import ipython
import matplotlib.pyplot as plt
import nbgrader
import notebook
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import time
import pyarrow

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, \
                            recall_score, f1_score, \
                            roc_auc_score, roc_curve, \
                            confusion_matrix
 
# import category-encoders

In [2]:
# This cell is just for the purpose of testing this code, we should load a clean dataset for both train and test

def clean(df):
    df_clean = df.set_index('id')
    df_clean = df_clean.sort_values('unix_timestamp')
    df_clean = df_clean.dropna()
    df_clean = df_clean[:1000]
    df_clean = pd.get_dummies(df_clean) 
    return df_clean

#train_df = clean(pd.read_csv('data/train.csv'))
#test_df = clean(pd.read_csv('data/test.csv'))

In [5]:
train_df = pd.read_parquet('cleaned_train_set.parquet')
test_df = pd.read_parquet('cleaned_test_set.parquet')

# Model Training

This assumes we have a variable `df_train_ready` with the training data.
Assumptions:
* Dataframe is sorted by `timestamp` ascending
* The `id` of the trade is the `index`
* We don't have anything other than numerical and categorical columns in the dataset

## Retrieving values for model training

In [14]:
# Getting our df into values
X = train_df.drop(columns=["success"]).values
y = train_df.success.values

X_test = test_df.values

In [15]:
# Using a split of 75-25 - we can try tunning these values later
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, shuffle=False)

## Model Training

### Functions for the Model Training

In [16]:
def perform_grid_search_and_return_best_model(grid_search_input):
    grid_search_input.fit(X_train, y_train)
    print("Best Parameters:")
    display(grid_search_input.best_estimator_.get_params())
    return grid_search_input.best_estimator_

In [17]:
# Plotting the AUC Score
def plot_roc_curve(roc_auc, fpr, tpr):
    # Function to plot ROC Curve
    # Inputs: 
    #     roc_auc - AU ROC value (float)
    #     fpr - false positive rate (output of roc_curve()) array
    #     tpr - true positive rate (output of roc_curve()) array
    plt.figure(figsize=(8, 6))
    lw = 2
    plt.plot(fpr, tpr, color='orange', lw=lw, label='ROC curve (AUROC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', label='random')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [18]:
# Getting the AUC and Plotting it against test split for a given model
def auc_against_validation_set(best_model):
    # Get predictions for our test set
    y_val_pred_probas = best_model.predict_proba(X_val)[:,1]

    # AUC Score
    roc_auc = roc_auc_score(y_true=y_val, y_score=y_val_pred_probas)
    fpr, tpr, thresholds = roc_curve(y_true=y_val, y_score=y_val_pred_probas)
    plot_roc_curve(roc_auc=roc_auc, fpr=fpr, tpr=tpr)


In [11]:
# Get the results for the Test Set and write to the filesystem

def test_set_results(model_input, model_name):
    file_to_save = "predictions_to_submit/{0}_{1}.csv".format(model_name, time.strftime("%Y-%m-%d %H:%M"))
    y_test_probas = model_input.predict_proba(X_test)[:,1]
    results = pd.DataFrame(data=y_test_probas, index=test_df.index, columns=["success"])
    results.to_csv(file_to_save)


### Random Forest Classifier

In [21]:
# For the Hyperparameter Tunning, lets define here our Grid
hyper_parameters_grid_rf = {
    'max_depth': [2, 6, 13],
    'n_estimators': [100, 200, 300, 500]
}

grid_search_rf = GridSearchCV(
    RandomForestClassifier(),
    hyper_parameters_grid_rf,
    cv=3,
    scoring="roc_auc", #     already takes into account the metric we are looking for, nice!
)

best_rf_model = perform_grid_search_and_return_best_model(grid_search_rf)

KeyboardInterrupt: 

In [None]:
auc_against_validation_set(best_rf_model)

test_set_results(best_rf_model, "random_forest")

### Gradient Boosting

In [None]:
# For the Hyperparameter Tunning, lets define here our Grid
hyper_parameters_grid_gb = {
    'max_depth': [1],
    'n_estimators': [500]
}

grid_search_gb = GridSearchCV(
    GradientBoostingClassifier(),
    hyper_parameters_grid_gb,
    cv=3,
    scoring="roc_auc", # already takes into account the metric we are looking for, nice!
)

best_gb_model = perform_grid_search_and_return_best_model(grid_search_gb)

In [None]:
auc_against_validation_set(best_gb_model)

test_set_results(best_gb_model, "gradient_boosting")

### Logistical Regression

In [None]:
# For the Hyperparameter Tunning, lets define here our Grid
hyper_parameters_grid_lr = {
    "C": [.1, 1],
    "penalty": ["l1", "l2"]
}

grid_search_lr = GridSearchCV(
    LogisticRegression(),
    hyper_parameters_grid_lr,
    cv=3,
    scoring="roc_auc", # already takes into account the metric we are looking for, nice!
)

best_lr_model = perform_grid_search_and_return_best_model(grid_search_lr)

In [None]:
auc_against_validation_set(best_lr_model)

test_set_results(best_lr_model, "logistical_regression")

### K Nearest

In [None]:
# For the Hyperparameter Tunning, lets define here our Grid
hyper_parameters_grid_knn = {
    "n_neighbors": [5],
    "weights": ['uniform']
}

grid_search_knn = GridSearchCV(
    KNeighborsClassifier(),
    hyper_parameters_grid_knn,
    cv=3,
    scoring="roc_auc", # already takes into account the metric we are looking for, nice!
)

best_knn_model = perform_grid_search_and_return_best_model(grid_search_knn)

In [None]:
auc_against_validation_set(best_knn_model)

test_set_results(best_knn_model, "knn")

## SVM

In [None]:
# For the Hyperparameter Tunning, lets define here our Grid
hyper_parameters_grid_svm = {
    "kernel": ['rbf'],
    "C": [1]
}

grid_search_svm = GridSearchCV(
    SVC(),
    hyper_parameters_grid_svm,
    cv=3,
    scoring="roc_auc", # already takes into account the metric we are looking for, nice!
)

best_svm_model = perform_grid_search_and_return_best_model(grid_search_svm)

In [None]:
auc_against_validation_set(best_knn_model)

test_set_results(best_svm_model, "svm")