In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount("/content/drive")

In [None]:
DATA_LOC = "/content/drive/MyDrive/Rhexis/datasets/test_pulls"
REPO_LOC = "/content/drive/MyDrive/Trajectories/rhexis-trajectory"

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import sys

sys.path.insert(0, f"{REPO_LOC}/Trajectory_Classification")
from utils import *

%load_ext autoreload
%autoreload 2

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=0, max_iter=1000)
param_grid = {
    "pca__n_components": list(range(3, 20)),
}
bin_grid = list(range(5, 25))
results_lr = grid_search_with_bins(make_custom_pipeline(clf_lr, True), param_grid, bin_grid)

In [None]:
results_lr["best_score"]

In [None]:
results_lr["best_params"]

In [None]:
results_lr["best_search"].score(results_lr["X_test"], results_lr["y_test"])

## Quadratic Gaussian Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf_qda = QuadraticDiscriminantAnalysis()
param_grid = {
    "pca__n_components": list(range(3, 20)),
    "quadraticdiscriminantanalysis__reg_param": [
        1e-6,
        1e-3,
        1e-2,
        1e-1,
        2e-1,
        3e-1,
        4e-1,
        5e-1,
    ],
}
bin_grid = list(range(5, 25))
results_qda = grid_search_with_bins(clf_qda, param_grid, bin_grid)

In [None]:
results_qda["best_score"]

In [None]:
results_qda["best_params"]

In [None]:
results_qda["best_search"].score(results_qda["X_test"], results_qda["y_test"])

## Multi-Layer Perceptron

From above experiments, we are sure that 21 bins is the optimal amount, and do not need to grid search over that parameter any further.

In [None]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = get_data_for_fixed_bins(21)

clf_mlp = MLPClassifier(
    solver="adam", alpha=1e-5, hidden_layer_sizes=(1000, 1000, 1000), random_state=1
)
param_grid = {
    "pca__n_components": list(range(10, 15)),
    "mlpclassifier__learning_rate_init": [1e-9, 1e-6, 1e-3, 1e-2, 1e-1],
}
search = grid_search(clf_mlp, param_grid, X_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
search.score(X_test, y_test)

## Generate ROC Curves for Logistic Regression, Quadratic GDA, and Multilayer Perceptron

In [None]:
from sklearn.metrics import RocCurveDisplay

# Logistic Regression
lr_best_n_bins = results_lr['best_params']['data__n_bins']
best_clf_lr = results_lr['results'][lr_best_n_bins].best_estimator_
fig = RocCurveDisplay.from_estimator(best_clf_lr, results_lr['X_test'], results_lr['y_test'], name="Logistic Regression")

# Quadratic GDA
qda_best_n_bins = results_qda['best_params']['data__n_bins']
best_clf_qda = results_qda['results'][qda_best_n_bins].best_estimator_
RocCurveDisplay.from_estimator(best_clf_qda, results_qda['X_test'], results_qda['y_test'], name="Quadratic GDA", ax=fig.ax_)

# Multilayer Perceptron
best_clf_mlp = search.best_estimator_
RocCurveDisplay.from_estimator(best_clf_mlp, X_test, y_test, name="Multilayer Perceptron", ax=fig.ax_)

plt.title("ROC Curves for LR, Quadratic GDA, MLP")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.savefig('ROC_Curves.pdf', dpi=120)

## Histogram-based Gradient Boosting (LightGBM)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

clf_hgbc = HistGradientBoostingClassifier()
clf_hgbc.fit(X_train, y_train)
clf_hgbc.score(X_test, y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)
clf_rf.score(X_test, y_test)
