In [None]:
import numpy as np
import pandas as pd
import scikitplot as skplt
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from utils.model_inference_plots import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_parquet('data/final_model_data_all_scaled.parquet')

In [3]:
X = df[['Bx', 'By', 'Bz', 'Bx_lag_1', 'Bx_lag_2', 'By_lag_1',
        'By_lag_2', 'Bz_lag_1', 'Bz_lag_2', 'Bx_conditional_vol',
        'By_conditional_vol', 'Bz_conditional_vol', 'Bx_rolling_stdev',
        'By_rolling_stdev', 'Bz_rolling_stdev']].values

y = df['Event_label_80'].values

In [4]:
def make_metrics_plots(y_test, y_pred, y_pred_proba):
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    skplt.metrics.plot_confusion_matrix(y_test, y_pred, ax=axes[0, 0])

    if y_pred_proba.shape[1] == 1:
        y_pred_proba = np.hstack([1 - y_pred_proba, y_pred_proba])

    skplt.metrics.plot_roc(y_test, y_pred_proba, ax=axes[0, 1])
    skplt.metrics.plot_precision_recall(y_test, y_pred_proba, ax=axes[1, 0])
    skplt.metrics.plot_cumulative_gain(y_test, y_pred_proba, ax=axes[1, 1])

    plt.show()

In [None]:
split = int(len(df) * 0.8)

X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [None]:
clf = XGBClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [6]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85   7474465
           1       0.76      0.64      0.69   4009036

    accuracy                           0.80  11483501
   macro avg       0.79      0.76      0.77  11483501
weighted avg       0.80      0.80      0.80  11483501

