In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
import os
import time
import glob
import json

import numpy as np
import pandas as pd
from matplotlib.figure import Figure
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)

In [3]:
def create_model(model_name: str):
    with open(f"../scripts/models/best_params_{model_name}.json", "r") as f:
        params = json.load(f)
        if model_name == "knn":
            return KNeighborsClassifier(**params)
        elif model_name == "logistic_regression":
            return LogisticRegression(**params)
        elif model_name == "decision_tree":
            return DecisionTreeClassifier(**params)
        elif model_name == "random_forest":
            return RandomForestClassifier(**params)
        elif model_name == "gradient_boosting":
            return HistGradientBoostingClassifier(**params)
        elif model_name == "xgboost":
            return XGBClassifier(**params)
        elif model_name == "svm":
            return LinearSVC(**params)
        
        raise ValueError("Unknown model!")

## Cross Val Training

In [4]:
def cross_val(model_name: str, X, y):
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = {
        "model_name": model_name,
        "accuracy": [],
        "precision": [],
        "recall": [],
        "f1": [],
        "mcc": [],
        "roc_auc": [],
        "train_time": [],
        "test_time": [],
    }

    for fold_i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        print(f">>> Training fold {fold_i + 1}")

        # split data
        X_train, X_test = (
            X.iloc[train_idx],
            X.iloc[test_idx],
        )
        y_train, y_test = (
            y.iloc[train_idx],
            y.iloc[test_idx],
        )

        # fit model
        clf = create_model(model_name)

        start_train = time.time()
        clf.fit(X_train, y_train)
        elapsed_train = time.time() - start_train

        # run prediction
        start_test = time.time()
        y_pred = clf.predict(X_test)
        elapsed_test = time.time() - start_test

        # log metrics
        scores["accuracy"].append(accuracy_score(y_test, y_pred))
        scores["precision"].append(precision_score(y_test, y_pred))
        scores["recall"].append(recall_score(y_test, y_pred))
        scores["f1"].append(f1_score(y_test, y_pred))
        scores["mcc"].append(matthews_corrcoef(y_test, y_pred))
        scores["roc_auc"].append(roc_auc_score(y_test, y_pred))
        scores["train_time"].append(elapsed_train)
        scores["test_time"].append(elapsed_test)

        # plot confusion matrix
        fig = Figure()
        ax = fig.subplots()
        ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
        fig.savefig(f"./plots/{model_name}-cm-{fold_i + 1}.png")

        # plot ROC
        fig = Figure()
        ax = fig.subplots()
        RocCurveDisplay.from_predictions(y_test, y_pred, ax=ax)
        fig.savefig(f"./plots/{model_name}-roc-{fold_i + 1}.png")

        # plot precision-recall
        fig = Figure()
        ax = fig.subplots()
        PrecisionRecallDisplay.from_predictions(y_test, y_pred, ax=ax)
        fig.savefig(f"./plots/{model_name}-prc-{fold_i + 1}.png")

    with open("metrics-cv-best.jsonl", "a+") as f:
        json.dump(scores, f)
        f.write("\n")

In [5]:
df_train = pd.read_parquet("../dataset/paper2-train.parquet")
df_train.head()

Unnamed: 0,zone_id,ts,sob_mean,sob_sum,o2_mean,o2_sum,uo_mean,uo_sum,chl_mean,chl_sum,...,nppv_mean,nppv_sum,pbo_mean,pbo_sum,tob_mean,tob_sum,si_mean,si_sum,target,continent
20397,1122,2022-06-01,35.379967,566.079468,235.254257,470.508514,0.256492,4.10387,0.358892,0.717784,...,8.032279,16.064558,428.261902,6852.19043,14.342039,229.472626,2.163302,4.326605,0,australia
636005,1088,2024-07-22,34.913521,384.048737,209.65889,209.65889,-0.012077,-0.132846,0.128557,0.128557,...,4.192322,4.192322,13.917703,153.094727,24.964533,274.609863,3.268391,3.268391,1,australia
222201,1944,2022-10-05,34.711334,416.536011,264.833496,529.666992,-0.007808,-0.093692,0.309889,0.619778,...,5.436712,10.873424,5090.45459,61085.457031,0.60819,7.298274,1.739208,3.478416,0,australia
440583,1535,2023-09-07,34.629536,415.554413,207.639297,207.639297,0.020311,0.243727,0.179565,0.179565,...,6.522992,6.522992,28.203934,338.447205,27.189264,326.271179,3.155294,3.155294,0,australia
1476632,1658,2023-10-06,34.817574,557.081177,206.917999,413.835999,-0.751882,-12.030111,0.087428,0.174856,...,2.692275,5.384549,2469.494385,39511.910156,2.7023,43.236801,3.234651,6.469302,0,africa


In [6]:
X_train = df_train.drop(columns=["zone_id", "ts", "target", "country", "continent"], errors="ignore")
y_train = df_train["target"]

In [7]:
for params_file in glob.glob("../scripts/models/*.json"):
    model_name = os.path.splitext(os.path.basename(params_file))[0].replace("best_params_", "")
    print("Training:", model_name)
    
    cross_val(model_name, X_train, y_train)

Training: decision_tree
>>> Training fold 1
>>> Training fold 2
>>> Training fold 3
>>> Training fold 4
>>> Training fold 5
>>> Training fold 6
>>> Training fold 7
>>> Training fold 8
>>> Training fold 9
>>> Training fold 10
Training: gradient_boosting
>>> Training fold 1
>>> Training fold 2
>>> Training fold 3
>>> Training fold 4
>>> Training fold 5
>>> Training fold 6
>>> Training fold 7
>>> Training fold 8
>>> Training fold 9
>>> Training fold 10
Training: logistic_regression
>>> Training fold 1


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 2


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 3


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 4


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 5




>>> Training fold 6


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 7


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 8


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 9


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


>>> Training fold 10


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


Training: random_forest
>>> Training fold 1
>>> Training fold 2
>>> Training fold 3
>>> Training fold 4
>>> Training fold 5
>>> Training fold 6
>>> Training fold 7
>>> Training fold 8
>>> Training fold 9
>>> Training fold 10
Training: knn
>>> Training fold 1
>>> Training fold 2
>>> Training fold 3
>>> Training fold 4
>>> Training fold 5
>>> Training fold 6
>>> Training fold 7
>>> Training fold 8
>>> Training fold 9
>>> Training fold 10
Training: xgboost
>>> Training fold 1


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




>>> Training fold 2
>>> Training fold 3
>>> Training fold 4
>>> Training fold 5
>>> Training fold 6
>>> Training fold 7
>>> Training fold 8
>>> Training fold 9
>>> Training fold 10


## Testing

In [8]:
df_test = pd.read_parquet("../dataset/paper2-test.parquet")
df_test.head()

Unnamed: 0,zone_id,ts,sob_mean,sob_sum,o2_mean,o2_sum,uo_mean,uo_sum,chl_mean,chl_sum,...,nppv_mean,nppv_sum,pbo_mean,pbo_sum,tob_mean,tob_sum,si_mean,si_sum,target,continent
184492,1170,2022-09-14,35.402901,424.834808,247.178192,494.356384,0.13366,1.603925,0.281907,0.563815,...,10.481349,20.962698,53.40979,640.91748,15.253552,183.042633,2.106932,4.213865,0,australia
95711,545,2022-07-23,35.318733,565.099731,228.894806,457.789612,0.120387,1.926192,0.353271,0.706542,...,9.804804,19.609608,42.582443,681.319092,19.636709,314.187347,2.842084,5.684168,1,australia
1669081,1144,2024-07-11,34.814644,557.034302,204.732193,204.732193,0.087027,1.392434,0.343167,0.343167,...,15.556914,15.556914,1719.839355,27517.429688,3.775412,60.406586,3.728522,3.728522,0,africa
1548872,1538,2023-11-07,34.746643,555.946289,201.42244,201.42244,0.309198,4.947171,0.112134,0.112134,...,3.571119,3.571119,3328.337891,53253.40625,1.379301,22.06881,3.141167,3.141167,0,africa
152732,1160,2022-08-26,35.164505,562.63208,206.091309,412.182617,-0.015524,-0.248387,0.399768,0.799536,...,17.786886,35.573772,102.341019,1637.456299,26.623756,425.980103,2.716478,5.432956,0,australia


In [9]:
X_test = df_test.drop(columns=["zone_id", "ts", "target", "country", "continent"], errors="ignore")
y_test = df_test["target"]

In [10]:
for params_file in glob.glob("../scripts/models/*.json"):
    model_name = os.path.splitext(os.path.basename(params_file))[0].replace("best_params_", "")
    print("Training:", model_name)
    
    # fit model
    clf = create_model(model_name)

    start_train = time.time()
    clf.fit(X_train, y_train)
    elapsed_train = time.time() - start_train

    # run prediction
    start_test = time.time()
    y_pred = clf.predict(X_test)
    elapsed_test = time.time() - start_test

    # store metrics
    scores = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "mcc": matthews_corrcoef(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_pred),
        "train_time": elapsed_train,
        "test_time": elapsed_test,
    }

    with open("metrics-test-best.jsonl", "a+") as f:
        json.dump(scores, f)
        f.write("\n")

Training: decision_tree
Training: gradient_boosting
Training: logistic_regression


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


Training: random_forest
Training: knn
Training: xgboost


# EDA

In [11]:
folds = []
with open("./metrics-cv-best.jsonl", "r") as f:
    for line in f:
        row = json.loads(line.strip())
        for i in range(1, len(row["accuracy"])):
            folds.append({
                "fold": i + 1,
                "model_name": row["model_name"],
                "accuracy": row["accuracy"][i],
                "precision": row["precision"][i],
                "recall": row["recall"][i],
                "f1": row["f1"][i],
                "mcc": row["mcc"][i],
                "roc_auc": row["roc_auc"][i],
                "train_time": row["train_time"][i],
                "test_time": row["test_time"][i],
            })

eda_folds = pd.DataFrame(folds)
eda_folds.head()

Unnamed: 0,fold,model_name,accuracy,precision,recall,f1,mcc,roc_auc,train_time,test_time
0,2,decision_tree,0.972192,0.75899,0.207966,0.326477,0.388377,0.602877,32.659856,0.008805
1,3,decision_tree,0.974026,0.912021,0.219543,0.353896,0.440458,0.609417,31.362199,0.008768
2,4,decision_tree,0.973999,0.918295,0.216817,0.350806,0.439296,0.608085,34.005343,0.008639
3,5,decision_tree,0.974508,0.919209,0.233802,0.372785,0.456557,0.616557,33.143779,0.008833
4,6,decision_tree,0.974169,0.92301,0.22122,0.356901,0.444981,0.610301,33.116637,0.010048


In [12]:
eda_folds.to_excel("metricssss.xlsx")