In [None]:
import json
import pickle
from pathlib import Path

import numpy as np
import torch
from sklearn.dummy import DummyClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm

from nlp_assemblee.datasets import build_dataset_and_dataloader_from_config
from nlp_assemblee.models import build_classifier_from_config
from nlp_assemblee.simple_visualisation import (
    calculate_metrics,
    plot_precision_recall_curve,
    plot_roc_curve,
)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
def load_records(path, phase="train"):
    path = Path(path)
    with open(path / f"precomputed_{phase}.pkl", "rb") as f:
        records = pickle.load(f)
    return records

In [None]:
y_train = load_records("../../data/precomputed/paraphrase-multilingual-MiniLM-L12-v2")["label"]
y_test = load_records("../../data/precomputed/paraphrase-multilingual-MiniLM-L12-v2", phase="test")[
    "label"
]

In [None]:
n = len(y_train)
probas = [np.sum(y_train == i) / n for i in range(3)]

In [None]:
probas

## Uniform

In [None]:
strategy = "uniform"
dummy = DummyClassifier(strategy=strategy)
dummy.fit(y_train, y_train)

In [None]:
y_pred = dummy.predict(y_test)
probs = dummy.predict_proba(y_test)

In [None]:
results = {
    "labels": y_test,
    "predictions": y_pred,
    "probs": probs,
}

In [None]:
metrics = calculate_metrics(results)

In [None]:
RESULTS_PATH = f"../../results/dummy_classifier_{strategy}"
Path(RESULTS_PATH).mkdir(exist_ok=True, parents=True)
with open(Path(RESULTS_PATH) / "logs.json", "w") as f:
    json.dump(metrics, f)

In [None]:
roc_fig = plot_roc_curve(results, figsize=(6, 6), palette="deep")
roc_fig.savefig(Path(RESULTS_PATH) / "roc_curve.png")

In [None]:
pr_fig = plot_precision_recall_curve(results, figsize=(6, 6), palette="deep")
pr_fig.savefig(Path(RESULTS_PATH) / "precision_recall_curve.png")

## Most frequent

In [None]:
strategy = "most_frequent"
dummy = DummyClassifier(strategy=strategy)
dummy.fit(y_train, y_train)

In [None]:
y_pred = dummy.predict(y_test)
probs = dummy.predict_proba(y_test)

In [None]:
results = {
    "labels": y_test,
    "predictions": y_pred,
    "probs": probs,
}

In [None]:
metrics = calculate_metrics(results)

In [None]:
RESULTS_PATH = f"../../results/dummy_classifier_{strategy}"
Path(RESULTS_PATH).mkdir(exist_ok=True, parents=True)
with open(Path(RESULTS_PATH) / "logs.json", "w") as f:
    json.dump(metrics, f)

In [None]:
roc_fig = plot_roc_curve(results, figsize=(6, 6), palette="deep")
roc_fig.savefig(Path(RESULTS_PATH) / "roc_curve.png")

In [None]:
pr_fig = plot_precision_recall_curve(results, figsize=(6, 6), palette="deep")
pr_fig.savefig(Path(RESULTS_PATH) / "precision_recall_curve.png")

## Stratified

In [None]:
strategy = "stratified"
dummy = DummyClassifier(strategy=strategy)
dummy.fit(y_train, y_train)

In [None]:
y_pred = dummy.predict(y_test)
probs = dummy.predict_proba(y_test)

In [None]:
results = {
    "labels": y_test,
    "predictions": y_pred,
    "probs": probs,
}

In [None]:
metrics = calculate_metrics(results)

In [None]:
RESULTS_PATH = f"../../results/dummy_classifier_{strategy}"
Path(RESULTS_PATH).mkdir(exist_ok=True, parents=True)
with open(Path(RESULTS_PATH) / "logs.json", "w") as f:
    json.dump(metrics, f)

In [None]:
roc_fig = plot_roc_curve(results, figsize=(6, 6), palette="deep")
roc_fig.savefig(Path(RESULTS_PATH) / "roc_curve.png")

In [None]:
pr_fig = plot_precision_recall_curve(results, figsize=(6, 6), palette="deep")
pr_fig.savefig(Path(RESULTS_PATH) / "precision_recall_curve.png")

## Prior

In [None]:
strategy = "prior"
dummy = DummyClassifier(strategy=strategy)
dummy.fit(y_train, y_train)

In [None]:
y_pred = dummy.predict(y_test)
probs = dummy.predict_proba(y_test)

In [None]:
results = {
    "labels": y_test,
    "predictions": y_pred,
    "probs": probs,
}

In [None]:
metrics = calculate_metrics(results)

In [None]:
RESULTS_PATH = f"../../results/dummy_classifier_{strategy}"
Path(RESULTS_PATH).mkdir(exist_ok=True, parents=True)
with open(Path(RESULTS_PATH) / "logs.json", "w") as f:
    json.dump(metrics, f)

In [None]:
roc_fig = plot_roc_curve(results, figsize=(6, 6), palette="deep")
roc_fig.savefig(Path(RESULTS_PATH) / "roc_curve.png")

In [None]:
pr_fig = plot_precision_recall_curve(results, figsize=(6, 6), palette="deep")
pr_fig.savefig(Path(RESULTS_PATH) / "precision_recall_curve.png")