In [None]:
from __future__ import annotations
import os

# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook
# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs
# Must be done before any related package that leverages cuda is imported
os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-56c53afb-6f08-5e5b-83fa-32fc6f09eeb0"
os.environ["TOKENIZERS_PARALLELISM"] = "FALSE"

import pandas as pd
from sec_certs.dataset import CCDataset
from shutil import copy
from pathlib import Path
from sec_certs.model.references.segment_extractor import ReferenceSegmentExtractor
from sec_certs.utils.nlp import prec_recall_metric
from sklearn.dummy import DummyClassifier
from sec_certs.utils.nlp import prec_recall_metric
from sec_certs.model.references.annotator_trainer import ReferenceAnnotatorTrainer
from sklearn.metrics import ConfusionMatrixDisplay
from sec_certs.utils.helpers import compute_heuristics_version
from rapidfuzz import fuzz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import torch
import optuna
from matplotlib import pyplot as plt


REPO_ROOT = Path(".").resolve()
DATASET_PATH = REPO_ROOT / "dataset/cc_final_run_may_23/dataset.json"
ANNOTATIONS_PATH = REPO_ROOT / "src/sec_certs/data/reference_annotations/manual_annotations/"

def replace_all(text: str, to_replce: set[str]) -> str:
    for i in to_replce:
        text = text.replace(i, "")
    return text

print(f"GPU available: {torch.cuda.is_available()}")

# Load data

Enrich annotations with string similarity of cert. and referenced cert.

In [None]:
train_annotations = pd.read_csv(ANNOTATIONS_PATH / "train.csv")
valid_annotations = pd.read_csv(ANNOTATIONS_PATH / "valid.csv")
all_annotations = pd.concat([train_annotations, valid_annotations])
all_annotations = all_annotations[all_annotations.label != "None"].assign(label=lambda df: df.label.str.upper())

dset = CCDataset.from_json(DATASET_PATH)
all_certs = {x.dgst: x for x in dset.certs.values()}
dset.certs = {x.dgst: x for x in dset.certs.values() if x.dgst in all_annotations.dgst.unique()}

cert_id_to_name_mapping = {x.heuristics.cert_id: x.name for x in all_certs.values()}
all_annotations["referenced_cert_name"] = all_annotations["referenced_cert_id"].map(cert_id_to_name_mapping)
all_annotations["cert_name"] = all_annotations["dgst"].map(lambda x: dset[x].name)
all_annotations["cert_versions"] = all_annotations["cert_name"].map(compute_heuristics_version)
all_annotations = all_annotations.loc[all_annotations["referenced_cert_name"].notnull()].copy()
all_annotations["referenced_cert_versions"] = all_annotations["referenced_cert_name"].map(compute_heuristics_version)
all_annotations["cert_name_stripped_version"] = all_annotations.apply(lambda x: replace_all(x["cert_name"], x["cert_versions"]), axis=1)
all_annotations["referenced_cert_name_stripped_version"] = all_annotations.apply(lambda x: replace_all(x["referenced_cert_name"], x["referenced_cert_versions"]), axis=1)
all_annotations["name_similarity"] = all_annotations.apply(lambda x: fuzz.token_set_ratio(x["cert_name"], x["referenced_cert_name"]), axis=1)
all_annotations["name_similarity_stripped_version"] = all_annotations.apply(lambda x: fuzz.token_set_ratio(x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"]), axis=1)
all_annotations["name_len_diff"] = all_annotations.apply(lambda x: abs(len(x["cert_name_stripped_version"]) - len(x["referenced_cert_name_stripped_version"])), axis=1)

## Retrieve segments

In [None]:
df = ReferenceSegmentExtractor()(dset.certs.values())
df = df.loc[df.label.notnull()].copy()
df = df.merge(all_annotations.loc[:, ["dgst", "referenced_cert_id", "name_similarity_stripped_version", "name_len_diff", "cert_name", "referenced_cert_name"]], on=["dgst", "referenced_cert_id"])

# Simplified binary labels
# label_mapping = {"COMPONENT_USED": "COMPONENT_SHARED", "RECERTIFICATION": "PREVIOUS_VERSION"}
# df.label = df.label.map(lambda x: label_mapping[x] if x in label_mapping else x)

## Segment post-processing

In [None]:
def process_segment(segment: str, referenced_cert_id: str) -> str:
    segment = segment.replace(referenced_cert_id, "the referenced product")
    return segment

df.segments = df.apply(lambda row: [process_segment(x, row.referenced_cert_id) for x in row.segments], axis=1)

## Train & evaluate the baseline classifier (majority class)

In [None]:
dummy_clf = DummyClassifier()
dummy_clf.fit(df.loc[df.split == "train", ["segments"]], df.loc[df.split == "train"].label)
y_pred_dummy = dummy_clf.predict(df.loc[df.split == "valid", ["segments"]])
print(classification_report(df.loc[df.split == "valid"].label, y_pred_dummy, zero_division=0))

## Train & evaluate the transformer

In [None]:
trainer = ReferenceAnnotatorTrainer.from_df(df, prec_recall_metric, mode="training", use_analytical_rule_name_similarity=True, n_iterations=20, n_epochs=1, batch_size=16, segmenter_metric="f1", ensemble_soft_voting_power=2)
trainer.train()
trainer.evaluate()

annotator = trainer.clf
df_predicted = annotator.predict_df(df)

print(classification_report(df_predicted.loc[df_predicted.split == "valid", ["y_pred"]], df_predicted.loc[df_predicted.split == "valid", ["label"]], zero_division=0))

# Print confusion matrix
ConfusionMatrixDisplay.from_predictions(df_predicted.loc[df_predicted.split == "valid", ["label"]], df_predicted.loc[df_predicted.split == "valid", ["y_pred"]], labels=list(trainer.label_mapping.values()), display_labels=list(trainer.label_mapping.values()), xticks_rotation=90)

# Serialize errors into file
df_predicted.y_proba = df_predicted.y_proba.map(lambda x: {y: z for y, z in zip(trainer.label_mapping.values(), x)})
df_predicted.loc[~df_predicted.correct].to_json("/var/tmp/xjanovsk/certs/sec-certs/dataset/annotator_errors.json", orient="records", indent=4)

## Hyperparameter optimization

In [None]:
def define_trainer(trial, df):
    use_analytical_rule_name_similarity = trial.suggest_categorical("use_analytical_rule_name_similarity", [True, False])
    n_iterations = trial.suggest_int("n_iterations", 1, 50)
    n_epochs = trial.suggest_int("n_epochs", 1, 5)
    batch_size = trial.suggest_int("batch_size", 8, 32)
    segmenter_metric = trial.suggest_categorical("segmenter_metric", ["accuracy", "f1"])
    ensemble_soft_voting_power = trial.suggest_int("ensemble_soft_voting_power", 1, 5)
    return ReferenceAnnotatorTrainer.from_df(df, prec_recall_metric, mode="training", use_analytical_rule_name_similarity=use_analytical_rule_name_similarity, n_iterations=n_iterations, n_epochs=n_epochs, batch_size=batch_size, segmenter_metric=segmenter_metric, ensemble_soft_voting_power=ensemble_soft_voting_power)

def objective(trial):
    trainer = define_trainer(trial, df)
    trainer.train()
    
    annotator = trainer.clf
    df_predicted = annotator.predict_df(df)
    return f1_score(df_predicted.loc[df_predicted.split == "valid", ["y_pred"]], df_predicted.loc[df_predicted.split == "valid", ["label"]], zero_division="warn", average="weighted")

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3)

best_trial = study.best_trial
print("Best Trial:", best_trial.params)
print("Best Trial Value:", best_trial.value)

In [None]:
ax = optuna.visualization.matplotlib.plot_optimization_history(study)
ax.figure.savefig("/var/tmp/xjanovsk/certs/sec-certs/dataset/cc_refs_hyperparam_search/optimization_history.pdf", bbox_inches="tight")

ax = optuna.visualization.matplotlib.plot_param_importances(study)
ax.figure.savefig("/var/tmp/xjanovsk/certs/sec-certs/dataset/cc_refs_hyperparam_search/param_importances.pdf", bbox_inches="tight")

ax = optuna.visualization.matplotlib.plot_timeline(study)
ax.figure.savefig("/var/tmp/xjanovsk/certs/sec-certs/dataset/cc_refs_hyperparam_search/timeline.pdf", bbox_inches="tight")