In [None]:
from __future__ import annotations
import os

# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook
# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs
# Must be done before any related package that leverages cuda is imported
os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-56c53afb-6f08-5e5b-83fa-32fc6f09eeb0"
os.environ["TOKENIZERS_PARALLELISM"] = "FALSE"

import pandas as pd
from sec_certs.dataset import CCDataset
from pathlib import Path
from sec_certs.model.references.segment_extractor import ReferenceSegmentExtractor
from sklearn.metrics import ConfusionMatrixDisplay
from sec_certs.utils.helpers import compute_heuristics_version
from rapidfuzz import fuzz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sec_certs.utils.nlp import softmax
import torch


REPO_ROOT = Path(".").resolve()
DATASET_PATH = REPO_ROOT / "dataset/cc_final_run_may_23/dataset.json"
ANNOTATIONS_PATH = REPO_ROOT / "src/sec_certs/data/reference_annotations/final/"

def replace_all(text: str, to_replce: set[str]) -> str:
    for i in to_replce:
        text = text.replace(i, "")
    return text

print(f"GPU available: {torch.cuda.is_available()}")


## Compute name similarity

In [None]:
train_annotations = pd.read_csv(ANNOTATIONS_PATH / "train.csv")
valid_annotations = pd.read_csv(ANNOTATIONS_PATH / "valid.csv")
all_annotations = pd.concat([train_annotations, valid_annotations])
all_annotations = all_annotations[all_annotations.label != "None"].assign(label=lambda df: df.label.str.upper())

dset = CCDataset.from_json(DATASET_PATH)
all_certs = {x.dgst: x for x in dset.certs.values()}
dset.certs = {x.dgst: x for x in dset.certs.values() if x.dgst in all_annotations.dgst.unique()}

cert_id_to_name_mapping = {x.heuristics.cert_id: x.name for x in all_certs.values()}
all_annotations["referenced_cert_name"] = all_annotations["referenced_cert_id"].map(cert_id_to_name_mapping)
all_annotations["cert_name"] = all_annotations["dgst"].map(lambda x: dset[x].name)
all_annotations["cert_versions"] = all_annotations["cert_name"].map(compute_heuristics_version)
all_annotations = all_annotations.loc[all_annotations["referenced_cert_name"].notnull()].copy()
all_annotations["referenced_cert_versions"] = all_annotations["referenced_cert_name"].map(compute_heuristics_version)
all_annotations["cert_name_stripped_version"] = all_annotations.apply(lambda x: replace_all(x["cert_name"], x["cert_versions"]), axis=1)
all_annotations["referenced_cert_name_stripped_version"] = all_annotations.apply(lambda x: replace_all(x["referenced_cert_name"], x["referenced_cert_versions"]), axis=1)
all_annotations["name_similarity"] = all_annotations.apply(lambda x: fuzz.token_set_ratio(x["cert_name"], x["referenced_cert_name"]), axis=1)
all_annotations["name_similarity_stripped_version"] = all_annotations.apply(lambda x: fuzz.token_set_ratio(x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"]), axis=1)


## Retrieve segments

In [None]:
df = ReferenceSegmentExtractor()(dset.certs.values())
df = df.loc[df.label.notnull()].copy()


## Train TF-IDF

(Train on individual segments, then agregate the results with sum of probabilities)

In [None]:
vectorizer = TfidfVectorizer()
df_train_exploded = df.loc[df.split == "train"].explode("segments")
df_valid_exploded = df.loc[df.split == "valid"].explode("segments")

x_train = df_train_exploded["segments"]
y_train = df_train_exploded["label"]

x_valid = df_valid_exploded["segments"]
y_valid = df_valid_exploded["label"]

x_train_tfidf = vectorizer.fit_transform(x_train)
x_valid_tfidf = vectorizer.transform(x_valid)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train_tfidf, y_train)
y_pred = clf.predict(x_valid_tfidf)

df_train_exploded["y_proba"] = clf.predict_proba(x_train_tfidf).tolist()
df_valid_exploded["y_proba"] = clf.predict_proba(x_valid_tfidf).tolist()

# Get mapping of labels to indices of the clf random forest classifier
label_mapping = {i: label for i, label in enumerate(clf.classes_)}

# Now merge the segments back together and compute final prediction using the softmax of the sum of the probabilities
df_valid_retrieved = df_valid_exploded.loc[:, ["dgst", "referenced_cert_id", "y_proba"]].groupby(["dgst", "referenced_cert_id"]).agg(list).reset_index()
df_train_retrieved = df_train_exploded.loc[:, ["dgst", "referenced_cert_id", "y_proba"]].groupby(["dgst", "referenced_cert_id"]).agg(list).reset_index()

def aggregate_results(x):
    # Return the argmax of the sum of the probabilities obtained from the predictions on the individual segments
    return label_mapping[int(np.argmax(softmax(np.power(x, 2).sum(axis=0))))]

df_valid_retrieved["y_pred"] = df_valid_retrieved["y_proba"].map(aggregate_results)
df_train_retrieved["y_pred"] = df_train_retrieved["y_proba"].map(aggregate_results)
df_predictions = pd.concat([df_train_retrieved, df_valid_retrieved])
df_final = df.merge(df_predictions, on=["dgst", "referenced_cert_id"])

# Finally print the classification report on the aggregated results
print(classification_report(df_final.loc[df_final.split == "valid"].label, df_final.loc[df_final.split == "valid"].y_pred, zero_division=0))

ConfusionMatrixDisplay.from_predictions(df_final.loc[df_final.split == "valid"].label, df_final.loc[df_final.split == "valid"].y_pred, xticks_rotation=90)
