# Prediction of reference annotations in CC Certificates (Raw)

This notebook:
- loads dataframe of a dataset with `(dgst, cert_id, sentences, label)`
- Trains a model to classify the sentences related to certificate reference to their common sentiment (meaning of reference)

In [1]:
# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook
# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-a5459e6a-b26d-5985-874c-528458a7728b"
print(os.getenv("CUDA_VISIBLE_DEVICES"))

import pandas as pd
from sec_certs.utils.nlp import prec_recall_metric
from ast import literal_eval
from pathlib import Path
from sec_certs.model.reference_classification import ReferenceClassifierTrainer
import numpy as np

REPO_ROOT = Path("../../../").resolve()


def predict_and_fill_df(clf, df, label_mapping):
    """
    Given the classifier, dataframe and label mapping, will populate dataframe with predictions for simple inspection.
    """
    df_new = df.copy()
    y_proba = clf.predict_proba(df_new.sentences)
    df_new["y_proba"] = y_proba
    df_new["y_pred"] = df_new.y_proba.map(lambda x: label_mapping[np.argmax(x)])
    df_new["correct"] = df_new.label == df_new.y_pred
    return df_new

def eval_strings(series):
    return [list(literal_eval(x)) for x in series]

MIG-a5459e6a-b26d-5985-874c-528458a7728b


In [14]:
# Prepare dataset

df = pd.read_csv(REPO_ROOT / "datasets/reference_classification_dataset.csv").loc[
    lambda df_: (df_.label.notnull()) & (df_.label.isin({"COMPONENT_USED", "RECERTIFICATION", "ON_PLATFORM"}))
].assign(sentences=lambda df_: eval_strings(df_.sentences))
df.label = df.label.map(lambda x: x if x != "ON_PLATFORM" else "COMPONENT_USED")

# Split into train/valid
df_train = df.loc[df.split == "train"].drop(columns="split")
df_valid = df.loc[df.split == "valid"].drop(columns="split")

# Use just few examples for learning
df_train = df_train.sample(n=30)

In [15]:
trainer = ReferenceClassifierTrainer(df_train, df_valid, prec_recall_metric, "transformer")
trainer.train()
trainer.evaluate()

config.json not found in HuggingFace Hub
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 7200
  Num epochs = 1
  Total optimization steps = 450
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/450 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Internal evaluation (of model working on individual sentences)
{'precision': 0.08035714285714286, 'recall': 0.08035714285714286}
Actual evaluation after ensemble soft voting
{'precision': 0.9014084507042254, 'recall': 0.9014084507042254}


In [16]:
# Take a look at misclassified instances
df_train = predict_and_fill_df(trainer.clf, df_train, trainer.label_mapping)
df_valid = predict_and_fill_df(trainer.clf, df_valid, trainer.label_mapping)

In [17]:
df_train.loc[~df_train.correct]

Unnamed: 0,dgst,referenced_cert_id,label,sentences,y_proba,y_pred,correct


In [20]:
df_valid.loc[~df_valid.correct]

Unnamed: 0,dgst,referenced_cert_id,label,sentences,y_proba,y_pred,correct
251,2c2244c35d126bfb,BSI-DSZ-CC-0794-2011,RECERTIFICATION,[This is a re-certification based on BSI-DSZ-C...,"[0.47754689036832726, 0.5224531096316728]",COMPONENT_USED,False
364,4489bfc781a82281,BSI-DSZ-CC-0817-2013,RECERTIFICATION,[This is a re-certification based on BSI-DSZ-C...,"[0.11195564561722356, 0.8880443543827765]",COMPONENT_USED,False
505,647d17d44745a532,BSI-DSZ-CC-0904-2015,RECERTIFICATION,"[and BSI-DSZ-CC-0904-2015-\n, This is a re-cer...","[0.16312949852765818, 0.8368705014723418]",COMPONENT_USED,False
616,7e58bfc14edf68e4,OCSI/CERT/TEC/01/2013/RC,RECERTIFICATION,"[OCSI/CERT/TEC/01/2013/RC, versione 1.0,]","[0.2810731555375018, 0.7189268444624982]",COMPONENT_USED,False
628,81273108dd167b98,BSI-DSZ-CC-0523-2008,RECERTIFICATION,[Specific results from the\nevaluation process...,"[0.4308279930268643, 0.5691720069731356]",COMPONENT_USED,False
709,9664c0f0ec6401b9,BSI-DSZ-CC-0891-V2-2016,RECERTIFICATION,[This is a re-certification based on BSI-DSZ-C...,"[0.4303467603163502, 0.5696532396836499]",COMPONENT_USED,False
719,983d16512ae92d46,BSI-DSZ-CC-0891-V3-2018,RECERTIFICATION,[The updated documents in compare to the forer...,"[0.48619060832541505, 0.513809391674585]",COMPONENT_USED,False
