# Prediction of reference annotations in CC Certificates (Raw)

This notebook:
- loads dataframe of a dataset with `(dgst, cert_id, sentences, label)`
- Trains a model to classify the sentences related to certificate reference to their common sentiment (meaning of reference)

In [8]:
# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook
# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-a5459e6a-b26d-5985-874c-528458a7728b"
print(os.getenv("CUDA_VISIBLE_DEVICES"))

import pandas as pd
from sec_certs.utils.nlp import prec_recall_metric
from ast import literal_eval
from pathlib import Path
from sec_certs.model.reference_classification import ReferenceClassifierTrainer
import numpy as np

REPO_ROOT = Path("../../../").resolve()


def predict_and_fill_df(clf, df, label_mapping):
    """
    Given the classifier, dataframe and label mapping, will populate dataframe with predictions for simple inspection.
    """
    df_new = df.copy()
    y_proba = clf.predict_proba(df_new.sentences)
    df_new["y_proba"] = y_proba
    df_new["y_pred"] = df_new.y_proba.map(lambda x: label_mapping[np.argmax(x)])
    df_new["correct"] = df_new.label == df_new.y_pred
    return df_new

MIG-a5459e6a-b26d-5985-874c-528458a7728b


In [9]:
# Prepare dataset

df = pd.read_csv(REPO_ROOT / "datasets/reference_classification_dataset_merged.csv", sep=";")
df = df.loc[(df.label.notnull())]
df = df.loc[df.label.isin({"COMPONENT_USED", "BASIS_OF_RECERTIFICATION", "BASIS_FOR"})]  # only the most popular labels
df.sentences = df.sentences.map(lambda x: list(literal_eval(x)))

# # Split into train/valid
df_train = df.loc[df.split == "train"].drop(columns="split")
df_valid = df.loc[df.split == "valid"].drop(columns="split")

# Use just few examples for learning
df_train = df_train.sample(n=10)

In [10]:
df_train.head()

Unnamed: 0,dgst,cert_id,label,sentences
12,99223aca5d9eb3b3,DCSSI-2009/11,COMPONENT_USED,[Toolbox Certificate DCSSI-2009/11\nTable 1:]
4,0f3900cdcd0c7f3e,BSI-DSZ-CC-1072-V4-2021-MA-01,COMPONENT_USED,[Certification Report NXP Secure Smart Card Co...
9,6d6ade44dcc497dd,BSI-DSZ-CC-0227-2004,BASIS_OF_RECERTIFICATION,[This is a\nre-certification based on BSI-DSZ-...
5,0f3900cdcd0c7f3e,NSCIB-CC-66030-CR5,COMPONENT_USED,[certificate identification NSCIB-CC-66030-CR5...
6,1fb1564dfb0f0b04,ANSSI-CC-2020/34,COMPONENT_USED,[[CER_IC] Rapport de certification ANSSI-CC-20...


In [11]:
trainer = ReferenceClassifierTrainer(df_train, df_valid, prec_recall_metric, "transformer")
trainer.train()
trainer.evaluate()

config.json not found in HuggingFace Hub
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 1760
  Num epochs = 1
  Total optimization steps = 110
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/110 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Internal evaluation (of model working on individual sentences)
{'precision': 0.45454545454545453, 'recall': 0.45454545454545453}
Actual evaluation after ensemble soft voting
{'precision': 0.2857142857142857, 'recall': 0.2857142857142857}


In [12]:
# Take a look at misclassified instances
df_train = predict_and_fill_df(trainer.clf, df_train, trainer.label_mapping)
df_valid = predict_and_fill_df(trainer.clf, df_valid, trainer.label_mapping)

In [13]:
df_train.loc[~df_train.correct]

Unnamed: 0,dgst,cert_id,label,sentences,y_proba,y_pred,correct
9,6d6ade44dcc497dd,BSI-DSZ-CC-0227-2004,BASIS_OF_RECERTIFICATION,[This is a\nre-certification based on BSI-DSZ-...,"[0.5461188093773812, 0.45388119062261884]",COMPONENT_USED,False
19,ca5da2fe138af656,BSI-DSZ-CC-0413-2007,BASIS_OF_RECERTIFICATION,[This is a re-certification based on\nBSI-DSZ-...,"[0.5465589745598575, 0.4534410254401425]",COMPONENT_USED,False


In [14]:
df_valid.loc[~df_valid.correct]

Unnamed: 0,dgst,cert_id,label,sentences,y_proba,y_pred,correct
1,0c7ef6c32cbdee47,BSI-DSZ-CC-1074-2019,BASIS_FOR,[The BAC+PACE configuration is subject of the ...,"[0.9330686268852108, 0.06693137311478929]",COMPONENT_USED,False
2,0e22fe4e4e58faf4,BSI-DSZ-CC-1052-V4-2021,BASIS_OF_RECERTIFICATION,[basierend auf BSI-DSZ-CC-1052-V4-2021.],"[0.7070543956916182, 0.2929456043083818]",COMPONENT_USED,False
7,238f8edc5eda1358,BSI-DSZ-CC-0222-2003,BASIS_OF_RECERTIFICATION,[This certification is a re-certification of B...,"[0.5998535578550636, 0.4001464421449364]",COMPONENT_USED,False
8,29964f32c68b0ce8,BSI-DSZ-CC-0519-V3-2021,BASIS_OF_RECERTIFICATION,[This is a re-certification based on BSI-DSZ-C...,"[0.8727371952470533, 0.12726280475294682]",COMPONENT_USED,False
13,a6fac58198296194,BSI-DSZ-CC-0555-2009,BASIS_OF_RECERTIFICATION,[Specific results from the evaluation process\...,"[0.8670438280210987, 0.13295617197890133]",COMPONENT_USED,False
