In [1]:
# It is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook
# For available GPUs, see https://www.fi.muni.cz/tech/unix/aura.html.cs

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-a5459e6a-b26d-5985-874c-528458a7728b"
print(os.getenv("CUDA_VISIBLE_DEVICES"))

# import spacy
# from spacy_cld import LanguageDetector

# nlp = spacy.load("en_core_web_sm")
# language_detector = LanguageDetector()
# nlp.add_pipe(language_detector)

MIG-a5459e6a-b26d-5985-874c-528458a7728b


In [13]:
# installed packages: setfit

import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence
from sentence_transformers.losses import CosineSimilarityLoss
from ast import literal_eval
from setfit import SetFitModel, SetFitTrainer, sample_dataset

from sklearn.metrics import precision_score, recall_score
from pathlib import Path

REPO_ROOT = Path("../../../").resolve()

def prec_recall_metric(y_pred, y_test):
    return {"precision": precision_score(y_test, y_pred, zero_division="warn", average="micro"), "recall": recall_score(y_test, y_pred, zero_division="warn", average="micro")}

def predict_and_fill_df(model, df, train_dataset) -> pd.DataFrame:
    label_mapping = {index: x for index, x in enumerate(train_dataset.features["label"].names)}
    df_new = df.copy()

    y_train_proba = model.predict_proba(df_new.sentences.tolist())
    df_new["y_proba"] = y_train_proba.tolist()
    df_new["y_pred"] = df_new.y_proba.map(lambda x: label_mapping[x.index(max(x))])
    df_new["correct"] = df_new.y_pred == df_new.label
    
    return df_new


In [None]:
# EXPLODED variant
# # Prepare dataset

# df = pd.read_csv(REPO_ROOT / "datasets/reference_classification_dataset_exploded.csv", sep=";")
# df = df.loc[(df.label.notnull()) & ((df.location == "report"))]

# # # Get language of the sentence, quite unreliable for the moment
# # df["lang"] = df.sentence.map(lambda x: nlp(x)._.languages)
# # df["is_en"] = df.lang.map(lambda x: x == ["en"])

# # # Take suitable subset of the dataframe
# # df = df.loc[df.is_en] # only english

# df = df.loc[df.label.isin({"COMPONENT_USED", "BASIS_OF_RECERTIFICATION", "BASIS_FOR"})]  # only the most popular labels

# # Split into train/valid
# df_train = df.loc[df.split == "train"].drop(columns="split")
# df_valid = df.loc[df.split == "valid"].drop(columns="split")

# dataset_features = Features(
#     {
#         "dgst": Value("string"),
#         "cert_id": Value("string"),
#         "location": Value("string"),
#         "sentences": Value("string"),
#         "label": ClassLabel(names=list(df.label.unique())),
#     }
# )
# train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split="train", preserve_index=False)
# valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split="validation", preserve_index=False)

# dataset = DatasetDict()
# dataset['train'] = train_dataset
# dataset['validation'] = valid_dataset

# train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=10)


In [15]:
# GROUPED variant
# Prepare dataset

df = pd.read_csv(REPO_ROOT / "datasets/reference_classification_dataset_merged.csv", sep=";")
df = df.loc[(df.label.notnull())]

# # Get language of the sentence, quite unreliable for the moment
# df["lang"] = df.sentence.map(lambda x: nlp(x)._.languages)
# df["is_en"] = df.lang.map(lambda x: x == ["en"])

# # Take suitable subset of the dataframe
# df = df.loc[df.is_en] # only english

df = df.loc[df.label.isin({"COMPONENT_USED", "BASIS_OF_RECERTIFICATION", "BASIS_FOR"})]  # only the most popular labels
df.sentences = df.sentences.map(lambda x: list(literal_eval(x)))

# # Split into train/valid
df_train = df.loc[df.split == "train"].drop(columns="split")
df_valid = df.loc[df.split == "valid"].drop(columns="split")

dataset_features = Features(
    {
        "dgst": Value("string"),
        "cert_id": Value("string"),
        "sentences": Sequence(feature=Value("string")),
        "label": ClassLabel(names=list(df.label.unique())),
    }
)
train_dataset = Dataset.from_pandas(df_train, features=dataset_features, split="train", preserve_index=False)
valid_dataset = Dataset.from_pandas(df_valid, features=dataset_features, split="validation", preserve_index=False)

dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['validation'] = valid_dataset

train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=10)


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
df.head()

Unnamed: 0,dgst,cert_id,label,split,sentences
0,0c7ef6c32cbdee47,ANSSI-CC-2017/61,COMPONENT_USED,valid,"{'Elixir-2 Project, Certification ID ANSSI-CC-..."
1,0c7ef6c32cbdee47,BSI-DSZ-CC-1074-2019,BASIS_FOR,valid,{'The BAC+PACE configuration is subject of the...
2,0e22fe4e4e58faf4,BSI-DSZ-CC-1052-V4-2021,BASIS_OF_RECERTIFICATION,valid,{'basierend auf BSI-DSZ-CC-1052-V4-2021.'}
3,0f3900cdcd0c7f3e,BSI-DSZ-CC-1072-V4-2021,COMPONENT_USED,train,{'BSI-DSZ-CC-1072-V4-2021 and the Crypto Libra...
4,0f3900cdcd0c7f3e,BSI-DSZ-CC-1072-V4-2021-MA-01,COMPONENT_USED,train,{'Certification Report NXP Secure Smart Card C...


In [31]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("all-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    loss_class=CosineSimilarityLoss,
    metric=prec_recall_metric,
    batch_size=16,
    num_iterations=40, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for contrastive learning
    column_mapping={"sentences": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)
# trainer.unfreeze(keep_body_frozen=False)

trainer.train(show_progress_bar=True)
metrics = trainer.evaluate()
print(metrics)

config.json not found in HuggingFace Hub
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 640
  Num epochs = 1
  Total optimization steps = 40
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/40 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'precision': 0.3684210526315789, 'recall': 0.3684210526315789}


In [34]:
# Can be validated with train_dataset.features["label"].int2str(index) function
label_mapping = {index: x for index, x in enumerate(train_dataset.features["label"].names)}

df_train = predict_and_fill_df(model, df_train, train_dataset)
df_valid = predict_and_fill_df(model, df_valid, train_dataset)

In [36]:
df_valid

Unnamed: 0,dgst,cert_id,location,label,sentences,y_proba,y_pred,correct
9,0e22fe4e4e58faf4,BSI-DSZ-CC-1052-V4-2021,report,BASIS_OF_RECERTIFICATION,basierend auf BSI-DSZ-CC-1052-V4-2021.,"[0.9006353496264655, 0.0993646503735345]",COMPONENT_USED,False
10,29964f32c68b0ce8,BSI-DSZ-CC-0519-V3-2021,report,BASIS_OF_RECERTIFICATION,Specific results from\nthe evaluation process ...,"[0.9018767488720597, 0.09812325112794029]",COMPONENT_USED,False
11,29964f32c68b0ce8,BSI-DSZ-CC-0519-V3-2021,report,BASIS_OF_RECERTIFICATION,This is a re-certification based on BSI-DSZ-CC...,"[0.694838178998959, 0.30516182100104094]",COMPONENT_USED,False
12,29964f32c68b0ce8,BSI-DSZ-CC-0519-V3-2021,report,BASIS_OF_RECERTIFICATION,As the evaluation work performed for this cert...,"[0.8982407935537878, 0.1017592064462122]",COMPONENT_USED,False
22,c1d88ce9dadd7d2d,BSI-DSZ-CC-0312-2005,report,COMPONENT_USED,[13] Certification Report BSI-DSZ-CC-0312-2005...,"[0.8947294027870802, 0.1052705972129198]",COMPONENT_USED,True
23,c1d88ce9dadd7d2d,BSI-DSZ-CC-0312-2005,report,COMPONENT_USED,P5CT072V0N refer to the certification report B...,"[0.900841506996804, 0.09915849300319597]",COMPONENT_USED,True
28,238f8edc5eda1358,BSI-DSZ-CC-0222-2003,report,BASIS_OF_RECERTIFICATION,This is a re-\ncertification based on BSI-DSZ-...,"[0.6922405130591214, 0.3077594869408786]",COMPONENT_USED,False
29,238f8edc5eda1358,BSI-DSZ-CC-0222-2003,report,BASIS_OF_RECERTIFICATION,This certification is a re-certification of BS...,"[0.7146859285928153, 0.2853140714071847]",COMPONENT_USED,False
31,a6fac58198296194,BSI-DSZ-CC-0555-2009,report,BASIS_OF_RECERTIFICATION,Specific results from the evaluation process\n...,"[0.9003949217128049, 0.09960507828719516]",COMPONENT_USED,False
32,a6fac58198296194,BSI-DSZ-CC-0555-2009,report,BASIS_OF_RECERTIFICATION,As the evaluation work performed for this cert...,"[0.8962471709210816, 0.10375282907891832]",COMPONENT_USED,False


In [1]:
import numpy as np

In [10]:
first = np.array([0.1,0.2,0.7])
second = np.array([0.2, 0.4, 0.4])
third = np.array([0.5, 0.5, 0])
preds = np.array([first, second, third])

preds = np.power(preds, 2)
preds = preds.sum(axis=0)

In [11]:
preds

array([0.3 , 0.45, 0.65])

In [12]:
np.argmax(preds)

2