##Step 1: Installing Required Packages


In [None]:
!pip install -q transformers datasets
!pip install pytorch-lightning
!pip install -q git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h

##Step 2: Importing Libraries and Setting Up Environment
This step imports the necessary libraries and sets up the environment for the project.
#Importing Libraries
In this step, essential libraries are imported for data manipulation (pandas, numpy), random operations (random, math), deep learning with PyTorch (torch, torch.nn), loading datasets (load_dataset), data splitting and evaluation metrics (train_test_split, f1_score, roc_auc_score, accuracy_score), and working with pre-trained models and training utilities from the Transformers library (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer).

#Setting Up Environment
A random state (state = 42) is set for reproducibility.
Seeds are set for PyTorch, NumPy, and random number generators to ensure consistent results across runs.
torch.backends.cudnn.enabled and torch.backends.cudnn.deterministic are set to control the deterministic behavior of CUDA operations.
The set_seed function from transformers is used to set a consistent seed for the library.

In [None]:
import pandas as pd
import numpy as np
import random
import math
import torch
import torch.nn as nn
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EvalPrediction

state = 42
torch.manual_seed(state)
torch.cuda.manual_seed(state)
np.random.seed(state)
random.seed(state)
torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic=True

from transformers import set_seed
set_seed(42)

#This step is optional and may be skipped if not running on Google Colab or if not using Google Drive for data storage.
from google.colab import drive
drive.mount('/content/drive') #This line is specific to Google Colab and mounts the user's Google Drive to the notebook environment.


##Step 3: Loading Dataset
This step loads the dataset for training and evaluation. The code provides two options for loading the dataset: using the entire dataset or using pre-split data for training, testing, and validation.
<br>The load_dataset function from the Hugging Face datasets library allows you to easily load and access various datasets for machine learning tasks, supporting multiple data formats and efficient caching.


In [None]:
# Option 1: Using the entire dataset for training
dataset = load_dataset('csv', data_files={'train': ['/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/cleaned_data_for_multiclassification_task/cleaned_shuffled_new_MITRE.csv',],
                                          })

# Option 2: Using pre-split data for training, testing, and validation
# dataset = load_dataset('csv', data_files={'train': ['/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/cleaned_data_for_multiclassification_task/splited_data_into_train_test_val/cleaned_MITRE_data_trainset.csv',],
#                                           'test': ['/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/cleaned_data_for_multiclassification_task/splited_data_into_train_test_val/cleaned_MITRE_data_testset.csv',],
#                                           'validation': ['/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/cleaned_data_for_multiclassification_task/splited_data_into_train_test_val/cleaned_MITRE_data_valset.csv',]
#                                           })

dataset

##Step 4: Creating Label Mappings
This step creates a list of labels and two dictionaries for mapping labels to integers and vice versa.

In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Description']]
labels = sorted(labels)
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

##Step 5: Preprocessing Data


As models like BERT don't expect text as direct input, but rather `input_ids`, etc., we tokenize the text using the tokenizer. Here I'm using the `AutoTokenizer` API, which will automatically load the appropriate tokenizer based on the checkpoint on the hub.
<br> In this cell, we can choose which tokenizer to use (roberta-base, roberta-large, or SecureBERT).
<br>What's a bit tricky is that we also need to provide labels to the model. For multi-label text classification, this is a matrix of shape (batch_size, num_labels). Also important: this should be a tensor of floats rather than integers, otherwise PyTorch' `BCEWithLogitsLoss` (which the model will use) will complain, as explained [here](https://discuss.pytorch.org/t/multi-label-binary-classification-result-type-float-cant-be-cast-to-the-desired-output-type-long/117915/3).

In [None]:

#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#tokenizer = AutoTokenizer.from_pretrained("roberta-base")
#tokenizer = AutoTokenizer.from_pretrained("roberta-large")
tokenizer = AutoTokenizer.from_pretrained("ehsanaghaei/SecureBERT")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Description"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Finally, we set the format of our data to PyTorch tensors. This will turn the training, validation and test sets into standard PyTorch [datasets](https://pytorch.org/docs/stable/data.html).

In [None]:
encoded_dataset.set_format("torch")

##Step 6: Define model

Here we define a model that includes a pre-trained base (i.e. the weights from SecureBERT) are loaded, with a random initialized classification head (linear layer) on top. One should fine-tune this head, together with the pre-trained base on a labeled dataset.

This is also printed by the warning.

We set the `problem_type` to be "multi_label_classification", as this will make sure the appropriate loss function is used (namely [`BCEWithLogitsLoss`](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)). We also make sure the output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("ehsanaghaei/SecureBERT",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

##Step 7: Train the model!

We are going to train the model using HuggingFace's Trainer API. This requires us to define 2 things:

* `TrainingArguments`, which specify training hyperparameters. All options can be found in the [docs](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments). Below, we for example specify that we want to evaluate after every epoch of training, we would like to save the model every epoch, we set the learning rate, the batch size to use for training/evaluation, how many epochs to train for, and so on.
* a `Trainer` object (docs can be found [here](https://huggingface.co/transformers/main_classes/trainer.html#id1)).

In [None]:
batch_size = 16
metric_name = "f1"

#Option 1: Training without Evaluation
#This option is for training the model without performing any evaluation during the training process
args = TrainingArguments(
    f"roberta-finetuned",
    evaluation_strategy = "no",
    do_eval=False,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    num_train_epochs=30,
    weight_decay=0.01,
)

#Option 2: Training with Evaluation
#This option is for training the model with evaluation performed at the end of each epoch.

# args = TrainingArguments(
#     f"roberta-finetuned",
#     evaluation_strategy = "epoch",
#     save_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=12,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model=metric_name,
# )

We are also going to compute metrics while training. For this, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values.

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
#if we want to evaluate our traing with val or test set we can uncomment eval_dataset, and compute_metrics
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    #eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
    )
trainer.train()

In [None]:
#Save the fine-tuned model in the desired output directory.

trainer.save_model('/content/drive/MyDrive/finetuned/secure_bert')

In [None]:
#if we use test or eval set we can use this line of code for evaluation
#trainer.evaluate()