In [None]:
!pip install transformers
!pip install datasets
!pip install huggingface_hub
!pip install ipywidgets==7.7.1
!pip install seqeval
!pip install evaluate
!pip install accelerate -U
!pip install setfit==1.0.2

In [None]:
from datasets import load_dataset
from huggingface_hub import notebook_login
import json
import evaluate
import numpy as np
from transformers import RobertaTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch
from collections import defaultdict
from torch import nn
from setfit import SetFitModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import defaultdict
import nltk
#Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Set the device to GPU
    device = torch.device("cuda")
    print("Using GPU.")
else:
    # Set the device to CPU
    device = torch.device("cpu")
    print("No GPU available, using CPU.")

In [None]:
# Log-in to Hugging Face
notebook_login()

In [5]:
def modify_labels(labels_file):

    """
    Modify labels file to support BIO-format.

    @:param labels_file: the path to the labels file

    @:return: 
        modified_labels2ids: mapping of BIO-format labels to ids
        modified_ids2labels: mapping of ids to BIO-format labels

    """
    modified_labels2ids = {}
    modified_ids2labels = {}
    with open(labels_file, mode="r", encoding="utf-8") as lab_file:
        labels = json.load(lab_file)

    index = 1
    for _, label in labels.items():
        label_begin = "B-" + label
        label_inside = "I-" + label
        modified_labels2ids[index] = label_begin
        modified_labels2ids[index + len(labels)] = label_inside
        modified_ids2labels[label_begin] = index
        modified_ids2labels[label_inside] = index +len(labels)
        index += 1
    modified_labels2ids[0] = "O"
    modified_ids2labels["O"] = 0

    return modified_labels2ids, modified_ids2labels

def get_original_labels(labels_file):
  """
    Get the original labels from the labels file (without BIO-format).

    @:param labels_file: the path to the labels file

    @:return label2id: mapping of original labels to ids

    """
  with open(labels_file, "r", encoding="utf-8") as lab_file:
    json_labels = json.load(lab_file)

  label2id = {v:k for k,v in json_labels.items()}
  return label2id

In [None]:
# Load our fine-tuned SetFit model from Hugging Face
setfit_model = SetFitModel.from_pretrained("fede-m/setfit_fold_4")
# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("osiria/roberta-base-italian", add_prefix_space=True)
# Get BIO labels
id2label, label2id = modify_labels("labels/id2labels.json")
# Load RoBERTa model
num_labels = len(id2label)
roberta_model = AutoModelForTokenClassification.from_pretrained("osiria/roberta-base-italian", num_labels =num_labels, id2label=id2label, label2id=label2id )
# Load sequeval
seqeval = evaluate.load("seqeval")

# Current fold used as test set
CURRENT_FOLD = 4


In [None]:
# ------------------------------------------------------- Load dataset from local folder ------------------------------------------------

def get_dataset(path):
  """
      Reads a dataset from the given path and returns the train and test set.
  
      @:param path: The path to the dataset file for pipeline.

      @:return train_set, test_set: The train and test set in the dataset.
      
  """
  train = []
  test = []

  for i in range(5):
    with open(path+str(i)+".json", mode="r", encoding="utf-8") as data:
      dataset = json.load(data)
    
    print(len(dataset["Data"]))
    if i == CURRENT_FOLD:
      test.extend(dataset["Data"])
    else:
      train.extend(dataset["Data"])
  return train, test

In [4]:
# ------------------------------------------------------- Load dataset from Hugging Face ------------------------------------------------
def get_dataset(name):
  """
      Reads a dataset from the given Hugging Face dataset name and fold and returns the train and test set.
  
      @:param name: The name of the dataset for the baseline.

      @:return train_set, test_set: The train and test set in the dataset.
      
  """
  train = []
  test = []
  for i in range(5):
    dataset = load_dataset(name, "fold"+str(i))
    if i == CURRENT_FOLD:
      test.extend(dataset["train"]["Data"][0])
    else:
      train.extend(dataset["train"]["Data"][0])

  return train, test


In [None]:


def get_roberta_datasets():
  """
      The function applies the filtering step by using the pre-trained SetFit for inference.  
      @:param None

      @:return roberta_train, roberta_test: The filtered train and test set RoBERTa will be trained on for sequence labelling.
      
  """
  # Get dataset from Hugging Face
  train, test = get_dataset("fede-m/setfit_dataset_coreference_folds")
  # Get dataset from local folder
  # train, test = get_dataset("dataset/pipeline/data_sent_coref_fold")

  # Filter the train and test sets using SetFit for inference
  roberta_train = []
  roberta_test = []
  test_pred_labels = []
  test_true_labels = []

  for train_sent in train:
    pred = setfit_model(train_sent["text"])
    if pred == 1:
      roberta_train.append(train_sent)

  for test_sent in test:
    pred = setfit_model(test_sent["text"])
    if pred == 1:
      roberta_test.append(test_sent)
    test_pred_labels.append(pred)
    test_true_labels.append(torch.tensor(test_sent["label"]))

  test_pred_labels = np.array(test_pred_labels)
  test_true_labels = np.array(test_true_labels)

  # Calculate the evaluation metrics for the filtering step
  accuracy = accuracy_score(test_true_labels, test_pred_labels)
  precision = precision_score(test_true_labels, test_pred_labels)
  recall = recall_score(test_true_labels, test_pred_labels)
  f1 = f1_score(test_true_labels, test_pred_labels)

  #print(accuracy)
  #print(precision)
  #print(recall)
  #print(f1)
  #print(len(test_pred_labels))
  #print(len(test_true_labels))
  return roberta_train, roberta_test


roberta_train, roberta_test = get_roberta_datasets()

In [None]:
# Hyperparameters for RoBERTa model
stride = 150
max_length = 512
window_size = max_length - stride
LEARNING_RATE = 6e-5
EPOCHS = 5

In [3]:
def adjust_tokens_labels_attention(segment):
  
  
  """
  Adjusts a segment of tokens, labels and attention mask to match the requirements of the transformer model.
  
  @:param segment: Output of the tokenizer containing
  
                 - "input_ids": A list of integers representing the tokens ids of the segment
                 - "labels": A list of integers representing the labels assigned to the tokens
                 - "attention_mask": A list of integers representing the attention mask of the tokens
  
  @:return: The adjusted segment 
  """

  # Check if segment has more than 510 tokens: in this case, when you add the <s> and <\s> token later, you would have more than 512 tokens
  # That's why we remove the last two tokens (which are always the dot "." and the beginning of token sign) and the respective labels and
  # attention mask

  if len(segment["input_ids"]) >= max_length -1:
    for i in range(2):
      segment["input_ids"].pop()
      segment["labels"].pop()
      segment["attention_mask"].pop()

  # Add initial and last special tokens and labels and attention
  if segment["input_ids"][0] != 0: # Does not already start with an initial token "<s>"
    # Add this special token and the respective label
    segment["input_ids"].insert(0,0)
    segment["labels"].insert(0,-100)
    segment["attention_mask"].insert(0,1)

  if segment["input_ids"][-1] != 2: # Does not already finish with "<\s>"
    segment["input_ids"].append(2)
    segment["labels"].append(-100)
    segment["attention_mask"].append(1)

  # Add padding to the segment
  padded_segment = add_padding(segment)

  return padded_segment


def add_padding(segment):

  """
  Add padding to a segment of tokens, labels and attention mask so that it matches the max_length that the RoBERTa model can handle (512 tokens)
  
  @:param segment: Output of the tokenizer containing
  
                 - "input_ids": A list of integers representing the tokens ids of the segment
                 - "labels": A list of integers representing the labels assigned to the tokens
                 - "attention_mask": A list of integers representing the attention mask of the tokens
  
  @:return: The segment adjusted with padding
  """
  padded_segment = tokenizer.pad(segment,padding="max_length", max_length=512)
  j = len(padded_segment["labels"])
  while j<max_length:
    padded_segment["labels"].append(-100)
    j +=1
  return padded_segment


def truncate(tokenized_input, new_labels):

  """
  Truncate a tokenized input into segments that match max_length (for texts that have more tokens than the ones the RoBERTa can handle)

  @:param tokenized_input: Output of the tokenizer containing
  
                  - "input_ids": A list of integers representing the tokens ids of the segment
                  - "labels": A list of integers representing the labels assigned to the tokens
                  - "attention_mask": A list of integers representing the attention mask of the tokens
  
  @:param new_labels: A list of integers representing the labels assigned to the tokens
  
  @:return: The modified segments
  """

  segments = []

  for j in range(0, len(tokenized_input["input_ids"]), window_size):
    segment = defaultdict()
    # 5: id of the dot (".") --> when dividing the text in segments, we want to truncate the text at the last dot in the segment
    # Also labels and attention mask must be adjusted accordingly
    if 5 not in tokenized_input["input_ids"][j:j+max_length]:
      segment["input_ids"] = tokenized_input["input_ids"][j:j+max_length]
      segment["labels"] = new_labels[j:j+max_length]
      segment["attention_mask"] = tokenized_input["attention_mask"][j:j+max_length]
      segment = adjust_tokens_labels_attention(segment) # Add initial and last special tokens and apply padding

    else:
      if j != 0:
        j += tokenized_input["input_ids"][j:j+max_length].index(5) +1 # Find the first dot in text and start from there

      curr_segm = tokenized_input["input_ids"][j:j+max_length]

      if j+max_length >= len(tokenized_input["input_ids"]):
            segment["input_ids"] = curr_segm
            segment["labels"] = new_labels[j:]
            segment["attention_mask"] = tokenized_input["attention_mask"][j:]

      else:
        # Find the last dot in the segment and break the segment there
        curr_segm.reverse()
        last_dot_index = len(curr_segm) - curr_segm.index(5)
        curr_segm.reverse()
        segment["input_ids"] = curr_segm[:last_dot_index]
        segment["labels"] = new_labels[j:j+last_dot_index]
        segment["attention_mask"] = tokenized_input["attention_mask"][j:j+last_dot_index]

      segment =  adjust_tokens_labels_attention(segment)

    segments.append(segment)

  return segments


def tokenize_and_align_labels(tokens, labels):

    """
    @:param tokens: A list of strings, where each string represents a word in the text.
    @:param labels: A list of strings, where each string represents a label associated with a word in the text.
    
    @:return segments: A list of segments resulting from the tokenization, truncation, and padding of the original document.
    """

    new_labels = []
    

    # Tokenize the input word tokens in RoBERTa tokenizer tokens
    tokenized_input = tokenizer(tokens)
    
    new_labels.append(-100) # Add label for initial special token
    # Convert new tokens into their corresponding ids and assign the corresponding label to each token
    for i, label in enumerate(labels):
      word_ids = tokenized_input.word_ids(batch_index=i)
    for i,token in enumerate(tokenized_input["input_ids"]):
        toks = tokenizer.convert_ids_to_tokens(token)

        for j,tok in enumerate(toks[1:len(toks)-1]):
            if j == 0:
                new_labels.append(labels[i])
            else:
                new_labels.append(-100)

    new_labels.append(-100) # Add label for last special token
    tokenized_input = tokenizer(tokens, is_split_into_words=True)

    segments = []

    # Check if we have too many tokens per document and truncate it in smaller segments if needed
    if len(tokenized_input["input_ids"]) > max_length:
      segments = truncate(tokenized_input, new_labels)

    else:
      tokenized_input["labels"] = new_labels
      segment = adjust_tokens_labels_attention(tokenized_input)

      segments.append(segment)

    return segments


def get_tokens_labels(docs, label2id: dict):

    """
    Function to word tokenize the input docs and assign the labels to each token.
    
    @:param docs: A list of dictionaries containing the documents.
    @:param label2id: A dictionary mapping labels to ids.
    
    @:return A list of dictionaries containing the new dataset to use for training and testing the RoBERTa model.
    """

    final_docs = []
    wp_tok = nltk.WordPunctTokenizer()
    # Get tokens and labels for training set
    for doc in docs:
        new_doc = defaultdict()
        # Word tokenize the text
        spans = list(wp_tok.span_tokenize(doc["text"]))
        new_doc["words"] = list(wp_tok.tokenize(doc["text"]))
        # Assign labels per tokens: need to convert the original labels in the dataset, expressed as spans, to a list of labels per token
        labels_per_token = [0]*len(spans)
        if not doc["entities"]:
            # If no entity in the document, assign label 0 to every token
            new_doc["labels"] = labels_per_token
        else:
            # Convert the labels's spans to a list of labels per token
            for i, span in enumerate(spans):
                for label in doc["entities"]:
                    if label["start_offset"] <= span[0] <= span[1] <= label["end_offset"]:
                        labels_per_token[i] = int(label2id[label["label"]])
                        break
            new_doc["labels"] = labels_per_token

        final_docs.append(new_doc)
    return final_docs

In [None]:
# CustomTrainer class to change the compute loss function and add higher weights to minority classes
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weights = [0.05] +[2.00]*(self.model.config.num_labels - 1) # Assign different weights to the classes
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weights, device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


def train_test(train, test):

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


    training_args = TrainingArguments(
        output_dir="fgsd_models",
        learning_rate= LEARNING_RATE,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=8,
        num_train_epochs=EPOCHS,
        weight_decay=0.4,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=False,
        push_to_hub=False,
    )

    trainer = CustomTrainer(
        model=model,
        args = training_args,
        train_dataset= train,
        eval_dataset= test,
        tokenizer = tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()


def compute_metrics(p):
    predictions, labels = p

    #print("Predictions: ",predictions)
    #print("Labels: ",labels)
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [[id2label[p] for (p,l) in zip(prediction,label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    print(results)
    



def get_roberta_predictions(train, test):
  """
    Function trains the RoBERTa model for sequence labelling on the pre-filtered dataset.
    
    @:param train, test: The train and test set to train the RoBERTa model.
    @:return None
  
  """
  original_labels = get_original_labels("labels/id2labels.json")

  #Word tokenize the sentences and assign the corresponding label to each token according to the entities spans
  train_word_tok = get_tokens_labels(train, original_labels)
  test_word_tok = get_tokens_labels(test, original_labels)
  # Adjust labels, truncate and add padding if necessary
  new_train = []
  new_test = []
  for train_sent in train_word_tok:
    curr_doc = tokenize_and_align_labels(train_sent["words"],train_sent["labels"])
    new_train.extend(curr_doc)
  for test_sent in test_word_tok:
    curr_doc = tokenize_and_align_labels(test_sent["words"],test_sent["labels"])
    new_test.extend(curr_doc)

  # Train the RoBERTa model for sequence labelling
  train_test(new_train,new_test)

# Load the model
num_labels = len(id2label)
model = AutoModelForTokenClassification.from_pretrained("osiria/roberta-base-italian", num_labels =num_labels, id2label=id2label, label2id=label2id )
# Load sequeval for evaluation
seqeval = evaluate.load("seqeval")

get_roberta_predictions(roberta_train, roberta_test)
# Push the model to hub
#model.push_to_hub("fede-m/setfit_roberta_pipeline_"+str(CURRENT_FOLD))

In [None]:
# Load fine-tuned SetFit model from Hugging Face
setfit_model = SetFitModel.from_pretrained("fede-m/setfit_fold_4")
# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("osiria/roberta-base-italian", add_prefix_space=True)
# Get BIO labels
id2label, label2id = modify_labels("labels/id2labels.json")
# Load RoBERTa model
num_labels = len(id2label)
roberta_model = AutoModelForTokenClassification.from_pretrained("osiria/roberta-base-italian", num_labels =num_labels, id2label=id2label, label2id=label2id )
# 4) load sequeval
seqeval = evaluate.load("seqeval")

model = AutoModelForTokenClassification.from_pretrained("fede-m/setfit_roberta_pipeline_4").cuda()

In [None]:
# ---------------------------- Test Final Pipeline Complete ------------------------------------------

# fold_num = fold we used as validation for the model we are testing
def test_roberta_classifier_results():
   """
      Function tests the result of the whole pipeline by subsequently applying both Setfit and RoBERTa on the test set. 
      
      @:param None
      @:return None
    
   """

   test_pred_labels = []
   test_true_labels = []
   roberta_test = []
   sentences_to_print = []


   # Get dataset from Hugging Face
   _, test = get_dataset("fede-m/setfit_dataset_coreference_folds")
  # Get dataset from local folder
  # _, test = get_dataset("dataset/pipeline/data_sent_coref_fold")

   original_labels = get_original_labels("labels/id2labels.json")
   # Get test set word-tokenized and with aligned labels per token
   test_word_tok = get_tokens_labels(test, original_labels)

   for test_sent, tokenized_test_sent in zip(test, test_word_tok):

     pred = setfit_model(test_sent["text"]) # Use the non-word tokenized version for SetFit prediction
     curr_sent = tokenize_and_align_labels(tokenized_test_sent["words"],tokenized_test_sent["labels"]) # Get RoBERTa tokenization and labels alignment
     sent_tokens = tokenizer.tokenize(test_sent["text"])
     sent_tokens_ids = tokenizer(tokenized_test_sent["words"])

     # Check Setfit prediction --> if prediction is one, then we apply the RoBERTa model for token classification
     if pred == 1:
       token_classifier_output = model(torch.tensor(curr_sent[0]['input_ids']).unsqueeze(0).cuda(), torch.tensor(curr_sent[0]['attention_mask']).unsqueeze(0).cuda())
       logits = token_classifier_output.logits
       predictions = torch.argmax(logits, dim=2)
       predicions_list = [id2label[label] for label in predictions[0].cpu().tolist()]

       # Re-align labels
       true_labels_without_100 = [id2label[0] if label == -100 else id2label[label] for label in curr_sent[0]["labels"]]
       test_pred_labels.append(predicions_list)
       test_true_labels.append(true_labels_without_100)

       if(predicions_list == true_labels_without_100):
        print("This sentence was correctly classified")

       else:
        obj = {"sentence": test_sent["text"],"tokens": sent_tokens, "true_labels":true_labels_without_100, "predictions": predicions_list}
        sentences_to_print.append(obj)


     else:
      # If SetFit prediction is 0: manually assign label 0 to all word tokens in the sentence
      predicted_labels = [0 for _ in tokenized_test_sent["words"]]
      # Get RoBERTa tokenization and labels alignment as for sentences that received label 1
      prediction_obj = tokenize_and_align_labels(tokenized_test_sent["words"], predicted_labels)
      predictions = [id2label[0] if label == -100 else id2label[label] for label in prediction_obj[0]['labels']]

      # Re-align labels
      true_labels_without_100 = [id2label[0] if label == -100 else id2label[label] for label in curr_sent[0]["labels"]]
      test_pred_labels.append(predictions)
      test_true_labels.append(true_labels_without_100)

      if(predictions == true_labels_without_100):
        print("This sentence was correctly classified")

      else:
        obj = {"sentence": test_sent["text"], "tokens": sent_tokens, "true_labels":true_labels_without_100, "predictions": predictions}
        sentences_to_print.append(obj)

   test_pred_labels = np.array(test_pred_labels)
   test_true_labels = np.array(test_true_labels)


   results = seqeval.compute(predictions=test_pred_labels, references=test_true_labels)
   print(results)

   return sentences_to_print


sentences_to_print = test_roberta_classifier_results()

