# Fine Tuning RoBERTa
In this notebook, we fine-tuned RoBERTa, using the base cased version.

### Set the right GPU to use
We set the environment variable to determine the order in which CUDA devices are enumerated.

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

### Import Libraries
We used pandas to read the dataseyt, scikit-learn for dataset splitting and the Hugging Face `transformers` library to download the model and perform training.

In [None]:
import transformers
import torch
import numpy as np
import random
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [None]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

### Functions for preprocess the Clean dataset

In [None]:
#FUNCTIONS DEFINITION

#READ SPLIT TOKENS
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing value {val}: {e}")
        return val  # Return the original value if there is an error

#MERGE TOKENS AS A WHOLE TEXT
def join_tokens(token_list):
    if isinstance(token_list, list):
        return ' '.join(token_list)
    return token_list


## Original Dataset

### Read the Dataset
In this case we use the original version of the dataset

In [None]:
dataRew=pd.read_json("../Dataset/IMDB_reviews.json",lines=True)

In [None]:
dataRew.drop(columns=["movie_id","rating","review_date","user_id","review_summary"],inplace=True)

Change the  Dataset in a suitable  form

In [None]:
dataRew['is_spoiler'] = dataRew['is_spoiler'].map({True: 1, False: 0})
dataRew = dataRew.rename(columns={'is_spoiler': 'label'})

### Split the Dataset

In [None]:
train, test,= train_test_split(dataRew, test_size=0.2, stratify=dataRew['label'],random_state=42)

### Divide the Dataset in Valuation and Training

In [None]:
train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

In [None]:
train['label'].value_counts()

In [None]:
val['label'].value_counts()

In [None]:
test['label'].value_counts()

### Change the Dataset

In [None]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

In [None]:
Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

### Tokenization

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [None]:
def encodeBig(text):
    return tokenizer(text['review_text'], padding="max_length", truncation=True, max_length=512)

In [None]:
Train=Train.map(encodeBig,batched=True)

In [None]:
Eval=Eval.map(encodeBig,batched=True)

In [None]:
Test=Test.map(encodeBig,batched=True)

### Apply the Model

In [None]:
BATCH_SIZE = 16
WEIGHT_DECAY=0.01
LR = 2e-5
EPOCHS = 5


In [None]:

model = RobertaForSequenceClassification.from_pretrained("roberta-base")

In [None]:


training_args = TrainingArguments(
output_dir="test_dirRob",
learning_rate=LR,
weight_decay=WEIGHT_DECAY,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
report_to="tensorboard",
save_strategy='no',
fp16=True

)

### Function to compute the metrics
We tried two different approaches: one using the **weighted metric**, which calculates the result of the metrics considering the number of samples, and the **binary metric**, which calculates the result only for the positive class. In the end, we only report the results of the binary-type metric, as it is the most significant one.

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metricsweighted(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [None]:
def compute_metricsbinary(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

### Train the model

In [None]:
model.cuda()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metricsbinary,
)

In [None]:
history=trainer.train()

In [None]:
log_history=trainer.state.log_history

In [None]:
log_history

In [None]:
 for log in log_history:
     if 'eval_loss' in log:
        print(list(log.keys()))
        

### Save the output in a file

In [None]:
with open("../Output/outputRoBERTa.txt", "a") as f:
    for log in log_history:
        if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}",file=f)

apply the model on test set

In [None]:
log_history=trainer.evaluate(Test)

In [None]:
log_history

In [None]:
with open("../Output/outputRoBERTa.txt", "a") as f:
    print("Result on Test",file=f)
    print(f"Eval Loss: {log_history['eval_loss']}, Accuracy: {log_history['eval_accuracy']}, F1: {log_history['eval_f1']}, Precision: {log_history['eval_precision']}, Recall: {log_history['eval_recall']}",file=f)

## Clean Dataset

### Read the Dataset

In [None]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")
CleanData=CleanData[["clean_review","is_spoiler"]]

In [None]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [None]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [None]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})
CleanData = CleanData[['text','label']]

In [None]:
train, test,= train_test_split(CleanData, test_size=0.2, stratify=CleanData['label'],random_state=42)

train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

### Tokenize the Dataset

In [None]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

In [None]:
def encodeBig(text):
    return tokenizer(text['text'], padding="max_length", truncation=True, max_length=512)

In [None]:
Train=Train.map(encodeBig,batched=True)

In [None]:
Eval=Eval.map(encodeBig,batched=True)

In [None]:
Test=Test.map(encodeBig,batched=True)

### Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metricsbinary,
)

In [None]:
history=trainer.train()

In [None]:
ROBERTAChistory=trainer.state.log_history

In [None]:
for log in log_history:
         if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}")

### Save the output in a file

In [None]:
with open("../Output/outputRoBERTaClean.txt", "a") as f:
    for log in log_history:
         if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}",file=f)

apply the model on test set

In [None]:
log_history=trainer.evaluate(Test)

In [None]:
with open("../Output/outputRoBERTaClean.txt", "a") as f:
    print("Result on Test",file=f)
    print(f"Eval Loss: {log_history['eval_loss']}, Accuracy: {log_history['eval_accuracy']}, F1: {log_history['eval_f1']}, Precision: {log_history['eval_precision']}, Recall: {log_history['eval_recall']}",file=f)