# Fine Tuning RoBERTa
In this notebook, we fine-tuned RoBERTa, using the base cased version.

### Import Libraries
We used scikit-learn for dataset splitting, and the Hugging Face `transformers` library to download the model and perform training.

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
import transformers
import torch
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [3]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

2024-06-15 15:57:21.658346: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Read the Dataset
In this case we use the original version of the dataset

In [4]:
dataRew=pd.read_json("../Dataset/IMDB_reviews.json",lines=True)

In [5]:
dataRew.drop(columns=["movie_id","rating","review_date","user_id","review_summary"],inplace=True)

Change the  Dataset in a suitable  form

In [6]:
dataRew['is_spoiler'] = dataRew['is_spoiler'].map({True: 1, False: 0})
dataRew = dataRew.rename(columns={'is_spoiler': 'label'})

## Split the Dataset

In [7]:
train, test,= train_test_split(dataRew, test_size=0.2, stratify=dataRew['label'],random_state=42)

#### Divide the Dataset in Valuation and Training

In [8]:
train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

In [9]:
train['label'].value_counts()

label
0    270713
1     96591
Name: count, dtype: int64

In [10]:
val['label'].value_counts()

label
0    67678
1    24148
Name: count, dtype: int64

In [11]:
test['label'].value_counts()

label
0    84598
1    30185
Name: count, dtype: int64

### Change the Dataset

In [12]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

In [13]:
Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

### Tokenization

In [14]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")



In [15]:
def encodeBig(text):
    return tokenizer(text['review_text'], padding="max_length", truncation=True, max_length=256)

In [16]:
Train=Train.map(encodeBig,batched=True)

Map:   0%|          | 0/367304 [00:00<?, ? examples/s]

In [17]:
Eval=Eval.map(encodeBig,batched=True)

Map:   0%|          | 0/91826 [00:00<?, ? examples/s]

In [18]:
Test=Test.map(encodeBig,batched=True)

Map:   0%|          | 0/114783 [00:00<?, ? examples/s]

### Apply the Model

https://huggingface.co/bhavyagiri/roberta-base-finetuned-imdb-spoilers?text=Jack+Ryan+is+so+amazing

In [20]:
BATCH_SIZE = 16
WEIGHT_DECAY=0.01
LR = 2e-5
EPOCHS = 5


In [21]:

model = RobertaForSequenceClassification.from_pretrained("roberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:


training_args = TrainingArguments(
output_dir="test_dirRob",
learning_rate=LR,
weight_decay=WEIGHT_DECAY,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
report_to="tensorboard",
save_strategy='no',
fp16=True

)

In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metricsweighted(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [24]:
def compute_metricsbinary(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [25]:
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metricsbinary,
)

In [None]:
history=trainer.train()

Epoch,Training Loss,Validation Loss


In [32]:
log_history=trainer.state.log_history

In [33]:
log_history

[{'loss': 0.5211,
  'grad_norm': 4.651061534881592,
  'learning_rate': 1.99135775580433e-05,
  'epoch': 0.021779849283442957,
  'step': 500},
 {'loss': 0.4945,
  'grad_norm': 4.778931617736816,
  'learning_rate': 1.9826458160909527e-05,
  'epoch': 0.043559698566885914,
  'step': 1000},
 {'loss': 0.4917,
  'grad_norm': 5.919809341430664,
  'learning_rate': 1.9739338763775758e-05,
  'epoch': 0.06533954785032887,
  'step': 1500},
 {'loss': 0.4788,
  'grad_norm': 4.403026103973389,
  'learning_rate': 1.9652219366641986e-05,
  'epoch': 0.08711939713377183,
  'step': 2000},
 {'loss': 0.4932,
  'grad_norm': 4.562783241271973,
  'learning_rate': 1.9565099969508213e-05,
  'epoch': 0.1088992464172148,
  'step': 2500},
 {'loss': 0.471,
  'grad_norm': 5.766238212585449,
  'learning_rate': 1.947798057237444e-05,
  'epoch': 0.13067909570065775,
  'step': 3000},
 {'loss': 0.481,
  'grad_norm': 5.613156795501709,
  'learning_rate': 1.939086117524067e-05,
  'epoch': 0.1524589449841007,
  'step': 3500},

In [40]:
 for log in log_history:
     if 'eval_loss' in log:
        print(list(log.keys()))
        

['eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch', 'step']
['eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch', 'step']
['eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch', 'step']
['eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch', 'step']
['eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch', 'step']


In [41]:
with open("../Dataset/outputRoB.txt", "a") as f:
    for log in log_history:
        if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}",file=f)

In [42]:
log_history=trainer.evaluate(Test)

In [43]:
log_history

{'eval_loss': 0.5364547371864319,
 'eval_accuracy': 0.7842799020760914,
 'eval_f1': 0.5449265773464924,
 'eval_precision': 0.6119458433088417,
 'eval_recall': 0.49113798244161005,
 'eval_runtime': 199.5676,
 'eval_samples_per_second': 575.158,
 'eval_steps_per_second': 35.948,
 'epoch': 5.0}

In [45]:
with open("../Dataset/outputRoB.txt", "a") as f:
    print("Result on Test",file=f)
    print(f"Eval Loss: {log_history['eval_loss']}, Accuracy: {log_history['eval_accuracy']}, F1: {log_history['eval_f1']}, Precision: {log_history['eval_precision']}, Recall: {log_history['eval_recall']}",file=f)

In [30]:
print("Finito")

Finito


In [None]:
print("Finito2")