# Fine Tuning BERT 
fine-tuning of the model BERT

### Set the right GPU to use

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


### Import Libraries
We used scikit-learn for dataset splitting, and the Hugging Face `transformers` library to download the model and perform training.

In [2]:
import transformers
import torch
import numpy as np
import random
import pandas as pd
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import ast
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset, DatasetDict

In [3]:
from transformers import Trainer,TrainingArguments

2024-06-18 09:36:19.763085: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-18 09:36:19.799208: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 09:36:19.799231: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 09:36:19.800204: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-18 09:36:19.806014: I tensorflow/core/platform/cpu_feature_guar

### Read the dataset

In [4]:
dataRew=pd.read_json("../Dataset/IMDB_reviews.json",lines=True)

In [5]:
dataRew

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"
...,...,...,...,...,...,...,...
573908,8 August 1999,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties
573909,31 July 1999,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie
573910,20 July 1999,tt0139239,ur0392750,False,"Go is the best movie I have ever seen, and I'v...",10,It's the best movie I've ever seen
573911,11 June 1999,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?


drop useless coulumns

In [6]:
dataRew.drop(columns=["movie_id","rating","review_date","user_id","review_summary"],inplace=True)

In [7]:
dataRew

Unnamed: 0,is_spoiler,review_text
0,True,"In its Oscar year, Shawshank Redemption (writt..."
1,True,The Shawshank Redemption is without a doubt on...
2,True,I believe that this film is the best story eve...
3,True,"**Yes, there are SPOILERS here**This film has ..."
4,True,At the heart of this extraordinary movie is a ...
...,...,...
573908,False,"Go is wise, fast and pure entertainment. Assem..."
573909,False,"Well, what shall I say. this one´s fun at any ..."
573910,False,"Go is the best movie I have ever seen, and I'v..."
573911,False,Call this 1999 teenage version of Pulp Fiction...


### Map true e false, in 0 e 1


In [8]:
dataRew['is_spoiler'] = dataRew['is_spoiler'].map({True: 1, False: 0})

In [9]:
dataRew = dataRew.rename(columns={'is_spoiler': 'label'})

### Split Dataset
We split the dataset using the stratify attribute to ensure that the divided sets are balanced.

### Train & Test

In [10]:
train, test,= train_test_split(dataRew, test_size=0.2, stratify=dataRew['label'],random_state=42)

In [11]:
train['label'].value_counts()

label
0    338391
1    120739
Name: count, dtype: int64

In [12]:
test['label'].value_counts()

label
0    84598
1    30185
Name: count, dtype: int64

### Divide in Validation e Training

In [13]:
train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

In [14]:
train['label'].value_counts()

label
0    270713
1     96591
Name: count, dtype: int64

In [15]:
val['label'].value_counts()

label
0    67678
1    24148
Name: count, dtype: int64

In [16]:
test['label'].value_counts()

label
0    84598
1    30185
Name: count, dtype: int64

In [17]:
test

Unnamed: 0,label,review_text
322757,1,After seeing Sixth sense and Unbreakable i had...
280946,0,I rented this movie from my local video shop y...
269020,0,I was lucky enough to win tickets to an advanc...
342401,0,"Despite some awful critics, I have to say that..."
420689,0,"Found it a fascinating film as it went along, ..."
...,...,...
339747,0,Amazing. Absolutely astounding. Tom Cruise i...
11746,0,This movie defines excellence in the field of ...
45415,0,A gripping tale of murder and one of the best ...
321264,0,Having seen this movie twice over the weekend ...


In [18]:
train

Unnamed: 0,label,review_text
82889,0,Welcome to my Punner Island where all your pun...
198457,1,The movie is not bad. That's what I gotta say ...
330749,0,"An intense, depressing movie. It sticks pretty..."
57902,0,"If you are just an average movie fan, then you..."
71005,0,Their whole life is a solid game from start to...
...,...,...
545532,0,I absolutely hated Dumb & Dumber and only got ...
57859,0,I sat down through 2 hours of pure boredom. I ...
101697,0,The Wizard of Oz is an American classic undoub...
328531,1,I didn't like this movie at all. The plot was ...


### Change the  Dataset
Transform in Dataset Hugging face

In [19]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

In [20]:
Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

In [21]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")



### Tokenization
Tokenization done with a maximum of 256 characters, specifying padding and truncation.

Try different max_lenght, try to increase the lenght, from 256 to 512

In [22]:
def encodeBig(text):
    return tokenizer(text['review_text'], padding="max_length", truncation=True, max_length=512)

In [23]:
Train=Train.map(encodeBig,batched=True)


Map:   0%|          | 0/367304 [00:00<?, ? examples/s]

In [24]:
Eval=Eval.map(encodeBig,batched=True)

Map:   0%|          | 0/91826 [00:00<?, ? examples/s]

In [25]:
Test=Test.map(encodeBig,batched=True)

Map:   0%|          | 0/114783 [00:00<?, ? examples/s]

In [26]:
from transformers import TrainingArguments, Trainer

#### Parameters of the models

In [27]:
BATCH_SIZE = 16
WEIGHT_DECAY=0.01
LR = 2e-5
EPOCHS = 5


In [28]:
torch.cuda.current_device()

0

In [29]:
#model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=2)

In [30]:
model=AutoModelForSequenceClassification.from_pretrained("/opt/models/bert-base-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /opt/models/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:


training_args = TrainingArguments(
output_dir="test_dir",
learning_rate=LR,
weight_decay=WEIGHT_DECAY,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
save_strategy='no',
fp16=True
)


### Function usefull to compute the metrics
We tried two different approaches: one using the **weighted metric**, which calculates the result of the metrics considering the number of samples, and the **binary metric**, which calculates the result only for the positive class.

In [32]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metricsweighted(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [33]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metricsbinary(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [34]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [35]:
#model=model.to(device)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metricsbinary,
)

In [37]:
trainer.args.device

device(type='cuda', index=0)

In [None]:
history=trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4645,0.456971,0.793773,0.44406,0.762784,0.313194
2,0.4272,0.45163,0.798423,0.543459,0.671932,0.456228
3,0.396,0.491398,0.795973,0.531847,0.670531,0.440699
4,0.3376,0.522656,0.786662,0.548638,0.618365,0.493043


In [46]:
history.metrics

{'train_runtime': 23038.5872,
 'train_samples_per_second': 79.715,
 'train_steps_per_second': 4.982,
 'total_flos': 4.832087153897472e+17,
 'train_loss': 0.38466114573099214,
 'epoch': 5.0}

In [47]:
log_history=trainer.state.log_history

In [None]:
with open("../Output/outputBert.txt", "a") as f:
    for log in log_history:
         if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}",file=f)

In [48]:
for log in log_history:
         if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}")

Epoch: 1.0, Eval Loss: 0.45697101950645447, Accuracy: 0.7937730054668612, F1: 0.4440595367407451, Precision: 0.7627836611195159, Recall: 0.3131936392247805
Epoch: 2.0, Eval Loss: 0.45162999629974365, Accuracy: 0.7984231045673339, F1: 0.5434589581689029, Precision: 0.6719321785801415, Recall: 0.45622825906907405
Epoch: 3.0, Eval Loss: 0.49139803647994995, Accuracy: 0.7959728181560778, F1: 0.5318473724980635, Precision: 0.670531157456997, Recall: 0.44069902269339073
Epoch: 4.0, Eval Loss: 0.5226563811302185, Accuracy: 0.7866617297933047, F1: 0.5486383115985438, Precision: 0.6183650150618053, Recall: 0.4930429021036939
Epoch: 5.0, Eval Loss: 0.5912967920303345, Accuracy: 0.7824363470041165, F1: 0.5401648022832942, Precision: 0.6080422841745259, Recall: 0.48592015901938046
Epoch: 5.0, Eval Loss: 0.5912967920303345, Accuracy: 0.7824363470041165, F1: 0.5401648022832942, Precision: 0.6080422841745259, Recall: 0.48592015901938046
Epoch: 5.0, Eval Loss: 0.5896179676055908, Accuracy: 0.783783312

In [None]:
evaluationRes=trainer.evaluate()

In [None]:
evaluationRes

In [None]:
log_history=trainer.evaluate(Test)

In [None]:
with open("../Output/outputBert.txt", "a") as f:
    print("Result on Test",file=f)
    print(f"Eval Loss: {log_history['eval_loss']}, Accuracy: {log_history['eval_accuracy']}, F1: {log_history['eval_f1']}, Precision: {log_history['eval_precision']}, Recall: {log_history['eval_recall']}",file=f)

# BERT, apply to the clean Dataset

## Read the clean Dataset

In [None]:
CleanData=pd.read_csv("../Dataset/datiClean.csv")
CleanData=CleanData[["clean_review","is_spoiler"]]

In [None]:
CleanData["clean_review"] = CleanData["clean_review"].apply(safe_literal_eval)
CleanData["whole__text"] = CleanData["clean_review"].apply(join_tokens)

In [None]:
CleanData['is_spoiler_numeric'] = np.where(CleanData['is_spoiler'] == True, 1, 0)

In [None]:
CleanData = CleanData.rename(columns={'is_spoiler_numeric': 'label','whole__text':'text'})
CleanData = CleanData[['text','label']]

In [None]:
train, test,= train_test_split(CleanData, test_size=0.2, stratify=CleanData['label'],random_state=42)

train, val,= train_test_split(train, test_size=0.2, stratify=train['label'],random_state=42)

### Tokenize the Dataset

In [None]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(val)
Test = Dataset.from_pandas(test)

Train=Train.remove_columns("__index_level_0__")
Eval=Eval.remove_columns("__index_level_0__")
Test=Test.remove_columns("__index_level_0__")

In [None]:
def encodeBig(text):
    return tokenizer(text['text'], padding="max_length", truncation=True, max_length=512)

In [None]:
Train=Train.map(encodeBig,batched=True)

In [None]:
Eval=Eval.map(encodeBig,batched=True)

In [None]:
Test=Test.map(encodeBig,batched=True)

### Train the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Train,
    eval_dataset=Eval,
    compute_metrics=compute_metricsbinary,
)

In [None]:
history=trainer.train()

In [None]:
BERTchistory=trainer.state.log_history

In [None]:
for log in BERTchistory:
         if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}")

In [None]:
with open("../Output/outputBertClean.txt", "a") as f:
    for log in log_history:
         if 'eval_loss' in log:
            print(f"Epoch: {log.get('epoch')}, Eval Loss: {log['eval_loss']}, Accuracy: {log['eval_accuracy']}, F1: {log['eval_f1']}, Precision: {log['eval_precision']}, Recall: {log['eval_recall']}",file=f)

In [None]:
log_history=trainer.evaluate(Test)

In [None]:
with open("../Output/outputBertClean.txt", "a") as f:
    print("Result on Test",file=f)
    print(f"Eval Loss: {log_history['eval_loss']}, Accuracy: {log_history['eval_accuracy']}, F1: {log_history['eval_f1']}, Precision: {log_history['eval_precision']}, Recall: {log_history['eval_recall']}",file=f)