## Grid Search

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
import transformers
import torch
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [3]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

2024-06-16 11:27:11.988404: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
dataRew=pd.read_json("Dataset/IMDB_reviews.json",lines=True)

In [5]:
dataRew.drop(columns=["movie_id","rating","review_date","user_id","review_summary"],inplace=True)

In [6]:
dataRew['is_spoiler'] = dataRew['is_spoiler'].map({True: 1, False: 0})
dataRew = dataRew.rename(columns={'is_spoiler': 'label'})

In [7]:
train_temp, test_eval = train_test_split(dataRew, train_size=100000, stratify=dataRew['label'])

# Dividi il temp dataset in test ed evaluation
test, evaluation = train_test_split(test_eval, train_size=25000, stratify=test_eval['label'])

# Dividi il train dataset in train ed evaluation
train, train_eval = train_test_split(train_temp, train_size=50000, stratify=train_temp['label'])

In [8]:
Train = Dataset.from_pandas(train)
Eval = Dataset.from_pandas(train_eval)
Test = Dataset.from_pandas(test)

In [9]:
dataRewHug = Dataset.from_pandas(dataRew)

### Tokenization Roberta

In [10]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")



In [11]:
def encodeBig(text):
    return tokenizer(text['review_text'], padding="max_length", truncation=True, max_length=256)

In [12]:
Train=Train.map(encodeBig,batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [13]:
Eval=Eval.map(encodeBig,batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [14]:
Test=Test.map(encodeBig,batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

### Apply the Model

In [15]:
BATCH_SIZE = 16
WEIGHT_DECAY=0.01
LR = 2e-5
EPOCHS = 3


In [16]:
def compute_metrics2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

### Try Grid Search on Roberta

In [17]:
import optuna
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [18]:
def objective(trial):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_categorical("learning_rate", [2e-5, 1e-4,1e-6])
    batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    weight_decay = trial.suggest_categorical("weight_decay", [0.0, 0.01,0.3])
    num_train_epochs = 3

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="test_dirRob",
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_strategy='no',
        fp16=True
    )
    
    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=Train,
        eval_dataset=Eval,
        compute_metrics=compute_metrics2,
    )
    
    # Train model
    trainer.train()
    
    # Evaluate model
    eval_result = trainer.evaluate()
    accuracy = eval_result['eval_accuracy']
    
    return accuracy

In [19]:

model = RobertaForSequenceClassification.from_pretrained("roberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2024-06-16 11:27:54,648] A new study created in memory with name: no-name-24695b55-727d-497c-ad97-07392d64f8e9


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5886,0.579478,0.73702,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Retrieve the best trial
best_trial = study.best_trial



In [None]:
# Print the results
print(f"Best Trial Number: {best_trial.number}")
print(f"Best Value: {best_trial.value}")
print("Best Parameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

### Print results of grid seacrh in a file


In [None]:
# Print the results
with open("outputGridSearchRob.txt", "a") as f:
    print(f"Best Trial Number: {best_trial.number}",file=f)
    print(f"Best Value: {best_trial.value}",file=f)
    print("Best Parameters:",file=f)
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}",file=f)

### Tokenization BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [None]:
def encodeBig(text):
    return tokenizer(text['review_text'], padding="max_length", truncation=True, max_length=256)

In [None]:
Train=Train.map(encodeBig,batched=True)
Eval=Eval.map(encodeBig,batched=True)

### Try Grid Search on BERT

In [None]:
import numpy as np
import evaluate


model=AutoModelForSequenceClassification.from_pretrained("/opt/models/bert-base-cased")

In [None]:
model.cuda()

In [None]:
def objective(trial):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_categorical("learning_rate", [2e-5, 1e-4,1e-6])
    batch_size = trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64])
    weight_decay = trial.suggest_categorical("weight_decay", [0.0, 0.01,0.3])
    num_train_epochs = 3

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="test_dirRob",
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_strategy='no',
        fp16=True
    )
    
    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=Train,
        eval_dataset=Eval,
        compute_metrics=compute_metrics2,
    )
    
    # Train model
    trainer.train()
    
    # Evaluate model
    eval_result = trainer.evaluate()
    accuracy = eval_result['eval_accuracy']
    
    return accuracy

In [None]:
# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

In [None]:
# Retrieve the best trial
best_trial = study.best_trial

In [26]:


# Print the results
print(f"Best Trial Number: {best_trial.number}")
print(f"Best Value: {best_trial.value}")
print("Best Parameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

Best Trial Number: 0
Best Value: 0.73702
Best Parameters:
    learning_rate: 0.0001
    per_device_train_batch_size: 16
    weight_decay: 0.0


In [27]:
# Print the results on a file
with open("outputGridSearchBERT.txt", "a") as f:
    print(f"Best Trial Number: {best_trial.number}",file=f)
    print(f"Best Value: {best_trial.value}",file=f)
    print("Best Parameters:",file=f)
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}",file=f)