#DeBERTa for Sentiment Analysis


In [2]:
!pip install transformers datasets scikit-learn --quiet

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import time

In [5]:
# Load dataset
df = pd.read_csv('/content/Reviews.csv', on_bad_lines='warn', engine='python')
df = df[['Text', 'Score']].dropna()
df = df[df['Score'] != 3]
df['label'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)

In [6]:
df.shape
df = df.sample(15000)

In [7]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Text'], df['label'], test_size=0.2, random_state=42)

In [8]:
# Tokenization
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
# Dataset class
class AmazonDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {'labels': torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

train_dataset = AmazonDataset(train_encodings, train_labels.tolist())
val_dataset = AmazonDataset(val_encodings, val_labels.tolist())

In [10]:
# Load model
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
import os, time

os.environ["WANDB_DISABLED"] = "true"  # Disable W&B tracking

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/RoBERTa_Sentiment/results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    logging_dir='/content/drive/MyDrive/RoBERTa_Sentiment/logs',
    logging_steps=10,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
start_time = time.time()
trainer.train()
training_time = time.time() - start_time
print(f"Training time: {training_time:.2f} seconds")

Epoch,Training Loss,Validation Loss
1,0.3142,0.270833
2,0.0976,0.206156


Training time: 243.82 seconds


In [18]:
# Evaluation
start_test = time.time()
predictions = trainer.predict(val_dataset)
test_time = time.time() - start_test

y_pred = predictions.predictions.argmax(axis=-1)
y_true = val_labels[:len(y_pred)]

acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')

print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1 Score: {f1}")
print(f"Training Time: {training_time:.2f} seconds")
print(f"Testing Time: {test_time:.2f} seconds")

Accuracy: 0.935
Precision: 0.9578454332552693
Recall: 0.9657615112160567
F1 Score: 0.9617871840094062
Training Time: 243.82 seconds
Testing Time: 8.77 seconds


In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
# Train model
start_time = time.time()
trainer.train()
training_time = time.time() - start_time