<a href="https://colab.research.google.com/github/f247805/ABSA-Analysis-with-Explainability-Methods/blob/main/RobertbaseA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files

uploaded = files.upload()  # This will open a file upload dialog


Saving FINAL_CLEANED_CORRECTED_SHUFFLED_DATASET_NO_DUPLICATE.jsonl to FINAL_CLEANED_CORRECTED_SHUFFLED_DATASET_NO_DUPLICATE.jsonl


In [3]:
# Import required libraries
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score

In [4]:
from tqdm import tqdm

In [5]:
import pandas as pd
df = pd.read_json("FINAL_CLEANED_CORRECTED_SHUFFLED_DATASET_NO_DUPLICATE.jsonl", lines=True)
df.head()
# Initialize a label encoder for polarity
label_encoder = LabelEncoder()
label_encoder.fit(["positive", "negative", "neutral"])  # Fit on possible polarities

# Function to preprocess the data
def preprocess_data(df):
    texts = []
    aspects = []
    polarities = []

    for _, row in df.iterrows():
        text = row["text"]
        labels = row["labels"]  # Directly use the column value  # Convert string to list of dictionaries

        for label in labels:
            aspect = label["aspect"]
            polarity = label["polarity"]
            texts.append(text)
            aspects.append(aspect)
            polarities.append(polarity)

    # Encode polarities into numerical labels
    polarities = label_encoder.transform(polarities)

    return texts, aspects, polarities

# Preprocess the dataset
texts, aspects, polarities = preprocess_data(df)

# Split the dataset into train, validation, and test sets
train_texts, temp_texts, train_aspects, temp_aspects, train_polarities, temp_polarities = train_test_split(
    texts, aspects, polarities, test_size=0.3, random_state=42
)

val_texts, test_texts, val_aspects, test_aspects, val_polarities, test_polarities = train_test_split(
    temp_texts, temp_aspects, temp_polarities, test_size=0.5, random_state=42
)

print(f"Train size: {len(train_texts)}")
print(f"Validation size: {len(val_texts)}")
print(f"Test size: {len(test_texts)}")

# Tokenize the data
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_data(texts, aspects):
    return tokenizer(texts, aspects, padding=True, truncation=True, return_tensors="pt")

train_inputs = tokenize_data(train_texts, train_aspects)
val_inputs = tokenize_data(val_texts, val_aspects)
test_inputs = tokenize_data(test_texts, test_aspects)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_polarities)
val_labels = torch.tensor(val_polarities)
test_labels = torch.tensor(test_polarities)

# Create DataLoaders
train_dataset = TensorDataset(train_inputs["input_ids"], train_inputs["attention_mask"], train_labels)
val_dataset = TensorDataset(val_inputs["input_ids"], val_inputs["attention_mask"], val_labels)
test_dataset = TensorDataset(test_inputs["input_ids"], test_inputs["attention_mask"], test_labels)

from torch.utils.data import DataLoader

# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=32,  # Adjust batch size as needed
    shuffle=True,
    num_workers=2,  # Use multiple workers for faster data loading
    pin_memory=True  # Speed up data transfer to GPU
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    num_workers=2,
    pin_memory=True
)
# Debug DataLoader
for i, batch in enumerate(train_loader):
    input_ids, attention_mask, labels = batch
    print(f"Batch {i + 1}:")
    print(f"Input IDs shape: {input_ids.shape}")
    print(f"Attention mask shape: {attention_mask.shape}")
    print(f"Labels shape: {labels.shape}")
    break  # Only check the first batch
# Load the model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Define device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and move to device
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)
model.to(device)  # Move model to GPU/CPU

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Adjust epochs as needed
    model.train()
    total_loss = 0

    for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}")):
        input_ids, attention_mask, labels = batch

        # Move data to the same device as the model
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        #print(f"Step {i + 1}: Forward pass complete, Loss = {loss.item()}")

        # Backward pass
        loss.backward()
        #print(f"Step {i + 1}: Backward pass complete")

        # Update weights
        optimizer.step()
       # print(f"Step {i + 1}: Optimizer step complete")

        # Zero gradients
        optimizer.zero_grad()
        #print(f"Step {i + 1}: Gradients zeroed")

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")



Train size: 21569
Validation size: 4622
Test size: 4622


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Batch 1:
Input IDs shape: torch.Size([32, 129])
Attention mask shape: torch.Size([32, 129])
Labels shape: torch.Size([32])


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 675/675 [07:10<00:00,  1.57it/s]


Epoch 1, Loss: 0.4905567312364777


Epoch 2: 100%|██████████| 675/675 [07:24<00:00,  1.52it/s]


Epoch 2, Loss: 0.3226955328847247


Epoch 3: 100%|██████████| 675/675 [07:25<00:00,  1.52it/s]

Epoch 3, Loss: 0.2519020741902016





In [7]:
# Evaluation code (add your validation/test code here)
def evaluate(model, val_loader):
    model.eval()
    true_labels = []
    preds = []
    with torch.no_grad():
        for batch in val_loader:
            # Access tensors by index (input_ids, attention_mask, labels)
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1)

            true_labels.extend(labels.cpu().numpy())
            preds.extend(batch_preds.cpu().numpy())

    return accuracy_score(true_labels, preds)

val_accuracy = evaluate(model, val_loader)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
# Evaluate on test set
test_accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Save the model
model.save_pretrained("absa_roberta_model")
tokenizer.save_pretrained("absa_roberta_tokenizer")

Validation Accuracy: 87.11%
Test Accuracy: 86.69%


('absa_roberta_tokenizer/tokenizer_config.json',
 'absa_roberta_tokenizer/special_tokens_map.json',
 'absa_roberta_tokenizer/vocab.json',
 'absa_roberta_tokenizer/merges.txt',
 'absa_roberta_tokenizer/added_tokens.json')