In [8]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

In [9]:
df = pd.read_csv('../../data/processed/brad_reviews_preprocessed.csv')

In [2]:
MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Use 10 CPU threads for dataloader / tokenization
torch.set_num_threads(12)

In [5]:
# Map HF labels -> custom numeric labels
label_map = {
    "negative": -1,
    "neutral": 0,
    "positive": 1,
}

In [6]:
class ReviewsDataset(Dataset):
    def __init__(self, texts):
        self.texts = list(texts)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch_texts):
    return tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )

In [10]:
dataset = ReviewsDataset(df["review_clean"])
loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=False,
    num_workers=10,
    collate_fn=collate_fn,
)

In [11]:
all_labels = []
all_scores = []

In [None]:
with torch.no_grad():
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        probs = torch.softmax(outputs.logits, dim=-1)
        scores, preds = torch.max(probs, dim=-1)

        # Convert to label strings using model config
        id2label = model.config.id2label
        for p, s in zip(preds.cpu().tolist(), scores.cpu().tolist()):
            label_str = id2label[p]          # e.g. "positive"/"negative"/"neutral"
            mapped = label_map[label_str]    # -1 / 0 / 1
            all_labels.append(mapped)
            all_scores.append(s)

  0%|          | 0/7868 [00:00<?, ?it/s]

In [None]:
df["camel_sentiment"] = all_labels
df["camel_score"] = all_scores

In [None]:
df.to_csv("../../data/processed/brad_reviews_with_camel_sentiment.csv", index=False)
print("Saved brad_reviews_with_camel_sentiment.csv")