In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import pandas as pd
print("Imported.")

Imported.


In [2]:
# -------- 1. Load data --------
df = pd.read_csv("/kaggle/input/brad-review-processed/brad_reviews_preprocessed.csv")  # change name if needed
texts = df["review_clean"].astype(str).tolist()
print("Dataset Loaded.")

Dataset Loaded.


In [3]:
# -------- 2. Device (GPU) --------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
# -------- 3. Load model & tokenizer --------
MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment"  # 3-class sentiment[web:169]
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-12-22 15:09:35.226546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766416175.433391      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766416175.493849      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766416175.944608      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766416175.944642      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766416175.944646      55 computation_placer.cc:177] computation placer alr

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
# -------- 4. Dataset & DataLoader (with workers) --------
class ReviewsDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch_texts):
    return tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )

dataset = ReviewsDataset(texts)

BATCH_SIZE = 128   # try 128, lower to 64 if CUDA OOM
NUM_WORKERS = 4    # Kaggle usually handles 2â€“4 workers fine

loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=collate_fn,
)

In [6]:
# -------- 5. Label mapping --------
label_map = {
    "negative": -1,
    "neutral": 0,
    "positive": 1,
}
id2label = model.config.id2label  # e.g. {0: 'negative', 1: 'neutral', 2: 'positive'}[web:169]
print(id2label)

{0: 'positive', 1: 'negative', 2: 'neutral'}


In [7]:
# -------- 6. Inference on GPU --------
all_labels = []
all_scores = []

In [8]:
torch.set_grad_enabled(False)

torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [9]:
for batch in tqdm(loader):
    # move batch to GPU
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    probs = torch.softmax(outputs.logits, dim=-1)
    scores, preds = torch.max(probs, dim=-1)

    for p, s in zip(preds.cpu().tolist(), scores.cpu().tolist()):
        label_str = id2label[p]
        mapped = label_map[label_str]
        all_labels.append(mapped)
        all_scores.append(s)

  0%|          | 0/3934 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [10]:
# -------- 7. Save results --------
df["camel_sentiment"] = all_labels
df["camel_score"] = all_scores
df.to_csv("brad_reviews_with_camel_sentiment_gpu.csv", index=False)

print("Saved brad_reviews_with_camel_sentiment_gpu.csv")

Saved brad_reviews_with_camel_sentiment_gpu.csv
