In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/labr-clean/labr_cleaned.csv')
df.columns

In [None]:
# 2. Create ground_truth: 1,2 -> 0, 3 -> 0.5, 4,5 -> 1
rating_to_norm = {1: 0, 2: 0, 3: 0.5, 4: 1, 5: 1}
df["ground_truth"] = df["rating"].map(rating_to_norm)

In [None]:
# 3. Prepare dataset for fine-tuning
class LABRSentimentDataset(Dataset):
    def __init__(self, df, text_col="review_text_clean", label_col="ground_truth", tokenizer=None, max_len=256):
        self.texts = df[text_col].astype(str).tolist()
        self.labels = df[label_col].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [None]:
# 4. Split data
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# 5. Initialize tokenizer and model
model_name = "CAMeL-Lab/bert-base-arabic-camelbert-da"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

In [None]:
# 6. Create datasets and loaders
train_ds = LABRSentimentDataset(train_df, tokenizer=tokenizer)
val_ds = LABRSentimentDataset(val_df, tokenizer=tokenizer)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

In [None]:
# 7. Fine-tune model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  # Use torch.optim.AdamW
num_training_steps = 3 * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps,
)

In [None]:
model.train()
for epoch in range(3):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/3")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = criterion(outputs.logits.squeeze(), batch["labels"])
        loss.backward()
        optimizer.step()
        scheduler.step()
        loop.set_postfix(loss=loss.item())


In [None]:
# 8. Predict sentiment for all reviews
model.eval()
def predict_sentiment(texts, batch_size=32):
    all_preds = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        enc = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            outputs = model(**enc)
            preds = outputs.logits.squeeze().cpu().tolist()
            if isinstance(preds, float):
                preds = [preds]
            all_preds.extend(preds)
    return all_preds

In [None]:
preds = predict_sentiment(df["review_text_clean"].astype(str).tolist())

In [None]:
# 9. Convert predictions to 0, 0.5, or 1
def round_to_sentiment(score):
    if score < 0.25:
        return 0
    elif score < 0.75:
        return 0.5
    else:
        return 1

df["camel_sentiment"] = [round_to_sentiment(p) for p in preds]

In [None]:
# 10. Save updated dataframe
df.to_csv("labr_balanced_with_sentiment.csv", index=False)