In [1]:
# !pip install gdown



In [2]:
import gdown

file_id = ""
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "arabic_sentence_level_back_translation.csv", quiet=False)

file_id = ""
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "english_sentiment_150k_balanced.csv", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1yyfpg4bOtSZahmLAGublRveIphRT_ByB
To: /content/arabic_sentence_level_back_translation.csv
100%|██████████| 55.1M/55.1M [00:00<00:00, 60.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1Uxs_xFpQ-cUIEjbmOStamaAssNJX5O3M
From (redirected): https://drive.google.com/uc?id=1Uxs_xFpQ-cUIEjbmOStamaAssNJX5O3M&confirm=t&uuid=6befcc56-dd2f-4253-8dc0-8f27f044e667
To: /content/english_sentiment_150k_balanced.csv
100%|██████████| 73.5M/73.5M [00:00<00:00, 99.6MB/s]


'english_sentiment_150k_balanced.csv'

In [3]:
import pandas as pd
# Load Arabic CSV
arabic_df = pd.read_csv(
    "arabic_sentence_level_back_translation.csv"
)
arabic_df.sample(5)

Unnamed: 0,text,label
26482,في هذا الكتاب يتحدث الدكتور على الوردي عن ابن ...,2
60064,مرضي. الموقع. صغر حجم الغرفه والسرير غير مريح ...,1
19725,عجبتني أوي الأدعيه اللي كان بيبتهل بيها الحاكم...,2
31870,فندق جميل جدا . الموقع المميز توافر الخدمات ال...,2
6401,تجربتي للمطعم المطعم اكلة رائع واصحاب المطعم ج...,2


In [4]:
# Load English CSV
english_df = pd.read_csv(
    "english_sentiment_150k_balanced.csv"
)
english_df.sample(5)


Unnamed: 0,text,label
55506,"Different product, expired 2 years ago [SEP] I...",0
136077,Best when warm! [SEP] This bread is excellent ...,2
131749,"So good, even the cats eat it! [SEP] I have a ...",2
57639,"Excellent [SEP] The tea smells amazing, and th...",2
53271,"Great snack, great dessert [SEP] These chips a...",2


In [5]:
print(arabic_df.shape)
print(arabic_df.columns)
print(english_df.shape)
print(english_df.columns)

(115338, 2)
Index(['text', 'label'], dtype='object')
(150000, 2)
Index(['text', 'label'], dtype='object')


In [6]:
df_all = pd.concat([arabic_df, english_df], ignore_index=True)

df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)

df_all = df_all[["text", "label"]]


In [7]:
print(df_all.isna().sum())
df_all = df_all.dropna()


text     1
label    0
dtype: int64


In [8]:
print(df_all.shape)
print(df_all["label"].value_counts())


(265337, 2)
label
2    119237
0     78128
1     67972
Name: count, dtype: int64


In [9]:
from sklearn.model_selection import train_test_split
# train data into train , val , split 
# First split:
# 80% training data
# 20% temporary data (will be split into validation and test)
# Stratification keeps the same label distribution across splits
train_df, temp_df = train_test_split(
    df_all,
    test_size=0.2,
    stratify=df_all["label"],
    random_state=42
)
# Second split:
# Split the remaining 20% into:
# 10% validation and 10% test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)


 Load Tokenizer (XLM-RoBERTa)
 
 XLM-RoBERTa is a multilingual transformer model
 
 suitable for both Arabic and English text

In [10]:
from transformers import AutoTokenizer

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    """
    Convert raw text into model-readable tokens.
    - truncation: cut long sentences
    - padding: pad short sentences
    - max_length: fixed input size for the model
    """
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [11]:
from datasets import Dataset
#Convert Pandas DataFrames to HuggingFace Datasets
# Trainer works with HuggingFace Dataset objects, not Pandas
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)
# Apply Tokenization to Datasets
# Tokenize all datasets in batches for efficiency
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)
# Remove Raw Text Column
# After tokenization, the model only needs token IDs and masks
train_ds = train_ds.remove_columns(["text"])
val_ds   = val_ds.remove_columns(["text"])
test_ds  = test_ds.remove_columns(["text"])
# Set Dataset Format to PyTorch
# Required for Trainer to work with PyTorch tensors
train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")


Map:   0%|          | 0/212269 [00:00<?, ? examples/s]

Map:   0%|          | 0/26534 [00:00<?, ? examples/s]

Map:   0%|          | 0/26534 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoModelForSequenceClassification
# Load Model for Sequence Classification
# Number of sentiment classes (e.g., Negative, Neutral, Positive)

num_labels = train_df["label"].nunique()
# Load pretrained XLM-RoBERTa with a classification head

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics

In [13]:
from transformers import TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
def compute_metrics(eval_pred):
    """
    Compute evaluation metrics during training:
    - Accuracy
    - Precision (weighted)
    - Recall (weighted)
    - F1-score (weighted)
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training Configuration

training_args = TrainingArguments(
    output_dir="./xlm_sentiment",       # Directory to save model checkpoints
    eval_strategy="epoch",              # Evaluate after each epoch
    save_strategy="epoch",              # Save model after each epoch
    learning_rate=2e-5,                 # Standard learning rate for transformers
    per_device_train_batch_size=8,      # Training batch size per GPU
    per_device_eval_batch_size=2,       # Evaluation batch size
    num_train_epochs=3,                 # Number of training epochs
    weight_decay=0.01,                  # Regularization to prevent overfitting
    logging_steps=100,                  # Log training info every 100 steps
    load_best_model_at_end=True,         # Load best model based on evaluation metric
    metric_for_best_model="f1",          # Use F1-score to select best model
    fp16=True                            # Use mixed precision for faster training
)

# Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
# Start Training
# Train the sentiment classification model
trainer.train()


  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5195,0.505242,0.780885,0.782077,0.780885,0.781387
2,0.4606,0.510372,0.790495,0.789947,0.790495,0.789748
3,0.4188,0.553806,0.794264,0.794357,0.794264,0.794094


TrainOutput(global_step=79602, training_loss=0.4874940259663718, metrics={'train_runtime': 11145.346, 'train_samples_per_second': 57.137, 'train_steps_per_second': 7.142, 'total_flos': 4.188811655077862e+16, 'train_loss': 0.4874940259663718, 'epoch': 3.0})

In [14]:
from google.colab import drive
# Connect Google Drive to Colab to save the trained model permanently
drive.mount('/content/drive')
# Define the path where the model and tokenizer will be saved
save_path = "/content/drive/MyDrive/xlm_sentiment_model"
# Save the trained model weights
trainer.save_model(save_path)
# Save the tokenizer configuration and vocabulary
tokenizer.save_pretrained(save_path)


Mounted at /content/drive


('/content/drive/MyDrive/xlm_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/xlm_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/xlm_sentiment_model/sentencepiece.bpe.model',
 '/content/drive/MyDrive/xlm_sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/xlm_sentiment_model/tokenizer.json')

In [15]:
# Evaluate the final model on unseen test data
trainer.evaluate(test_ds)

{'eval_loss': 0.5622738599777222,
 'eval_accuracy': 0.7947915881510514,
 'eval_precision': 0.7948292914015919,
 'eval_recall': 0.7947915881510514,
 'eval_f1': 0.7946899274877105,
 'eval_runtime': 240.2561,
 'eval_samples_per_second': 110.44,
 'eval_steps_per_second': 55.22,
 'epoch': 3.0}

In [16]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
# Get model predictions (logits) and true labels
preds_output = trainer.predict(test_ds)
# Convert logits to predicted class labels
preds = np.argmax(preds_output.predictions, axis=1)
# Extract true labels
labels = preds_output.label_ids
# Compute confusion matrix to analyze classification errors
cm = confusion_matrix(labels, preds)
print(cm)


[[ 6140  1320   353]
 [ 1145  4406  1246]
 [  239  1142 10543]]


In [17]:
# Display precision, recall, F1-score, and support for each class
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.82      0.79      0.80      7813
           1       0.64      0.65      0.64      6797
           2       0.87      0.88      0.88     11924

    accuracy                           0.79     26534
   macro avg       0.78      0.77      0.77     26534
weighted avg       0.79      0.79      0.79     26534



**Load Trained Model and Tokenizer**

In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
# Path where the trained model and tokenizer are saved
save_path = "/content/drive/MyDrive/xlm_sentiment_model"

model = AutoModelForSequenceClassification.from_pretrained(save_path)## Load the fine-tuned sentiment classification model
tokenizer = AutoTokenizer.from_pretrained(save_path)## Load the corresponding tokenizer
# Set the model to evaluation mode (disables dropout)
model.eval()


The tokenizer you are loading from '/content/drive/MyDrive/xlm_sentiment_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [31]:
# Sample Texts for Inference
# Test sentences in Arabic and English
test_texts = [
    "أنا مبسوطة جدًا بالمنتج ده",        # Expected: Positive
    "This product is terrible",          # Expected: Negative
    "الخدمة ممتازة",                     # Expected: Positive
    "اليوم كان عادي، لا حلو ولا وحش",    # Expected: Neutral
    "The movie was okay, nothing special" # Expected: Neutral
]


In [32]:
# Tokenize Input Texts
# Convert raw text into token IDs and attention masks
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")


In [33]:
# Model Prediction
with torch.no_grad():
# Turn off gradient calculation since we are only testing the model
# This makes prediction faster and uses less memoryabs    
    outputs = model(**inputs)# Run the model on the input texts
    logits = outputs.logits    # Get the prediction scores for each class
    predictions = logits.argmax(dim=1)    # Choose the class with the highest score as the final prediction

In [34]:
# Display the model's internal label mapping
id2label = model.config.id2label
print(id2label)


{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}


In [35]:
# Manually define readable sentiment labels
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}


In [36]:
# Convert predicted class IDs to sentiment names
pred_labels = [id2label[p.item()] for p in predictions]
# Print each text with its predicted sentiment
for text, label in zip(test_texts, pred_labels):
    print(f"Text: {text} => Prediction: {label}")
# Create a DataFrame to store texts and their predicted sentiments
df_results = pd.DataFrame({
    "text": test_texts,
    "prediction": pred_labels
})

df_results

Text: أنا مبسوطة جدًا بالمنتج ده => Prediction: Positive
Text: This product is terrible => Prediction: Negative
Text: الخدمة ممتازة => Prediction: Positive
Text: اليوم كان عادي، لا حلو ولا وحش => Prediction: Neutral
Text: The movie was okay, nothing special => Prediction: Neutral


Unnamed: 0,text,prediction
0,أنا مبسوطة جدًا بالمنتج ده,Positive
1,This product is terrible,Negative
2,الخدمة ممتازة,Positive
3,اليوم كان عادي، لا حلو ولا وحش,Neutral
4,"The movie was okay, nothing special",Neutral
