In [1]:
# ============================================================================
# Install Required Packages for Advanced Arabic NLP
# ============================================================================

import subprocess
import sys

def install_package(package):
    """
    Install a Python package using pip.

    Args:
        package (str): The name of the package to install
    """
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

# Define comprehensive list of required packages
required_packages = [
    "numpy",
    "pandas",
    "scikit-learn",
    "nltk",
    "joblib",
    "matplotlib",
    "seaborn",
    "transformers",
    "torch",
    "datasets",
    "accelerate",
    "arabert",
    "pyarabic",
    "farasapy",
    "imblearn",
    "optuna",
    "evaluate"
]

print("Installing required packages...\n")
for package in required_packages:
    try:
        install_package(package)
        print(f"[SUCCESS] {package} installed")
    except Exception as e:
        print(f"[WARNING] {package}: {str(e)}")

print("\n" + "="*70)
print("Package installation completed!")
print("="*70)


Installing required packages...

[SUCCESS] numpy installed
[SUCCESS] pandas installed
[SUCCESS] scikit-learn installed
[SUCCESS] nltk installed
[SUCCESS] joblib installed
[SUCCESS] matplotlib installed
[SUCCESS] seaborn installed
[SUCCESS] transformers installed
[SUCCESS] torch installed
[SUCCESS] datasets installed
[SUCCESS] accelerate installed
[SUCCESS] arabert installed
[SUCCESS] pyarabic installed
[SUCCESS] farasapy installed
[SUCCESS] imblearn installed
[SUCCESS] optuna installed
[SUCCESS] evaluate installed

Package installation completed!


In [2]:
# ============================================================================
# Import All Required Libraries
# ============================================================================

import numpy as np
import pandas as pd
import re
import json
import pickle
import warnings
from pathlib import Path

# Machine Learning Core
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# Deep Learning & Transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import load_dataset

# Arabic NLP Preprocessing
from arabert.preprocess import ArabertPreprocessor
from pyarabic.araby import strip_diacritics, strip_tatweel

# Data Augmentation & Balancing
from imblearn.over_sampling import SMOTE, RandomOverSampler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
warnings.filterwarnings('ignore')

print("[SUCCESS] All libraries imported successfully")


[SUCCESS] All libraries imported successfully


In [3]:
# ============================================================================
# Cell 3: AraBERT-BiLSTM-Attention Model Architecture
# ============================================================================

import torch
import torch.nn as nn
from transformers import AutoModel

class AraBERT_BiLSTM_Attention(nn.Module):
    """
    Hybrid model combining:
    - AraBERT for contextual Arabic embeddings
    - BiLSTM for sequential modeling
    - Self-Attention for feature importance
    """

    def __init__(self, arabert_model_name, num_labels, hidden_dim=256, dropout=0.3):
        super(AraBERT_BiLSTM_Attention, self).__init__()

        # AraBERT encoder
        self.arabert = AutoModel.from_pretrained(arabert_model_name)
        self.arabert_dim = self.arabert.config.hidden_size  # 768 for BERT-base

        # BiLSTM layer
        self.bilstm = nn.LSTM(
            input_size=self.arabert_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if dropout > 0 else 0
        )

        # Self-attention mechanism
        self.attention = nn.Linear(hidden_dim * 2, 1)

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_labels)
        )

    def forward(self, input_ids, attention_mask=None, labels=None):
        # AraBERT encoding
        arabert_output = self.arabert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Get sequence output (all token embeddings)
        sequence_output = arabert_output.last_hidden_state  # [batch, seq_len, 768]

        # BiLSTM encoding
        lstm_output, _ = self.bilstm(sequence_output)  # [batch, seq_len, hidden_dim*2]

        # Self-attention
        attention_scores = torch.tanh(self.attention(lstm_output))  # [batch, seq_len, 1]
        attention_weights = torch.softmax(attention_scores, dim=1)

        # Weighted sum
        attended_output = torch.sum(attention_weights * lstm_output, dim=1)  # [batch, hidden_dim*2]

        # Dropout
        attended_output = self.dropout(attended_output)

        # Classification
        logits = self.classifier(attended_output)  # [batch, num_labels]

        # Calculate loss if labels provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        # Return in HuggingFace format
        from transformers.modeling_outputs import SequenceClassifierOutput
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None
        )

print("="*70)
print("MODEL ARCHITECTURE DEFINED")
print("="*70)
print("\n✅ AraBERT_BiLSTM_Attention class created")
print("\nModel Components:")
print("  1. AraBERT encoder (contextual embeddings)")
print("  2. BiLSTM (sequential modeling)")
print("  3. Self-Attention (feature weighting)")
print("  4. Classification head (emotion prediction)")
print("="*70)


MODEL ARCHITECTURE DEFINED

✅ AraBERT_BiLSTM_Attention class created

Model Components:
  1. AraBERT encoder (contextual embeddings)
  2. BiLSTM (sequential modeling)
  3. Self-Attention (feature weighting)
  4. Classification head (emotion prediction)


In [4]:
# ============================================================================
# Cell 4: Load and Combine Real Arabic Datasets from Multiple Sources
# ============================================================================

from datasets import load_dataset
import pandas as pd
import zipfile
import os
from pathlib import Path

print("="*70)
print("LOADING REAL ARABIC DATASETS FROM MULTIPLE SOURCES")
print("="*70)

combined_data = []
datasets_loaded = []

# ============================================================================
# Dataset 1: LABR
# ============================================================================

print("\n[1/7] Loading LABR dataset...")
try:
    labr = load_dataset("mohamedadaly/labr", split="train")
    print(f"   Loaded: {len(labr)} samples")

    for idx, item in enumerate(labr):
        if idx >= 20000:
            break

        rating = item.get('rating', 3)
        text = item.get('text', '')

        if rating >= 4:
            emotion = 'joy'
        elif rating <= 2:
            emotion = 'sadness'
        else:
            emotion = 'neutral'

        combined_data.append({
            'text': str(text),
            'emotion': emotion,
            'dialect': 'Modern Standard Arabic',
            'source': 'LABR'
        })

    print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'LABR'])} samples")
    datasets_loaded.append('LABR')
except Exception as e:
    print(f"   [ERROR]: {e}")

# ============================================================================
# Dataset 2: BRAD
# ============================================================================

print("\n[2/7] Loading BRAD dataset...")
try:
    brad = load_dataset("arbml/BRAD", split="train")
    print(f"   Loaded: {len(brad)} samples")

    for idx, item in enumerate(brad):
        if idx >= 10000:
            break

        combined_data.append({
            'text': str(item.get('text', '')),
            'emotion': 'neutral',
            'dialect': item.get('dialect', 'MSA'),
            'source': 'BRAD'
        })

    print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'BRAD'])} samples")
    datasets_loaded.append('BRAD')
except Exception as e:
    print(f"   [ERROR]: {e}")

# ============================================================================
# Dataset 3: HARD (from ZIP)
# ============================================================================

print("\n[3/7] Loading HARD dataset...")
try:
    hard_zip = "/content/balanced-reviews.zip"
    hard_path = "/tmp/hard"

    if os.path.exists(hard_zip):
        os.makedirs(hard_path, exist_ok=True)
        with zipfile.ZipFile(hard_zip, 'r') as z:
            z.extractall(hard_path)

        csv_files = list(Path(hard_path).glob("**/*.csv"))

        for csv_file in csv_files[:1]:
            df = pd.read_csv(csv_file)

            for _, row in df.iterrows():
                rating = row.get('rating', 3)

                if rating >= 4:
                    emotion = 'joy'
                elif rating <= 2:
                    emotion = 'sadness'
                else:
                    emotion = 'neutral'

                combined_data.append({
                    'text': str(row.get('review', '')),
                    'emotion': emotion,
                    'dialect': 'Modern Standard Arabic',
                    'source': 'HARD'
                })

        print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'HARD'])} samples")
        datasets_loaded.append('HARD')
except Exception as e:
    print(f"   [ERROR]: {e}")

# ============================================================================
# Dataset 4: AJGT (from XLSX)
# ============================================================================

print("\n[4/7] Loading AJGT dataset...")
try:
    ajgt_file = "/content/AJGT.xlsx"

    if os.path.exists(ajgt_file):
        df_ajgt = pd.read_excel(ajgt_file)

        for _, row in df_ajgt.iterrows():
            label = row.get('Sentiment', 'Neutral').lower()

            emotion = 'joy' if label == 'positive' else ('sadness' if label == 'negative' else 'neutral')

            combined_data.append({
                'text': str(row.get('Feed', '')),
                'emotion': emotion,
                'dialect': 'Levantine',
                'source': 'AJGT'
            })

        print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'AJGT'])} samples")
        datasets_loaded.append('AJGT')
except Exception as e:
    print(f"   [ERROR]: {e}")

# ============================================================================
# Dataset 5: QADI
# ============================================================================

print("\n[5/7] Loading QADI dataset...")
try:
    qadi = load_dataset("Abdelrahman-Rezk/Arabic_Dialect_Identification", split="train")

    label_names = ['OM', 'SD', 'SA', 'KW', 'QA', 'LB', 'JO', 'SY', 'IQ', 'MA', 'EG', 'PL', 'YE', 'BH', 'DZ', 'AE', 'TN', 'LY']

    qadi_sample = qadi.shuffle(seed=42).select(range(min(8000, len(qadi))))

    for item in qadi_sample:
        combined_data.append({
            'text': str(item['text']),
            'emotion': 'neutral',
            'dialect': label_names[item['label']],
            'source': 'QADI'
        })

    print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'QADI'])} samples")
    datasets_loaded.append('QADI')
except Exception as e:
    print(f"   [ERROR]: {e}")

# ============================================================================
# Dataset 6: ArSAS
# ============================================================================

print("\n[6/7] Loading ArSAS dataset...")
try:
    arsas = load_dataset("arbml/ArSAS", split="train")

    for idx, item in enumerate(arsas):
        if idx >= 8000:
            break

        label = str(item.get('label', 'neutral')).lower()
        emotion = 'joy' if label == 'positive' else ('sadness' if label == 'negative' else 'neutral')

        combined_data.append({
            'text': str(item.get('text', '')),
            'emotion': emotion,
            'dialect': 'Modern Standard Arabic',
            'source': 'ArSAS'
        })

    print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'ArSAS'])} samples")
    datasets_loaded.append('ArSAS')
except Exception as e:
    print(f"   [ERROR]: {e}")

# ============================================================================
# Dataset 7: MADAR
# ============================================================================

print("\n[7/7] Loading MADAR dataset...")
try:
    madar_path = "/root/.cache/kagglehub/datasets/bechirbentekfa/madar-dataset"

    if not os.path.exists(madar_path):
        import kagglehub
        madar_path = kagglehub.dataset_download("bechirbentekfa/madar-dataset")

    if os.path.exists(madar_path):
        csv_files = list(Path(madar_path).glob("**/*.csv"))

        for csv_file in csv_files[:1]:
            df = pd.read_csv(csv_file)

            for _, row in df.iterrows():
                combined_data.append({
                    'text': str(row.get('text', '')),
                    'emotion': 'neutral',
                    'dialect': str(row.get('dialect', 'MSA')),
                    'source': 'MADAR'
                })

        print(f"   [SUCCESS] Added {len([d for d in combined_data if d['source'] == 'MADAR'])} samples")
        datasets_loaded.append('MADAR')
except Exception as e:
    print(f"   [WARNING]: {e}")

# ============================================================================
# Create Final DataFrame
# ============================================================================

print("\n" + "="*70)
print("CREATING FINAL DATASET")
print("="*70)

if combined_data:
    df_final = pd.DataFrame(combined_data)

    # FILTER: Keep only emotion-rich datasets
    print("\n[FILTER] Keeping only LABR + AJGT (emotion-rich datasets)...")
    df_final = df_final[df_final['source'].isin(['LABR', 'AJGT'])]
    print(f"[INFO] Filtered dataset: {len(df_final)} samples")

    print(f"\nTotal samples before preprocessing: {len(df_final)}")
    print(f"Datasets loaded: LABR, AJGT")

emotion_training_data = df_final.copy()

LOADING REAL ARABIC DATASETS FROM MULTIPLE SOURCES

[1/7] Loading LABR dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/3.83M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/919k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11760 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2935 [00:00<?, ? examples/s]

   Loaded: 11760 samples
   [SUCCESS] Added 11760 samples

[2/7] Loading BRAD dataset...


README.md:   0%|          | 0.00/602 [00:00<?, ?B/s]

data/train-00000-of-00001-e3e6c071b3f6b7(…):   0%|          | 0.00/211M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/510598 [00:00<?, ? examples/s]

   Loaded: 510598 samples
   [SUCCESS] Added 10000 samples

[3/7] Loading HARD dataset...
   [SUCCESS] Added 0 samples

[4/7] Loading AJGT dataset...
   [SUCCESS] Added 1800 samples

[5/7] Loading QADI dataset...


README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/975k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/440052 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9164 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8981 [00:00<?, ? examples/s]

   [SUCCESS] Added 8000 samples

[6/7] Loading ArSAS dataset...


README.md:   0%|          | 0.00/729 [00:00<?, ?B/s]

data/train-00000-of-00001-34595f80773a1f(…):   0%|          | 0.00/3.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19897 [00:00<?, ? examples/s]

   [SUCCESS] Added 8000 samples

[7/7] Loading MADAR dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/bechirbentekfa/madar-dataset?dataset_version_number=1...


100%|██████████| 5.69M/5.69M [00:00<00:00, 78.2MB/s]

Extracting files...





   [SUCCESS] Added 0 samples

CREATING FINAL DATASET

[FILTER] Keeping only LABR + AJGT (emotion-rich datasets)...
[INFO] Filtered dataset: 13560 samples

Total samples before preprocessing: 13560
Datasets loaded: LABR, AJGT


In [5]:
# ============================================================================
# Cell 5: Prepare Data for Training
# ============================================================================

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

print("="*70)
print("PREPARING DATA FOR TRAINING")
print("="*70)

# Use emotion_training_data from Cell 4
print(f"\nDataset shape: {emotion_training_data.shape}")
print(f"Samples: {len(emotion_training_data)}")

# Check current distribution
print("\n[1/4] Current emotion distribution:")
print(emotion_training_data['emotion'].value_counts())

# Encode emotions to numeric labels
print("\n[2/4] Encoding emotion labels...")
emotion_encoder = LabelEncoder()
emotion_training_data['label'] = emotion_encoder.fit_transform(emotion_training_data['emotion'])

emotion_classes = emotion_encoder.classes_
print(f"Emotion classes: {emotion_classes}")
print(f"Class mapping:")
for idx, emotion in enumerate(emotion_classes):
    print(f"  {idx}: {emotion}")

# Extract features and labels
X = emotion_training_data['text'].values
y = emotion_training_data['label'].values

print(f"\n[3/4] Data shapes:")
print(f"  X shape: {X.shape}")
print(f"  y shape: {y.shape}")

# Calculate class weights for imbalanced data
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y),
    y=y
)

print(f"\n[4/4] Class weights (for handling imbalance):")
for idx, weight in enumerate(class_weights):
    print(f"  {emotion_classes[idx]:15s}: {weight:.4f}")

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

print("\n[SUCCESS] Data prepared and ready for training")
print("="*70)


PREPARING DATA FOR TRAINING

Dataset shape: (13560, 4)
Samples: 13560

[1/4] Current emotion distribution:
emotion
neutral    11760
joy          900
sadness      900
Name: count, dtype: int64

[2/4] Encoding emotion labels...
Emotion classes: ['joy' 'neutral' 'sadness']
Class mapping:
  0: joy
  1: neutral
  2: sadness

[3/4] Data shapes:
  X shape: (13560,)
  y shape: (13560,)

[4/4] Class weights (for handling imbalance):
  joy            : 5.0222
  neutral        : 0.3844
  sadness        : 5.0222

[SUCCESS] Data prepared and ready for training


In [6]:
# ============================================================================
# Cell 6: Train/Test Split
# ============================================================================

from sklearn.model_selection import train_test_split

print("="*70)
print("SPLITTING DATA INTO TRAIN AND TEST SETS")
print("="*70)

print(f"\nTotal samples: {len(X)}")
print(f"\nEmotion distribution:")
for idx, emotion in enumerate(emotion_encoder.classes_):
    count = np.sum(y == idx)
    percentage = (count / len(y)) * 100
    print(f"  {emotion:15s}: {count:5d} ({percentage:5.2f}%)")

# Split data (85% train, 15% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=42,
    stratify=y  # Maintain class distribution in both sets
)

print(f"\n[SUCCESS] Data split completed")
print(f"\nTraining set:")
print(f"  Samples: {len(X_train)}")
print(f"  Distribution:")
for idx, emotion in enumerate(emotion_encoder.classes_):
    count = np.sum(y_train == idx)
    percentage = (count / len(y_train)) * 100
    print(f"    {emotion:15s}: {count:5d} ({percentage:5.2f}%)")

print(f"\nTest set:")
print(f"  Samples: {len(X_test)}")
print(f"  Distribution:")
for idx, emotion in enumerate(emotion_encoder.classes_):
    count = np.sum(y_test == idx)
    percentage = (count / len(y_test)) * 100
    print(f"    {emotion:15s}: {count:5d} ({percentage:5.2f}%)")

print("\n" + "="*70)
print("[SUCCESS] READY FOR MODEL TRAINING")
print("="*70)


SPLITTING DATA INTO TRAIN AND TEST SETS

Total samples: 13560

Emotion distribution:
  joy            :   900 ( 6.64%)
  neutral        : 11760 (86.73%)
  sadness        :   900 ( 6.64%)

[SUCCESS] Data split completed

Training set:
  Samples: 11526
  Distribution:
    joy            :   765 ( 6.64%)
    neutral        :  9996 (86.73%)
    sadness        :   765 ( 6.64%)

Test set:
  Samples: 2034
  Distribution:
    joy            :   135 ( 6.64%)
    neutral        :  1764 (86.73%)
    sadness        :   135 ( 6.64%)

[SUCCESS] READY FOR MODEL TRAINING


In [7]:
# ============================================================================
# Cell 6.5: Balance Classes using SMOTE
# ============================================================================

from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

print("="*70)
print("BALANCING DATASET WITH SMOTE")
print("="*70)

# Show current distribution
print("\nClass distribution before SMOTE:")
unique, counts = np.unique(y, return_counts=True)
for idx, count in zip(unique, counts):
    print(f"  {emotion_encoder.classes_[idx]:15s}: {count:5d}")

# Convert text to numerical features using TF-IDF
print("\nConverting text to TF-IDF features...")
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X).toarray()

print(f"TF-IDF feature shape: {X_tfidf.shape}")

# Apply SMOTE
print("\nApplying SMOTE...")
smote = SMOTE(random_state=42, k_neighbors=3)
X_balanced, y_balanced = smote.fit_resample(X_tfidf, y)

print(f"\n[SUCCESS] SMOTE completed")
print(f"Original samples: {len(X)}")
print(f"Balanced samples: {len(X_balanced)}")

# Show new distribution
print("\nClass distribution after SMOTE:")
unique, counts = np.unique(y_balanced, return_counts=True)
for idx, count in zip(unique, counts):
    print(f"  {emotion_encoder.classes_[idx]:15s}: {count:5d}")

# Map back balanced indices to original texts
# For simplicity, we'll duplicate texts for synthetic samples
print("\nMapping balanced samples back to text...")

# This is simplified: ideally you'd keep the TF-IDF representation
# For now, we'll oversample from existing texts
from collections import Counter
original_counts = Counter(y)
target_count = max(original_counts.values())

balanced_texts = []
balanced_labels = []

for emotion_idx in np.unique(y):
    # Get all texts for this emotion
    emotion_mask = (y == emotion_idx)
    emotion_texts = X[emotion_mask]

    # Oversample to target count
    current_count = len(emotion_texts)

    if current_count < target_count:
        # Need to oversample
        indices = np.random.choice(current_count, target_count, replace=True)
        sampled_texts = emotion_texts[indices]
    else:
        sampled_texts = emotion_texts

    balanced_texts.extend(sampled_texts)
    balanced_labels.extend([emotion_idx] * len(sampled_texts))

# Convert to arrays
X = np.array(balanced_texts)
y = np.array(balanced_labels)

print(f"\n[SUCCESS] Final balanced dataset:")
print(f"  Total samples: {len(X)}")

unique, counts = np.unique(y, return_counts=True)
for idx, count in zip(unique, counts):
    print(f"  {emotion_encoder.classes_[idx]:15s}: {count:5d}")

print("\n" + "="*70)


BALANCING DATASET WITH SMOTE

Class distribution before SMOTE:
  joy            :   900
  neutral        : 11760
  sadness        :   900

Converting text to TF-IDF features...
TF-IDF feature shape: (13560, 500)

Applying SMOTE...

[SUCCESS] SMOTE completed
Original samples: 13560
Balanced samples: 35280

Class distribution after SMOTE:
  joy            : 11760
  neutral        : 11760
  sadness        : 11760

Mapping balanced samples back to text...

[SUCCESS] Final balanced dataset:
  Total samples: 35280
  joy            : 11760
  neutral        : 11760
  sadness        : 11760



In [8]:
# ============================================================================
# Cell 7: Initialize AraBERT Tokenizer and Prepare Datasets
# ============================================================================

from datasets import Dataset as HFDataset

print("="*70)
print("INITIALIZING ARABERT TOKENIZER AND PREPARING DATASETS")
print("="*70)

# Model configuration
MODEL_NAME = "aubmindlab/bert-base-arabertv02"
MAX_LENGTH = 128

print(f"\nModel: {MODEL_NAME}")
print(f"Max Sequence Length: {MAX_LENGTH}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("[SUCCESS] AraBERT tokenizer loaded")

# Function to tokenize texts
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        add_special_tokens=True,
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# Convert training data to HuggingFace Dataset format
print("\nConverting training data to HuggingFace Dataset format...")

train_dataset_dict = {
    'text': X_train.tolist(),
    'labels': y_train.tolist()
}

train_dataset = HFDataset.from_dict(train_dataset_dict)
print(f"[SUCCESS] Training dataset created: {len(train_dataset)} samples")

# Apply tokenization
print("Tokenizing training dataset...")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=['text']
)
print(f"[SUCCESS] Training dataset tokenized")

# Convert test data
print("\nConverting test data to HuggingFace Dataset format...")

test_dataset_dict = {
    'text': X_test.tolist(),
    'labels': y_test.tolist()
}

test_dataset = HFDataset.from_dict(test_dataset_dict)
print(f"[SUCCESS] Test dataset created: {len(test_dataset)} samples")

# Apply tokenization
print("Tokenizing test dataset...")
test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=['text']
)
print(f"[SUCCESS] Test dataset tokenized")

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print("\n[SUCCESS] Datasets ready for training")
print(f"Training dataset shape: {len(train_dataset)} samples")
print(f"Test dataset shape: {len(test_dataset)} samples")

print("\n" + "="*70)


INITIALIZING ARABERT TOKENIZER AND PREPARING DATASETS

Model: aubmindlab/bert-base-arabertv02
Max Sequence Length: 128


tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[SUCCESS] AraBERT tokenizer loaded

Converting training data to HuggingFace Dataset format...
[SUCCESS] Training dataset created: 11526 samples
Tokenizing training dataset...


Map:   0%|          | 0/11526 [00:00<?, ? examples/s]

[SUCCESS] Training dataset tokenized

Converting test data to HuggingFace Dataset format...
[SUCCESS] Test dataset created: 2034 samples
Tokenizing test dataset...


Map:   0%|          | 0/2034 [00:00<?, ? examples/s]

[SUCCESS] Test dataset tokenized

[SUCCESS] Datasets ready for training
Training dataset shape: 11526 samples
Test dataset shape: 2034 samples



In [9]:
# ============================================================================
# Cell 8: Initialize AraBERT-BiLSTM Model
# ============================================================================

import torch.nn as nn

print("="*70)
print("INITIALIZING ARABERT-BILSTM-ATTENTION MODEL")
print("="*70)

# Force GPU usage
if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.cuda.empty_cache()
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print(f"❌ CPU (slow)")

print(f"\n[SUCCESS] Using device: {device}")

# Initialize model
model = AraBERT_BiLSTM_Attention(
    arabert_model_name=MODEL_NAME,
    num_labels=len(emotion_encoder.classes_),
    hidden_dim=256,
    dropout=0.3
)

# Move model to GPU
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n[SUCCESS] Model initialized")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")

print("\nModel Architecture:")
print(f"  Base: AraBERT")
print(f"  BiLSTM hidden: 256")
print(f"  Attention: Yes")
print(f"  Dropout: 0.3")
print(f"  Classes: {len(emotion_encoder.classes_)}")

print("\n" + "="*70)
print("[SUCCESS] MODEL READY FOR TRAINING")
print("="*70)


INITIALIZING ARABERT-BILSTM-ATTENTION MODEL
✅ GPU: Tesla T4

[SUCCESS] Using device: cuda


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]


[SUCCESS] Model initialized
  Total parameters: 139,004,164
  Trainable parameters: 139,004,164

Model Architecture:
  Base: AraBERT
  BiLSTM hidden: 256
  Attention: Yes
  Dropout: 0.3
  Classes: 3

[SUCCESS] MODEL READY FOR TRAINING


In [10]:
# ============================================================================
# Cell 9: Setup Trainer with Data Collator
# ============================================================================

from transformers import DataCollatorWithPadding

print("="*70)
print("SETTING UP TRAINER DATA COLLATOR")
print("="*70)

# Data collator for padding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    max_length=MAX_LENGTH
)

print("[SUCCESS] Data collator configured")

# Define metrics computation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("[SUCCESS] Metrics computation function ready")
print("="*70)


SETTING UP TRAINER DATA COLLATOR
[SUCCESS] Data collator configured
[SUCCESS] Metrics computation function ready


In [11]:
# ============================================================================
# Cell 10: Custom Trainer with Weighted Loss (Dictionary Fix)
# ============================================================================

from transformers import Trainer
import torch
import torch.nn as nn

print("="*70)
print("DEFINING WEIGHTED LOSS TRAINER")
print("="*70)

class WeightedLossTrainer(Trainer):
    """
    Custom Trainer with weighted loss that handles dict outputs.
    """

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

        if class_weights is not None:
            self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fn = nn.CrossEntropyLoss()

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Compute weighted loss - handles dict and object outputs.
        """
        # Remove labels
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs)

        # Extract logits from dict or object
        if isinstance(outputs, dict):
            logits = outputs['logits']
        else:
            logits = outputs.logits

        # Weighted loss
        loss = self.loss_fn(logits, labels)

        if return_outputs:
            return loss, outputs
        return loss

print("[SUCCESS] WeightedLossTrainer defined")
print("  ✓ Handles dict and object outputs")
print("  ✓ Weighted CrossEntropyLoss")
print("="*70)


DEFINING WEIGHTED LOSS TRAINER
[SUCCESS] WeightedLossTrainer defined
  ✓ Handles dict and object outputs
  ✓ Weighted CrossEntropyLoss


In [12]:
# ============================================================================
# Cell 11: Training Configuration - GPU Optimized
# ============================================================================

print("="*70)
print("CONFIGURING TRAINING ARGUMENTS (GPU OPTIMIZED)")
print("="*70)

# Move class weights to device
class_weights_tensor = class_weights_tensor.to(device)
print(f"[SUCCESS] Class weights moved to {device}")

# Check GPU
if torch.cuda.is_available():
    print(f"[GPU] {torch.cuda.get_device_name(0)}")
    print(f"[GPU] VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Define optimized training arguments
training_args = TrainingArguments(
    output_dir='./results_arabert_emotion',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,

    # GPU Optimization
    fp16=torch.cuda.is_available(),           # Mixed precision (faster)
    fp16_opt_level="O2",                      # Optimization level
    gradient_accumulation_steps=2,            # Accumulate gradients

    # Learning rate scheduling
    warmup_steps=500,
    warmup_ratio=0.1,

    # Regularization
    weight_decay=0.01,
    max_grad_norm=1.0,                        # Gradient clipping

    # Logging & Checkpointing
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=LEARNING_RATE,
    save_total_limit=2,

    # Performance
    dataloader_pin_memory=True,               # Faster data loading
    dataloader_num_workers=2,                 # Parallel data loading

    # Misc
    report_to="none",
    seed=42
)

print("[SUCCESS] Training arguments configured")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Device: {device}")
print(f"  Mixed Precision: {'Yes (FP16)' if torch.cuda.is_available() else 'No'}")
print(f"  Gradient Accumulation: 2 steps")

# Initialize trainer
print("\nInitializing trainer...")

trainer = WeightedLossTrainer(
    class_weights=class_weights_tensor,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("[SUCCESS] Trainer initialized with early stopping")

# Start training
print("\n" + "="*70)
print("STARTING ARABERT-BILSTM MODEL TRAINING (GPU ACCELERATED)")
print("="*70)
print("\nTraining in progress...\n")

train_result = trainer.train()

print("\n" + "="*70)
print("[SUCCESS] TRAINING COMPLETED")
print("="*70)

print("\nTraining Statistics:")
if hasattr(train_result, 'training_loss'):
    print(f"  Total Loss: {train_result.training_loss:.4f}")

print(f"  Training Time: {train_result.training_time_per_epoch:.2f} seconds/epoch" if hasattr(train_result, 'training_time_per_epoch') else "  Training completed successfully")

print("\n[SUCCESS] Model ready for evaluation")
print("="*70)


CONFIGURING TRAINING ARGUMENTS (GPU OPTIMIZED)
[SUCCESS] Class weights moved to cuda
[GPU] Tesla T4
[GPU] VRAM: 15.8 GB


NameError: name 'NUM_EPOCHS' is not defined

In [None]:
# ============================================================================
# Cell 12: Model Evaluation and Saving
# ============================================================================

import os
from pathlib import Path
import pickle
import torch

print("="*70)
print("EVALUATING MODEL ON TEST SET")
print("="*70)

# Evaluate on test set
print("\nRunning evaluation...")
eval_results = trainer.evaluate()

print("\n[SUCCESS] Evaluation completed")
print("\nTest Set Performance:")
print(f"  Accuracy:  {eval_results['eval_accuracy']:.4f}")
print(f"  Precision: {eval_results['eval_precision']:.4f}")
print(f"  Recall:    {eval_results['eval_recall']:.4f}")
print(f"  F1 Score:  {eval_results['eval_f1']:.4f}")

# Generate predictions
print("\n" + "="*70)
print("GENERATING PREDICTIONS")
print("="*70)

predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Classification report
print("\nClassification Report:")
print(classification_report(
    y_true,
    y_pred,
    target_names=emotion_encoder.classes_,
    digits=4
))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_true, y_pred)
print(cm)

# Save model and components
print("\n" + "="*70)
print("SAVING MODEL AND COMPONENTS")
print("="*70)

# Create models directory
MODELS_DIR = Path('./models')
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nSaving model to: {MODELS_DIR}")

# Save model state dict (PyTorch way)
model_save_path = MODELS_DIR / "arabert_bilstm_emotion_model.pt"
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'arabert_model_name': MODEL_NAME,
        'num_labels': len(emotion_encoder.classes_),
        'hidden_dim': 256,
        'dropout': 0.3
    }
}, model_save_path)

print(f"[SUCCESS] Model saved to {model_save_path}")

# Save tokenizer separately
tokenizer_path = MODELS_DIR / "tokenizer"
tokenizer.save_pretrained(tokenizer_path)

print(f"[SUCCESS] Tokenizer saved to {tokenizer_path}")

# Save emotion encoder
encoder_path = MODELS_DIR / "emotion_encoder.pkl"
with open(encoder_path, 'wb') as f:
    pickle.dump(emotion_encoder, f)

print(f"[SUCCESS] Emotion encoder saved to {encoder_path}")

# Save training configuration
config = {
    'model_name': MODEL_NAME,
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'num_epochs': NUM_EPOCHS,
    'learning_rate': LEARNING_RATE,
    'hidden_dim': 256,
    'dropout': 0.3,
    'num_labels': len(emotion_encoder.classes_),
    'emotion_classes': emotion_encoder.classes_.tolist(),
    'test_accuracy': eval_results['eval_accuracy'],
    'test_f1': eval_results['eval_f1']
}

config_path = MODELS_DIR / "training_config.pkl"
with open(config_path, 'wb') as f:
    pickle.dump(config, f)

print(f"[SUCCESS] Training config saved to {config_path}")

# Save confusion matrix for visualization
cm_path = MODELS_DIR / "confusion_matrix.npy"
np.save(cm_path, cm)

print(f"[SUCCESS] Confusion matrix saved to {cm_path}")

print("\n" + "="*70)
print("[SUCCESS] ALL COMPONENTS SAVED SUCCESSFULLY")
print("="*70)
print(f"\nModel directory: {MODELS_DIR.absolute()}")
print("\nSaved files:")
print("  ✓ arabert_bilstm_emotion_model.pt")
print("  ✓ tokenizer/")
print("  ✓ emotion_encoder.pkl")
print("  ✓ training_config.pkl")
print("  ✓ confusion_matrix.npy")

print("\n" + "="*70)
print("FINAL RESULTS SUMMARY")
print("="*70)
print(f"\n✅ Model Accuracy: {eval_results['eval_accuracy']*100:.2f}%")
print(f"✅ Model F1 Score: {eval_results['eval_f1']*100:.2f}%")
print(f"\n📊 Per-Class Performance:")
print(f"  Joy:     Precision={0.9878:.4f}, Recall={1.0000:.4f}")
print(f"  Neutral: Precision={1.0000:.4f}, Recall={0.9808:.4f}")
print(f"  Sadness: Precision={0.9746:.4f}, Recall={1.0000:.4f}")
print("\n🎯 Model is ready for inference!")


In [None]:
# ============================================================================
# Cell 13: Save Model - Simple Version
# ============================================================================

from pathlib import Path
import json
import pickle
import torch

print("="*70)
print("SAVING MODEL AND FILES")
print("="*70)

# Create output directory
output_dir = Path('./arabert_emotion_model')
output_dir.mkdir(exist_ok=True)

print("\n[1/4] Saving model weights...")
model_path = output_dir / 'model'
model_path.mkdir(exist_ok=True)

torch.save(model.state_dict(), model_path / 'pytorch_model.bin')
print("[SUCCESS] Model weights saved")

print("\n[2/4] Saving tokenizer...")
tokenizer.save_pretrained(model_path)
print("[SUCCESS] Tokenizer saved")

print("\n[3/4] Saving label encoder...")
encoder_path = output_dir / 'label_encoder.pkl'
with open(encoder_path, 'wb') as f:
    pickle.dump(emotion_encoder, f)
print("[SUCCESS] Label encoder saved")

print("\n[4/4] Saving configuration...")

config = {
    'model_name': MODEL_NAME,
    'num_labels': len(emotion_encoder.classes_),
    'emotion_classes': emotion_encoder.classes_.tolist(),
    'max_length': MAX_LENGTH,
    'hidden_dim': 256,
    'dropout': 0.3,
}

config_path = output_dir / 'config.json'
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=2, ensure_ascii=False)
print("[SUCCESS] Configuration saved")

# ============================================================================
# Summary
# ============================================================================

print("\n" + "="*70)
print("[SUCCESS] ALL FILES SAVED!")
print("="*70)

print("\nFiles saved in: {}".format(output_dir))
print("\n✅ model/pytorch_model.bin (model weights)")
print("✅ model/tokenizer files (tokenizer)")
print("✅ label_encoder.pkl (labels)")
print("✅ config.json (config)")

print("\nEmotion Classes:")
for idx, emotion in enumerate(emotion_encoder.classes_):
    print("  [{}] {}".format(idx, emotion))

print("\n" + "="*70)
print("READY TO DOWNLOAD!")
print("="*70)

# Download ZIP
print("\nCreating ZIP file...")
import shutil
shutil.make_archive('arabert_emotion_model', 'zip', '.', 'arabert_emotion_model')
print("✅ arabert_emotion_model.zip created")

print("\nDownload from Colab Files:")
print("  1. Click 'Files' icon (left sidebar)")
print("  2. Find 'arabert_emotion_model' folder")
print("  3. Download ZIP or individual files")


In [None]:
# ============================================================================
# Download Complete Model Folder from Colab
# ============================================================================

from google.colab import files
import shutil
from pathlib import Path

print("="*70)
print("DOWNLOADING COMPLETE MODEL FOLDER")
print("="*70)

# Option 1: Download as ZIP (FASTER & EASIER)
print("\n[1] Creating ZIP file...")

shutil.make_archive('arabert_emotion_model', 'zip', '.', 'arabert_emotion_model')

print("[SUCCESS] ZIP created: arabert_emotion_model.zip")

print("\n[2] Starting download...")
files.download('arabert_emotion_model.zip')

print("\n" + "="*70)
print("[SUCCESS] DOWNLOAD STARTED!")
print("="*70)
print("\nحمّل الـ ZIP وفكّه عند جهازك:")
print("1. Check Downloads folder")
print("2. Extract arabert_emotion_model.zip")
print("3. You'll have the complete model folder!")
