<a href="https://colab.research.google.com/github/ebrahimhalaby/ebrahimsal/blob/master/Arabic_Commands_BERT_Benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧪 Arabic Commands BERT Benchmark (Colab-Ready)
This notebook benchmarks multiple Arabic/Multilingual BERT models on your **commands dataset** (`text,intent`) with rigorous evaluation for research:
- K-Fold CV over multiple random seeds.
- Metrics: Accuracy, F1 (macro/micro), per-class PR/F1, confusion matrices.
- Optional Arabic normalization ablation & class weighting.
- Exports full results (CSVs, figures) to a ZIP and (optionally) Google Drive.

> **Tip:** Start with a light configuration (fewer models/folds) to validate, then switch to the full configuration.

In [1]:
!pip uninstall -y -q opencv-python opencv-contrib-python opencv-python-headless \
  tsfresh thinc umap-learn gcsfs cupy-cuda12x tensorflow numba diffusers gradio
!pip install -U --no-cache-dir pip >/dev/null


!pip install --no-cache-dir -q \
  "numpy==1.26.4" \
  "scipy==1.12.0" \
  "pandas==2.2.2" \
  "scikit-learn==1.5.2" \
  "transformers==4.46.3" \
  "accelerate==1.0.1" \
  "tokenizers==0.20.3" \
  "huggingface_hub==0.34.1" \
  "datasets==2.21.0" \
  "evaluate==0.4.2" \
  "onnx" "onnxruntime" \
  "matplotlib==3.8.4" \
  "fsspec==2024.6.1"

# 4) فحص سريع للإصدارات + اختبار الـ Trainer
import numpy, scipy, pandas, sklearn, transformers, datasets, onnxruntime, matplotlib
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
print("✅ Ready:",
      "\n  numpy", numpy.__version__,
      "\n  scipy", scipy.__version__,
      "\n  transformers", transformers.__version__,
      "\n  tokenizers", __import__("tokenizers").__version__,
      "\n  hub", __import__("huggingface_hub").__version__)


[0m✅ Ready: 
  numpy 1.26.4 
  scipy 1.12.0 
  transformers 4.46.3 
  tokenizers 0.20.3 
  hub 0.34.1


In [2]:
# ===============================
# 1) Mount Google Drive (optional)
# ===============================
USE_DRIVE = True   # <- set False if you don't want to use Drive

if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

DATA_FROM_DRIVE = True  # If True, the CSV is read from Drive path; else we'll use manual upload dialog.
DRIVE_CSV_PATH = "/content/drive/MyDrive/Colab Notebooks/arabic_game_commands_10k.csv"  # <- change if using Drive


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
if DATA_FROM_DRIVE:
    CSV_PATH = DRIVE_CSV_PATH
else:
    # Manual upload (dialog)
    from google.colab import files
    up = files.upload()  # choose your CSV
    CSV_PATH = list(up.keys())[0]

print("Using CSV_PATH:", CSV_PATH)
df = pd.read_csv(CSV_PATH)
assert {"text","intent"}.issubset(df.columns), "CSV must have 'text' and 'intent' columns"
df.head(5)


Using CSV_PATH: /content/drive/MyDrive/Colab Notebooks/arabic_game_commands_10k.csv


Unnamed: 0,text,intent
0,حالا دوّر دغري بعدين يمين,RIGHT
1,لو سمحت قيف حالاً,STOP
2,حالا ستوب و وبعدين,STOP
3,إجري للهدف وبعدها خليك مركز,RUN
4,تحت لتحت,CROUCH


In [4]:
# ===============================
# 3) Benchmark configuration
# ===============================
import os, re, json, numpy as np, matplotlib.pyplot as plt
from pathlib import Path

OUTPUT_ROOT = "/content/bert_ar_commands_benchmark"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

# --- Toggle between LIGHT and FULL runs ---
LIGHT_RUN = True  # True = faster sanity-check; False = full research run

if LIGHT_RUN:
    MODELS = [
        "aubmindlab/bert-base-arabertv2",
        "UBC-NLP/MARBERT",
        "xlm-roberta-base",
    ]
    KFOLDS = 3
    SEEDS = [42]
    MAX_EPOCHS = 3
else:
    MODELS = [
        "aubmindlab/bert-base-arabertv2",
        "asafaya/bert-base-arabic",
        "UBC-NLP/ARBERT",
        "UBC-NLP/MARBERT",
        "bert-base-multilingual-cased",
        "xlm-roberta-base",
    ]
    KFOLDS = 5
    SEEDS = [42, 77, 123]
    MAX_EPOCHS = 6

LR = 2e-5
BATCH = 32
PATIENCE = 2                     # Early stopping
USE_CLASS_WEIGHTS = True
USE_NORMALIZE = True

print("Models:", MODELS)
print("KFOLDS:", KFOLDS, "| SEEDS:", SEEDS, "| EPOCHS:", MAX_EPOCHS)


Models: ['aubmindlab/bert-base-arabertv2', 'UBC-NLP/MARBERT', 'xlm-roberta-base']
KFOLDS: 3 | SEEDS: [42] | EPOCHS: 3


In [5]:
# ===============================
# 4) Preprocess (Arabic normalization optional) + labels
# ===============================
import numpy as np
df = df.dropna(subset=["text","intent"]).reset_index(drop=True)
df["text"] = df["text"].astype(str).str.strip()
df["intent"] = df["intent"].astype(str).str.strip()

ARABIC_INDIC = "٠١٢٣٤٥٦٧٨٩"
WESTERN = "0123456789"
DIGIT_MAP = str.maketrans(ARABIC_INDIC, WESTERN)

def normalize_ar(text: str) -> str:
    t = text.strip()
    t = re.sub(r"[\u0640]+", "", t)             # Tatweel
    t = t.translate(DIGIT_MAP)                  # ١٢٣ -> 123
    t = re.sub(r"[ـ]+", "", t)                  # Madd
    t = re.sub(r"\s+", " ", t)                  # Spaces
    t = re.sub("[إأآا]", "ا", t)
    t = re.sub("ى", "ي", t)
    t = re.sub("ؤ", "و", t)
    t = re.sub("ئ", "ي", t)
    t = t.replace("ة", "ه")
    t = re.sub(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]", "", t)  # Diacritics
    t = re.sub(r"(.)\1{2,}", r"\1\1", t)       # Limit long char repeats
    return t

df["text_norm"] = df["text"].map(normalize_ar) if USE_NORMALIZE else df["text"]

labels = sorted(df["intent"].unique().tolist())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
df["label"] = df["intent"].map(label2id)

print("Num samples:", len(df), "| Num classes:", len(labels))
print("Classes:", labels)


Num samples: 10000 | Num classes: 12
Classes: ['ATTACK', 'BACKWARD', 'CROUCH', 'DEFEND', 'FORWARD', 'JUMP', 'LEFT', 'RIGHT', 'RUN', 'STOP', 'TURN_AROUND', 'WALK']


In [8]:
# ===============================
# 5) Training & Cross-Validation
# ===============================
import warnings, math, os, json, numpy as np, matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

import torch
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    confusion_matrix, precision_recall_fscore_support,
    accuracy_score, f1_score
)
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer,
    EarlyStoppingCallback, set_seed
)

# بديل evaluate: دالة مقاييس تعتمد على scikit-learn فقط
def compute_metrics_builder():
    def compute_metrics(eval_pred):
        logits, labels_np = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = float(accuracy_score(labels_np, preds))
        f1_macro = float(f1_score(labels_np, preds, average="macro"))
        f1_micro = float(f1_score(labels_np, preds, average="micro"))
        return {"accuracy": acc, "f1_macro": f1_macro, "f1_micro": f1_micro}
    return compute_metrics

USE_FP16 = torch.cuda.is_available()

def train_eval_one_fold(model_name, seed, tr_idx, va_idx, outdir):
    set_seed(seed)
    tok = AutoTokenizer.from_pretrained(model_name)

    # تحويل بيانات الطية إلى HF Dataset
    train_ds = Dataset.from_pandas(
        df.iloc[tr_idx][["text_norm","label"]].rename(columns={"text_norm":"text"})
    )
    val_ds   = Dataset.from_pandas(
        df.iloc[va_idx][["text_norm","label"]].rename(columns={"text_norm":"text"})
    )

    def tok_fn(batch):
        return tok(batch["text"], truncation=True)
    train_ds = train_ds.map(tok_fn, batched=True, remove_columns=["text"])
    val_ds   = val_ds.map(tok_fn,   batched=True, remove_columns=["text"])
    data_collator = DataCollatorWithPadding(tokenizer=tok)

    # أوزان الفئات (اختياري)
    loss_weights = None
    if USE_CLASS_WEIGHTS:
        y = df.iloc[tr_idx]["label"].values
        cw = compute_class_weight("balanced", classes=np.arange(len(labels)), y=y)
        loss_weights = torch.tensor(cw, dtype=torch.float)

    # تحميل النموذج
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=len(labels), id2label=id2label, label2id=label2id
    )

    # تخصيص دالة الخسارة لدعم أوزان الفئات (تمت إضافة **kwargs لدعم num_items_in_batch)
    def compute_loss(model, inputs, return_outputs=False, **kwargs):
        labels_t = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(
            weight=(loss_weights.to(logits.device) if loss_weights is not None else None)
        )
        loss = loss_fct(logits, labels_t)
        return (loss, outputs) if return_outputs else loss

    args = TrainingArguments(
        output_dir=outdir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        learning_rate=LR,
        per_device_train_batch_size=BATCH,
        per_device_eval_batch_size=BATCH,
        num_train_epochs=MAX_EPOCHS,
        warmup_ratio=0.1,
        weight_decay=0.01,
        fp16=USE_FP16,
        logging_steps=50,
        report_to="none",
        seed=seed
    )

    trainer = Trainer(
        model=model, args=args,
        train_dataset=train_ds, eval_dataset=val_ds,
        tokenizer=tok, data_collator=data_collator,
        compute_metrics=compute_metrics_builder(),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
    )
    trainer.compute_loss = compute_loss

    # تدريب وتقييم
    trainer.train()
    out = trainer.predict(val_ds)
    preds = np.argmax(out.predictions, axis=-1)
    y_true = np.array(val_ds["label"])

    # مقاييس تفصيلية بالطية
    acc = float(accuracy_score(y_true, preds))
    p, r, f1s, _ = precision_recall_fscore_support(
        y_true, preds, labels=np.arange(len(labels))
    )
    f1_macro = float(f1s.mean())
    f1_micro = float(f1_score(y_true, preds, average="micro"))
    cm = confusion_matrix(y_true, preds, labels=np.arange(len(labels)))

    # حفظ مصفوفة الالتباس
    os.makedirs(outdir, exist_ok=True)
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(f"CM {model_name} s{seed}")
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
    plt.yticks(range(len(labels)), labels)
    plt.tight_layout()
    cm_path = os.path.join(outdir, "confusion_matrix.png")
    plt.savefig(cm_path); plt.close()

    # حفظ تقرير الفئات
    per_class = {
        labels[i]: {
            "precision": float(p[i]),
            "recall": float(r[i]),
            "f1": float(f1s[i])
        } for i in range(len(labels))
    }
    with open(os.path.join(outdir, "per_class.json"), "w", encoding="utf-8") as f:
        json.dump(per_class, f, ensure_ascii=False, indent=2)

    return {
        "metrics": {"accuracy": acc, "f1_macro": f1_macro, "f1_micro": f1_micro},
        "val_true": y_true.tolist(),
        "val_pred": preds.tolist()
    }

# تنفيذ الـ CV عبر جميع النماذج والبذور
all_rows = []
store_preds = {}
from datetime import datetime
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")

for model_name in MODELS:
    for seed in SEEDS:
        skf = StratifiedKFold(n_splits=KFOLDS, shuffle=True, random_state=seed)
        for fold, (tr_idx, va_idx) in enumerate(skf.split(df["text_norm"], df["label"])):
            outdir = f"{OUTPUT_ROOT}/{RUN_TAG}/{model_name.replace('/','_')}/seed{seed}_fold{fold}"
            print(f"\n=== {model_name} | seed={seed} | fold={fold+1}/{KFOLDS} ===")
            os.makedirs(outdir, exist_ok=True)
            res = train_eval_one_fold(model_name, seed, tr_idx, va_idx, outdir)
            row = {"model": model_name, "seed": seed, "fold": fold}
            row.update(res["metrics"])
            all_rows.append(row)
            store_preds[f"{model_name}|seed{seed}|fold{fold}"] = res

# حفظ النتائج المجمّعة
import pandas as pd
cv_df = pd.DataFrame(all_rows)
cv_df.to_csv(f"{OUTPUT_ROOT}/{RUN_TAG}/cv_results.csv", index=False)
summary = cv_df.groupby("model")[["accuracy","f1_macro","f1_micro"]].agg(["mean","std"]).reset_index()
summary_path = f"{OUTPUT_ROOT}/{RUN_TAG}/cv_summary.csv"
os.makedirs(os.path.dirname(summary_path), exist_ok=True)
summary.to_csv(summary_path, index=False)
summary



=== aubmindlab/bert-base-arabertv2 | seed=42 | fold=1/3 ===


Map:   0%|          | 0/6666 [00:00<?, ? examples/s]

Map:   0%|          | 0/3334 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.6189,0.260795,0.965807,0.965296,0.965807
2,0.072,0.044325,0.993701,0.99371,0.993701
3,0.031,0.030281,0.995201,0.995201,0.995201



=== aubmindlab/bert-base-arabertv2 | seed=42 | fold=2/3 ===


Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.5777,0.238588,0.975998,0.975781,0.975998
2,0.0768,0.036161,0.9967,0.996695,0.9967
3,0.0355,0.022367,0.9982,0.9982,0.9982



=== aubmindlab/bert-base-arabertv2 | seed=42 | fold=3/3 ===


Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.583,0.25486,0.970297,0.970122,0.970297
2,0.0605,0.046353,0.992499,0.992492,0.992499
3,0.0293,0.034916,0.993399,0.993399,0.993399



=== UBC-NLP/MARBERT | seed=42 | fold=1/3 ===


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/6666 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3334 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.0629,0.027999,0.9979,0.997903,0.9979
2,0.0153,0.012473,0.9985,0.9985,0.9985
3,0.0132,0.009993,0.9991,0.9991,0.9991


model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]


=== UBC-NLP/MARBERT | seed=42 | fold=2/3 ===


Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.0643,0.029239,0.997,0.997003,0.997
2,0.0142,0.015229,0.9985,0.998501,0.9985
3,0.0093,0.017256,0.9976,0.997598,0.9976



=== UBC-NLP/MARBERT | seed=42 | fold=3/3 ===


Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.0744,0.03737,0.9958,0.995798,0.9958
2,0.0142,0.020932,0.9973,0.997299,0.9973
3,0.0093,0.015015,0.9979,0.9979,0.9979



=== xlm-roberta-base | seed=42 | fold=1/3 ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/6666 [00:00<?, ? examples/s]

Map:   0%|          | 0/3334 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,1.0529,0.346339,0.966407,0.966391,0.966407
2,0.0816,0.022621,0.9979,0.997897,0.9979
3,0.026,0.013114,0.9982,0.998197,0.9982



=== xlm-roberta-base | seed=42 | fold=2/3 ===


Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.939,0.296673,0.972397,0.972133,0.972397
2,0.0761,0.022498,0.9976,0.997606,0.9976
3,0.0256,0.014338,0.9979,0.997902,0.9979



=== xlm-roberta-base | seed=42 | fold=3/3 ===


Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Map:   0%|          | 0/3333 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,1.0515,0.340718,0.969697,0.969598,0.969697
2,0.0652,0.028613,0.9961,0.9961,0.9961
3,0.0167,0.020512,0.9967,0.9967,0.9967


Unnamed: 0_level_0,model,accuracy,accuracy,f1_macro,f1_macro,f1_micro,f1_micro
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
0,UBC-NLP/MARBERT,0.9985,0.0006,0.9985,0.0006,0.9985,0.0006
1,aubmindlab/bert-base-arabertv2,0.9956,0.002425,0.9956,0.002425,0.9956,0.002425
2,xlm-roberta-base,0.9976,0.000794,0.9976,0.000793,0.9976,0.000794


In [9]:
# ===============================
# 6) Pairwise bootstrap significance (ΔF1_macro) between models
# ===============================
import numpy as np, pandas as pd
from sklearn.metrics import precision_recall_fscore_support

cv_df = pd.read_csv(f"{OUTPUT_ROOT}/{RUN_TAG}/cv_results.csv")
models_unique = cv_df["model"].unique().tolist()

def paired_bootstrap_f1(y_true, pred_a, pred_b, B=2000):
    y_true = np.array(y_true); pa = np.array(pred_a); pb = np.array(pred_b)
    n = len(y_true)
    rng = np.random.default_rng(2024)
    def f1_macro(y, p):
        _, _, f1s, _ = precision_recall_fscore_support(y, p, labels=np.unique(y))
        return float(np.mean(f1s))
    diffs = []
    for _ in range(B):
        idx = rng.integers(0, n, n)
        diffs.append(f1_macro(y_true[idx], pa[idx]) - f1_macro(y_true[idx], pb[idx]))
    diffs = np.array(diffs)
    lo, hi = np.quantile(diffs, [0.025, 0.975])
    return float(diffs.mean()), float(lo), float(hi)

stats_rows = []
for i in range(len(models_unique)):
    for j in range(i+1, len(models_unique)):
        A, Bm = models_unique[i], models_unique[j]
        diffs = []
        for seed in SEEDS:
            for fold in range(KFOLDS):
                keyA = f"{A}|seed{seed}|fold{fold}"
                keyB = f"{Bm}|seed{seed}|fold{fold}"
                if keyA in store_preds and keyB in store_preds:
                    y_true = store_preds[keyA]["val_true"]
                    pa = store_preds[keyA]["val_pred"]
                    pb = store_preds[keyB]["val_pred"]
                    mean_d, lo, hi = paired_bootstrap_f1(y_true, pa, pb, B=1500 if LIGHT_RUN else 3000)
                    diffs.append((mean_d, lo, hi))
        if diffs:
            mean_over = float(np.mean([d[0] for d in diffs]))
            lo_over   = float(np.mean([d[1] for d in diffs]))
            hi_over   = float(np.mean([d[2] for d in diffs]))
            stats_rows.append({"A":A, "B":Bm, "ΔF1_macro_mean": mean_over, "CI_low": lo_over, "CI_high": hi_over})

stats_df = pd.DataFrame(stats_rows)
stats_df.to_csv(f"{OUTPUT_ROOT}/{RUN_TAG}/pairwise_bootstrap_macroF1.csv", index=False)
stats_df


Unnamed: 0,A,B,ΔF1_macro_mean,CI_low,CI_high
0,aubmindlab/bert-base-arabertv2,UBC-NLP/MARBERT,-0.002903,-0.005149,-0.000761
1,aubmindlab/bert-base-arabertv2,xlm-roberta-base,-0.001994,-0.004482,0.000517
2,UBC-NLP/MARBERT,xlm-roberta-base,0.000909,-0.001064,0.002963


In [10]:
# ===============================
# 7) Retrain best model on all data & export (Torch/ONNX/int8)
# ===============================
best_model = summary.sort_values(("f1_macro","mean"), ascending=False)["model"].iloc[0]
best_model


'UBC-NLP/MARBERT'

In [13]:
# ===============================
# 7) Retrain best model on all data & export (Torch/ONNX/int8)
# يعتمد على المتغيرات المعرفة سابقاً:
# best_model, OUTPUT_ROOT, RUN_TAG, df, labels, id2label, label2id,
# USE_CLASS_WEIGHTS, LR, BATCH, MAX_EPOCHS, PATIENCE
# ===============================

import os, json, numpy as np, torch
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer,
    EarlyStoppingCallback
)

# مجلد الإخراج للنموذج النهائي
final_dir = f"{OUTPUT_ROOT}/{RUN_TAG}/final_{best_model.replace('/','_')}"
os.makedirs(final_dir, exist_ok=True)

# تحميل التوكنايزر + إعداد طول أقصى مناسب للأوامر القصيرة
tok = AutoTokenizer.from_pretrained(best_model)
MAXLEN = 64
if hasattr(tok, "model_max_length"):
    tok.model_max_length = MAXLEN

# تقسيم صغير داخلي لأجل EarlyStopping فقط
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=999)
tr_idx, va_idx = list(skf.split(df["text_norm"], df["label"]))[0]

train_ds = Dataset.from_pandas(df.iloc[tr_idx][["text_norm","label"]].rename(columns={"text_norm":"text"}))
val_ds   = Dataset.from_pandas(df.iloc[va_idx][["text_norm","label"]].rename(columns={"text_norm":"text"}))

def tok_fn(b):
    return tok(b["text"], truncation=True, max_length=MAXLEN)

train_ds = train_ds.map(tok_fn, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tok_fn,   batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tok)

# تحميل النموذج
model = AutoModelForSequenceClassification.from_pretrained(
    best_model, num_labels=len(labels), id2label=id2label, label2id=label2id
)

# أوزان الفئات (اختياري)
loss_weights = None
if USE_CLASS_WEIGHTS:
    y = df.iloc[tr_idx]["label"].values
    cw = compute_class_weight("balanced", classes=np.arange(len(labels)), y=y)
    loss_weights = torch.tensor(cw, dtype=torch.float)

# ✅ دالة المقاييس لإنتاج eval_f1_macro المطلوبة لـ metric_for_best_model
def compute_metrics_builder():
    def compute_metrics(eval_pred):
        logits, labels_np = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": float(accuracy_score(labels_np, preds)),
            "f1_macro": float(f1_score(labels_np, preds, average="macro")),
            "f1_micro": float(f1_score(labels_np, preds, average="micro")),
        }
    return compute_metrics

# ✅ تعديل توقيع compute_loss ليتقبل **kwargs (مثل num_items_in_batch)
def compute_loss(model, inputs, return_outputs=False, **kwargs):
    labels_t = inputs.get("labels")
    outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
    logits = outputs.logits
    loss_fct = torch.nn.CrossEntropyLoss(
        weight=(loss_weights.to(logits.device) if loss_weights is not None else None)
    )
    loss = loss_fct(logits, labels_t)
    return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir=final_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",   # ✅ سيبحث عن eval_f1_macro
    greater_is_better=True,
    learning_rate=LR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=MAX_EPOCHS,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    seed=2025
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tok, data_collator=data_collator,
    compute_metrics=compute_metrics_builder(),               # ✅ مهم لتوليد eval_f1_macro
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
)
trainer.compute_loss = compute_loss

# تدريب النموذج النهائي
trainer.train()

# حفظ (Torch) + التوكنايزر + خرائط التسميات
trainer.save_model(final_dir)
tok.save_pretrained(final_dir)
with open(os.path.join(final_dir, "label_map.json"), "w", encoding="utf-8") as f:
    json.dump(
        {
            "labels": labels,
            "label2id": {k: int(v) for k, v in label2id.items()},
            "id2label": {int(k): v for k, v in id2label.items()},
        },
        f, ensure_ascii=False, indent=2
    )

# تصدير ONNX (مع محاور ديناميكية) + تكميم INT8 اختياري
dummy = tok("اختبار", return_tensors="pt", truncation=True, max_length=MAXLEN)
model.to("cpu").eval()
onnx_path = os.path.join(final_dir, "model.onnx")
torch.onnx.export(
    model,
    (dummy["input_ids"], dummy["attention_mask"]),
    onnx_path,
    input_names=["input_ids","attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch", 1: "seq"},
        "attention_mask": {0: "batch", 1: "seq"},
        "logits": {0: "batch"}
    },
    opset_version=17
)

# التكميم (اختياري)
try:
    from onnxruntime.quantization import quantize_dynamic, QuantType
    quantize_dynamic(onnx_path, os.path.join(final_dir, "model.int8.onnx"), weight_type=QuantType.QInt8)
except Exception as e:
    print("Quantization skipped:", e)

final_dir


Map:   0%|          | 0/8750 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro
1,0.0458,0.019563,0.9984,0.998401,0.9984
2,0.0195,0.007928,0.9984,0.998401,0.9984
3,0.0065,0.005275,0.9992,0.999203,0.9992




'/content/bert_ar_commands_benchmark/20251020_101222/final_UBC-NLP_MARBERT'

In [23]:
# ------------------------------------------
# 🧹 تنظيف مساحة كولاب واسترجاع المساحة المفقودة
# ------------------------------------------
import shutil, os, subprocess

paths_to_clean = [
    "/root/.cache/huggingface",
    "/root/.cache/torch",
    "/root/.cache/pip",
    "/content/sample_data",
    "/content/__pycache__",
    "/content/cache",
]

for p in paths_to_clean:
    if os.path.exists(p):
        try:
            shutil.rmtree(p)
            print(f"🧹 Deleted: {p}")
        except Exception as e:
            print(f"⚠️ Skip {p}: {e}")

# تنظيف النماذج والملفات الضخمة التي تم توليدها مؤقتاً
os.system("rm -rf /content/*.zip /content/*.pt /content/*.bin /content/*.safetensors /content/checkpoint*")
os.system("rm -rf /tmp/*")

print("\n📊 المساحة بعد التنظيف:\n")
print(subprocess.getoutput("df -h | head -n 5"))



📊 المساحة بعد التنظيف:

Filesystem      Size  Used Avail Use% Mounted on
overlay         113G  106G  7.6G  94% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G  4.0K  5.7G   1% /dev/shm
/dev/root       2.0G  1.2G  750M  62% /usr/sbin/docker-init


In [25]:
# ===============================
# 8) Export all results as ZIP (and copy to Drive)
# ===============================
!pip install -q pydrive2

import os, time, pathlib, fnmatch
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# -------- إعدادات قابلة للتعديل --------
TOP_FOLDER_NAME = "BERT_final_model"   # اسم مجلد الوجهة في My Drive
UPLOAD_MINIMAL  = True                 # True = ارفع الملفات الضرورية فقط
MIN_PATTERNS = [                       # ما الذي يُرفع في الوضع المصغّر
    "model.int8.onnx", "model.onnx",
    "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json",
    "vocab.txt", "merges.txt", "sentencepiece.*",
    "config.json", "label_map.json"
]
# ---------------------------------------

# ✅ تأكيد وجود final_dir
try:
    final_dir
except NameError:
    raise RuntimeError("🔴 المتغير final_dir غير معرّف. شغّل خلية التدريب/التصدير أولاً.")
if not os.path.isdir(final_dir):
    raise RuntimeError(f"🔴 المسار غير موجود: {final_dir}")
print(f"✅ final_dir = {final_dir}")

# 🔐 مصادقة Google عبر آلية Colab (لا تحتاج client_secrets.json)
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print("✅ تم تسجيل الدخول إلى Google Drive عبر مصادقة Colab.")

# ---------- دوال مساعدة ----------
def ensure_folder(name, parent_id="root"):
    """إنشاء/إيجاد مجلد باسم name داخل parent_id وإرجاع folder_id"""
    q = (
        f"'{parent_id}' in parents and trashed=false "
        f"and mimeType='application/vnd.google-apps.folder' and title='{name}'"
    )
    items = drive.ListFile({'q': q}).GetList()
    if items:
        return items[0]['id']
    f = drive.CreateFile({
        'title': name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [{'id': parent_id}]
    })
    f.Upload()
    return f['id']

def upload_file(local_path, parent_id):
    """رفع ملف واحد إلى Drive (أجزاء 8MB لتقليل الذاكرة المؤقتة)"""
    f = drive.CreateFile({'title': os.path.basename(local_path),
                          'parents': [{'id': parent_id}]})
    f['chunksize'] = 8 * 1024 * 1024
    f.SetContentFile(local_path)
    f.Upload()
    return f['id']

def should_keep(relpath: str) -> bool:
    """تحديد ما إذا كان الملف يُرفع في وضع النسخة المصغّرة"""
    if not UPLOAD_MINIMAL:
        return True
    for pat in MIN_PATTERNS:
        if fnmatch.fnmatch(os.path.basename(relpath), pat) or fnmatch.fnmatch(relpath, pat):
            return True
    return False
# -----------------------------------

# إنشاء مسار الوجهة في Drive
top_id = ensure_folder(TOP_FOLDER_NAME, "root")
ts = time.strftime("%Y%m%d_%H%M%S")
export_id = ensure_folder(f"export_{ts}", top_id)
print(f"📁 Drive path: My Drive/{TOP_FOLDER_NAME}/export_{ts}")

# السير على محتويات final_dir ورفعها
base = pathlib.Path(final_dir)
folders_cache = {str(base): export_id}
files_count = 0

for p in base.rglob("*"):
    rel = str(p.relative_to(base))
    if p.is_dir():
        parent_local = str(p.parent)
        parent_id = folders_cache.get(parent_local, export_id)
        new_id = ensure_folder(p.name, parent_id)
        folders_cache[str(p)] = new_id
        continue

    # ملف
    if not should_keep(rel):
        continue
    parent_id = folders_cache.get(str(p.parent), export_id)
    files_count += 1
    print(f"⏫ [{files_count}] {rel}")
    upload_file(str(p), parent_id)

print(f"\n✅ تم الرفع: {files_count} ملف(ات).")
print("📎 ستجدها في: My Drive →", TOP_FOLDER_NAME, "→", f"export_{ts}")


✅ final_dir = /content/bert_ar_commands_benchmark/20251020_101222/final_UBC-NLP_MARBERT
✅ تم تسجيل الدخول إلى Google Drive عبر مصادقة Colab.
📁 Drive path: My Drive/BERT_final_model/export_20251020_122854
⏫ [1] vocab.txt
⏫ [2] model.int8.onnx
⏫ [3] config.json
⏫ [4] tokenizer.json
⏫ [5] tokenizer_config.json
⏫ [6] special_tokens_map.json
⏫ [7] label_map.json
⏫ [8] model.onnx
⏫ [9] checkpoint-822/vocab.txt
⏫ [10] checkpoint-822/config.json
⏫ [11] checkpoint-822/tokenizer.json
⏫ [12] checkpoint-822/tokenizer_config.json
⏫ [13] checkpoint-822/special_tokens_map.json
⏫ [14] checkpoint-274/vocab.txt
⏫ [15] checkpoint-274/config.json
⏫ [16] checkpoint-274/tokenizer.json
⏫ [17] checkpoint-274/tokenizer_config.json
⏫ [18] checkpoint-274/special_tokens_map.json
⏫ [19] checkpoint-548/vocab.txt
⏫ [20] checkpoint-548/config.json
⏫ [21] checkpoint-548/tokenizer.json
⏫ [22] checkpoint-548/tokenizer_config.json
⏫ [23] checkpoint-548/special_tokens_map.json

✅ تم الرفع: 23 ملف(ات).
📎 ستجدها في: My Driv

In [73]:
# ===============================
# 9) Quick inference function (loads final best model)
# ===============================
import json, numpy as np, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def normalize_ar(text: str) -> str:
    t = text.strip()
    t = re.sub(r"[\u0640]+", "", t)
    t = t.translate(str.maketrans("٠١٢٣٤٥٦٧٨٩","0123456789"))
    t = re.sub(r"[ـ]+", "", t)
    t = re.sub(r"\s+", " ", t)
    t = re.sub("[إأآا]", "ا", t)
    t = re.sub("ى", "ي", t)
    t = re.sub("ؤ", "و", t)
    t = re.sub("ئ", "ي", t)
    t = t.replace("ة", "ه")
    t = re.sub(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]", "", t)
    t = re.sub(r"(.)\1{2,}", r"\1\1", t)
    return t

best_dir = final_dir  # from previous cell
tok = AutoTokenizer.from_pretrained(best_dir)
mdl = AutoModelForSequenceClassification.from_pretrained(best_dir)
id2label = mdl.config.id2label

def predict(text, threshold=0.8):
    t = normalize_ar(text)
    enc = tok(t, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = mdl(**enc).logits
        probs = torch.softmax(logits, dim=-1).numpy().squeeze()
    idx = int(np.argmax(probs))
    conf = float(probs[idx])
    label = id2label[idx] if conf >= threshold else "UNKNOWN"
    return {"text": text, "norm": t, "label": label, "confidence": round(conf, 4)}

print(predict("كفي لورا"))



{'text': 'كفي لورا', 'norm': 'كفي لورا', 'label': 'BACKWARD', 'confidence': 0.996}


In [77]:
import pandas as pd
df_orig = pd.read_csv("/mnt/data/arabic_game_commands_10k.csv")  # ملفك
df_aug  = pd.read_csv("/mnt/data/arabic_game_commands_15k.csv")
df = pd.concat([df_orig, df_aug], ignore_index=True)
df.drop_duplicates(subset=["text_norm"], inplace=True)
df.to_csv("/mnt/data/arabic_game_commands_merged.csv", index=False, encoding="utf-8-sig")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/arabic_game_commands_10k.csv'