In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "UBC-NLP/MARBERT"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from datasets import Dataset
import pandas as pd

df = pd.read_parquet('/content/translated_data_cleaned.parquet')
dataset = Dataset.from_pandas(df)

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["__index_level_0__"])
test_dataset = test_dataset.remove_columns(["__index_level_0__"])

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])



Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [33]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ---------------- Training Arguments ----------------
training_args = TrainingArguments(
    output_dir="./marbert-egyptian-guard",
    learning_rate=2e-5,
    eval_strategy="steps",
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    no_cuda=False,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[]
)

# ---------------- Compute Metrics ----------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Trainer ----------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [34]:

trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.7759,0.541186,0.7364,0.733761,0.743951,0.7364
100,0.4349,0.429464,0.815,0.814085,0.815433,0.815
150,0.389,0.406145,0.8178,0.814578,0.83386,0.8178
200,0.3581,0.371954,0.8562,0.855774,0.858011,0.8562
250,0.3913,0.274882,0.8892,0.888661,0.891285,0.8892
300,0.3262,0.289903,0.8986,0.898388,0.898463,0.8986
350,0.3252,0.48367,0.8606,0.859633,0.869169,0.8606
400,0.3696,0.327232,0.8796,0.879249,0.885582,0.8796
450,0.3347,0.363473,0.8656,0.865122,0.867527,0.8656
500,0.3499,0.294276,0.9068,0.906799,0.907189,0.9068


TrainOutput(global_step=16875, training_loss=0.16355215939769038, metrics={'train_runtime': 10440.4221, 'train_samples_per_second': 12.931, 'train_steps_per_second': 1.616, 'total_flos': 1.776015569664e+16, 'train_loss': 0.16355215939769038, 'epoch': 3.0})

In [35]:

# حفظ الموديل النهائي
model.save_pretrained("./arabguard-egyptian-v1")
tokenizer.save_pretrained("./arabguard-egyptian-v1")

('./arabguard-egyptian-v1/tokenizer_config.json',
 './arabguard-egyptian-v1/special_tokens_map.json',
 './arabguard-egyptian-v1/vocab.txt',
 './arabguard-egyptian-v1/added_tokens.json',
 './arabguard-egyptian-v1/tokenizer.json')

In [37]:
import zipfile
import os

!zip -r arabguard-egyptian-v1.zip /content/arabguard-egyptian-v1

from google.colab import files
files.download('/content/arabguard-egyptian-v1.zip')

  adding: content/marbert-egyptian-guard/ (stored 0%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/ (stored 0%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/tokenizer_config.json (deflated 74%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/training_args.bin (deflated 53%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/scheduler.pt (deflated 61%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/trainer_state.json (deflated 83%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/config.json (deflated 55%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/scaler.pt (deflated 64%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/model.safetensors (deflated 7%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/tokenizer.json (deflated 72%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/rng_state.pth (deflated 26%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/special_tokens_map.j

FileNotFoundError: Cannot find file: /content/marbert-egyptian-guard.zip

In [38]:
# ضغط الفولدر كله
!zip -r marbert-egyptian-guard.zip /content/marbert-egyptian-guard

# داونلود فوري
from google.colab import files
files.download("/content/marbert-egyptian-guard.zip")

  adding: content/marbert-egyptian-guard/ (stored 0%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/ (stored 0%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/tokenizer_config.json (deflated 74%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/training_args.bin (deflated 53%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/scheduler.pt (deflated 61%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/trainer_state.json (deflated 83%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/config.json (deflated 55%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/scaler.pt (deflated 64%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/model.safetensors (deflated 7%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/tokenizer.json (deflated 72%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/rng_state.pth (deflated 26%)
  adding: content/marbert-egyptian-guard/checkpoint-14500/special_tokens_map.j

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!tar -czvf marbert-egyptian-guard.tar.gz /content/marbert-egyptian-guard


tar: Removing leading `/' from member names
/content/marbert-egyptian-guard/
/content/marbert-egyptian-guard/checkpoint-14500/
/content/marbert-egyptian-guard/checkpoint-14500/tokenizer_config.json
/content/marbert-egyptian-guard/checkpoint-14500/training_args.bin
/content/marbert-egyptian-guard/checkpoint-14500/scheduler.pt
/content/marbert-egyptian-guard/checkpoint-14500/trainer_state.json
/content/marbert-egyptian-guard/checkpoint-14500/config.json
/content/marbert-egyptian-guard/checkpoint-14500/scaler.pt
/content/marbert-egyptian-guard/checkpoint-14500/model.safetensors


In [None]:
from google.colab import files
files.download("marbert-egyptian-guard.tar.gz")
