In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "microsoft/mdeberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import Dataset
import pandas as pd

df = pd.read_parquet('/content/final_training_dataset.parquet')
dataset = Dataset.from_pandas(df)

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["__index_level_0__"])
test_dataset = test_dataset.remove_columns(["__index_level_0__"])

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])



Map:   0%|          | 0/104409 [00:00<?, ? examples/s]

Map:   0%|          | 0/11602 [00:00<?, ? examples/s]

In [11]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# ---------------- Training Arguments ----------------
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    eval_strategy="steps",
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    no_cuda=False,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[]
)

# ---------------- Compute Metrics ----------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ---------------- Trainer ----------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.5265,0.480112,0.851577,0.850242,0.85468,0.851577
100,0.3327,0.439476,0.875366,0.875316,0.875683,0.875366
150,0.4349,0.321648,0.886571,0.88673,0.886993,0.886571
200,0.3634,0.556809,0.790726,0.785101,0.846873,0.790726
250,0.3558,0.331278,0.850198,0.849597,0.856849,0.850198
300,0.3793,0.297477,0.889243,0.888992,0.891694,0.889243
350,0.2293,0.354349,0.900879,0.900449,0.901741,0.900879
400,0.3552,0.272626,0.902,0.9013,0.905095,0.902
450,0.3173,0.284956,0.909154,0.909122,0.909231,0.909154
500,0.3194,0.27529,0.912774,0.912404,0.913912,0.912774


In [None]:
trainer.save_model("arabguard-mdeberta")
tokenizer.save_pretrained("arabguard-mdeberta")
