In [1]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import sqlite3
import re
from dateutil.parser import parse

# Determine the device to use (GPU if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [2]:
from transformers import pipeline
import torch

# Load a high-performance multilingual zero-shot classification model with strong Arabic support
classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",  # Top multilingual model for zero-shot tasks with excellent Arabic performance
    device=0 if torch.cuda.is_available() else -1,  # Utilize GPU for faster inference
    model_kwargs={"ignore_mismatched_sizes": True}  # Ignore optional files that may cause 404 errors (fixes the additional_chat_templates issue)
)

# Define the supported intent categories for the virtual assistant
candidate_labels = [
    "greeting",
    "add_reminder",
    "view_reminders",
    "general_question",
    "delete_reminder"
]

# Test with an English input
sample_text = "Remind me to call Ahmed tomorrow at 5 PM"
result = classifier(sample_text, candidate_labels, multi_label=False)
print(f"English Detected Intent: {result['labels'][0]} (Confidence: {result['scores'][0]:.2f})")

# Test with an Arabic input
arabic_text = "ذكرني أتصل بأحمد غدا الساعة 5 مساء"
arabic_result = classifier(arabic_text, candidate_labels, multi_label=False)
print(f"Arabic Detected Intent: {arabic_result['labels'][0]} (Confidence: {arabic_result['scores'][0]:.2f})")

Device set to use cuda:0


English Detected Intent: add_reminder (Confidence: 0.45)
Arabic Detected Intent: add_reminder (Confidence: 0.40)


In [3]:
from datasets import load_dataset

# Load the complete dataset from local Parquet files
full_dataset = load_dataset("parquet", data_files="massive_data/**/*.parquet")["train"]

# Filter for Arabic examples only (locale = "ar-SA")
arabic_train = full_dataset.filter(lambda example: example["locale"] == "ar-SA")

# Create a DatasetDict containing only the Arabic training split
# Note: Validation and test splits are not present in the loaded file; training data alone is sufficient for fine-tuning
massive_arabic = {
    "train": arabic_train
}

# Display loading summary
print("MASSIVE Arabic dataset loaded successfully from local Parquet files.")
print(f"Number of training examples: {len(massive_arabic['train'])}")

# Display available columns for verification
print("Available columns:", full_dataset.column_names)

# Retrieve intent label names for mapping
intent_names = full_dataset.features["intent"].names

# Display a safe sample from the Arabic training data
sample = massive_arabic["train"][0]

print("\n--- Sample Arabic Example ---")
print("Utterance:", sample["utt"])
print("Intent ID:", sample["intent"])
print("Intent Name:", intent_names[sample["intent"]])
print("Annotated Utterance (with slots):", sample.get("annot_utt", "Not available"))
print("Slots:", sample.get("slots", "Not available in this version"))

MASSIVE Arabic dataset loaded successfully from local Parquet files.
Number of training examples: 11514
Available columns: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'worker_id', 'slot_method', 'judgments']

--- Sample Arabic Example ---
Utterance: صحيني تسعة الصباح يوم الجمعة
Intent ID: 48
Intent Name: alarm_set
Annotated Utterance (with slots): صحيني [time : تسعة الصباح] يوم [date : الجمعة]
Slots: Not available in this version


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load the full dataset from local Parquet files
full_dataset = load_dataset("parquet", data_files="massive_data/**/*.parquet")["train"]

# Filter for Arabic examples
arabic_dataset = full_dataset.filter(lambda example: example["locale"] == "ar-SA")

# Map MASSIVE intents to our custom 5 categories
intent_mapping = {
    "alarm_set": "add_reminder",
    "alarm_query": "view_reminders",
    "reminder_set": "add_reminder",
    "reminder_query": "view_reminders",
    "calendar_set": "add_reminder",
    "calendar_query": "view_reminders",
    "greeting": "greeting",
    "general_quirky": "general_question",
    "general_question": "general_question",
    # Default fallback
}

def map_intent(example):
    original_intent = full_dataset.features["intent"].names[example["intent"]]
    example["label_str"] = intent_mapping.get(original_intent, "general_question")
    return example

arabic_dataset = arabic_dataset.map(map_intent)

# Create label to ID mapping
unique_labels = sorted(set(arabic_dataset["label_str"]))
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

# Add numeric label
arabic_dataset = arabic_dataset.map(lambda example: {"label": label2id[example["label_str"]]})

# Load Arabic tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["utt"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = arabic_dataset.map(tokenize_function, batched=True)

# Load model with correct number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

# Training arguments (updated for latest Transformers version)
training_args = TrainingArguments(
    output_dir="./arabic_finetuned_model",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="no",  # Fixed: changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=False,
    fp16=True if torch.cuda.is_available() else False  # Mixed precision on GPU
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Start fine-tuning
print("Starting fine-tuning on Arabic MASSIVE data (11514 examples)...")
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./arabic_finetuned_model")
tokenizer.save_pretrained("./arabic_finetuned_model")

print("Fine-tuning completed successfully! Model saved in ./arabic_finetuned_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fine-tuning on Arabic MASSIVE data (11514 examples)...


Step,Training Loss
50,1.0076
100,0.5287
150,0.4534
200,0.3672
250,0.2937
300,0.2668
350,0.2392
400,0.3112
450,0.2428
500,0.2247


Fine-tuning completed successfully! Model saved in ./arabic_finetuned_model
