In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers

In [None]:
!pip install evaluate

In [None]:
!pip install trl peft

In [None]:
!pip install -U bitsandbytes


In [None]:
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    Trainer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    set_seed
)

import torch
from torch.utils.tensorboard import SummaryWriter
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
# from trl import 
from torch.utils.data import Dataset, DataLoader
import evaluate


In [None]:
df = pd.read_excel("<Dataset name>")

In [None]:
quantization = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_compute_dtype = "float16")

In [None]:

model = AutoModelForSeq2SeqLM.from_pretrained("PRAli22/arat5-arabic-dialects-translation", quantization_config = quantization)

tokonizer = AutoTokenizer.from_pretrained("PRAli22/arat5-arabic-dialects-translation")


In [None]:
model

In [None]:
print(model.config)


In [None]:
from torch.nn import Dropout

new_dropout_rate = 0.4

for module in model.modules():
    if isinstance(module, Dropout):
        module.p = new_dropout_rate

In [None]:
model.config.dropout_rate = 0.4

In [None]:
# Lora configiration

lora_config = LoraConfig(
        lora_alpha=32,  
        lora_dropout=0.50,  
        r=18, 
        bias="none",
        task_type="Seq2Seq",
        target_modules= 
         ['k', 'v', 'q', 'o', 'wi_0', 'wi_1', 'wo', 'lm_head']
    )

model = prepare_model_for_kbit_training(model)
model_with_lora = get_peft_model(model, lora_config)


In [None]:
msa_length = df["msa"].apply(lambda x: len(x.split())).mean()
hijazi_length = df["hijazi"].apply(lambda x: len(x.split())).mean()

In [None]:
msa_length, hijazi_length

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=50):
        self.src = []
        self.dest = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        for i in range(len(df)):
            # Add translation pairs
            self.src.append(f"ترجم من حجازي الى فصحى: {df.iloc[i, 2]}")  # Hijazi to MSA
            self.dest.append(df.iloc[i, 0])  # MSA

            self.src.append(f"ترجم من فصحى الى حجازي: {df.iloc[i, 0]}")  # MSA to Hijazi
            self.dest.append(df.iloc[i, 2])  # Hijazi

            self.src.append(f"ترجم من جنوبي الى فصحى: {df.iloc[i, 1]}")  # Janoubiyah to MSA
            self.dest.append(df.iloc[i, 0])  # MSA

            self.src.append(f"ترجم من فصحى الى جنوبي: {df.iloc[i, 0]}")  # MSA to Janoubiyah
            self.dest.append(df.iloc[i, 1])  # Janoubiyah

            self.src.append(f"ترجم من جنوبي الى حجازي: {df.iloc[i, 1]}")  # Janoubiyah to Hijazi
            self.dest.append(df.iloc[i, 2])  # Hijazi

            self.src.append(f"ترجم من حجازي الى جنوبي: {df.iloc[i, 2]}")  # Hijazi to Janoubiyah
            self.dest.append(df.iloc[i, 1])  # Janoubiyah

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        # Tokenize source and destination texts
        inputs = self.tokenizer(
            self.src[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = self.tokenizer(
            self.dest[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0)
        }

In [None]:
dataset = CustomDataset(df, tokonizer)

In [None]:
dataset[0]

In [None]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train, test = torch.utils.data.random_split(dataset, [train_size, test_size])

In [None]:
bleu_metric = evaluate.load("bleu")
writer = SummaryWriter()

def compute_cost(eval_pred):
    preds, trues = eval_pred.predictions, eval_pred.label_ids
    pred_ids = preds.tolist()
    true_ids = trues.tolist()
    
    
    pred_ids = [
            [token if token >= 0 and token < tokonizer.vocab_size else tokonizer.pad_token_id for token in pred_seq]
            for pred_seq in pred_ids
        ]

    preds_text = tokonizer.batch_decode(pred_ids, skip_special_tokens=True)
    trues_text = tokonizer.batch_decode(true_ids, skip_special_tokens=True)
    result = {}
    result["bleu"] = bleu_metric.compute(predictions=preds_text, references=[[t] for t in trues_text])

    bleu_score = result["bleu"]["bleu"]
    writer.add_scalar(tag="eval/bleu", scalar_value=bleu_score)

    for idx, precision in enumerate(result["bleu"]["precisions"]):
        writer.add_scalar(tag=f"precisions{idx+1}-gram", scalar_value=precision)
    return result["bleu"]


In [None]:

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)


In [None]:
set_seed(16)

In [None]:
trainerarg = Seq2SeqTrainingArguments(
    output_dir="Chickpoints",
    evaluation_strategy = "epoch",
    save_strategy="steps",
    num_train_epochs = 50,
    logging_dir = 'logging/',
    logging_steps = 100,
    save_steps= 5000,
    predict_with_generate=True,
    remove_unused_columns=False,
    learning_rate = 0.0003,
    weight_decay = 0.80,
    lr_scheduler_type= "linear",
    warmup_ratio=0.05,
    generation_max_length = 50,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size  = 32,
)


trainer = Seq2SeqTrainer(
    model = model_with_lora,
    args = trainerarg,
    train_dataset = train,
    eval_dataset = test,
    compute_metrics = compute_cost,
    tokenizer=tokonizer,
)


In [None]:
trainer.train()