In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

text_fr = "Il va pleuvoir demain."

# Tokenizer partagé
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer.src_lang = "fra_Latn"
inputs = tokenizer(text_fr, return_tensors="pt", padding=True, truncation=True)

# Token de début darija
forced_bos_token_id = tokenizer.convert_tokens_to_ids("ary_Arab")

# A – modèle original
model_base = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
outputs_base = model_base.generate(
    **inputs, forced_bos_token_id=forced_bos_token_id, max_length=128
)
translation_base = tokenizer.batch_decode(outputs_base, skip_special_tokens=True)[0]

# B – modèle fine-tuné LoRA
model_lora = AutoModelForSeq2SeqLM.from_pretrained("nllb-darija-lora-model")
outputs_lora = model_lora.generate(
    **inputs, forced_bos_token_id=forced_bos_token_id, max_length=128
)
translation_lora = tokenizer.batch_decode(outputs_lora, skip_special_tokens=True)[0]

# Résultats
print("🔹 Base     :", translation_base)
print("🔸 LoRA     :", translation_lora)


  from .autonotebook import tqdm as notebook_tqdm


🔹 Base     : غاديا غاديا يطيح.
🔸 LoRA     : غادي الشتا غدّا.


In [3]:
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Charger les deux modèles
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model_base = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
model_lora = AutoModelForSeq2SeqLM.from_pretrained("nllb-darija-lora-model")

# BLEU
bleu = load("sacrebleu")

# Dataset
dataset = load_dataset("json", data_files="all_translations_dataset.json", split="train[:100]")

preds_base, preds_lora, refs = [], [], []

tokenizer.src_lang = "fra_Latn"
forced_bos_token_id = tokenizer.convert_tokens_to_ids("ary_Arab")

for example in dataset:
    translation = example["translation"]

    src = translation.get("fra_Latn", None)
    tgt = translation.get("ary_Arab", None)

    if src is None or tgt is None:
        continue  # 🛑 Skip si une langue est absente

    # Tokeniser entrée
    inputs = tokenizer(src, return_tensors="pt", padding=True, truncation=True)

    # Génération modèle base
    out_base = model_base.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
    pred_base = tokenizer.decode(out_base[0], skip_special_tokens=True)

    # Génération modèle LoRA
    out_lora = model_lora.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
    pred_lora = tokenizer.decode(out_lora[0], skip_special_tokens=True)

    preds_base.append(pred_base)
    preds_lora.append(pred_lora)
    refs.append([tgt])

# BLEU Score
score_base = bleu.compute(predictions=preds_base, references=refs)
score_lora = bleu.compute(predictions=preds_lora, references=refs)

print(f"📊 BLEU modèle original : {score_base['score']:.2f}")
print(f"📊 BLEU modèle LoRA     : {score_lora['score']:.2f}")


📊 BLEU modèle original : 1.37
📊 BLEU modèle LoRA     : 5.94


In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import random

# Charger les modèles
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model_base = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
model_lora = AutoModelForSeq2SeqLM.from_pretrained("nllb-darija-lora-model")

# Charger le dataset complet
dataset = load_dataset("json", data_files="all_translations_dataset.json", split="train")
tokenizer.src_lang = "fra_Latn"
forced_bos_token_id = tokenizer.convert_tokens_to_ids("ary_Arab")

# Mélanger le dataset et en prendre 10
examples = dataset.shuffle(seed=42).select(range(10))

for idx, example in enumerate(examples):
    translation = example["translation"]
    src = translation.get("fra_Latn", None)
    tgt = translation.get("ary_Arab", None)

    if src is None or tgt is None:
        continue

    inputs = tokenizer(src, return_tensors="pt", padding=True, truncation=True)

    # Génération modèle original
    out_base = model_base.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
    pred_base = tokenizer.decode(out_base[0], skip_special_tokens=True)

    # Génération modèle LoRA
    out_lora = model_lora.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
    pred_lora = tokenizer.decode(out_lora[0], skip_special_tokens=True)

    print(f"\n================ EXEMPLE {idx+1} ================")
    print(f"🧾 FRANÇAIS      : {src}")
    print(f"🎯 RÉFÉRENCE     : {tgt}")
    print(f"🔹 ORIGINAL      : {pred_base}")
    print(f"🔸 LoRA FINE-TUNE: {pred_lora}")



🧾 FRANÇAIS      : Tout ce qu'il fait, c'est nous rabaisser et nous rappeler qu'il est intellectuellement supérieur
🎯 RÉFÉRENCE     : لحاجا لواحيدا لي كايدير هيا كايطيّح منّا, وي فكّرنا أنّاهو موتقّاف علينا
🔹 ORIGINAL      : كَيْدِيرْ كُلّْشِي بَاشْ يْخْلِّينَا نْتّْهَيّْنُو وْيْتّْفَكّْرُو بْلِّي هُوَ أَعْظِمْ فْالْفْهْمَة.
🔸 LoRA FINE-TUNE: لحاجا ليا كايدير هووا هووا هووا كايخايبنا أُ كايعزز علينا بلي هووّا فاقلي

🧾 FRANÇAIS      : En somme, la Renaissance a été à l'origine d'un changement majeur dans la façon d'envisager l'apprentissage et la diffusion des connaissances.
🎯 RÉFÉRENCE     : وفي جوهر الأمر، دار عصر النهضة تغيير كبير في نهج التعلم ونشر المعرفة.
🔹 ORIGINAL      : وْبْالْكْتَابْ، رَاهْ التّْوْقِيَّة دْيَالْ النّْقْدَامَة دْيَالْهَا دَارَاتْ تّْبْدِيلْ فْالْمْقَادَة دْيَالْ التَّعْلِيمْ وْالْمَعْرِفَة.
🔸 LoRA FINE-TUNE: في الختصار، أدى عصر النهضة إلى تغيير كبير في طريقة التعلم ونشر المعرفة.

🧾 FRANÇAIS      : Sais-tu te diriger en fonction de la position du soleil ?
🎯 RÉFÉ