## Fine-tune a BERT-based encoder-decoder for English -> Arabic translation

In [2]:
%%capture
!pip install evaluate sacrebleu

In [3]:
import re
import os
import argparse
from typing import List
import inspect

import pandas as pd
import numpy as np
import torch

# HF libs
from datasets import Dataset
import evaluate
import transformers
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)

2025-09-11 15:36:56.622565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757605016.861169      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757605016.930774      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# -------------------------
# Small helpers / cleaning
# -------------------------
MAX_LENGTH = 64

def clean_arabic(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"[^\u0600-\u06FFa-zA-Z0-9\s]", "", text)
    arabic_diacritics = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u0653-\u0655\u06D6-\u06ED]")
    text = re.sub(arabic_diacritics, "", text)
    replacements = {
        "آ": "ا", "أ": "ا", "إ": "ا", "ٱ": "ا",
        "ة": "ه", "ى": "ي",
        "ؤ": "و", "ئ": "ي", "ء": "",
        "ـ": "",
        "٠":"0","١":"1","٢":"2","٣":"3","٤":"4","٥":"5","٦":"6","٧":"7","٨":"8","٩":"9"
    }
    for k, v in replacements.items():
        text = text.replace(k, v)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def normalize_sentence_series(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()
    s = s.str.replace(r'[^A-Za-z\u0600-\u06FF0-9\s]+', '', regex=True)
    s = s.str.normalize("NFC")
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

def read_file(loc: str, lang1="eng", lang2="ara") -> pd.DataFrame:
    df = pd.read_csv(loc, delimiter='\t', header=None, quoting=3, engine="python")
    df = df.iloc[:, :2]
    df.columns = [lang1, lang2]
    df[lang2] = df[lang2].apply(clean_arabic)
    df[lang1] = df[lang1].astype(str).str.replace(r"[^\w\s]", "", regex=True).str.strip()
    df[lang1] = normalize_sentence_series(df[lang1])
    df[lang2] = normalize_sentence_series(df[lang2])
    df = df.dropna(subset=[lang1, lang2]).drop_duplicates(subset=[lang1, lang2]).reset_index(drop=True)
    return df

def filter_by_max_len(df: pd.DataFrame, max_words: int=MAX_LENGTH) -> pd.DataFrame:
    mask = (df['eng'].str.split().str.len() <= max_words) & (df['ara'].str.split().str.len() <= max_words)
    return df[mask].reset_index(drop=True)

In [5]:
# -------------------------
# Tokenization & dataset
# -------------------------
def build_hf_dataset(df: pd.DataFrame, tokenizer, src_col="eng", tgt_col="ara", max_source_length=MAX_LENGTH, max_target_length=MAX_LENGTH):
    def preprocess_fn(examples):
        inputs = examples[src_col]
        targets = examples[tgt_col]
        model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True, padding="max_length")
        # as_target_tokenizer context (works with modern tokenizers)
        try:
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
        except Exception:
            labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    ds = Dataset.from_pandas(df[[src_col, tgt_col]])
    tokenized = ds.map(preprocess_fn, batched=True, remove_columns=[src_col, tgt_col])
    return tokenized

In [6]:
# -------------------------
# Metrics
# -------------------------
sacrebleu = evaluate.load("sacrebleu")

def postprocess_text(preds: List[str], labels: List[str]):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels

Downloading builder script: 0.00B [00:00, ?B/s]

In [7]:
class Args:
    output_dir = "./outputs"
    num_train_epochs = 3
    per_device_train_batch_size = 8
    per_device_eval_batch_size = 8
    learning_rate = 5e-5
    save_steps = 500
    eval_steps = 500
    max_source_length = 64
    max_target_length = 64

args = Args()

In [8]:
# -------------------------
# Main train function
# -------------------------
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", type=str, default="/kaggle/input/eng-ara/eng-ara.txt")
    parser.add_argument("--output_dir", type=str, default="./bert-encdec-eng-ara")
    parser.add_argument("--num_train_epochs", type=int, default=12)
    parser.add_argument("--per_device_train_batch_size", type=int, default=8)
    parser.add_argument("--per_device_eval_batch_size", type=int, default=8)
    parser.add_argument("--learning_rate", type=float, default=5e-5)
    parser.add_argument("--max_source_length", type=int, default=64)
    parser.add_argument("--max_target_length", type=int, default=64)
    args = parser.parse_args([])

    print("transformers version:", transformers.__version__)
    df = read_file(args.data_path, "eng", "ara")
    df = filter_by_max_len(df, max_words=MAX_LENGTH)
    print(f"Total pairs after cleaning: {len(df)}")
    if len(df) == 0:
        raise ValueError("No pairs after cleaning. Check dataset path/format.")

    # split
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train_df = df.iloc[: int(len(df) * 0.9)].reset_index(drop=True)
    val_df = df.iloc[int(len(df) * 0.9):].reset_index(drop=True)
    print("Train / Val sizes:", len(train_df), len(val_df))

    model_name = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

    # config tokens
    model.config.decoder_start_token_id = tokenizer.cls_token_id or tokenizer.bos_token_id or tokenizer.cls_token_id
    model.config.eos_token_id = tokenizer.sep_token_id or tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.max_length = args.max_target_length
    model.config.vocab_size = model.config.encoder.vocab_size

    tokenized_train = build_hf_dataset(train_df, tokenizer, "eng", "ara", args.max_source_length, args.max_target_length)
    tokenized_val = build_hf_dataset(val_df, tokenizer, "eng", "ara", args.max_source_length, args.max_target_length)

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest", label_pad_token_id=-100)

    # compute_metrics
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
        result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    # -------------------------
    # Compatibility: detect transformers version
    # -------------------------
    tver = tuple(int(x) for x in transformers.__version__.split(".")[:2])
    supported_args = inspect.signature(TrainingArguments).parameters

    # Base kwargs for TrainingArguments (safe)
    train_args_kwargs = {
        "output_dir": args.output_dir,
        "num_train_epochs": args.num_train_epochs,
        "per_device_train_batch_size": args.per_device_train_batch_size,
        "per_device_eval_batch_size": args.per_device_eval_batch_size,
        "learning_rate": args.learning_rate,
        "save_total_limit": 3,
        "fp16": torch.cuda.is_available(),
        # make training logging visible on Kaggle / notebooks
        "logging_steps": 100 if "logging_steps" in supported_args else None,
    }
    # remove None values
    train_args_kwargs = {k: v for k, v in train_args_kwargs.items() if v is not None}

    # Add modern args if supported
    if "evaluation_strategy" in supported_args:
        train_args_kwargs["evaluation_strategy"] = "steps"
        train_args_kwargs["eval_steps"] = getattr(args, "eval_steps", 500)
    if "save_steps" in supported_args:
        train_args_kwargs["save_steps"] = getattr(args, "save_steps", 500)
    if "remove_unused_columns" in supported_args:
        train_args_kwargs["remove_unused_columns"] = False
    # disable reporting to wandb/other remote loggers so Kaggle output is immediate
    if "report_to" in supported_args:
        train_args_kwargs["report_to"] = "none"

    # Build TrainingArguments (this will automatically use the appropriate signature)
    training_args = TrainingArguments(**train_args_kwargs)

    # Build trainer kwargs:
    trainer_kwargs = dict(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
    )

    # Try to pass predict_with_generate to Trainer only if accepted
    try:
        trainer = Trainer(**trainer_kwargs, predict_with_generate=True)
    except TypeError:
        trainer = Trainer(**trainer_kwargs)

    # Make logs visible and confirm device
    import transformers as _tf
    _tf.logging.set_verbosity_info()
    print("Starting training — device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    print("TrainingArguments:", training_args)

    # Start training (you can resume from checkpoint by passing resume_from_checkpoint="path")
    train_result = trainer.train()
    print("Training finished. Saving model ...")
    trainer.save_model(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)


    # Quick generation check
    sample_texts = val_df['eng'].tolist()[:8]
    inputs = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True, max_length=args.max_source_length).to(model.device)
    # Use model.generate (works with EncoderDecoderModel)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=args.max_target_length,
        num_beams=4,
        early_stopping=True,
    )
    preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    for s, p, t in zip(sample_texts, preds, val_df['ara'].tolist()[:len(preds)]):
        print("SRC:", s)
        print("PRED:", p)
        print("TGT :", t)
        print("---")


In [9]:
if __name__ == "__main__":
    main()

transformers version: 4.52.4
Total pairs after cleaning: 12468
Train / Val sizes: 11221 1247


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bia

Map:   0%|          | 0/11221 [00:00<?, ? examples/s]



Map:   0%|          | 0/1247 [00:00<?, ? examples/s]

  trainer = Trainer(**trainer_kwargs, predict_with_generate=True)
  trainer = Trainer(**trainer_kwargs)
***** Running training *****
  Num examples = 11,221
  Num Epochs = 12
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8,424
  Number of trainable parameters = 384,194,811


Starting training — device: cuda
TrainingArguments: TrainingArguments(
_n_gpu=2,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,1.4152
200,0.7249
300,0.6833
400,0.6412
500,0.6166
600,0.6218
700,0.582
800,0.5558
900,0.5339
1000,0.5128


Saving model checkpoint to ./bert-encdec-eng-ara/checkpoint-500
Configuration saved in ./bert-encdec-eng-ara/checkpoint-500/config.json
Configuration saved in ./bert-encdec-eng-ara/checkpoint-500/generation_config.json
Model weights saved in ./bert-encdec-eng-ara/checkpoint-500/model.safetensors
tokenizer config file saved in ./bert-encdec-eng-ara/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./bert-encdec-eng-ara/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./bert-encdec-eng-ara/checkpoint-1000
Configuration saved in ./bert-encdec-eng-ara/checkpoint-1000/config.json
Configuration saved in ./bert-encdec-eng-ara/checkpoint-1000/generation_config.json
Model weights saved in ./bert-encdec-eng-ara/checkpoint-1000/model.safetensors
tokenizer config file saved in ./bert-encdec-eng-ara/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./bert-encdec-eng-ara/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./bert-encdec

Training finished. Saving model ...


Model weights saved in ./bert-encdec-eng-ara/model.safetensors
tokenizer config file saved in ./bert-encdec-eng-ara/tokenizer_config.json
Special tokens file saved in ./bert-encdec-eng-ara/special_tokens_map.json
tokenizer config file saved in ./bert-encdec-eng-ara/tokenizer_config.json
Special tokens file saved in ./bert-encdec-eng-ara/special_tokens_map.json


ValueError: `decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.

In [10]:
import pandas as pd
import torch
from transformers import AutoTokenizer, EncoderDecoderModel

# -------------------------
# Load model + tokenizer
# -------------------------
model_name = "/kaggle/working/bert-encdec-eng-ara/checkpoint-8424" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_pretrained(model_name)

# Fix missing config values
if model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = tokenizer.cls_token_id or tokenizer.bos_token_id
if model.config.eos_token_id is None:
    model.config.eos_token_id = tokenizer.sep_token_id or tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Force set special tokens for generation
model.config.decoder_start_token_id = tokenizer.cls_token_id or 101
model.config.eos_token_id = tokenizer.sep_token_id or 102
model.config.pad_token_id = tokenizer.pad_token_id or 0

# -------------------------
# Load dataset
# -------------------------
file_path = "/kaggle/input/eng-ara/eng-ara.txt" 
df = pd.read_csv(file_path, sep="\t", header=None, usecols=[0,1], names=["eng","ara"])

# Take 20 random samples
sample_df = df.sample(20, random_state=42).reset_index(drop=True)

# -------------------------
# Prediction function
# -------------------------
def translate(sentence, max_len=64):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    # outputs = model.generate(**inputs, max_length=max_len)
    outputs = model.generate(
        **inputs,
        max_length=max_len,
        decoder_start_token_id=model.config.decoder_start_token_id,
        eos_token_id=model.config.eos_token_id,
        pad_token_id=model.config.pad_token_id
    )

    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return pred.strip()

# -------------------------
# Run predictions
# -------------------------
for i, row in sample_df.iterrows():
    eng = row["eng"]
    ara = row["ara"]
    pred = translate(eng)
    print(f"\nExample {i+1}")
    print(f"English   : {eng}")
    print(f"Arabic    : {ara}")
    print(f"Predicted : {pred}")


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file /kaggle/working/bert-encdec-eng-ara/checkpoint-8424/config.json
Model config EncoderDecoderConfig {
  "architectures": [
    "EncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "bert-base-multilingual-cased",
    "add_cross_attention": true,
    "architectures": [
      "BertForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "classifier_dropout": null,
    "directionality": "bidi",
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "is_decoder": true,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pooler_fc_size": 768,
    "pooler_num_attention_head


Example 1
English   : Do you have a friend named Tom?
Arabic    : هل لديك صديق يسمى توم ؟
Predicted : 

Example 2
English   : She called me in the afternoon.
Arabic    : اتصلت بي بعد الظهر.
Predicted : 

Example 3
English   : I had nothing else to do.
Arabic    : لم يكن لديّ أيّ شيء آخر أقوم به.
Predicted : 

Example 4
English   : I said I would make her happy.
Arabic    : قلتُ أني سأسعدها.
Predicted : 

Example 5
English   : He asked us to help him.
Arabic    : طلب منا المساعدة.
Predicted : 

Example 6
English   : All of us should go.
Arabic    : علينا جميعا أن نذهب.
Predicted : علينا جميعا ان نذهب

Example 7
English   : That was years ago.
Arabic    : حصل ذلك منذ سنوات عدة.
Predicted : 

Example 8
English   : The doctor took his pulse.
Arabic    : قاس الطبيب نبضه.
Predicted : 

Example 9
English   : This story is based on a true story.
Arabic    : هذه القصة مُستمّدة من قصّةٍ واقعية.
Predicted : 

Example 10
English   : What are you up to tomorrow afternoon?
Arabic    : ماذا عندك بعد

In [18]:
!zip -r bert_checkpoint.zip /kaggle/working/bert-encdec-eng-ara/checkpoint-8424

  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/ (stored 0%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/scheduler.pt (deflated 57%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/generation_config.json (deflated 14%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/vocab.txt (deflated 45%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/tokenizer.json (deflated 67%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/trainer_state.json (deflated 80%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/rng_state.pth (deflated 25%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/config.json (deflated 75%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/training_args.bin (deflated 52%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/scaler.pt (deflated 60%)
  adding: kaggle/working/bert-encdec-eng-ara/checkpoint-8424/optimizer.pt (deflated 30%)


In [20]:
# rm -rf /kaggle/working/bert-encdec-eng-ara/checkpoint-7500
# rm -rf /kaggle/working/bert-encdec-eng-ara/checkpoint-8000
# !rm -rf /kaggle/working/bert-encdec-eng-ara/checkpoint-8424
# !rm -f /kaggle/working/bert_checkpoint.zip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
!zip -r bert_encdec_eng_ara.zip ./bert-encdec-eng-ara

  adding: bert-encdec-eng-ara/ (stored 0%)
  adding: bert-encdec-eng-ara/special_tokens_map.json (deflated 42%)
  adding: bert-encdec-eng-ara/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)
  adding: bert-encdec-eng-ara/generation_config.json (deflated 14%)
  adding: bert-encdec-eng-ara/vocab.txt (deflated 45%)
  adding: bert-encdec-eng-ara/tokenizer.json (deflated 67%)
  adding: bert-encdec-eng-ara/config.json (deflated 75%)
  adding: bert-encdec-eng-ara/tokenizer_config.json (deflated 75%)
  adding: bert-encdec-eng-ara/training_args.bin (deflated 52%)


---

In [22]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from huggingface_hub import create_repo

# replace with your username
create_repo("bert-encdec-eng-ara", private=True)  

RepoUrl('https://huggingface.co/idrisskh/bert-encdec-eng-ara', endpoint='https://huggingface.co', repo_type='model', repo_id='idrisskh/bert-encdec-eng-ara')

In [24]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="idrisskh/bert-encdec-eng-ara",  
    folder_path="./bert-encdec-eng-ara",       
    path_in_repo="."                             
)

Uploading...:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/idrisskh/bert-encdec-eng-ara/commit/c2cb31b63f59217a3d2a68d6079aa5269cf119f7', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c2cb31b63f59217a3d2a68d6079aa5269cf119f7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/idrisskh/bert-encdec-eng-ara', endpoint='https://huggingface.co', repo_type='model', repo_id='idrisskh/bert-encdec-eng-ara'), pr_revision=None, pr_num=None)

In [25]:
from transformers import AutoTokenizer, EncoderDecoderModel

# -------------------------
# Load model + tokenizer
# -------------------------

model_name = "idrisskh/bert-encdec-eng-ara"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/tokenizer_config.json
loading file chat_template.jinja from cache at None


config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/config.json
Model config EncoderDecoderConfig {
  "architectures": [
    "EncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "bert-base-multilingual-cased",
    "add_cross_attention": true,
    "architectures": [
      "BertForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "classifier_dropout": null,
    "directionality": "bidi",
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "is_decoder": true,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "bert",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pooler_fc_size": 768,
    "pooler_num_attention_heads": 12,
    "pooler_num_fc_layers": 3,
    "pooler_size_per_head": 128,
    "pooler_type": "fir

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/model.safetensors
Generate config GenerationConfig {
  "decoder_start_token_id": 101,
  "eos_token_id": 102,
  "pad_token_id": 0
}

Instantiating BertModel model under default dtype torch.float32.
Instantiating BertLMHeadModel model under default dtype torch.float32.
Generate config GenerationConfig {
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing EncoderDecoderModel.

All the weights of EncoderDecoderModel were initialized from the model checkpoint at idrisskh/bert-encdec-eng-ara.
If your task is similar to the task the model of the checkpoint was trained on, you can already use EncoderDecoderModel for predictions without further training.


generation_config.json:   0%|          | 0.00/110 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--idrisskh--bert-encdec-eng-ara/snapshots/c2cb31b63f59217a3d2a68d6079aa5269cf119f7/generation_config.json
Generate config GenerationConfig {
  "max_length": 64,
  "pad_token_id": 0
}



In [27]:
import pandas as pd
import torch

# Fix missing config values
if model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = tokenizer.cls_token_id or tokenizer.bos_token_id
if model.config.eos_token_id is None:
    model.config.eos_token_id = tokenizer.sep_token_id or tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id


print("CLS:", tokenizer.cls_token_id)
print("SEP:", tokenizer.sep_token_id)
print("PAD:", tokenizer.pad_token_id)
print("Decoder start:", model.config.decoder_start_token_id)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Force set special tokens for generation
model.config.decoder_start_token_id = tokenizer.cls_token_id or 101
model.config.eos_token_id = tokenizer.sep_token_id or 102
model.config.pad_token_id = tokenizer.pad_token_id or 0

# -------------------------
# Load dataset
# -------------------------
file_path = "/kaggle/input/eng-ara/eng-ara.txt" 
df = pd.read_csv(file_path, sep="\t", header=None, usecols=[0,1], names=["eng","ara"])

# Take 20 random samples
sample_df = df.sample(20, random_state=42).reset_index(drop=True)

# -------------------------
# Prediction function
# -------------------------
def translate(sentence, max_len=64):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    
    outputs = model.generate(
        **inputs,
        max_length=max_len,
        num_beams=5,                # beam search (better than greedy)
        early_stopping=True,
        decoder_start_token_id=tokenizer.cls_token_id or model.config.decoder_start_token_id,
        eos_token_id=tokenizer.sep_token_id or model.config.eos_token_id,
        pad_token_id=tokenizer.pad_token_id or model.config.pad_token_id
    )

    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return pred.strip()


# -------------------------
# Run predictions
# -------------------------
for i, row in sample_df.iterrows():
    eng = row["eng"]
    ara = row["ara"]
    pred = translate(eng)
    print(f"\nExample {i+1}")
    print(f"English   : {eng}")
    print(f"Arabic    : {ara}")
    print(f"Predicted : {pred}")

CLS: 101
SEP: 102
PAD: 0
Decoder start: 101

Example 1
English   : Do you have a friend named Tom?
Arabic    : هل لديك صديق يسمى توم ؟
Predicted : هل لديك صديق توم ؟

Example 2
English   : She called me in the afternoon.
Arabic    : اتصلت بي بعد الظهر.
Predicted : اتصلت بي بعد الظهر

Example 3
English   : I had nothing else to do.
Arabic    : لم يكن لديّ أيّ شيء آخر أقوم به.
Predicted : لم يكن لدي اي شي اخر اقوم به

Example 4
English   : I said I would make her happy.
Arabic    : قلتُ أني سأسعدها.
Predicted : 

Example 5
English   : He asked us to help him.
Arabic    : طلب منا المساعدة.
Predicted : طلب منا المساعده

Example 6
English   : All of us should go.
Arabic    : علينا جميعا أن نذهب.
Predicted : علينا جميعا ان نذهب

Example 7
English   : That was years ago.
Arabic    : حصل ذلك منذ سنوات عدة.
Predicted : كان ذلك منذ عده سنين

Example 8
English   : The doctor took his pulse.
Arabic    : قاس الطبيب نبضه.
Predicted : قاس الطبيب

Example 9
English   : This story is based on a true st