In [1]:
pip install transformers datasets sentencepiece torch scikit-learn



In [4]:
!pip install --upgrade transformers
# cross_language_translation_train.py

# Step 1: Import libraries
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer, MarianConfig, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

# Step 2: Load a multilingual translation dataset (e.g., Tatoeba)
# Corrected dataset loading line
dataset = load_dataset("opus_books", "en-fr", split='train[:1%]')  # ✅ Correct config

# Step 3: Prepare data
data_list = dataset["translation"]  # get the 'translation' field which is a dict
en_sentences = [item["en"] for item in data_list if item["en"] and item["fr"]]
fr_sentences = [item["fr"] for item in data_list if item["en"] and item["fr"]]

# Build DataFrame
df = pd.DataFrame({"en": en_sentences, "fr": fr_sentences})

# Step 4: Preprocess - Clean and filter
df["en"] = df["en"].astype(str)
df["fr"] = df["fr"].astype(str)
df = df[df["en"].str.strip() != '']
df = df[df["fr"].str.strip() != '']



# Train-test split
train_texts, val_texts = train_test_split(df, test_size=0.1)

# Step 5: Load MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Step 6: Tokenization Function
def preprocess_function(examples):
    inputs = tokenizer(examples["en"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["fr"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Step 7: Convert train and val sets to Hugging Face format
train_ds = Dataset.from_pandas(train_texts[['en', 'fr']])
val_ds = Dataset.from_pandas(val_texts[['en', 'fr']])

train_tokenized = train_ds.map(preprocess_function, batched=True)
val_tokenized = val_ds.map(preprocess_function, batched=True)

# Step 7.5: Disable W&B (prevents API key errors)
import os
os.environ["WANDB_DISABLED"] = "true"


# Step 8: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="no"
)

# Step 9: Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized
)

# Step 10: Train the Model
trainer.train()

# Step 11: Save Fine-Tuned Model
model.save_pretrained("./fine_tuned_translation_model")
tokenizer.save_pretrained("./fine_tuned_translation_model")

print("✅ Model fine-tuned and saved successfully.")






Map:   0%|          | 0/1143 [00:00<?, ? examples/s]



Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss




✅ Model fine-tuned and saved successfully.


In [8]:
from transformers import MarianMTModel, MarianTokenizer

model_path = "./fine_tuned_translation_model"
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

# Input
text = "Babu sir is best than RL Sir"
input_tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Translation
translated_tokens = model.generate(**input_tokens)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print("🔁 Translated:", translated_text)


🔁 Translated: Babu monsieur est mieux que RL Sir
