In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/Multi_language_data.csv")

# Show all rows (removes truncation)
pd.set_option('display.max_rows', None)

# Display the full DataFrame
df

Unnamed: 0,source,target,target_language_code
0,Wash your hands.,अपने हाथ धोएं。,hi
1,Wash your hands.,Lavez-vous les mains.,fr
2,Wash your hands.,Waschen Sie Ihre Hände.,de
3,Wash your hands.,Lávate las manos.,es
4,Drink clean water.,साफ पानी पिएं。,hi
5,Drink clean water.,Buvez de l'eau propre.,fr
6,Drink clean water.,Trinken Sie sauberes Wasser.,de
7,Drink clean water.,Bebe agua limpia.,es
8,Use compost for better yield.,बेहतर उपज के लिए कंपोस्ट का उपयोग करें。,hi
9,Use compost for better yield.,Utilisez du compost pour un meilleur rendement.,fr


In [None]:
!pip install transformers datasets sacrebleu sentencepiece


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# Load dataset
df = pd.read_csv("Multi_language_data.csv")

# Filter for one language pair: English to Hindi
df = df[df["target_language_code"] == "hi_IN"]

# Rename columns for HuggingFace dataset
df = df.rename(columns={"source": "translation_source", "target": "translation_target"})
dataset = Dataset.from_pandas(df)

# Load MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization
def preprocess_function(examples):
    model_inputs = tokenizer(examples["translation_source"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["translation_target"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./marian-en-hi-model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True  # Only works if it is running on GPU with float16 support
)


In [None]:
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer

# Supported translation directions and models
LANGUAGE_MODELS = {
    "English to Hindi": "Helsinki-NLP/opus-mt-en-hi",
    "Hindi to English": "Helsinki-NLP/opus-mt-hi-en",

    "English to German": "Helsinki-NLP/opus-mt-en-de",
    "German to English": "Helsinki-NLP/opus-mt-de-en",

    "English to Spanish": "Helsinki-NLP/opus-mt-en-es",
    "Spanish to English": "Helsinki-NLP/opus-mt-es-en",

    "English to French": "Helsinki-NLP/opus-mt-en-fr",
    "French to English": "Helsinki-NLP/opus-mt-fr-en",

    "English to Kannada": "Helsinki-NLP/opus-mt-en-kn",
    "Kannada to English": "Helsinki-NLP/opus-mt-kn-en"
}

# Translation function with forced "नमस्ते" for Hindi greeting
def translate_text(text, direction):
    model_name = LANGUAGE_MODELS[direction]

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    output = tokenizer.decode(translated[0], skip_special_tokens=True)

    # Force prepend "नमस्ते" if the input contains a greeting
    if direction == "English to Hindi":
        greetings = ["hello", "hi", "hey"]
        if any(greet in text.lower() for greet in greetings):
            if "नमस्ते" not in output:
                output = "नमस्ते " + output.lstrip("।").lstrip(", ")

    return output

# Gradio UI
gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Dropdown(choices=list(LANGUAGE_MODELS.keys()), label="Translation Direction")
    ],
    outputs="text",
    title="Cross Language Translation Tool",
    description="Translate between different languages"
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f2dc5d464e475bd25e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
from transformers import MarianMTModel, MarianTokenizer, MBartForConditionalGeneration, MBart50TokenizerFast


In [None]:
# MarianMT (English to Hindi)
marian_model_name = "Helsinki-NLP/opus-mt-en-hi"
marian_tokenizer = MarianTokenizer.from_pretrained(marian_model_name)
marian_model = MarianMTModel.from_pretrained(marian_model_name)

# mBART (multilingual: must specify lang codes)
mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)
mbart_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name)




tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [None]:
def marian_translate(text):
    inputs = marian_tokenizer([text], return_tensors="pt", padding=True, truncation=True)
    translated = marian_model.generate(**inputs)
    return marian_tokenizer.decode(translated[0], skip_special_tokens=True)


In [None]:
def mbart_translate(text):
    mbart_tokenizer.src_lang = "en_XX"
    encoded = mbart_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    generated_tokens = mbart_model.generate(
        **encoded,
        forced_bos_token_id=mbart_tokenizer.lang_code_to_id["hi_IN"]
    )
    return mbart_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


In [None]:
import gradio as gr

def compare_models(text):
    marian_out = marian_translate(text)
    mbart_out = mbart_translate(text)
    return marian_out, mbart_out

gr.Interface(
    fn=compare_models,
    inputs=gr.Textbox(label="Enter text in English"),
    outputs=[
        gr.Textbox(label="MarianMT Output"),
        gr.Textbox(label="mBART Output")
    ],
    title="MarianMT vs mBART: English to Hindi Translator",
    description="Compare translations between MarianMT and mBART for the same input."
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5d8ee638f19dffca3d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


