<a href="https://colab.research.google.com/github/eshandutta23/Cross-Language-Translation-Tool-NLP-Project-/blob/main/NLP_Cross_language_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/Multi_language_data.csv")

# Show all rows (removes truncation)
pd.set_option('display.max_rows', None)

# Display the full DataFrame
df

In [None]:
!pip install transformers datasets sacrebleu sentencepiece


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# Load dataset
df = pd.read_csv("Multi_language_data.csv")

# Filter for one language pair: English to Hindi
df = df[df["target_language_code"] == "hi_IN"]

# Rename columns for HuggingFace dataset
df = df.rename(columns={"source": "translation_source", "target": "translation_target"})
dataset = Dataset.from_pandas(df)

# Load MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization
def preprocess_function(examples):
    model_inputs = tokenizer(examples["translation_source"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["translation_target"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./marian-en-hi-model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True  # Only works if it is running on GPU with float16 support
)


In [None]:
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer

# Supported translation directions and models
LANGUAGE_MODELS = {
    "English to Hindi": "Helsinki-NLP/opus-mt-en-hi",
    "Hindi to English": "Helsinki-NLP/opus-mt-hi-en",

    "English to German": "Helsinki-NLP/opus-mt-en-de",
    "German to English": "Helsinki-NLP/opus-mt-de-en",

    "English to Spanish": "Helsinki-NLP/opus-mt-en-es",
    "Spanish to English": "Helsinki-NLP/opus-mt-es-en",

    "English to French": "Helsinki-NLP/opus-mt-en-fr",
    "French to English": "Helsinki-NLP/opus-mt-fr-en",

    "English to Kannada": "Helsinki-NLP/opus-mt-en-kn",
    "Kannada to English": "Helsinki-NLP/opus-mt-kn-en"
}

# Translation function with forced "नमस्ते" for Hindi greeting
def translate_text(text, direction):
    model_name = LANGUAGE_MODELS[direction]

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    output = tokenizer.decode(translated[0], skip_special_tokens=True)

    # Force prepend "नमस्ते" if the input contains a greeting
    if direction == "English to Hindi":
        greetings = ["hello", "hi", "hey"]
        if any(greet in text.lower() for greet in greetings):
            if "नमस्ते" not in output:
                output = "नमस्ते " + output.lstrip("।").lstrip(", ")

    return output

# Gradio UI
gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(label="Enter Text"),
        gr.Dropdown(choices=list(LANGUAGE_MODELS.keys()), label="Translation Direction")
    ],
    outputs="text",
    title="Cross Language Translation Tool",
    description="Translate between different languages"
).launch()


In [None]:
from transformers import MarianMTModel, MarianTokenizer, MBartForConditionalGeneration, MBart50TokenizerFast


In [None]:
# MarianMT (English to Hindi)
marian_model_name = "Helsinki-NLP/opus-mt-en-hi"
marian_tokenizer = MarianTokenizer.from_pretrained(marian_model_name)
marian_model = MarianMTModel.from_pretrained(marian_model_name)

# mBART (multilingual: must specify lang codes)
mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)
mbart_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name)


In [None]:
def marian_translate(text):
    inputs = marian_tokenizer([text], return_tensors="pt", padding=True, truncation=True)
    translated = marian_model.generate(**inputs)
    return marian_tokenizer.decode(translated[0], skip_special_tokens=True)


In [None]:
def mbart_translate(text):
    mbart_tokenizer.src_lang = "en_XX"
    encoded = mbart_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    generated_tokens = mbart_model.generate(
        **encoded,
        forced_bos_token_id=mbart_tokenizer.lang_code_to_id["hi_IN"]
    )
    return mbart_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


In [None]:
# Model Evaluation based on Qulatitative Evaluation
import gradio as gr

def compare_models(text):
    marian_out = marian_translate(text)
    mbart_out = mbart_translate(text)
    return marian_out, mbart_out

gr.Interface(
    fn=compare_models,
    inputs=gr.Textbox(label="Enter text in English"),
    outputs=[
        gr.Textbox(label="MarianMT Output"),
        gr.Textbox(label="mBART Output")
    ],
    title="MarianMT vs mBART: English to Hindi Translator",
    description="Compare translations between MarianMT and mBART for the same input."
).launch()


In [None]:
from datasets import load_metric
bleu = load_metric("sacrebleu")

predictions = ["He goes to school."]
references = [["He is going to school."]]

bleu_score = bleu.compute(predictions=predictions, references=references)
print("BLEU Score:", bleu_score["score"])
print("The model produces accurate and fluent translations and is well fine tuned")

