In [4]:
import tensorflow as tf

from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM # Its purpose is to automatically load a pre-trained 
                                                 # sequence-to-sequence language model that is 
                                                 # compatible with TensorFlow.


print(f"TensorFlow version: {tf.__version__}") # Print the TensorFlow version for verification.

  from .autonotebook import tqdm as notebook_tqdm


TensorFlow version: 2.19.0


In [5]:
#MODEL_NAME = "Helsinki-NLP/opus-mt-en-fr"
MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi"
# a model specifically trained by Helsinki NLP
# to translate from English ('en') to French ('fr'). MarianMT models are
# efficient and widely used for various language pairs.

In [None]:
#!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   ---------- ----------------------------- 262.1/992.0 kB ? eta -:--:--
   ------------------------------- -------- 786.4/992.0 kB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 992.0/992.0 kB 1.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [6]:
import sentencepiece
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load the tokenizer associated with the pre-trained model.
# The tokenizer is responsible for converting text into numerical IDs (tokens)



In [7]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Load the pre-trained sequence-to-sequence model for conditional generation (translation).
# TFAutoModelForSeq2SeqLM automatically selects the correct model architecture
# (e.g., MarianMT model in this case) and loads its pre-trained weights, ensuring
# it's compatible with TensorFlow. This model has an encoder (BERT-like)
# and a decoder (GPT-like) component.





TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [8]:
def translate_text(text_to_translate):
    inputs = tokenizer(text_to_translate, return_tensors="tf", truncation=True,padding=True)

    # Generate the translation using the model.
    # 'generate' method is used for sequence generation tasks.
    # 'inputs' are the tokenized input IDs.
    # 'max_length' sets the maximum length of the generated output sequence.
    # 'num_beams' is used for beam search decoding, which explores multiple
    # possible next words to find a more probable sequence, leading to better translations.
    # 'early_stopping=True' stops generation once all beam hypotheses have finished.
    # 'no_repeat_ngram_size=2' prevents the generation of repeating n-grams (e.g., words or phrases)
    # of size 2 or more, which helps in producing more natural-sounding translations.

    translated_tokens = model.generate(
        inputs["input_ids"],
        max_length = 50,
        num_beams = 5,
        early_stopping = True,
        no_repeat_ngram_size = 2
    )

    # Decode the generated tokens back into human-readable text.
    # 'skip_special_tokens=True' removes special tokens (like [CLS], [SEP], [PAD])
    # from the decoded output, resulting in clean text.
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    # Return the translated text.
    return translated_text

In [None]:
#T = input('-> ')
#print(T,'\n')
#print(translate_text(T))

What is that one word that ignited to minds of Indians? 

वह एक शब्द क्या है जो आदिवासियों के मन में उभरता था?


In [None]:
import streamlit as st

T = st.text_input("Enter text to translate:")
if T:
    translated_text = translate_text(T)
    st.write(f'Input Text: {T}')
    st.write(f'Translated Text: {translated_text}')