<a href="https://colab.research.google.com/github/govardhan-06/Eng-Mal-Translator/blob/main/Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install datasets -q
!pip install --upgrade accelerate -q
!pip uninstall -y transformers accelerate -q
!pip install transformers[torch] accelerate -q

In [28]:
from datasets import load_dataset

dataset= load_dataset("Govardhan-06/flores_eng_mal")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence_eng_Latn', 'sentence_mal_Mlym'],
        num_rows: 1808
    })
    test: Dataset({
        features: ['sentence_eng_Latn', 'sentence_mal_Mlym'],
        num_rows: 201
    })
})

In [29]:
#Renaming test data to validation data
dataset['validation']=dataset.pop('test')

In [30]:
dataset['train'][1500]

{'sentence_eng_Latn': 'Since 1988, ballot boxes must be transparent so that voters and observers can witness that no envelopes are present at the start of the vote and that no envelopes are added except those of the duly counted and authorized voters.',
 'sentence_mal_Mlym': '1988 മുതൽ ബാലറ്റ് ബോക്സുകൾ സുതാര്യമാണ്, അതിലൂടെ വോട്ടർമാർക്കും നിരീക്ഷകർക്കും വോട്ടെടുപ്പിന്റെ തുടക്കത്തിൽ ഒരു എൻ\u200cവലപ്പുകളും ഇല്ലെന്നതും ശരിയായി എണ്ണിയിട്ടുള്ളതും അംഗീകൃതവുമായ വോട്ടർമാർക്കും ഒഴികെ എൻ\u200cവലപ്പുകളൊന്നും ചേർത്തിട്ടില്ലെന്നതും കൃത്യമായി കാണാനും നിരീക്ഷിക്കാനും സാധിക്കും.'}

In [31]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

model_ckpt="Helsinki-NLP/opus-mt-en-ml"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt,return_tensors='pt')
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)



In [32]:
dataset['train']['sentence_eng_Latn'][0]

'Many buildings are quite beautiful to look at and the view from a tall building or from a cleverly-positioned window can be a beauty to behold.'

In [33]:
dataset['train']['sentence_mal_Mlym'][0]

'പല കെട്ടിടങ്ങളും കാണാൻ നല്ല ഭംഗിയാണ് കൂടാതെ ഉയരമുള്ള ഒരു കെട്ടിടത്തിൽ നിന്നോ അല്ലെങ്കിൽ ഉചിതമായി സ്ഥാനം നിശ്ചയിച്ച് പണിത ഒരു ജനലിലൂടെയോ കാണുന്ന കാഴ്ച്ച മനോഹരമായിരിക്കും.'

In [34]:
eng_sent= dataset['train']['sentence_eng_Latn'][0]
mal_sent= dataset['train']['sentence_mal_Mlym'][0]

model_input = tokenizer(eng_sent, text_target=mal_sent)
model_input

{'input_ids': [2186, 15180, 41, 6223, 2931, 8, 1034, 145, 6, 4, 1883, 52, 13, 13060, 3901, 121, 52, 13, 19575, 191, 20, 15930, 55, 8281, 137, 33, 13, 6070, 8, 2229, 2, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [788, 5675, 465, 1995, 432, 9884, 291, 1752, 3132, 381, 32, 5675, 109, 10460, 1060, 8863, 45, 4240, 4658, 25, 12024, 32, 1127, 180, 2091, 256, 3694, 7226, 389, 9572, 3213, 2, 0]}

In [35]:
dataset['train'].column_names

['sentence_eng_Latn', 'sentence_mal_Mlym']

In [36]:
max_length=128

def tokenize_function(examples):
  input= examples["sentence_eng_Latn"]
  target= examples["sentence_mal_Mlym"]
  model_input = tokenizer(input, text_target=target,max_length=max_length, truncation=True)
  return model_input

In [37]:
tokenized_inputs=dataset.map(tokenize_function,batched=True,remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/1808 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

### Fine-tuning

In [38]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [39]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [40]:
batch = data_collator([tokenized_inputs["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [41]:
batch['labels']

tensor([[ 1859,   205,  1184, 10500,   193,  1897,    99,   513,  1463,   160,
          9223, 15540,  7590,   308,   567,  2162,  7971,  9206,     2,     0,
          -100,  -100,  -100],
        [ 2896,    95,  2439,  2784,   705,   176,  1568,  6230,  6972,  3583,
          3739,  1454,  2959,  2519,  2467,  1133, 16000,    25,  1959,   270,
         14102,     2,     0]])

In [42]:
!pip install evaluate -q
!pip install sacrebleu -q

In [43]:
import evaluate

metric = evaluate.load("sacrebleu")

In [44]:
import numpy as np
from datasets import load_metric
from transformers import EvalPrediction

# Load the BLEU metric
metric = load_metric("sacrebleu")

def compute_metrics(eval_preds: EvalPrediction):
    preds, labels = eval_preds

    # If preds is a tuple, extract the first element
    if isinstance(preds, tuple):
        preds = preds[0]

    # Ensure that preds is a numpy array
    preds = np.array(preds)

    # If preds are logits, get the token IDs by taking argmax
    if preds.ndim == 3:  # Assuming shape (batch_size, sequence_length, vocab_size)
        preds = np.argmax(preds, axis=-1)

    # Replace -100 in labels with the tokenizer pad token ID
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process the decoded outputs
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # SacreBLEU metric expects references to be a list of lists
    decoded_labels = [[label] for label in decoded_labels]

    # Compute the BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Return the BLEU score
    return {"bleu": result["score"]}


In [45]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [48]:
from transformers import Seq2SeqTrainingArguments

# Adjusting the dropout rate in the model configuration
model.config.dropout_rate = 0.3

args = Seq2SeqTrainingArguments(
    model_ckpt,
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [60]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_inputs["train"],
    eval_dataset=tokenized_inputs["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [51]:
trainer.evaluate()

{'eval_loss': 3.3388736248016357,
 'eval_bleu': 9.690105080502189,
 'eval_runtime': 18.484,
 'eval_samples_per_second': 10.874,
 'eval_steps_per_second': 0.703,
 'epoch': 30.0}

In [52]:
trainer.push_to_hub(model_name="eng-mal-translator", commit_message="Model Training complete")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[24660]], 'forced_eos_token_id': 0}


model.safetensors:   0%|          | 0.00/227M [00:00<?, ?B/s]

events.out.tfevents.1720117777.444fcd30b326.928.3:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

events.out.tfevents.1720118500.444fcd30b326.928.4:   0%|          | 0.00/726 [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Govardhan-06/opus-mt-en-ml/commit/db7a0778893188cc74a45395040a7f82a9befcdd', commit_message='Model Training complete', commit_description='', oid='db7a0778893188cc74a45395040a7f82a9befcdd', pr_url=None, pr_revision=None, pr_num=None)

In [56]:
from transformers import pipeline
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

model_ckpt = "Govardhan-06/nllb-200-distilled-600M"
translator = pipeline("translation", model=model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [57]:
user_input=input("Enter the text to be translated: ")
user_input

Enter the text to be translated: Space Exploration Technologies Corporation, commonly referred to as SpaceX, is an American spacecraft manufacturer, launch service provider and satellite communications company headquartered in Hawthorne, California. The company was founded in 2002 by Elon Musk with the goal of reducing space transportation costs and ultimately developing a sustainable colony on Mars. The company currently produces and operates the Falcon 9 and Falcon Heavy rockets along with the Dragon and Starship spacecraft.


'Space Exploration Technologies Corporation, commonly referred to as SpaceX, is an American spacecraft manufacturer, launch service provider and satellite communications company headquartered in Hawthorne, California. The company was founded in 2002 by Elon Musk with the goal of reducing space transportation costs and ultimately developing a sustainable colony on Mars. The company currently produces and operates the Falcon 9 and Falcon Heavy rockets along with the Dragon and Starship spacecraft.'

In [58]:
input_ids = tokenizer(user_input, return_tensors="pt").input_ids
translated_ids = model.generate(input_ids=input_ids, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print("Translated Malayalam text:", translated_text)

Translated Malayalam text: സ്പേസ് എക്സ്പ്ലോറേഷൻ ടെക്നോളജീസ് കോർപ്പറേഷൻ, സാധാരണയായി സ്പേസ് എക്സ് എന്ന് വിളിക്കപ്പെടുന്ന, ഒരു അമേരിക്കൻ ബഹിരാകാശ കപ്പൽ നിർമ്മാതാവ്, ലോഞ്ച് സർവീസ് പ്രൊവൈഡർ, കാലിഫോർണിയയിലെ ഹാവോൺ ആസ്ഥാനമായുള്ള സാറ്റലൈറ്റ് കമ്മ്യൂണിക്കേഷൻ കമ്പനിയാണ്. ബഹിരാകാശ ഗതാഗത ചെലവുകൾ കുറയ്ക്കാനും ഒടുവിൽ ചൊവ്വയിൽ സുസ്ഥിരമായ ഒരു കോളനി വികസിപ്പിക്കാനും ലക്ഷ്യമിട്ടാണ് കമ്പനി 2002-ൽ ഇലോൺ മസ്ക് സ്ഥാപിച്ചത്. നിലവിൽ കമ്പനി ഡ്രാഗൺ, സ്റ്റാർ ഷിപ്പ് ബഹിരാകാശ കപ്പുകൾക്കൊപ്പം ഫാൽക്കൺ 9, ഫാൽക്കൺ ഹെവി റോക്കറ്റുകൾ ഉൽപാദിപ്പിക്കുകയും പ്രവർത്തിക്കുകയും ചെയ്യുന്നു.
