In [1]:
import pandas as pd
import json
import re
from tqdm.auto import tqdm
from datasets import Dataset

In [2]:
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

en_lines = load_data('/kaggle/input/transl/v2/en-hi/train.en')
hi_lines = load_data('/kaggle/input/transl/v2/en-hi/train.hi')

In [3]:
train_data = {'input_text': en_lines, 'target_text': hi_lines}
df = pd.DataFrame(train_data)

In [4]:
df = df.sample(n=4000, random_state=42)
df

Unnamed: 0,input_text,target_text
5760611,Chief metropolitan magistrate Arun Kumar Garg ...,मुख्य मेट्रोपोलिटन मजिस्ट्रेट अरुण कुमार गर्ग ...
5114175,"Among these rulers, the most prominent ruler e...",इन शासकों में सबसे प्रमुख मिनांदर ने बौद्ध धर्...
1572385,Steps to check rising accidents of MIG planes\n,मिग विमानों की बढती दुर्घटनाओं को रोकने के लिए...
4552449,Another Witness provided us with an apartment.\n,"साथ ही, मुझे नाज़ है कि मैं एक यहोवा का साक्षी..."
2700552,They can take an informed decision based on in...,इस मोबाइल एप से किसान बाजार भावों के बारे में ...
...,...,...
4209266,"""""""Heartfelt congratulations to the people of ...",आप प्रमुख केजरीवाल ने ट्विटर पर राज्य में 'नई ...
8521330,IPL 2020 Mumbai Indians vs Royal Challengers B...,RCB vs MI IPL 2020 Super Over Match Report: इं...
2006386,The phone is powered by a 2759 mAh battery and...,फोन के 4 जीबी रैम और 128 जीबी स्टोरेज वेरिएंट ...
2664233,"""and spoke to them according to the counsel of...","और बूढ़ों की दी हुई सम्मति छोड़कर, जवानों की स..."


In [5]:
def preprocess_text(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['input_text'] = df['input_text'].apply(preprocess_text)
df['target_text'] = df['target_text'].apply(preprocess_text)

In [6]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [7]:
train_df

Unnamed: 0,input_text,target_text
2521371,"According to The Washington Post, residents in...","""एक्सेस नाउ के आंकड़ों के अनुसार द वाशिंगटन पोस..."
7124581,"In the previous session on Friday, the gold ha...","इससे पहले पिछले सत्र में बुधवार को सोना 50,138..."
4015210,Simplified Chinese characters,सरलीकृत चीनी वर्ण
8269749,Verma had challenged before the SC the Governm...,वर्मा ने अपने खिलाफ लगे आरोपों और सरकार द्वारा...
7198652,And how good it is that our overseers display ...,और यह कितना अच्छा है कि हमारे अध्यक्ष पौलुस के...
...,...,...
4617125,"He says that these days, hospitals in Syria ar...",उनका कहना है कि सीरिया के अस्पतालों में इस समय...
273776,BJP releases list of candidates,अपने साथ कई सवाल लेकर आई बीजेपी के उम्मीदवारों...
9313387,And we gotta tell this to everybody in New Yor...,और हमें न्यू यॉर्क सिटी में सबको यह बताना होगा!
5311592,"For India, Spotify is currently focusing on En...","Spotify म्युज़िक भारत में हिंदी, अंग्रेजी, पंज..."


In [8]:
train_dataset = Dataset.from_pandas(train_df[['input_text', 'target_text']])
val_dataset = Dataset.from_pandas(val_df[['input_text', 'target_text']])
test_dataset = Dataset.from_pandas(test_df[['input_text', 'target_text']])

# BART-base

In [9]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [10]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=1024, truncation=True, padding="max_length")
    outputs = tokenizer(examples['target_text'], max_length=128, truncation=True, padding="max_length")
    
    labels = outputs.input_ids
    labels = [[label if label != tokenizer.pad_token_id else -100 for label in labels_example] for labels_example in labels]
    
    inputs["labels"] = labels
    return inputs

In [12]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

2024-04-21 15:32:21.618034: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 15:32:21.618132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 15:32:21.724811: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112745588889084, max=1.0…



Epoch,Training Loss,Validation Loss
1,No log,1.424799
2,1.626700,1.3581
3,1.466200,1.341986


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=1200, training_loss=1.528558349609375, metrics={'train_runtime': 1100.363, 'train_samples_per_second': 8.724, 'train_steps_per_second': 1.091, 'total_flos': 5853469999104000.0, 'train_loss': 1.528558349609375, 'epoch': 3.0})

In [15]:
model.save_pretrained("./bartbase-model")
tokenizer.save_pretrained("./bartbase-model")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./bartbase-model/tokenizer_config.json',
 './bartbase-model/special_tokens_map.json',
 './bartbase-model/vocab.json',
 './bartbase-model/merges.txt',
 './bartbase-model/added_tokens.json')

In [18]:
def translate_text(text, model, tokenizer, device):
    model.to(device)

    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text


In [20]:
test_df

Unnamed: 0,input_text,target_text
6427512,"It is priced at Rs 8,787.","इसकी कीमत 12,700 रुपये रखी गई है।"
5275160,Mohammed Shami picked up three wickets for Ind...,युजवेंद्र चहल ने तीन और मोहम्मद शमी एवं हार्दि...
4289042,Of over 70 earthquakes recorded in the Hindu K...,इस शताब्दी में हिंदूकुश क्षेत्र के लगभग 70 अभि...
1364027,The motifs are distributed over planes of colo...,ये बनावट रंगो और टुकड़ो की सतह पर हज़ारो रंगे ...
2943759,This is good for your health.,यह आपके स्वास्थ्य के लिए काफी फायदेमंद होगी।
...,...,...
5452264,They are really an attempt to make us believe ...,शैतान की परीक्षाओं का सामना करने में हम यीशु क...
4130961,The video has gone viral on social media and p...,जिसका विडियो सोशल मीडिया में जमकर वायरल हो रहा...
1969049,Banbasa (Hindi: ) is a census town in Champawa...,बनबसा (Banbasa) भारत के उत्तराखण्ड राज्य के चम...
5576969,I am financially weak.,मैं आर्थिक रूप से कमजोर हूं।


In [21]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

first_entry = test_df.iloc[0]['input_text']
print("Original text:", first_entry)

translated_text = translate_text(first_entry, model, tokenizer, device)
print("Translated text:", translated_text)

Original text: It is priced at Rs 8,787.
Translated text: प्रधानियों में 8,787 के बाद है।


In [22]:
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_df['translated_text'] = ''

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    translated_text = translate_text(row['input_text'], model, tokenizer, device)
    test_df.at[index, 'translated_text'] = translated_text

print(test_df[['input_text', 'target_text', 'translated_text']])

  0%|          | 0/400 [00:00<?, ?it/s]

                                                input_text  \
6427512                          It is priced at Rs 8,787.   
5275160  Mohammed Shami picked up three wickets for Ind...   
4289042  Of over 70 earthquakes recorded in the Hindu K...   
1364027  The motifs are distributed over planes of colo...   
2943759                      This is good for your health.   
...                                                    ...   
5452264  They are really an attempt to make us believe ...   
4130961  The video has gone viral on social media and p...   
1969049  Banbasa (Hindi: ) is a census town in Champawa...   
5576969                             I am financially weak.   
2048248  He played just 8 Tests, 79 ODIs and 25 T20Is f...   

                                               target_text  \
6427512                  इसकी कीमत 12,700 रुपये रखी गई है।   
5275160  युजवेंद्र चहल ने तीन और मोहम्मद शमी एवं हार्दि...   
4289042  इस शताब्दी में हिंदूकुश क्षेत्र के लगभग 70 अभि...   
1364027

In [23]:
test_df

Unnamed: 0,input_text,target_text,translated_text
6427512,"It is priced at Rs 8,787.","इसकी कीमत 12,700 रुपये रखी गई है।","प्रधानियों में 8,787 के बाद है।"
5275160,Mohammed Shami picked up three wickets for Ind...,युजवेंद्र चहल ने तीन और मोहम्मद शमी एवं हार्दि...,प्रधानियों के बाद मुखारी पर संबराध कि साथ नहीं।
4289042,Of over 70 earthquakes recorded in the Hindu K...,इस शताब्दी में हिंदूकुश क्षेत्र के लगभग 70 अभि...,मुख्यों के अधिकार में 70 करना है।
1364027,The motifs are distributed over planes of colo...,ये बनावट रंगो और टुकड़ो की सतह पर हज़ारो रंगे ...,"प्रधानियों के बाद कहा, ‘‘मुख्’ का संसकार नहीं,..."
2943759,This is good for your health.,यह आपके स्वास्थ्य के लिए काफी फायदेमंद होगी।,उनके कहा है।
...,...,...,...
5452264,They are really an attempt to make us believe ...,शैतान की परीक्षाओं का सामना करने में हम यीशु क...,"इसके लिए कहा हैं, ‘प्रधान को पहलेंगों का मुखार..."
4130961,The video has gone viral on social media and p...,जिसका विडियो सोशल मीडिया में जमकर वायरल हो रहा...,प्रधानियों के बाद कहा है।
1969049,Banbasa (Hindi: ) is a census town in Champawa...,बनबसा (Banbasa) भारत के उत्तराखण्ड राज्य के चम...,प्रधानियों के बाद कि अपनी संबस्‍तार में।
5576969,I am financially weak.,मैं आर्थिक रूप से कमजोर हूं।,उन्हेंगों के लिए है।


In [None]:
test_df['translations_new'] = test_translations

### Bleu Score

In [30]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

nltk.download('wordnet')
nltk.download('punkt')

references = [[ref.split()] for ref in test_df['target_text'].tolist()]
translated_texts = [translation.split() for translation in test_df['translated_text'].tolist()]

bleu1 = corpus_bleu(references, translated_texts, weights=(1, 0, 0, 0))
bleu2 = corpus_bleu(references, translated_texts, weights=(0.5, 0.5, 0, 0))
bleu3 = corpus_bleu(references, translated_texts, weights=(0.33, 0.33, 0.33, 0))
bleu4 = corpus_bleu(references, translated_texts, weights=(0.25, 0.25, 0.25, 0.25))

print(f"BLEU-1: {bleu1*100}")
print(f"BLEU-2: {bleu2*100}")
print(f"BLEU-3: {bleu3*100}")
print(f"BLEU-4: {bleu4*100}")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
BLEU-1: 3.2213694753984305
BLEU-2: 0.7368021936167561
BLEU-3: 0.1873038305497764
BLEU-4: 0.6017495802211925


### Rouge-score

In [29]:
!pip install rouge-score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for i in range(len(test_df)):
    reference = test_df['target_text'].iloc[i]
    translation = test_df['translated_text'].iloc[i]

    scores = scorer.score(reference, translation)

    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print(f"Average ROUGE-1: {average_rouge1*100}")
print(f"Average ROUGE-2: {average_rouge2*100}")
print(f"Average ROUGE-L: {average_rougeL*100}")

Average ROUGE-1: 7.962402015768891
Average ROUGE-2: 1.9349834805717159
Average ROUGE-L: 7.811666721651243


### ChrF Score

In [27]:
!pip install sacrebleu
import sacrebleu

references = [[ref] for ref in test_df['target_text'].tolist()]  # List of lists
translations = test_df['translated_text'].tolist()

chrf_scores = []

for reference, translation in zip(references, translations):
    chrf = sacrebleu.sentence_chrf(translation, reference)  # Note the order: (hypothesis, reference)
    chrf_scores.append(chrf.score)

average_chrf_score = sum(chrf_scores) / len(chrf_scores)

print(f"Average ChrF Score: {average_chrf_score}")


Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m992.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.8.2 sacrebleu-2.4.2
Average ChrF Score: 9.008351128044833


# M2M-100 is a multilingual model

In [31]:
!pip install transformers




In [37]:
pip install transformers sentencepiece


Note: you may need to restart the kernel to use updated packages.


In [38]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

In [39]:
def translate_text(text, model, tokenizer):

    tokenizer.src_lang = "en"
    model.config.forced_bos_token_id = tokenizer.get_lang_id("hi")

    encoded = tokenizer(text, return_tensors="pt")

    generated_tokens = model.generate(**encoded, forced_bos_token_id=model.config.forced_bos_token_id)

    translated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translated_text

sample_text = "Hello world!"
translated_text = translate_text(sample_text, model, tokenizer)
print("Translated text:", translated_text)



Translated text: हैलो दुनिया!


In [43]:
top10_translations = []
for index, row in tqdm(test_df.head(10).iterrows(), total=10):
    translation = translate_text(row['input_text'], model, tokenizer)
    top10_translations.append(translation)

test_df.head(10)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,input_text,target_text,translated_text
6427512,"It is priced at Rs 8,787.","इसकी कीमत 12,700 रुपये रखी गई है।","इसकी कीमत 8,787 रुपये है।"
5275160,Mohammed Shami picked up three wickets for Ind...,युजवेंद्र चहल ने तीन और मोहम्मद शमी एवं हार्दि...,"मोहम्मद शमी ने भारत के लिए तीन विकेट उठाए, जिस..."
4289042,Of over 70 earthquakes recorded in the Hindu K...,इस शताब्दी में हिंदूकुश क्षेत्र के लगभग 70 अभि...,इस शताब्दी के दौरान हिंदू कुश क्षेत्र में 70 स...
1364027,The motifs are distributed over planes of colo...,ये बनावट रंगो और टुकड़ो की सतह पर हज़ारो रंगे ...,प्रेरणाओं को रंग की प्लेटों पर वितरित किया जात...
2943759,This is good for your health.,यह आपके स्वास्थ्य के लिए काफी फायदेमंद होगी।,यह आपके स्वास्थ्य के लिए अच्छा है।
7178817,"And yes, one more thing.","हाँ, और एक ज़रूरी बात .","और हाँ, एक और बात।"
7337618,These accounts have been frozen.,इन खातों को फ्रीज कराया जा रहा है।,इन खातों को ठंडा कर दिया गया है।
2483861,Security of Vikas Dubey should have been ensur...,विकास दुबे को न्यायिक हिरासत में रखते हुए इसकी...,Vikas Dubey की सुरक्षा को अदालत में सुरक्षित र...
8221597,Directs review of loan for Kakardooma Multi-st...,देरी के कारण दिल्ली में कड़कड़डूमा बहु-मंजिला ...,देरी के कारण रद्द करने के लिए दिल्ली में Kakar...
8568543,Shraddha will be seen next in a Luv Ranjan dir...,श्रद्धा जल्द ही रणबीर कपूर के साथ लव रंजन के न...,Shraddha अगली बार Ranbir Kapoor के साथ एक Luv ...
