In [1]:
import pandas as pd
import json
import re
from tqdm.auto import tqdm
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import Dataset

In [2]:
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

en_lines = load_data('/kaggle/input/transl/v2/en-hi/train.en')
hi_lines = load_data('/kaggle/input/transl/v2/en-hi/train.hi')

In [3]:
train_data = {'input_text': en_lines, 'target_text': hi_lines}
df = pd.DataFrame(train_data)

In [4]:
df = df.sample(n=4000, random_state=42)
df

Unnamed: 0,input_text,target_text
5760611,Chief metropolitan magistrate Arun Kumar Garg ...,मुख्य मेट्रोपोलिटन मजिस्ट्रेट अरुण कुमार गर्ग ...
5114175,"Among these rulers, the most prominent ruler e...",इन शासकों में सबसे प्रमुख मिनांदर ने बौद्ध धर्...
1572385,Steps to check rising accidents of MIG planes\n,मिग विमानों की बढती दुर्घटनाओं को रोकने के लिए...
4552449,Another Witness provided us with an apartment.\n,"साथ ही, मुझे नाज़ है कि मैं एक यहोवा का साक्षी..."
2700552,They can take an informed decision based on in...,इस मोबाइल एप से किसान बाजार भावों के बारे में ...
...,...,...
4209266,"""""""Heartfelt congratulations to the people of ...",आप प्रमुख केजरीवाल ने ट्विटर पर राज्य में 'नई ...
8521330,IPL 2020 Mumbai Indians vs Royal Challengers B...,RCB vs MI IPL 2020 Super Over Match Report: इं...
2006386,The phone is powered by a 2759 mAh battery and...,फोन के 4 जीबी रैम और 128 जीबी स्टोरेज वेरिएंट ...
2664233,"""and spoke to them according to the counsel of...","और बूढ़ों की दी हुई सम्मति छोड़कर, जवानों की स..."


In [5]:
def preprocess_text(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['input_text'] = df['input_text'].apply(preprocess_text)
df['target_text'] = df['target_text'].apply(preprocess_text)

In [7]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [8]:
train_df

Unnamed: 0,input_text,target_text
2521371,"According to The Washington Post, residents in...","""एक्सेस नाउ के आंकड़ों के अनुसार द वाशिंगटन पोस..."
7124581,"In the previous session on Friday, the gold ha...","इससे पहले पिछले सत्र में बुधवार को सोना 50,138..."
4015210,Simplified Chinese characters,सरलीकृत चीनी वर्ण
8269749,Verma had challenged before the SC the Governm...,वर्मा ने अपने खिलाफ लगे आरोपों और सरकार द्वारा...
7198652,And how good it is that our overseers display ...,और यह कितना अच्छा है कि हमारे अध्यक्ष पौलुस के...
...,...,...
4617125,"He says that these days, hospitals in Syria ar...",उनका कहना है कि सीरिया के अस्पतालों में इस समय...
273776,BJP releases list of candidates,अपने साथ कई सवाल लेकर आई बीजेपी के उम्मीदवारों...
9313387,And we gotta tell this to everybody in New Yor...,और हमें न्यू यॉर्क सिटी में सबको यह बताना होगा!
5311592,"For India, Spotify is currently focusing on En...","Spotify म्युज़िक भारत में हिंदी, अंग्रेजी, पंज..."


In [9]:
train_dataset = Dataset.from_pandas(train_df[['input_text', 'target_text']])
val_dataset = Dataset.from_pandas(val_df[['input_text', 'target_text']])
test_dataset = Dataset.from_pandas(test_df[['input_text', 'target_text']])

In [10]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [11]:
model_name = 't5-base'  # You can use 't5-base', 't5-large', 't5-3b', or 't5-11b' depending on your requirements
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def tokenize_function(examples):
    inputs = tokenizer(
        examples['input_text'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    outputs = tokenizer(
        examples['target_text'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    labels = outputs['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100
    
    inputs["labels"] = labels
    return inputs


In [13]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="epoch",  # Changed to match evaluation_strategy
)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.3519,0.279046
2,0.329,0.266081
3,0.2538,0.265246


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2400, training_loss=0.33587370147307716, metrics={'train_runtime': 1651.5055, 'train_samples_per_second': 5.813, 'train_steps_per_second': 1.453, 'total_flos': 5845995749376000.0, 'train_loss': 0.33587370147307716, 'epoch': 3.0})

In [18]:
model.save_pretrained("./t5-model")
tokenizer.save_pretrained("./t5-model")

('./t5-model/tokenizer_config.json',
 './t5-model/special_tokens_map.json',
 './t5-model/spiece.model',
 './t5-model/added_tokens.json')

In [28]:
def translate_text(text, model, tokenizer, device):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )
    
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

In [29]:
import torch
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_translations = []

for text in tqdm(test_df['input_text'].tolist()):
    translation = translate_text(text, model, tokenizer, device)
    test_translations.append(translation)


  0%|          | 0/400 [00:00<?, ?it/s]

In [30]:
test_df['translations_new'] = test_translations

### Bleu Score

In [36]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

In [37]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
references = [[ref.split()] for ref in test_df['target_text'].tolist()]
translated_texts = [translation.split() for translation in test_df['translations_new'].tolist()]

In [39]:
bleu1 = corpus_bleu(references, translated_texts, weights=(1, 0, 0, 0))
bleu2 = corpus_bleu(references, translated_texts, weights=(0.5, 0.5, 0, 0))
bleu3 = corpus_bleu(references, translated_texts, weights=(0.33, 0.33, 0.33, 0))
bleu4 = corpus_bleu(references, translated_texts, weights=(0.25, 0.25, 0.25, 0.25))

In [40]:
print(f"BLEU-1: {bleu1}")
print(f"BLEU-2: {bleu2}")
print(f"BLEU-3: {bleu3}")
print(f"BLEU-4: {bleu4}")

BLEU-1: 7.313334957916342e-25
BLEU-2: 2.0840685874253587e-25
BLEU-3: 1.1387783349603243e-25
BLEU-4: 6.680176605986215e-26


### Rouge-score

In [58]:
!pip install rouge-score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for i in range(len(test_df)):
    reference = test_df['target_text'].iloc[i]
    translation = test_df['translations_new'].iloc[i]

    scores = scorer.score(reference, translation)

    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print(f"Average ROUGE-1: {average_rouge1}")
print(f"Average ROUGE-2: {average_rouge2}")
print(f"Average ROUGE-L: {average_rougeL}")


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4c2bbd79664408d39891a104be1e46658de5e86a5014e3d76f2a550b75bfd7b1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Average ROUGE-1: 0.09826839826839824
Average ROUGE-2: 0.0335233918128655
Average ROUGE-L: 0.09826839826839824


### ChrF Score

In [63]:
import sacrebleu

references = [[ref] for ref in test_df['target_text'].tolist()]  # List of lists
translations = test_df['translations_new'].tolist()

chrf_scores = []

for reference, translation in zip(references, translations):
    chrf = sacrebleu.sentence_chrf(translation, reference)  # Note the order: (hypothesis, reference)
    chrf_scores.append(chrf.score)

average_chrf_score = sum(chrf_scores) / len(chrf_scores)

print(f"Average ChrF Score: {average_chrf_score}")


Average ChrF Score: 0.7693213418945618
