In [1]:
import transformers 
import torch
import datasets
from datasets import Dataset

In [2]:
import pandas as pd
data_dev = pd.read_xml("./data/NEWS2018_M-EnHi_dev.xml")
data_train = pd.read_xml("./data/NEWS2018_M-EnHi_trn.xml")

In [3]:
data_dev.head()

Unnamed: 0,ID,SourceName,TargetName
0,1,aachaaryanandana,आचार्यनंदना
1,2,aachaarysut,आचार्यसुत
2,3,aacharynandan,आचार्यनंदन
3,4,aacharynandanaa,आचार्यनंदना
4,5,aadamkhor,आदमखोर


In [4]:
translation_list = []
for src, trgt in zip(data_dev['SourceName'], data_dev['TargetName']):
    dic = {}
    dic['en'] = src
    dic['hi'] = trgt
    translation_list.append(dic)
    
data_dev['translation'] = translation_list

In [5]:
translation_list = []
for src, trgt in zip(data_train['SourceName'], data_train['TargetName']):
    dic = {}
    dic['en'] = src
    dic['hi'] = trgt
    translation_list.append(dic)
    
data_train['translation'] = translation_list

In [6]:
for col in ['SourceName', 'TargetName']:
    del data_dev[col]
    del data_train[col]

In [7]:
data_dev.columns = ['id', 'translation']
data_train.columns = ['id', 'translation']

In [8]:
ds_dev = Dataset.from_pandas(data_dev)
ds_train = Dataset.from_pandas(data_train)

In [9]:
split_datasets = ds_train.train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 11643
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1294
    })
})

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

In [11]:
# for i in range(ds_dev.shape[0]):
#     target = ds_dev[i]['TargetName']
#     hin_snippet = ds_dev[i]['SourceName']

#     inputs = tokenizer.encode(
#         hin_snippet, return_tensors="pt",padding=True,max_length=512,truncation=True)

#     outputs = model.generate(
#         inputs, max_length=128, num_beams=None, early_stopping=True)

#     translated = tokenizer.decode(outputs[0]).replace('<pad>',"").strip().lower()
#     print(hin_snippet, translated, target)
#     break

In [28]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex['en'] for ex in examples['translation']]
    targets = [ex['hi'] for ex in examples['translation']]
    model_inputs = tokenizer(
        inputs, 
        text_target=targets,
        max_length=max_length,
        padding=True,
#         truncation=True
    )
    return model_inputs
    

In [29]:
tokenized_datasets_train = split_datasets.map(
                    preprocess_function,
                    batched=True,
                    remove_columns=ds_train.column_names,
)

  0%|          | 0/12 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

In [30]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors='pt')

In [31]:
# tokenized_datasets_train['train'][1]

In [32]:
batch = data_collator([tokenized_datasets_train['train'][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [33]:
tokenized_datasets_train

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11643
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1294
    })
})

In [34]:
import evaluate

metric = evaluate.load("sacrebleu")

In [35]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds =  preds[0]
        
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = np.where(labels!=-100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds,
                           references=decoded_labels)
    return {        'bleu': result['score']    }

In [51]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f'divyanshu-finetuned-hi-en',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
#     gradient_accumulation_steps=4,
    report_to='wandb',
    predict_with_generate=True,
    fp16=True
)

PyTorch: setting up devices


In [52]:
from transformers import Seq2SeqTrainer

In [53]:
import wandb

In [54]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_train['train'],
    data_collator=data_collator,
    eval_dataset=tokenized_datasets_train['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using cuda_amp half precision backend


In [55]:
# for i in tokenized_datasets_train['train']:
#     print(i)

In [56]:
# trainer.evaluate(max_length=max_length)

In [57]:
trainer.train()

***** Running training *****
  Num examples = 11643
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 14555
  Number of trainable parameters = 75856896
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Bleu
1,0.6363,0.725785,11.080936
2,0.5053,0.702442,11.760029
3,0.4226,0.692455,16.148158
4,0.3665,0.686729,15.230108
5,0.3313,0.685952,15.028662


***** Running Evaluation *****
  Num examples = 1294
  Batch size = 16
Saving model checkpoint to divyanshu-finetuned-hi-en/checkpoint-2911
Configuration saved in divyanshu-finetuned-hi-en/checkpoint-2911/config.json
Model weights saved in divyanshu-finetuned-hi-en/checkpoint-2911/pytorch_model.bin
tokenizer config file saved in divyanshu-finetuned-hi-en/checkpoint-2911/tokenizer_config.json
Special tokens file saved in divyanshu-finetuned-hi-en/checkpoint-2911/special_tokens_map.json
Deleting older checkpoint [divyanshu-finetuned-hi-en/checkpoint-2908] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1294
  Batch size = 16
Saving model checkpoint to divyanshu-finetuned-hi-en/checkpoint-5822
Configuration saved in divyanshu-finetuned-hi-en/checkpoint-5822/config.json
Model weights saved in divyanshu-finetuned-hi-en/checkpoint-5822/pytorch_model.bin
tokenizer config file saved in divyanshu-finetuned-hi-en/checkpoint-5822/tokenizer_config.json
Special tokens f

TrainOutput(global_step=14555, training_loss=0.45544327775541754, metrics={'train_runtime': 1484.6663, 'train_samples_per_second': 39.211, 'train_steps_per_second': 9.804, 'total_flos': 355716200005632.0, 'train_loss': 0.45544327775541754, 'epoch': 5.0})

Dev set evaluation

In [58]:
trainer.evaluate(max_length=max_length)

***** Running Evaluation *****
  Num examples = 1294
  Batch size = 16


{'eval_loss': 0.6859515905380249,
 'eval_bleu': 15.0286623765927,
 'eval_runtime': 13.6359,
 'eval_samples_per_second': 94.897,
 'eval_steps_per_second': 5.94,
 'epoch': 5.0}