# This is a tutorial for machine translation with T5

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import evaluate

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")



## We will use pretrained t5-small model to finetune a English to French model.

In [3]:
#we will use bleu score as the evaluation metric
bleu = evaluate.load("bleu")

## First we need to read the files and convert it into a dataframe

In [4]:
df=pd.read_csv('chn_parallel.csv')

In [5]:
df

Unnamed: 0,eng,sim,tra
0,"On Monday, scientists from the Stanford Univer...",周一，斯坦福大学医学院的科学家宣布，他们发明了一种可以将细胞按类型分类的新型诊断工具：一种可...,史丹佛大學醫學院的科學家於週一宣布發明一項新型診斷工具，可依類型將細胞分類：這是一種細小的可...
1,Lead researchers say this may bring early dete...,主要研究人员表示，这可以让低收入国家/地区的患者尽早发现癌症、肺结核、艾滋病和疟疾。在这些国...,主要研究人員表示，這或許可以讓低收入國家的癌症、肺結核、愛滋病毒及瘧疾病患早期發現病症。在這...
2,The JAS 39C Gripen crashed onto a runway at ar...,当地时间上午 9:30 左右 (UTC 0230)，JAS 39C 鹰狮战斗机撞上跑道并发生...,JAS 39C 獅鷲戰鬥機在當地時間上午 9 點 30 分（世界協調時間 02:30）墜落在...
3,The pilot was identified as Squadron Leader Di...,涉事飞行员是空军中队长迪罗里·帕塔维 (Dilokrit Pattavee)。,駕駛員的身分確認是空軍少校帕塔維 (Dilokrit Pattavee)。
4,Local media reports an airport fire vehicle ro...,当地媒体报道，一辆机场消防车在响应火警时翻了车。,當地媒體報導一輛機場消防車在出勤時翻覆。
...,...,...,...
992,The tourist season for the hill stations gener...,山中避暑之地的旅游旺季通常是在印度的夏季。,避暑勝地的旅遊季節，通常在印度的夏季期間達到頂峰。
993,"However, they have a different kind of beauty ...",然而，冬天却另有一番不同的美景和魅力，许多山中避暑之地的雪量恰到好处。这些地方会提供活动项目...,然而，它們在冬天有不同的美與魅力。許多山間小鎮都會下不少的雪，且那裡也會提供滑雪和滑雪板等活動。
994,Only a few airlines still offer bereavement fa...,只有少数几家航空公司扔提供丧亲票价，这比为赶赴葬礼的临时出行稍微便宜一些。,只有少數航空公司仍然提供喪親票價，為臨時的葬禮旅行提供些微折扣。
995,"Airlines that offer these include Air Canada, ...",提供此类票价的航空公司包括加拿大航空公司、达美航空公司、德国汉莎航空公司（从美国或加拿大起飞...,提供這些服務的航空公司包括加拿大航空、達美航空、漢莎航空（從美國或加拿大出發的航班）和西捷航空。


In [6]:
#Create a customized dataset class
class CustomDataset():

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.sim = list(dataframe['sim'])
        self.tra = list(dataframe['tra'])

    def __len__(self):
        return len(self.sim)

    def __getitem__(self, index):
        #we need to get the input ids of input(English) and output(French)
        inputs = self.tokenizer.encode_plus(
            self.sim[index],
            None,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True
        )
        outputs = self.tokenizer.encode_plus(
            self.tra[index],
            None,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True
        )
        
        
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels=outputs['input_ids']   
        return torch.tensor(input_ids, dtype=torch.long),torch.tensor(attention_mask, dtype=torch.long),torch.tensor(labels, dtype=torch.long)

In [7]:
#Train test split and create dataloders

#train, test = train_test_split(df, test_size=0.5,random_state=12345)

train_set = CustomDataset(df, tokenizer)
trainloader = DataLoader(train_set, batch_size=2,shuffle=True)
#we only randomly pick 2000 samples as test_set
#test_set = CustomDataset(test[:2000], tokenizer)
#testloader = DataLoader(test_set, batch_size=2,shuffle=False)

In [8]:
#ground truth of test_set
true_list=test[:2000]['fra'].to_list()

NameError: name 'test' is not defined

In [9]:
def training(train_loader, model,optimizer):
    """one epoch training"""


    epoch_loss = 0
    model.train()


    for idx, (input_ids,attention_mask,labels) in enumerate(train_loader):


        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels==0]=-100
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels= labels.cuda()


        
        
        #calculate the loss 
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        
        #accumulate the loss
        epoch_loss += loss.item()
        


        # SGD
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    



    return epoch_loss / len(train_loader)

In [10]:
def evaluating(eval_loader, model,optimizer):
    epoch_loss = 0
    pred_list=[]
    model.eval()
    
    with torch.no_grad():
    
        for idx, (input_ids,attention_mask,labels) in enumerate(eval_loader):
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
            
            
            #get the output sequence
            output_sequences = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            outputs=tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
            for samples in outputs:
                pred_list.append(samples)
    
    
    
    #calculate bleu score
    bleu_score=bleu.compute(predictions=pred_list, references=true_list)    
    return bleu_score

In [11]:
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
#Set the optimizer and learning rate is recommended to be 1e-4 by huggingface
optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=1e-4,
    )

In [12]:
#Train the model
for i in range(5):


    print('epochs:'+ str(i+1))
    
    #training
    tr_loss=training(trainloader, model,optimizer)
    print('training_loss:'+str(round(tr_loss, 5)))

    #evaluating
    #bleu_score=evaluating(testloader, model,optimizer)
    #print('bleu_score:'+str(round(bleu_score['bleu'], 5)))

epochs:1


RuntimeError: CUDA out of memory. Tried to allocate 978.00 MiB (GPU 0; 8.00 GiB total capacity; 6.66 GiB already allocated; 0 bytes free; 6.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Due to the large amount of training data. The training is done on a cluster.
we can run evaluating function again to check the bleu score after training

In [22]:
bleu_score=evaluating(testloader, model,optimizer)

In [23]:
bleu_score

{'bleu': 0.4875162591403384,
 'precisions': [0.7436706689536878,
  0.5589662027833002,
  0.4481323877068558,
  0.3667598416026089],
 'brevity_penalty': 0.9535654925674059,
 'length_ratio': 0.9546109510086456,
 'translation_length': 14575,
 'reference_length': 15268}

## a simple example

In [14]:
#Check the model outputs after training
sentences = ["HuggingFace is a company.", "Welcome to NYC."]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
)
print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))


['HuggingFace est une société.', 'Bienvenue à NYC.']


In [21]:
bleu.compute(predictions=['HuggingFace est une société.', 'Bienvenue à NYC.'], references=["HuggingFace est une entreprise.", "Bienvenue à NYC."])

{'bleu': 0.5969491792019646,
 'precisions': [0.8888888888888888,
  0.7142857142857143,
  0.6,
  0.3333333333333333],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 9,
 'reference_length': 9}