In [133]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence

import re
import string
import numpy as np
from pyvi import ViTokenizer
from torchvision import transforms
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
engs = []
vies = []
with open('Data/eng-vie.txt','r') as f:
    for line in f: 
        line = line.split('\t')
        engs.append(line[0])
        vies.append(line[1])

In [15]:
print(len(engs), len(vies))
print(engs[1000], vies[1000])

6369 6369
He has a foreign car. Ông ấy có một chiếc xe hơi nước ngoài.


In [70]:
#tách từ
def token_vi(sentence):
        return ViTokenizer.tokenize(sentence)
    
# xử lý
def clean_sentence(text, language = 'en'):
    text = text.lower()
    if language == 'en':
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"that's", "that is", text)
        text = re.sub(r"what's", "that is", text)
        text = re.sub(r"where's", "where is", text)
        text = re.sub(r"how's", "how is", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"can't", "cannot", text)
        text = re.sub(r"n't", " not", text)
        text = re.sub(r"n'", "ng", text)
        text = re.sub(r"'bout", "about", text)
        text = re.sub(r"'til", "until", text)
        #text = re.sub(r"([.!?])", r" \1", text)
        text = re.sub(r"[-()\"#/@;:<>{}`+=~|,!.?]", " ", text)      
    else:
        table = str.maketrans('','',string.punctuation)
        text = text.translate(table)
    text = text.replace('  ',' ',5)
    text = text.strip()
    return text


In [83]:
engs_all = [clean_sentence(i) for i in engs]
vies_all = [token_vi(clean_sentence(i)) for i in vies]


In [84]:
#view
list(zip(engs_all[100:120], vies_all[100:120]))

[('hang on tom', 'chờ chút tom'),
 ('i am at home', 'tôi đang ở nhà'),
 ('i am curious', 'tôi tò_mò'),
 ('i do not know', 'tôi không biết'),
 ('i do not know', 'tôi không hiểu'),
 ('i hear music', 'tôi nghe nhạc'),
 ('i like bread', 'tôi thích bánh_mì'),
 ('i like bread', 'tôi thích bánh mỳ'),
 ('i will not lose', 'tôi không thua đâu'),
 ('i would buy that', 'lấy cho tôi cái kia'),
 ('i am not free', 'tôi không rảnh'),
 ('i am not free', 'em không rảnh'),
 ('i am not free', 'mình không rảnh'),
 ('i am not free', 'tao không rảnh'),
 ('i am not free', 'tớ không rảnh'),
 ('i am retiring', 'tôi sắp nghỉ hưu'),
 ('is tom there', 'tom có ở đó không'),
 ('it is raining', 'trời đang mưa'),
 ('it is so easy', 'quá dễ'),
 ('it is so hard', 'việc này thật là khó')]

In [89]:
class Vocab():
    def __init__(self):
        self.n_words = 3
        self.word2index = {'<startseq>':1,'<endseq>':2,'<pad>':0, '<none>':3}
        self.word2count = {}
        self.index2word = {1:'<startseq>',2:'<endseq>',0:'<pad>',3:'<none>'}
        self.max_length = 0

        self._word_count = {}
    
    def fit(self, all_train_caption):
        # Thêm các từ vào từ điển và cập nhập các thông tin cơ bản của vocab
        for sentence in all_train_caption: 
          self.addSentence(sentence)
    
    def addSentence(self,sentence):
        sentence = sentence.split()
        if len(sentence)>self.max_length:
          self.max_length = len(sentence)
        for word in sentence:
          self.addWord(word)
    def addWord(self, word):
        if (word not in self.word2index):
          self.word2index[word] = self.n_words
          self.word2count[word] = 1
          self.index2word[self.n_words] = word
          self.n_words += 1
        else:
          self.word2count[word] += 1


In [90]:
vi_vocab = Vocab()
vi_vocab.fit(vies_all)


en_vocab = Vocab()
en_vocab.fit(engs_all)

In [94]:
print(f'Độ dài bộ từ điển \nvi:{vi_vocab.n_words}\nen:{en_vocab.n_words}')

Độ dài bộ từ điển 
vi:3383
en:3503


In [127]:
# tạo dataset 
class FlickrDataset(Dataset):
    def __init__(self,engs, vies,vi_vocab,en_vocab):

        self.engs = engs
        self.vies = vies
        self.transforms = transforms
        self.vi_vocab = vi_vocab
        self.en_vocab = en_vocab
    def tensorFromSentence(self,sentence,vocab,input= False):
        if input ==False:
            sentence = '<startseq> '+sentence +' <endseq>' 
        indexes = []
        for word in sentence.split():
            if word in vocab.word2index:
                indexes.append(vocab.word2index[word])
        return torch.tensor(indexes, dtype = torch.long)
    def __getitem__(self, index):
        input = self.tensorFromSentence(self.engs[index],en_vocab, input = True)
        target = self.tensorFromSentence(self.vies[index],vi_vocab)
        return input, target
    def __len__(self):
        return len(self.engs)

In [128]:
dataset = FlickrDataset(engs_all ,vies_all,vi_vocab,en_vocab)
dataset.__len__()

6369

In [129]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [5000, 1369])

In [131]:
input, target = train_dataset[21]
print(' '.join([en_vocab.index2word[i.item()] for i in input ]))
print(' '.join([vi_vocab.index2word[i.item()] for i in target ]))

that is where i grew up
<startseq> đó là nơi tôi đã lớn lên <endseq>


In [138]:
# padding theo độ dài của batch giảm thiểu chi phí
def collate_fcn(batch):
    inputs = [item[0] for item in batch]
    inputs = pad_sequence(inputs, batch_first = True, padding_value = en_vocab.word2index['<pad>'])
    
    targets = [item[1] for item in batch]
    targets = pad_sequence(targets, batch_first=True, padding_value=vi_vocab.word2index['<pad>'])
    return inputs,targets

In [324]:
# dataloader
train_dataloader = DataLoader(train_dataset, batch_size=2, num_workers=1, collate_fn=collate_fcn)
test_dataloader = DataLoader(test_dataset, batch_size=2, num_workers=1,collate_fn=collate_fcn)

In [325]:
input = next(iter(test_dataloader))[0][2]
target = next(iter(test_dataloader))[1][2]
print(' '.join([en_vocab.index2word[i.item()] for i in input ]))
print(' '.join([vi_vocab.index2word[i.item()] for i in target ]))

IndexError: index 2 is out of bounds for dimension 0 with size 2

In [None]:
class Encoder(nn.Module):
    def __init__(self,embedding_size, hidden_size, en_vocab_size,num_layers,bidirectional, p):
        super(Encoder, self).__init__()
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(en_vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first= True,num_layers = num_layers ,dropout = p ,bidirectional = bidirectional)
        
    def forward(self,inputs):
        embed = self.embedding(inputs)
        
        outputs, (hidden,cell) = self.lstm(embed)
        return hidden,cell# bidirectional(2)* num_layers, batch_size, hidden_size

In [326]:
class Decoder(nn.Module):
    def __init__(self,embedidng_size, hidden_size, vi_vocab_size, num_layers,bidirectional, p):
        super(Decoder, self).__init__()
        
        self.embedding = nn.Embedding(vi_vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers = num_layers, dropout = p, bidirectional = bidirectional,batch_first = True)
        self.fc = nn.Linear(hidden_size, vi_vocab_size)
    def forward(self, hidden, cell, x):
        x = x.unsqueeze(1)# batch_size,1
        embed = self.embedding(x)
        print(hidden.size())
        output, (hidden,cell) = self.lstm(embed,(hidden,cell))
        # output.shape = batch_size,1, hidden_dim
        
        output = self.fc(output)# batch_size,1, vocab_size
        output = output.squeeze(1)# batch_size, vocab_size
        
        return output, hidden, cell
    

In [327]:
class Seq2Seq(nn.Module):
    def __init__(self,embedding_size, hidden_size, en_vocab_size,vi_vocab_size,num_layers,bidirectional , p):
        super(Seq2Seq, self).__init__()
        self.vi_vocab_size = vi_vocab_size
        self.encoder = Encoder(embedding_size, hidden_size, en_vocab_size, num_layers,bidirectional,p)
        self.decoder = Decoder(embedding_size, hidden_size, vi_vocab_size, num_layers,bidirectional,p)
    def forward(self, input, targets, teacher_force_radio = 0.5):
        batch_size = input.size(1)
        targets_len = targets.size(1)
        
        outputs = torch.zeros((batch_size, targets_len, self.vi_vocab_size)).to(device)
        
        
        hidden, cell = self.encoder(input)# bidirectional(2 or 1)* num_layers, batch_size, hidden_size
        
        x = targets[:, 0]# (batch-size,)
        for i in range(1, targets_len):

            ouput, hidden,cell = self.decoder(hidden,cell,x)
            
            outputs[:,t,:] = output
            
            pred = output.argmax(1) 
            
            x= target[:,t] if teacher_force_radio>0.5 else pred
        return outputs#batch_size, targets_len, vi_vocab_size
            
        

In [328]:
embedding_size = 300
hidden_size = 512
en_vocab_size = en_vocab.n_words
vi_vocab_size = vi_vocab.n_words
num_layers = 2
bidirectional = False
p = 0.3

learning_rate = 0.001


In [329]:
model = Seq2Seq(embedding_size, hidden_size, en_vocab_size,vi_vocab_size,num_layers,bidirectional , p).to(device)
criterion= nn.CrossEntropyLoss(ignore_index= vi_vocab.word2index['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 1.96 GiB total capacity; 1023.94 MiB already allocated; 17.69 MiB free; 1.04 GiB reserved in total by PyTorch)

In [330]:
lst_loss = []
steps_loss = []
epochs = 5
for epoch in range(epochs):
    model.train()
    
    for i, (inputs, targets) in enumerate(train_dataloader):
        inputs = inputs.to(device)#batch_size, seq_len
        targets = targets.to(device)

        model.zero_grad()

        outputs = model(inputs, targets[:,:-1])

        loss = criterion(outputs[:,1:].view(-1,vi_vocab.n_words), targets[:,1:].reshape(-1))

    loss.backward()
    optimizer.step()

    steps_loss.append(loss.item())

    if i %500 ==0:
        print(f'Epoch: {epoch}, Step: {i}, Loss:{np.array(steps_loss).mean():.5f}')
        lst_loss.append(np.array(steps_loss).mean)
        steps_loss =[]
    torch.save(model.state_dict(),'model.pth')

torch.Size([2, 2, 512])


RuntimeError: Expected hidden[0] size (2, 1, 512), got [2, 2, 512]