# 1. Load pretrained Vietnamese word2vec

In [1]:
import torch
import torchtext.vocab as vocab

word_embedding = vocab.Vectors(name="vi_word2vec.txt",
                               unk_init=torch.Tensor.normal_)

word_embedding.vectors.shape

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([1587507, 100])

In [2]:
def get_vector(embeddings, word):
    """ Get embedding vector of the word
    @param embeddings (torchtext.vocab.vectors.Vectors)
    @param word (str)
    @return vector (torch.Tensor)
    """
    assert word in embeddings.stoi, f'*{word}* is not in the vocab!'
    return embeddings.vectors[embeddings.stoi[word]]

def closest_words(embeddings, vector, n=10):
    """ Return n words closest in meaning to the word
    @param embeddings (torchtext.vocab.vectors.Vectors)
    @param vector (torch.Tensor)
    @param n (int)
    @return words (list(tuple(str, float)))
    """
    distances = [(word, torch.dist(vector, get_vector(embeddings, word)).item())
                 for word in embeddings.itos]
    
    return sorted(distances, key = lambda w: w[1])[:n]


word_vector = get_vector(word_embedding, "Việt_Nam")

closest_words(word_embedding, word_vector)

[('Việt_Nam', 0.0),
 ('VN', 0.6608753204345703),
 ('Trung_Quốc', 0.6805075407028198),
 ('nước', 0.7456551790237427),
 ('TQ', 0.7542526721954346),
 ('của', 0.7784993648529053),
 ('biển', 0.7814522385597229),
 ('vùng_biển', 0.7835540175437927),
 ('Singapore', 0.7879586219787598),
 ('và', 0.7881313562393188)]

# 2. Vocabulary.py

In [3]:
import torch
from tqdm import tqdm
from underthesea import word_tokenize


class Vocabulary:
    """ The Vocabulary class is used to record words, which are used to convert 
        text to numbers and vice versa.
    """

    def __init__(self):
        self.word2id = dict()
        self.word2id['<pad>'] = 0   # Pad Token
        self.word2id['<unk>'] = 1   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id 
    
    def __len__(self):
        return len(self.word2id)

    def id2word(self, word_index):
        """
        @param word_index (int)
        @return word (str)
        """
        return self.id2word[word_index]

    def add(self, word):
        """ Add word to vocabulary
        @param word (str)
        @return index (str): index of the word just added
        """
        if word not in self:
            word_index = self.word2id[word] = len(self.word2id)
            self.id2word[word_index] = word
            return word_index
        else:
            return self[word]

    @staticmethod
    def tokenize_corpus(corpus):
        """Split the documents of the corpus into words
        @param corpus (list(str)): list of documents
        @return tokenized_corpus (list(list(str))): list of words
        """
        print("Tokenize the corpus...")
        tokenized_corpus = list()
        for document in tqdm(corpus):
            tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
            tokenized_corpus.append(tokenized_document)

        return tokenized_corpus

    def corpus_to_tensor(self, corpus, is_tokenized=False):
        """ Convert corpus to a list of indices tensor
        @param corpus (list(str) if is_tokenized==False else list(list(str)))
        @param is_tokenized (bool)
        @return indicies_corpus (list(tensor))
        """
        if is_tokenized:
            tokenized_corpus = corpus
        else:
            tokenized_corpus = self.tokenize_corpus(corpus)
        indicies_corpus = list()
        for document in tqdm(tokenized_corpus):
            indicies_document = torch.tensor(list(map(lambda word: self[word], document)),
                                             dtype=torch.int64)
            indicies_corpus.append(indicies_document)

        return indicies_corpus

    def tensor_to_corpus(self, tensor):
        """ Convert list of indices tensor to a list of tokenized documents
        @param indicies_corpus (list(tensor))
        @return corpus (list(list(str)))
        """
        corpus = list()
        for indicies in tqdm(tensor):
            document = list(map(lambda index: self.id2word[index.item()], indicies))
            corpus.append(document)

        return corpus

In [4]:
corpus_sample = ["Với cộng đồng người Bách Việt trước đây, việc thuần hóa mèo cũng có thể theo cách thức như vậy.",
                 "Tuy nhiên, rất khó xác định được thời gian cụ thể loài mèo được thuần hóa.",
                 "Chỉ biết rằng, từ xa xưa, mèo đã là vật nuôi thân quen trong hầu hết gia đình nông dân Việt Nam."]

Vocabulary.tokenize_corpus(corpus_sample)

Tokenize the corpus...


100%|██████████| 3/3 [00:00<00:00, 36.55it/s]


[['Với',
  'cộng_đồng',
  'người',
  'Bách',
  'Việt',
  'trước',
  'đây',
  ',',
  'việc',
  'thuần_hóa',
  'mèo',
  'cũng',
  'có_thể',
  'theo',
  'cách_thức',
  'như_vậy',
  '.'],
 ['Tuy_nhiên',
  ',',
  'rất',
  'khó',
  'xác_định',
  'được',
  'thời_gian',
  'cụ_thể',
  'loài',
  'mèo',
  'được',
  'thuần_hóa',
  '.'],
 ['Chỉ',
  'biết',
  'rằng',
  ',',
  'từ',
  'xa_xưa',
  ',',
  'mèo',
  'đã',
  'là',
  'vật_nuôi',
  'thân_quen',
  'trong',
  'hầu_hết',
  'gia_đình',
  'nông_dân',
  'Việt_Nam',
  '.']]

In [5]:
vocab = Vocabulary()

# create vocabulary from pretrained word2vec
words_list = list(word_embedding.stoi.keys())
for word in words_list:
    vocab.add(word)

# test the vocabulary
tensor = vocab.corpus_to_tensor(corpus_sample)
corpus = vocab.tensor_to_corpus(tensor)
" ".join(corpus[0])

Tokenize the corpus...


100%|██████████| 3/3 [00:00<00:00, 373.75it/s]
100%|██████████| 3/3 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:01<00:00,  2.27it/s]


'Với cộng_đồng người Bách Việt trước đây , việc <unk> mèo cũng có_thể theo cách_thức như_vậy .'

# 3. IMDBDataset.py

In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset


class IMDBDataset(Dataset):
    """ Load dataset from file csv
    """

    def __init__(self, vocab, csv_fpath=None, tokenized_fpath=None):
        """
        @param vocab (Vocabulary)
        @param csv_fpath (str)
        @param tokenized_fpath (str)
        """
        self.vocab = vocab
        df = pd.read_csv(csv_fpath)
        self.sentiments_list = list(df.sentiment)
        self.reviews_list = list(df.vi_review)

        sentiments_type = list(set(self.sentiments_list))
        sentiments_type.sort()
        
        self.sentiment2id = {sentiment: i for i, sentiment in enumerate(sentiments_type)}

        if tokenized_fpath:
            self.tokenized_reviews = torch.load(tokenized_fpath)
        else:
            self.tokenized_reviews = self.vocab.tokenize_corpus(self.reviews_list)

        self.tensor_data = self.vocab.corpus_to_tensor(self.tokenized_reviews, is_tokenized=True)
        self.tensor_label = torch.tensor([self.sentiment2id[sentiment] for sentiment in self.sentiments_list],
                                         dtype=torch.float64)
        
    def __len__(self):
        return len(self.tensor_data)

    def __getitem__(self, idx):
        return self.tensor_data[idx], self.tensor_label[idx]

In [7]:
# dataset = IMDBDataset(vocab, "VI_IMDB.csv", "tokenized.pt")
dataset = IMDBDataset(vocab, "VI_IMDB.csv")

Tokenize the corpus...


100%|██████████| 50000/50000 [31:42<00:00, 26.28it/s]  
100%|██████████| 50000/50000 [00:09<00:00, 5137.69it/s]


In [8]:
torch.save(dataset.tokenized_reviews, "tokenized.pt")

# 4. Split data

In [10]:
from torch.utils.data import random_split


split_rate = 0.8
full_size = len(dataset)
train_size = (int)(split_rate * full_size)
valid_size = (int)((full_size - train_size)/2)
test_size = full_size - train_size - valid_size
train_dataset, valid_dataset, test_dataset = random_split(dataset, 
                                                          lengths=[train_size, valid_size, test_size])

len(train_dataset), len(valid_dataset), len(test_dataset)

(40000, 5000, 5000)

# 5. Create batch iterator

In [11]:
import numpy as np
import math

import torch


def batch_iterator(dataset, batch_size, pad_idx, device):
    """ Yield the reviews and sentiments of the dataset in batches
    @param dataset (IMDBDataset)
    @param batch_size (int)
    @param pad_idx (int)
    @param device (torch.device)
    @yield dict {"reviews": tuple(torch.Tensor, torch.Tensor), "sentiments": torch.Tensor} 
    """
    batch_num = math.ceil(len(dataset) / batch_size)
    index_array = list(range(len(dataset)))

    np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [dataset[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)

        reviews = [e[0] for e in examples]
        reviews = torch.nn.utils.rnn.pad_sequence(reviews, 
                                                  batch_first=False, 
                                                  padding_value=pad_idx).to(device)
        reviews_lengths = torch.tensor([len(e[0]) for e in examples])
        sentiments = torch.tensor([e[1] for e in examples]).to(device)

        yield {"reviews": (reviews, reviews_lengths), "sentiments": sentiments} 

# 6. Recurrent Neural Network model

In [12]:
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        """
        @param vocab_size (int)
        @param embedding_dim (int)
        @param hidden_dim (int)
        @param n_layers (int)
        @param bidirectional (bool)
        @param dropout (float)
        @param pad_idx (int)
        """
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, 1)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        """
        @param text (torch.Tensor): shape = [sent len, batch size]
        @param text_lengths (torch.Tensor): shape = [batch size]
        @return
        """
        #text = [sent len, batch size]
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [13]:
INPUT_DIM = word_embedding.vectors.shape[0]
EMBEDDING_DIM = 100
BATCH_SIZE = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = vocab["<pad>"]
UNK_IDX = vocab["<unk>"]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [14]:
model.embedding.weight.data.copy_(word_embedding.vectors)
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 161,061,357 trainable parameters


# 7. Train the model

In [17]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

model = model.to(device)

In [18]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    @param preds (torch.Tensor): shape = [batch_size]
    @param y (torch.Tensor): shape = [batch_size]
    @return acc (torch.Tensor): shape = [1]
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [19]:
def train(model, dataset, batch_size, optimizer, criterion, pad_idx, device):
    """
    @param model (RNN)
    @param dataset (IMDBDataset)
    @param batch_size (int)
    @param optimizer (torch.optim)
    @param criterion (torch.nn.modules.loss)
    @param pad_idx (int)
    @param device (torch.device)
    @return epoch_loss (float): model's loss of this epoch
    @return epoch_acc (float): model's accuracy of this epoch 
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in batch_iterator(dataset, batch_size, pad_idx, device):
        
        optimizer.zero_grad()
        
        reviews, reviews_lengths = batch["reviews"]
        
        predictions = model(reviews, reviews_lengths).squeeze(1)
        
        loss = criterion(predictions, batch["sentiments"])
        
        acc = binary_accuracy(predictions, batch["sentiments"])
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    batch_num = math.ceil(len(dataset) / batch_size)
    return epoch_loss / batch_num, epoch_acc / batch_num

In [20]:
def evaluate(model, dataset, batch_size, criterion, pad_idx, device):
    """
    @param model (RNN)
    @param dataset (IMDBDataset)
    @param batch_size (int)
    @param criterion (torch.nn.modules.loss)
    @param pad_idx (int)
    @param device (torch.device)
    @return epoch_loss (float): model's loss of this epoch
    @return epoch_acc (float): model's accuracy of this epoch 
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in batch_iterator(dataset, batch_size, pad_idx, device):

            reviews, reviews_lengths = batch["reviews"]
            
            predictions = model(reviews, reviews_lengths).squeeze(1)
            
            loss = criterion(predictions, batch["sentiments"])
            
            acc = binary_accuracy(predictions, batch["sentiments"])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    batch_num = math.ceil(len(dataset) / batch_size)
    return epoch_loss / batch_num, epoch_acc / batch_num

In [21]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [22]:
N_EPOCHS = 5

best_valid_loss = float("inf")

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataset, BATCH_SIZE, 
                                  optimizer, criterion, PAD_IDX, device)
    valid_loss, valid_acc = evaluate(model, valid_dataset, BATCH_SIZE, 
                                     criterion, PAD_IDX, device)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "model.pt")
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

# 8. Test the model

In [None]:
test_loss, test_acc = evaluate(model, test_dataset, BATCH_SIZE, 
                               criterion, PAD_IDX, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.270 | Test Acc: 89.48%


In [None]:
def predict_sentiment(model, sentence, vocab, device):
    model.eval()
    corpus = [sentence]
    tensor = vocab.corpus_to_tensor(corpus)[0].to(device)
    tensor = tensor.unsqueeze(1)
    length = [len(tensor)]
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
dataset.sentiment2id

{'positive': 0, 'negative': 1}

In [None]:
sentence = "Bộ phim này rất dở! Nội dung cực kì nhàm chán"

predict_sentiment(model, sentence, vocab, device)

Tokenize the corpus...


100%|██████████| 1/1 [00:00<00:00, 270.29it/s]
100%|██████████| 1/1 [00:00<00:00, 6232.25it/s]


0.9958381652832031

In [None]:
sentence = "Bộ phim này rất hay! Nhiều tình tiết rất kịch tính."

predict_sentiment(model, sentence, vocab, device)

Tokenize the corpus...


100%|██████████| 1/1 [00:00<00:00, 1456.86it/s]
100%|██████████| 1/1 [00:00<00:00, 8422.30it/s]


0.032079365104436874