<a href="https://colab.research.google.com/github/fromstar/NLU-Project-LM2/blob/main/223727_NLU_LM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [1]:
import torch
import math
import torch.nn as nn
import time
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.optim import optimizer
import torch.nn.functional as F
import os


from google.colab import drive
from google.colab import files
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
#Main path to dataset:

drive.mount('/content/drive/', force_remount = False)

Mounted at /content/drive/


In [3]:
!cp -R "/content/drive/MyDrive/input/"  "/content/input" 

Global Variables

In [4]:
data = "/content/input"
device = 'cuda:0'

embedding_size = 650
hidden_size =650
nlayers = 1
learning_rate = 0.001
clip = 0.35
epochs = 25
batch_size = 64
eval_batch_size = 1
dropout = 0.5
interval = 50


Corpus

In [5]:
class Corpus(object):
    def __init__(self):
        self.dictionary = {0: '<pad>', 1: '<unk>', 2: '<bos>', 3: '<eos>'}
        self.len_dict = len(self.dictionary)
        self.train = self.to_token(os.path.join(data, 'ptb.train.txt'))
        self.test = self.to_token(os.path.join(data, 'ptb.test.txt'))
        self.valid = self.to_token(os.path.join(data, 'ptb.valid.txt'))

# Fill the dictionary and return an array with the corresponding key of the words read
    def to_token(self, path):
        if os.path.exists(path):  # check if the file I need to read exists
            with open(path) as txt:
                key = self.len_dict
                sentences = []
                values = list(self.dictionary.values())

                for line in txt:
                    tmp = []
                    # line = line.strip()
                    words = ['<bos>'] + line.split() + ['<eos>']

                    # scroll through the words of a sentence
                    for word in words:
                        # if the world is not in the dictionary I add it.
                        if word not in values:
                            # the length of the dictionary coincides with the index of insertion in it
                            self.dictionary[key] = word
                            tmp.append(key)
                            key += 1
                            values.append(word)
                        else:
                            tmp.append(values.index(word))
                    sentences.append(torch.LongTensor(tmp).to(device))

            print("Sentences loaded")
            self.len_dict = len(self.dictionary)
            return sentences
        else:
            raise ValueError(path +" doesn't exist.")

    def print_dic(self):
        print(self.dictionary)
        print("Number of tokens: " + str(len(self.dictionary)))

GRU

In [6]:
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.update_gate = nn.Linear(hidden_size * 2, hidden_size)
        self.reset_gate = nn.Linear(hidden_size * 2, hidden_size)
        self.out_gate = nn.Linear(hidden_size, hidden_size)
        self.x = nn.Linear(hidden_size, hidden_size)

    def forward(self, input, prev_state):

        seq_size, _, _ = input.size()
        hidden_seq = []

        for t in range(seq_size):

            x_t = input[t, :, :]
            x_h = torch.cat((x_t, prev_state), dim=1)

            reset = torch.sigmoid(self.reset_gate(x_h))
            update = torch.sigmoid(self.update_gate(x_h))

            n1 = self.out_gate(prev_state) * reset
            n2 = n1 + self.x(x_t)
            out = torch.tanh(n2)

            new_state = (1 - update) * out + update * prev_state
            hidden_seq.append(new_state.unsqueeze(0))

        hidden_seq = torch.cat(hidden_seq, dim=0)

        return hidden_seq, new_state

Model

In [7]:
class RNN(nn.Module):
    def __init__(self, ntoken):
        super(RNN, self).__init__()

        self.ntoken = ntoken
        self.nlayers = nlayers
        self.input_size = embedding_size
        self.hidden_size = hidden_size
        self.drop = nn.Dropout(dropout)

        self.encoder = nn.Embedding(ntoken, self.input_size, padding_idx=0)

        self.rnn = nn.ModuleList()
        # N GRU layers
        for i in range(nlayers):
            self.rnn.append(GRUCell(self.input_size, hidden_size))

        self.fc = nn.Linear(hidden_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, ntoken)
        self.init_weights()

    def init_weights(self):
        initrange = 0.05
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        output = self.drop(self.encoder(input))

        for i in range(len(self.rnn)):
            output, hidden[i] = self.rnn[i](output, hidden[i])
            output = self.drop(output)

        output = self.fc(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1,self.ntoken)

        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, batch_size):

        hidden = []
        weight = next(self.parameters())
        for i in range(nlayers):
            hidden.append(weight.new_zeros(batch_size, self.hidden_size))
        return hidden


Main

In [8]:
def get_batch(source, i, batch_size):
    data = []
    target = []
    size = 0
    for sentence in source[batch_size * i: batch_size * (i+1)]:
        data.append(sentence[:-1])
        target.append(sentence[1:])
        size += len(sentence[:-1])

    # Fill the sentences with the pad tag in order to make them the same length
    # The key for the pad tag is 0.
    data = pad_sequence(data, padding_value=0)
    target = pad_sequence(target, padding_value=0)

    return data, target, size

In [9]:
def train(model, train, opt, epoch):
    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    total_size = 0

    for batch_idx in range(0, len(train) // batch_size):
        data, target, size = get_batch(train, batch_idx, batch_size)
        output, hidden = model(data, hidden)
        # Dropout to recurrent element
        # output = nn.Dropout(dropout)(output)

        loss = F.nll_loss(
                output,
                target.view(-1),
                reduction='sum',
                ignore_index=0,
            )

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        opt.step()

        total_loss += loss.item()

        total_size += size
        if batch_idx % interval == 0 and batch_idx > 0:
            cur_loss = total_loss / total_size
            ppl = round(math.exp(cur_loss),2)
            elapsed = time.time() - start_time
            print('epoch: ', epoch, ' | batches: ', batch_idx+1, '/', (len(train) // batch_size), ' | learning_rate: ', learning_rate,
                  '| ms/batch: ',round(elapsed * 1000 / interval,2), ' | loss: ', round(cur_loss,3), ' | perplexity: ', ppl)
            total_loss = 0
            start_time = time.time()
            total_size = 0
        
        for i in range(len(hidden)):
            hidden[i] = hidden[i].detach()

def evaluate(data_source, model):
    model.eval()
    total_loss = 0
    total_size = 0
    with torch.no_grad():
        for i in range(0, len(data_source)//eval_batch_size):
            hidden = model.init_hidden(eval_batch_size)
            data, target, size = get_batch(data_source, i, eval_batch_size)
            output, hidden = model(data, hidden)
            for i in range(len(hidden)):
             hidden[i] = hidden[i].detach()
             
            total_loss += F.nll_loss(
                output,
                target.view(-1),
                reduction='sum',
                ignore_index=0,
            )
            total_size += size

    return total_loss / total_size

In [None]:
def main():

    save = 'model_test.pt'
    torch.manual_seed(1111)
    corpus = Corpus()
    print("len ditc: ", corpus.len_dict)
    model = RNN(corpus.len_dict).to(device)
    print("Dictionary's length: ", corpus.len_dict, " words.")

    train_data = corpus.train
    test_data = corpus.test
    val_data = corpus.valid

    opt = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.99))
    # opt = torch.optim.SGD(model.parameters(), lr=learning_rate)
    best_val_loss = None

    try:
        for epoch in range(0, epochs):
            epoch_start_time = time.time()
            train(model, train_data, opt, epoch)
            val_loss = evaluate(val_data, model)
            ppl = round(math.exp(val_loss),2)

            print("-----------------------------------------------------------------------------------------")
            print('end epoch: ', epoch, '| time: ', round((time.time() - epoch_start_time),2), 's | valid loss: ', round(val_loss.item(),3),
                  '| valid ppl: ', ppl)
            print("-----------------------------------------------------------------------------------------\n")

            if not best_val_loss or val_loss < best_val_loss:
                with open(save, 'wb') as f:
                    torch.save(model, f)
                best_val_loss = val_loss

    except KeyboardInterrupt:
        print('Exiting from training early')

    with open(save, 'rb') as f:
        model = torch.load(f)

    test_loss = evaluate(test_data,model)
    print("-----------------------------------------------------------------------------------------")
    print('End of training\ntest loss: ', round(test_loss.item(),3), '\ntest ppl: ', math.exp(test_loss))


main()

Sentences loaded
Sentences loaded
Sentences loaded
len ditc:  10002
Dictionary's length:  10002  words.
epoch:  0  | batches:  51 / 657  | learning_rate:  0.001 | ms/batch:  233.3  | loss:  12.46  | perplexity:  257837.74
epoch:  0  | batches:  101 / 657  | learning_rate:  0.001 | ms/batch:  205.7  | loss:  6.46  | perplexity:  638.85
