# 3. Redes Neurais Recorrentes

Modelo de linguagem baseado em uma rede neural recorrente de Elman.

Após o treinamento do modelo em um corpora ()

In [41]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
import string
import re

Criação do conjunto de dados

In [42]:
class CustomDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file,sep='\t')
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.build_vocab()

    def build_vocab(self):
        words = ' '.join(self.data['text']).lower()
        words = re.sub('['+string.punctuation+']', '', words)
        words = words.split()
        unique_words = set(words)
        self.word_to_idx = {word: idx for idx, word in enumerate(unique_words)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        novo_idx = len(self.word_to_idx)
        self.word_to_idx['<END>'] = novo_idx
        self.idx_to_word[novo_idx] = '<END>'

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        text = text.lower()
        text = re.sub('['+string.punctuation+']', '', text)

        tokens = [token for token in text.split() if token != '']
        labels = [tokens[i+1] if i < len(tokens)-1 else '<END>' for i in range(len(tokens))]

        input_ids = [self.word_to_idx[token] for token in tokens]
        label_ids = [self.word_to_idx[label] for label in labels]
        return {'input_ids': torch.tensor(input_ids), 'labels': torch.tensor(label_ids)}

Criação da classe relacionada ao modelo RNN

In [43]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_dim)

Hiperparâmetros

In [44]:
embedding_dim = 100
hidden_dim = 256
num_layers = 2
learning_rate = 0.0001
num_epochs = 50

Carrega o conjunto de dados

In [45]:
dataset = CustomDataset('https://raw.githubusercontent.com/giacicunb/enap_pln2024/main/corpora/simple_corpus.csv')
dataloader = DataLoader(dataset, batch_size=1)

Verificando se está tudo correto com nosso dataset

Obtém-se o tamanho do vocabulário

In [46]:
vocab_size = len(dataset.word_to_idx)
print(f'O vocaculario possui {vocab_size} palavras')

O vocaculario possui 362 palavras


Instanciando o modelo baseado em RNN

In [47]:
model = RNNModel(vocab_size+1, embedding_dim, hidden_dim, num_layers)

 Definindo a função de perda, em que a função softmax está implementada internamente:

In [48]:
loss_function = nn.CrossEntropyLoss()

Definindo-se o otimizador Adam

In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Treinamento da RNN



1.   O treinamento é feito manualmente, pegando-se batch por batch
2.   Primeiramente, pega-se os textos do batch sendo analisados e prepara-os para colocá-los no modelo
3.   Inicializa $h^{(0)}$
4.   Depois, calcula $h^{(t)}$
5.   Após processar todas as palavras do texto de entrada, calcula-se a função loss
6.   Atualiza os pesos da rede fazendo-se o backpropagation

In [50]:
for epoch in range(num_epochs):

    total_loss = 0
    for batch in dataloader:

        tokens, labels = batch['input_ids'], batch['labels']

        optimizer.zero_grad()

        output,_ = model(tokens)

        output_flat = output.view(-1, output.shape[-1])
        labels_flat = labels.view(-1)

        loss = loss_function(output_flat, labels_flat)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}')

Epoch 1, Loss: 5.9012486934661865
Epoch 2, Loss: 5.8495674928029375
Epoch 3, Loss: 5.806120236714681
Epoch 4, Loss: 5.763212601343791
Epoch 5, Loss: 5.719970305760701
Epoch 6, Loss: 5.675849358240764
Epoch 7, Loss: 5.630305528640747
Epoch 8, Loss: 5.582701921463013
Epoch 9, Loss: 5.532245635986328
Epoch 10, Loss: 5.477983395258586
Epoch 11, Loss: 5.418991009394328
Epoch 12, Loss: 5.355111996332805
Epoch 13, Loss: 5.288232723871867
Epoch 14, Loss: 5.222163041432698
Epoch 15, Loss: 5.159573952356975
Epoch 16, Loss: 5.1003719965616865
Epoch 17, Loss: 5.043552398681641
Epoch 18, Loss: 4.9885936578114825
Epoch 19, Loss: 4.9352452754974365
Epoch 20, Loss: 4.883035659790039
Epoch 21, Loss: 4.831312259038289
Epoch 22, Loss: 4.779558340708415
Epoch 23, Loss: 4.727481047312419
Epoch 24, Loss: 4.67491348584493
Epoch 25, Loss: 4.621768395105998
Epoch 26, Loss: 4.568012078603108
Epoch 27, Loss: 4.513624946276347
Epoch 28, Loss: 4.458606680234273
Epoch 29, Loss: 4.402997255325317
Epoch 30, Loss: 4.3

Predição

In [51]:
def predict_next_word(model, input_text):
    input_text = input_text.split()
    input_data = torch.tensor([[dataset.word_to_idx[word] for word in input_text]])
    output, _ = model(input_data)
    _, predicted_idx = torch.max(output[:, -1], 1)
    predicted_word = dataset.idx_to_word[predicted_idx.item()]
    return predicted_word

Fazendo o modelo de linguagem funcionar

In [52]:
input_text = "termo"
predicted_word = predict_next_word(model, input_text)
print(f"A proxima palavra apos {input_text} eh {predicted_word}")

A proxima palavra apos termo eh europa
