<a href="https://colab.research.google.com/github/essat20/NLP_CW_210021102/blob/main/NLP_RNN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# downloading datasets
!pip install datasets # install the datasets from huggingface

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any

In [22]:
# import the libraries

import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer, AdamW
from datasets import load_dataset
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW as aw

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt # this can be used to display loss/epoch graphs
from sklearn.model_selection import learning_curve

from sklearn.metrics import confusion_matrix # so i can display the confusion matrix

from torch.nn.utils.rnn import pad_sequence  # have to add padding library to ensure all sequences have the same length

import seaborn as sb

from torch.distributions import Categorical


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # prepares GPU

# Load the dataset
dataset = load_dataset('emotion', trust_remote_code=True) # have to use the second parameter as there is required custom code for the dataset to be loaded properly

# Extract text sequences
texts = dataset['train']['text']


In [24]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, input_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        hidden_state = (hidden_state[0].detach(), hidden_state[1].detach())
        return output, hidden_state


In [28]:
def train():
    ########### Hyperparameters ###########
    hidden_size = 512   # size of hidden state
    seq_len = 100       # length of LSTM sequence
    num_layers = 3      # num of layers in LSTM layer stack
    lr = 0.002          # learning rate
    epochs = 3        # max number of epochs
    op_seq_len = 200    # total num of characters in output test sequence
    save_path = "charRNN_emotion.pth"  # Change this to the path where you want to save the model
    #######################################

    # Load the dataset
    dataset = load_dataset('emotion', split='train')  # Load the training split of the emotion dataset

    # Extract text sequences
    texts = dataset['text']

    # Combine all texts into a single string
    data = ' '.join(texts)

    chars = sorted(list(set(data)))
    data_size, vocab_size = len(data), len(chars)
    print("----------------------------------------")
    print("Data has {} characters, {} unique".format(data_size, vocab_size))
    print("----------------------------------------")

    # char to index and index to char maps
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    ix_to_char = {i: ch for i, ch in enumerate(chars)}

    # convert data from chars to indices
    data = list(data)
    for i, ch in enumerate(data):
        data[i] = char_to_ix[ch]

    # data tensor on device
    data = torch.tensor(data).to(device)
    data = torch.unsqueeze(data, dim=1)

    # model instance
    rnn = RNN(vocab_size, vocab_size, hidden_size, num_layers).to(device)

    # loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adagrad(rnn.parameters(), lr=lr)

    # training loop
    for i_epoch in range(1, epochs + 1):

        # random starting point (1st 10000 chars) from data to begin
        data_ptr = np.random.randint(10000)
        n = 0
        running_loss = 0
        hidden_state = None

        while True:
            input_seq = data[data_ptr: data_ptr + seq_len]
            target_seq = data[data_ptr + 1: data_ptr + seq_len + 1]

            # forward pass
            output, hidden_state = rnn(input_seq, hidden_state)

            # compute loss
            loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
            running_loss += loss.item()

            # compute gradients and take optimizer step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update the data pointer
            data_ptr += seq_len
            n += 1

            # if at end of data : break
            if data_ptr + seq_len + 1 > data_size:
                break

        # print loss and save weights after every epoch
        print("Epoch: {0} \t Loss: {1:.8f}".format(i_epoch, running_loss / n))
        torch.save(rnn.state_dict(), save_path)

        # sample / generate a text sequence after every epoch
        data_ptr = 0
        hidden_state = None

        # random character from data to begin
        rand_index = np.random.randint(data_size - 1)
        input_seq = data[rand_index: rand_index + 1]

        print("----------------------------------------")
        while True:
            # forward pass
            output, hidden_state = rnn(input_seq, hidden_state)

            # construct categorical distribution and sample a character
            output = F.softmax(torch.squeeze(output), dim=0)
            dist = Categorical(output)
            index = dist.sample()

            # print the sampled character
            print(ix_to_char[index.item()], end='')

            # next input is
            input_seq[0][0] = index.item()
            data_ptr += 1

            if data_ptr > op_seq_len:
                break

        print("\n----------------------------------------")

In [29]:
train() # call the training loop

----------------------------------------
Data has 1565532 characters, 27 unique
----------------------------------------
Epoch: 1 	 Loss: 1.71560139
----------------------------------------
ont dreally stoll someble bettamed i feil hot shated praally feeling wating exptirate or happelle ening best pith as i mave lre while i feel oucless with by thile wrepted her were theme sat i feel vinm
----------------------------------------
Epoch: 2 	 Loss: 1.48649242
----------------------------------------
to qrodush pilled at they its clacely discalrisged i blil that onerran is belazenent make him hers afe that moy brace i still feel bitteny i unching even for the capilet beantared whethere must people 
----------------------------------------
Epoch: 3 	 Loss: 1.40871943
----------------------------------------
lblly feel apuse up there it of the foratm mirs at theneings inmodidastion about keeps i was sweeache is a caons feating and been feeling rany mess i feel like if its fine pustant and i 