# CSCI 4435/5435: Text Mining & Natural Language Processing
## Assignment 4: Recurrent Neural Networks
### Student: Miguel Guirao
### Aggie ID: 800699208

## Summary
- Import al requiered libraries
- Load the dataset and perform data pre-processing

In [1]:
# import all required libraries
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from nltk.tokenize import SpaceTokenizer
import torch
import gensim.downloader as api
import gensim

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

In [2]:
# Let's setup TensorBoard
writer = SummaryWriter()

# Let's verify some libraries versions
# Let's check the versions of our main libraries
pyversion = !python --version
print(f"Python version: {pyversion[0]}\nPyTorch version: {torch.__version__}\nGensim version: {gensim.__version__}")

# USING GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device_name = torch.cuda.get_device_name()  
print(f"Total GPU devices available for use: {torch.cuda.device_count()}")
print("CUDA device name:", device_name) 
print(f"Using compute device: {device}.")

Python version: Python 3.12.3
PyTorch version: 2.6.0+cu124
Gensim version: 4.3.3
Total GPU devices available for use: 1
CUDA device name: NVIDIA GeForce RTX 3050 Ti Laptop GPU
Using compute device: cuda:0.


## Load the dataset and perform pre-processing

In [3]:
# https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
dataset = pd.read_json("dataset/News_Category_Dataset_v2.json", lines=True) # Read the file as a json object per line.
dataset.head(15)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26
6,ENTERTAINMENT,Donald Trump Is Lovin' New McDonald's Jingle I...,Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,"It's catchy, all right.",2018-05-26
7,ENTERTAINMENT,What To Watch On Amazon Prime That’s New This ...,Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,There's a great mini-series joining this week.,2018-05-26
8,ENTERTAINMENT,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,"Myer's kids may be pushing for a new ""Powers"" ...",2018-05-26
9,ENTERTAINMENT,What To Watch On Hulu That’s New This Week,Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,You're getting a recent Academy Award-winning ...,2018-05-26


In [4]:
# Drop unneeded features
dataset.drop(labels=['category', 'headline', 'authors', 'link', 'date'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,short_description
0,She left her husband. He killed their children...
1,Of course it has a song.
2,The actor and his longtime girlfriend Anna Ebe...
3,The actor gives Dems an ass-kicking for not fi...
4,"The ""Dietland"" actress said using the bags is ..."


In [5]:
dataset.loc[:, 'short_description'] = dataset['short_description'].str.lower()
dataset.loc[:, 'short_description'] = dataset['short_description'].str.replace("-", " ", regex=True)
dataset.loc[:, 'short_description'] = dataset['short_description'].str.replace(r"[^'\&\w\s]", "", regex=True)
dataset.loc[:, 'short_description'] = dataset['short_description'].str.strip()
dataset.head()

Unnamed: 0,short_description
0,she left her husband he killed their children ...
1,of course it has a song
2,the actor and his longtime girlfriend anna ebe...
3,the actor gives dems an ass kicking for not fi...
4,the dietland actress said using the bags is a ...


In [6]:
train_data = dataset["short_description"]
train_data

0         she left her husband he killed their children ...
1                                   of course it has a song
2         the actor and his longtime girlfriend anna ebe...
3         the actor gives dems an ass kicking for not fi...
4         the dietland actress said using the bags is a ...
                                ...                        
200848    verizon wireless and at&t are already promotin...
200849    afterward azarenka more effusive with the pres...
200850    leading up to super bowl xlvi the most talked ...
200851    correction an earlier version of this story in...
200852    the five time all star center tore into his te...
Name: short_description, Length: 200853, dtype: object

In [7]:
train_data = [" ".join(["<start>", x, "<end>"]) for x in train_data]
# print first row
train_data[:5]

['<start> she left her husband he killed their children just another day in america <end>',
 '<start> of course it has a song <end>',
 '<start> the actor and his longtime girlfriend anna eberstein tied the knot in a civil ceremony <end>',
 '<start> the actor gives dems an ass kicking for not fighting hard enough against donald trump <end>',
 '<start> the dietland actress said using the bags is a really cathartic therapeutic moment <end>']

## Tokenization

In [8]:
###
# define Vocab
###
class Vocab:
    def __init__(self, list_of_sentence, tokenization, special_token, max_tokens=None):
        # count vocab frequency
        vocab_freq = {}
        tokens = tokenization(list_of_sentence)
        for t in tokens:
            for vocab in t:
                if vocab not in vocab_freq:
                    vocab_freq[vocab] = 0 
                vocab_freq[vocab] += 1
        # sort by frequency
        vocab_freq = {k: v for k, v in sorted(vocab_freq.items(), key=lambda i: i[1], reverse=True)}
        # create vocab list
        self.vocabs = [special_token] + list(vocab_freq.keys())
        if max_tokens:
            self.vocabs = self.vocabs[:max_tokens]
        self.stoi = {v: i for i, v in enumerate(self.vocabs)}

    def _get_tokens(self, list_of_sentence):
        for sentence in list_of_sentence:
            tokens = tokenizer.tokenize(sentence)
            yield tokens

    def get_itos(self):
        return self.vocabs

    def get_stoi(self):
        return self.stoi

    def append_token(self, token):
        self.vocabs.append(token)
        self.stoi = {v: i for i, v in enumerate(self.vocabs)}

    def __call__(self, list_of_tokens):
        def get_token_index(token):
            if token in self.stoi:
                return self.stoi[token]
            else:
                return 0
        return [get_token_index(t) for t in list_of_tokens]

    def __len__(self):
        return len(self.vocabs)

In [9]:
###
# generate Vocab
###
max_word = 50000

# create tokenizer
tokenizer = SpaceTokenizer()

# define tokenization function
def yield_tokens(data):
    for text in data:
        tokens = tokenizer.tokenize(text)
        yield tokens

# build vocabulary list, of size max_word
vocab = Vocab(train_data, tokenization=yield_tokens, special_token="<unk>", max_tokens=max_word)

In [10]:
pad_index = vocab.__len__()
vocab.append_token("<pad>")
print(f"pad_index: {pad_index}, vocab size: {len(vocab.vocabs)}")

pad_index: 50000, vocab size: 50001


In [11]:
itos = vocab.get_itos()
stoi = vocab.get_stoi()
# test
print("The number of token index is {}.".format(vocab.__len__()))
print("The padded index is {}.".format(stoi["<pad>"]))

The number of token index is 50001.
The padded index is 50000.


In [12]:
max_seq_len = 256

def collate_batch(batch):
    label_list, feature_list = [], []
    for text in batch:
        # tokenize to a list of word's indices
        tokens = vocab(tokenizer.tokenize(text))
        # separate into features and labels
        y = tokens[1:]
        y.append(-100)
        x = tokens
        # limit length to max_seq_len
        y = y[:max_seq_len]
        x = x[:max_seq_len]
        # pad features and labels
        y += [-100] * (max_seq_len - len(y))
        x += [pad_index] * (max_seq_len - len(x))
        # add to list
        label_list.append(y)
        feature_list.append(x)
    # convert to tensor
    label_list = torch.tensor(label_list, dtype=torch.int64).to(device)
    feature_list = torch.tensor(feature_list, dtype=torch.int64).to(device)
    return label_list, feature_list

dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_batch)

In [13]:
# test
for labels, features in dataloader:
    break

print("label shape in batch : {}".format(labels.size()))
print("feature shape in batch : {}\n".format(features.size()))
print("***** label sample *****")
print(labels[0], "\n")
print("***** features sample *****")
print(features[0])

label shape in batch : torch.Size([32, 256])
feature shape in batch : torch.Size([32, 256])

***** label sample *****
tensor([  39,   12,   14,   39,   12,   26,  446,  859,    2, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
  

## Embeddings: Option A with a pre-trained Word2Vec

In [1]:
import gensim.downloader as api
import gensim.models

In [2]:
word_2_vec = api.load('word2vec-google-news-300')

# Task 1: Language Modeling

## RNN

In [14]:
embedding_dim = 64
rnn_units = 512

class SimpleRnnModel(nn.Module):
    def __init__(self, vocab_size, seq_len, embedding_dim, rnn_units, padding_idx):
        super().__init__()

        self.seq_len = seq_len
        self.padding_idx = padding_idx

        self.embedding = nn.Embedding(
            vocab_size,
            embedding_dim,
            padding_idx=padding_idx,
        )
        self.rnn = nn.RNN(
            input_size=embedding_dim,
            hidden_size=rnn_units,
            num_layers=1,
            batch_first=True,
        )
        self.classify = nn.Linear(rnn_units, vocab_size)

    def forward(self, inputs, states=None, return_final_state=False):
        # embedding
        #   --> (batch_size, seq_len, embedding_dim)
        outs = self.embedding(inputs)
        # build "lengths" property to pack inputs (see above)
        lengths = (inputs != self.padding_idx).int().sum(dim=1, keepdim=False)
        # pack inputs for RNN
        packed_inputs = torch.nn.utils.rnn.pack_padded_sequence(
            outs,
            lengths.cpu(),
            batch_first=True,
            enforce_sorted=False,
        )
        # apply RNN
        if states is None:
            packed_outs, final_state = self.rnn(packed_inputs)
        else:
            packed_outs, final_state = self.rnn(packed_inputs, states)
        # unpack results
        #   --> (batch_size, seq_len, rnn_units)
        outs, _ = torch.nn.utils.rnn.pad_packed_sequence(
            packed_outs,
            batch_first=True,
            padding_value=0.0,
            total_length=self.seq_len,
        )
        # apply feed-forward to classify
        #   --> (batch_size, seq_len, vocab_size)
        logits = self.classify(outs)
        # return results
        if return_final_state:
            return logits, final_state  # This is used in prediction
        else:
            return logits               # This is used in training

In [15]:
modelRNN = SimpleRnnModel(vocab_size=vocab.__len__(), seq_len=max_seq_len, embedding_dim=embedding_dim, rnn_units=rnn_units, padding_idx=pad_index).to(device)

In [16]:
num_epochs = 50
torch.cuda.empty_cache()

optimizer = torch.optim.AdamW(modelRNN.parameters(), lr=0.001)

for epoch in range(num_epochs):
    for labels, seqs in dataloader:
        # optimize
        optimizer.zero_grad()
        logits = modelRNN(seqs)
        loss = F.cross_entropy(logits.transpose(1,2), labels)
        loss.backward()
        optimizer.step()
        # calculate accuracy
        pred_labels = logits.argmax(dim=2)
        num_correct = (pred_labels == labels).float().sum()
        num_total = (labels != -100).float().sum()
        accuracy = num_correct / num_total
        print("Epoch {} - loss: {:2.4f} - accuracy: {:2.4f}".format(epoch+1, loss.item(), accuracy), end="\r")
    writer.add_scalar('Loss/train_rnn', loss.item(), epoch+1)
    writer.add_scalar('Accuracy/train_rnn', accuracy, epoch+1)
    torch.cuda.empty_cache()
    print("")

torch.cuda.empty_cache()
writer.flush()

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.53 GiB. GPU 0 has a total capacity of 3.69 GiB of which 375.12 MiB is free. Including non-PyTorch memory, this process has 3.30 GiB memory in use. Of the allocated memory 3.19 GiB is allocated by PyTorch, and 13.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
writer.close()

In [None]:
torch.cuda.empty_cache()