## A Recurrent Neural Network for text classification

https://towardsdatascience.com/lstm-text-classification-using-pytorch-2c6c657f8fc0 

https://keras.io/examples/nlp/text_classification_with_transformer/ 

In [1]:
import pandas as pd
import numpy as np
import torch
import torchtext
import torch.nn as nn
import spacy
import matplotlib.pyplot as plt
import collections
from torch.utils.data import Dataset, DataLoader

In [2]:
train_data = pd.read_csv("../Datasets/data_train.csv")
val_data = pd.read_csv("../Datasets/data_val.csv")
test_data = pd.read_csv("../Datasets/data_test.csv")

In [70]:
# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 8
BATCH_SIZE = 32
NUM_EPOCHS = 10
LEARNING_RATE = 0.001

In [39]:
# create tokenizer
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# tokenize sentences
train_data['tokens'] = train_data['Text'].apply(tokenizer)
val_data['tokens'] = val_data['Text'].apply(tokenizer)

In [79]:
# count the frequency of each word in the training data
word_freq = Counter()
for sentence in train_data['Text']:
    word_freq.update(tokenizer(sentence))

# build vocabulary
vocab = Vocab(word_freq)

# convert tokens to indices
train_data['text_tensor'] = train_data['Text'].apply(lambda x: torch.LongTensor([vocab[token] for token in tokenizer(x)]))
train_data['label_tensor'] = train_data['Emotion'].apply(lambda x: torch.LongTensor([vocab[token] for token in tokenizer(x)]))
val_data['text_tensor'] = val_data['Text'].apply(lambda x: torch.LongTensor([vocab[token] for token in tokenizer(x)]))
val_data['label_tensor'] = val_data['Emotion'].apply(lambda x: torch.LongTensor([vocab[token] for token in tokenizer(x)]))

## Using PyTorch

In [24]:
classes = [classes for classes in train_data['Emotion'].unique()]

In [25]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [26]:
counter = collections.Counter()
for line in train_data["Text"].to_list():
    counter.update(tokenizer(line))
vocab = torchtext.vocab.vocab(counter, min_freq=1)

In [27]:
first_sentence = train_data["Text"][0]
second_sentence = train_data["Text"][1]

In [28]:
vocab_size = len(vocab)
print(f"Vocab size if {vocab_size}")

def encode(x):
    return [vocab.get_stoi()[s] for s in tokenizer(x)]

vec = encode(first_sentence)
print(vec)

Vocab size if 15212
[0, 1, 2, 3, 4, 5, 6, 7, 5, 8, 9, 10, 3, 11, 12, 13, 14, 15, 16, 17, 18]


In [29]:
def decode(x):
    return [vocab.get_itos()[i] for i in x]

decode(vec)

['i',
 'can',
 'go',
 'from',
 'feeling',
 'so',
 'hopeless',
 'to',
 'so',
 'damned',
 'hopeful',
 'just',
 'from',
 'being',
 'around',
 'someone',
 'who',
 'cares',
 'and',
 'is',
 'awake']

In [30]:
def padify(b):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [encode(x[1]) for x in b]
    # first, compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

In [31]:
f_tokens = encode(first_sentence)
s_tokens = encode(second_sentence)

print(f'First Sentence in dataset:\n{first_sentence}')
print("Length:", len(train_data["Text"][0]))
print(f'\nSecond Sentence in dataset:\n{second_sentence}')
print("Length: ", len(train_data["Text"][1]))

First Sentence in dataset:
i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake
Length: 108

Second Sentence in dataset:
im grabbing a minute to post i feel greedy wrong
Length:  48


In [42]:
def to_bow(text,bow_vocab_size=vocab_size):
    res = torch.zeros(bow_vocab_size,dtype=torch.float32)
    for i in encode(text):
        if i<bow_vocab_size:
            res[i] += 1
    return res

print(f"sample text:\n{train_data['Text'][0]}")
print(f"\nBoW vector:\n{to_bow(train_data['Text'][1])}")

sample text:
i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake

BoW vector:
tensor([1., 0., 0.,  ..., 0., 0., 0.])


In [53]:
# this collate function gets list of batch_size tuples, and needs to 
# return a pair of label-feature tensors for the whole minibatch
def bowify(b):
    return (
            torch.LongTensor([t[0]-1 for t in b]),
            torch.stack([to_bow(t[1]) for t in b])
    )

train_loader = DataLoader(train_data, batch_size=16, collate_fn=bowify, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, collate_fn=bowify, shuffle=True)

In [54]:
net = torch.nn.Sequential(torch.nn.Linear(vocab_size,4),torch.nn.LogSoftmax(dim=1))

In [55]:
def train_epoch(net, dataloader, lr=0.01, optimizer=None, loss_fn = torch.nn.NLLLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,features in dataloader:
        optimizer.zero_grad()
        out = net(features)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss
        _,predicted = torch.max(out,1)
        acc+=(predicted==labels).sum()
        count+=len(labels)
        i+=1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

In [56]:
train_epoch(net,train_loader,epoch_size=1600)

KeyError: 6582

## This Part works now

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giacomomunda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/giacomomunda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giacomomunda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
class TextClassificationDataset(Dataset):
    def __init__(self, df, vectorizer, label_encoder):
        self.df = df
        self.vectorizer = vectorizer
        self.label_encoder = label_encoder
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens if token not in self.stop_words and token not in self.punctuation]
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        preprocess_text = " ".join(tokens)
        return preprocess_text
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]["Text"]
        label = self.df.iloc[idx]["Emotion"]

        # Preprocess text
        text = self.preprocess_text(text)
        
        # Convert text to BOW vector
        bow = self.vectorizer.transform([text]).toarray()[0]
        
        # Convert label to numerical value
        label = self.label_encoder.transform([label])[0]
        
        return torch.LongTensor([label]), torch.FloatTensor(bow)
    

In [69]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.log_softmax(x)
        return x

In [70]:
# Load data from pandas dataframe
train_df = pd.read_csv("../Datasets/data_train.csv")
eval_df = pd.read_csv("../Datasets/data_val.csv")
test_df = pd.read_csv("../Datasets/data_test.csv")

# Create sorted list of unique labels
all_labels = np.concatenate([train_df["Emotion"].unique(), eval_df["Emotion"].unique()])
unique_labels = np.unique(all_labels)
sorted_labels = np.sort(unique_labels)

# Initialize vectorizer and label encoder
vectorizer = CountVectorizer()
label_encoder = LabelEncoder()

# Fit vectorizer and label encoder to training data
vectorizer.fit(train_df["Text"])
label_encoder.fit(train_df["Emotion"])

# Create dataset and data loader for training data
train_dataset = TextClassificationDataset(train_df, vectorizer, label_encoder)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Fit a new label encoder to the evaluation labels only
eval_labels = eval_df["Emotion"]
eval_label_encoder = LabelEncoder()
eval_label_encoder.fit(eval_labels)

# Create dataset and data loader for evaluation data
eval_dataset = TextClassificationDataset(eval_df, vectorizer, eval_label_encoder)
eval_dataloader = DataLoader(eval_dataset, batch_size=16, shuffle=True)

# Fit a new label encoder to the test labels only
test_labels = test_df["Emotion"]
test_label_encoder = LabelEncoder()
test_label_encoder.fit(test_labels)

# Create dataset and data loader for test data
test_dataset = TextClassificationDataset(test_df, vectorizer, test_label_encoder)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)


In [72]:
net = NeuralNetwork(len(vectorizer.vocabulary_), 20, 6)

# define loss function and optimizer
loss_fn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

best_val_accuracy = 0
best_model = None

# Train model
for epoch in range(8):
    total_loss, total_acc, count = 0, 0, 0
    net.train()
    for labels, features in train_dataloader:
        optimizer.zero_grad()
        out = net(features)
        labels = labels.squeeze(1)
        loss = loss_fn(out, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}: training loss = {total_loss/len(train_dataset)}")

    # Evaluate model on validation set
    net.eval()
    total_correct = 0
    with torch.no_grad():
        for labels, features in eval_dataloader:
            out = net(features)
            labels = labels.squeeze(1)
            _, predicted = torch.max(out, dim=1)
            total_correct += (predicted == labels).sum().item()
        val_accuracy = total_correct / len(eval_dataset)
        
    print(f"Accuracy: {val_accuracy}")

    # Save the best model based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = net.state_dict()
        torch.save(best_model, "best_model.pt")

    # Evaluate model on test set
    net.load_state_dict(best_model)
    net.eval()
    total_correct = 0
    with torch.no_grad():
        for labels, features in test_dataloader:
            out = net(features)
            labels = labels.squeeze(1)
            _, predicted = torch.max(out, dim=1)
            total_correct += (predicted == labels).sum().item()
        test_accuracy = total_correct / len(test_dataset)

    print(f"Test accuracy: {test_accuracy}")


Epoch 0: training loss = 0.0688542762661928
Accuracy: 0.8494247123561781
Test accuracy: 0.8569284642321161
Epoch 1: training loss = 0.017362684891148502
Accuracy: 0.8934467233616809
Test accuracy: 0.8899449724862432
Epoch 2: training loss = 0.007776901100818828
Accuracy: 0.896448224112056
Test accuracy: 0.887943971985993
Epoch 3: training loss = 0.004871668008487518
Accuracy: 0.8959479739869936
Test accuracy: 0.8854427213606804
Epoch 4: training loss = 0.0034349635364349807
Accuracy: 0.8984492246123061
Test accuracy: 0.8844422211105553
Epoch 5: training loss = 0.0025763971321855826
Accuracy: 0.8959479739869936
Test accuracy: 0.8874437218609305
Epoch 6: training loss = 0.001983615933085245
Accuracy: 0.886943471735868
Test accuracy: 0.8804402201100551
Epoch 7: training loss = 0.0016071952151689394
Accuracy: 0.8929464732366184
Test accuracy: 0.8814407203601801


**Results:**

- *Fully Connected with CountVectorizer:*

    8 Epochs --> Accuracy: 0.8799 | Loss: 0.0017039 (best loss)

    Best accuracy score: 0.8909 (Epoch 3)

    After preprocessing --> best validation accuracy score:

                            Epoch 5: 0.8984


- *Fully Connected with TfidfVectorizer:*

    8 epochs --> Accuracy: 0.8799 | Loss: 0.0026595

    Best accuracy score: 0.8849

    After preprocessing --> best accuracy score: 0.8879 (Epoch 6)
                            
                            Epoch 8: 0.8854

**Doing inference with the trained model**

In [73]:
loaded_state_dict = torch.load("best_model.pt")
net.load_state_dict(loaded_state_dict)

# prepare input data for inference
new_data = ["Today I went to the supermarket and bought some fruits, they were delicious!", 
            "I hate this movie", 
            "I was very pleased to see my daughter today",
            "Why are you so angry?",
            "Studying computational linguistics can be hard, but very satisfying!",
            "What are you doing for Christmas? I hope you have a great time"]

# convert input data to BOW vectors
new_data_bow = vectorizer.transform(new_data).toarray()

# run model on input data
net.eval()
with torch.no_grad():
    output_tensor = net(torch.FloatTensor(new_data_bow))
    _, predicted = torch.max(output_tensor, dim=1)
    predicted_labels = label_encoder.inverse_transform(predicted)
    for i, sentence in enumerate(new_data):
        print(f"'{sentence}' | Predicted emotion: {predicted_labels[i]}\n")

'Today I went to the supermarket and bought some fruits, they were delicious!' | Predicted emotion: joy

'I hate this movie' | Predicted emotion: joy

'I was very pleased to see my daughter today' | Predicted emotion: joy

'Why are you so angry?' | Predicted emotion: anger

'Studying computational linguistics can be hard, but very satisfying!' | Predicted emotion: sadness

'What are you doing for Christmas? I hope you have a great time' | Predicted emotion: sadness

