<a href="https://colab.research.google.com/github/gupta24789/multiclass-classification/blob/main/multiclass_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pytorch-lightning

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import itertools
from nltk.stem import PorterStemmer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import pytorch_lightning as pl
import torchmetrics

from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Set Seed

In [3]:
SEED = 121
torch.manual_seed(SEED)
np.random.seed(SEED)
pl.seed_everything(SEED)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

## Utilities

In [4]:
def process_text(text):

    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tokenizer = WhitespaceTokenizer()
    text_tokens = tokenizer.tokenize(text)

    clean_text = []
    for word in text_tokens:
      if word not in stopwords_english:
        stem_word = stemmer.stem(word).lower()
        clean_text.append(stem_word)

    return clean_text

## Load Read Data

In [5]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multiclass-classification/main/data/train.txt", header = None, sep=';')
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multiclass-classification/main/data/val.txt",header = None, sep=';')
train_df.columns = ['complaints','label']
val_df.columns = ['complaints','label']
train_df['processed_complaints'] = train_df.complaints.apply(lambda x: process_text(x))
val_df['processed_complaints'] = val_df.complaints.apply(lambda x: process_text(x))

In [6]:
print(train_df.label.value_counts())

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: label, dtype: int64


In [7]:
print(val_df.label.value_counts())

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: label, dtype: int64


In [8]:
train_df.head()

Unnamed: 0,complaints,label,processed_complaints
0,i didnt feel humiliated,sadness,"[didnt, feel, humili]"
1,i can go from feeling so hopeless to so damned...,sadness,"[go, feel, hopeless, damn, hope, around, someo..."
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grab, minut, post, feel, greedi, wrong]"
3,i am ever feeling nostalgic about the fireplac...,love,"[ever, feel, nostalg, fireplac, know, still, p..."
4,i am feeling grouchy,anger,"[feel, grouchi]"


## Encode Label

In [9]:
## Encode Labels
label2idx_map = {w:i for i,w in enumerate(train_df.label.unique().tolist())}
idx2label_map = {i:w for w,i in label2idx_map.items()}

train_df['encoded_label'] = train_df.label.apply(lambda x: label2idx_map[x])
val_df['encoded_label'] = val_df.label.apply(lambda x: label2idx_map[x])

## Class Weight

In [10]:
class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(train_df.encoded_label), y=train_df.encoded_label)
class_weights

array([0.57151022, 1.23513973, 2.04498978, 4.66200466, 1.37669936,
       0.49732687])

## Build Vocab

In [11]:
special_tokens = ['__PAD__','__UNK__']
vocab = list(set(itertools.chain.from_iterable(train_df.processed_complaints.tolist())))
vocab = special_tokens + vocab
token2idx = {w:i for i,w in enumerate(vocab)}
idx2token = {i:w for i,w in enumerate(vocab)}
print(f"vocab : {len(vocab)}")

PAD_ID = token2idx['__PAD__']
UNK_ID = token2idx['__UNK__']

vocab : 10379


## Convert text to numbers

In [12]:
def convert_to_number_tensor(text):
  encoded_text = []
  for w in text:
    encoded_text.append(token2idx.get(w, UNK_ID))

  return torch.tensor(encoded_text)

In [13]:
train_df['text'] = train_df.processed_complaints.apply(lambda x: convert_to_number_tensor(x))
val_df['text'] = val_df.processed_complaints.apply(lambda x: convert_to_number_tensor(x))

In [14]:
train_df.head(3)

Unnamed: 0,complaints,label,processed_complaints,encoded_label,text
0,i didnt feel humiliated,sadness,"[didnt, feel, humili]",0,"[tensor(8671), tensor(5016), tensor(3084)]"
1,i can go from feeling so hopeless to so damned...,sadness,"[go, feel, hopeless, damn, hope, around, someo...",0,"[tensor(2896), tensor(5016), tensor(3694), ten..."
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grab, minut, post, feel, greedi, wrong]",1,"[tensor(2600), tensor(4361), tensor(2197), ten..."


## DataLoaders

In [15]:
def custom_collate(batch):

  text = [torch.tensor(item['text']) for item in batch]
  label = [item['encoded_label'] for item in batch]

  ## for equal length use pad_sequence
  padded_text = nn.utils.rnn.pad_sequence(text, batch_first= True, padding_value= PAD_ID)
  label = torch.tensor(label, dtype = torch.long)

  batch = {"text": padded_text, "label": label}
  return batch

In [16]:
train_data = train_df[['text','encoded_label']].to_dict('records')
val_data = val_df[['text','encoded_label']].to_dict('records')

In [17]:
train_data[:2]

[{'text': tensor([8671, 5016, 3084]), 'encoded_label': 0},
 {'text': tensor([2896, 5016, 3694, 2257, 5185, 6600, 8625, 5214, 3353]),
  'encoded_label': 0}]

In [18]:
batch_size = 2
train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn= custom_collate)

In [19]:
example = next(iter(train_dl))
example['text'].shape, example['label'].shape

  text = [torch.tensor(item['text']) for item in batch]


(torch.Size([2, 22]), torch.Size([2]))

In [20]:
example['text']

tensor([[5016, 2622, 5700, 2458,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [7278, 3516, 7932, 9310, 4599, 5429,  935, 1940, 9049, 9310, 5016, 3682,
          748, 8583, 4069, 7562, 7932, 5195, 9310, 5016, 5039, 7882]])

In [21]:
example['label']

tensor([0, 5])

In [22]:
## dataloaders
batch_size = 64
train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn= custom_collate)
val_dl = DataLoader(val_data, batch_size = batch_size, shuffle = False, collate_fn= custom_collate)

## Build Model

In [72]:
class MultiClassRNN(pl.LightningModule):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, learning_rate, dropout, num_layers = 1, bidirectional = False):
    super().__init__()
    self.learning_rate = learning_rate
    self.bidirectional = bidirectional

    ## define loss & accuracy
    self.loss_fn = nn.CrossEntropyLoss(weight= torch.tensor(class_weights, dtype = torch.float))
    self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=output_dim)
    self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=output_dim)

    ## define layers
    self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim= embedding_dim, padding_idx= PAD_ID)
    self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first = True, num_layers = num_layers, bidirectional = bidirectional, dropout=dropout)
    self.relu = nn.ReLU()
    self.linear1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 32)
    self.linear2 = nn.Linear(32, output_dim)


  def forward(self, text):
    """
    No need to apply softmax at the end as crossentropy implicitly apply the softmax
    """
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)

    if self.bidirectional:
       ## concatnate last hidden layer of forward & backward
      hidden_squeezed = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
    else:
      hidden_squeezed = hidden[-1,:,:].squeeze(0)

    hidden_squeezed = self.relu(hidden_squeezed)
    out = self.linear1(hidden_squeezed)
    hidden = self.relu(out)
    logits = self.linear2(out)
    return logits

  def _shared_step(self, batch):
    text, label = batch['text'], batch['label']
    logits = self(text)
    loss = self.loss_fn(logits, label)
    return logits, loss, label

  def training_step(self, batch, batch_idx):
    logits, loss, label = self._shared_step(batch)
    self.train_f1.update(logits, label)
    self.log_dict({"train_loss": loss, "train_f1": self.train_f1}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self,batch, batch_idx):
    logits, loss, label = self._shared_step(batch)
    self.val_f1.update(logits, label)
    self.log_dict({"val_loss": loss,  "val_f1": self.val_f1}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def on_training_epoch_end(self):
    self.train_f1.reset()

  def on_validation_epoch_end(self):
    if self.current_epoch!=0:
      print(f"Epoch : {self.current_epoch} Val F1 : {self.val_f1.compute()}")
    self.val_f1.reset()

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

In [73]:
# ## test model architecture
# model = MultiClassRNN(vocab_size = len(token2idx),
#                       embedding_dim=100,
#                       hidden_dim= 64,
#                       output_dim= len(label2idx_map),
#                       learning_rate= 1e-3,
#                       dropout = 0.5,
#                       num_layers= 2,
#                       bidirectional = True
#                       )

# logits = model(example['text'])
# model.loss_fn(logits, example['label'])

In [81]:
## Model Training

model = MultiClassRNN(vocab_size = len(token2idx),
                      embedding_dim=100,
                      hidden_dim= 256,
                      output_dim= len(label2idx_map),
                      learning_rate= 1e-3,
                      dropout = 0.25,
                      num_layers= 2,
                      bidirectional = True
                      )

callbacks = pl.callbacks.ModelCheckpoint(dirpath = "multiclass_logs",
                                         filename = '{epoch}-{val_loss:.2f}-{val_acc:.2f}',
                                          mode = "max",
                                          monitor = "val_f1",
                                          save_last = True,
                                          save_top_k=-1)


trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=20,
           check_val_every_n_epoch = 2,
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/multiclass_logs exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type              | Params
------------------------------------------------
0 | loss_fn   | CrossEntropyLoss  | 0     
1 | train_f1  | MulticlassF1Score | 0     
2 | val_f1    | MulticlassF1Score | 0     
3 | embedding | Embedding         | 1.0 M 
4 | rnn       | RNN               | 577 K 
5 | relu      | ReLU              | 0     
6 | linear1   | Linear         

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
  text = [torch.tensor(item['text']) for item in batch]
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Val F1 : 0.4034999907016754


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Val F1 : 0.6269999742507935


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 5 Val F1 : 0.737500011920929


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 7 Val F1 : 0.7875000238418579


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 9 Val F1 : 0.8324999809265137


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 11 Val F1 : 0.8349999785423279


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 13 Val F1 : 0.8539999723434448


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 15 Val F1 : 0.8389999866485596


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 17 Val F1 : 0.8585000038146973


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch : 19 Val F1 : 0.8460000157356262


In [82]:
model.eval()

MultiClassRNN(
  (loss_fn): CrossEntropyLoss()
  (train_f1): MulticlassF1Score()
  (val_f1): MulticlassF1Score()
  (embedding): Embedding(10379, 100, padding_idx=0)
  (rnn): RNN(100, 256, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
  (relu): ReLU()
  (linear1): Linear(in_features=512, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=6, bias=True)
)

In [83]:
def predict(text):
  model.eval()
  tokenized_text = process_text(text)
  token_tensor = convert_to_number_tensor(tokenized_text)
  token_tensor = token_tensor.view(1,-1)
  preds = model(token_tensor)
  value, index = torch.topk(preds, k = 1)
  return index.item()

In [84]:
index = predict("I love you")
print(f"Label : {idx2label_map[index]}")

Label : love


In [85]:
index = predict("i hate you")
print(f"Label : {idx2label_map[index]}")

Label : anger


## Classification report

In [86]:
val_preds_index = [predict(text) for text in val_df.complaints]

In [87]:
print(classification_report(val_df.encoded_label, val_preds_index, target_names = list(label2idx_map.keys())))

              precision    recall  f1-score   support

     sadness       0.91      0.79      0.84       550
       anger       0.79      0.73      0.76       275
        love       0.53      0.80      0.64       178
    surprise       0.44      0.79      0.57        81
        fear       0.60      0.84      0.70       212
         joy       0.91      0.72      0.80       704

    accuracy                           0.76      2000
   macro avg       0.70      0.78      0.72      2000
weighted avg       0.81      0.76      0.77      2000



In [88]:
val_df.label.value_counts()

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: label, dtype: int64