<a href="https://colab.research.google.com/github/gupta24789/multilabel-classification/blob/main/multilabel_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q  pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import random
import pandas as pd
import numpy as np
import itertools
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
import torchmetrics

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Set Seed

In [3]:
seed = 121
random.seed(seed)
torch.manual_seed(seed)
pl.seed_everything(seed)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

## Utilities

In [4]:
tokenizer = WordPunctTokenizer()
stemmer = PorterStemmer()
STOPWORDS = stopwords.words('english')


def process_context(context):

  clean_context = []

  context = context.lower()
  words_list = tokenizer.tokenize(context)


  for w in words_list:
    if w not in STOPWORDS and w not in string.punctuation:
      stem_word = stemmer.stem(w)
      clean_context.append(stem_word)

  return clean_context


def convert_word_to_number_tensor(context):

  encoded_context = []
  for w in context:
    encoded_context.append(token2idx.get(w, UNK_ID))

  return torch.tensor(encoded_context)

## Load Data

In [5]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multilabel-classification/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multilabel-classification/main/data/test.csv")

print(f"Train shape : {train_df.shape}")
print(f"Val shape : {val_df.shape}")

train_df.columns = train_df.columns.str.lower()
val_df.columns = val_df.columns.str.lower()

Train shape : (16777, 9)
Val shape : (4195, 9)


In [6]:
train_df.head(3)

Unnamed: 0,id,title,abstract,computer science,physics,mathematics,statistics,quantitative biology,quantitative finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0


## Data Prep

In [7]:
train_df['context'] = train_df.title + ". " + train_df.abstract
val_df['context'] = val_df.title + ". " + val_df.abstract

In [8]:
target_columns = ['computer science', 'physics', 'mathematics',
       'statistics', 'quantitative biology', 'quantitative finance']

In [9]:
train_df = train_df[['context'] + target_columns]
val_df = val_df[['context'] + target_columns]

In [10]:
train_df.head(3)

Unnamed: 0,context,computer science,physics,mathematics,statistics,quantitative biology,quantitative finance
0,Reconstructing Subject-Specific Effect Maps. ...,1,0,0,0,0,0
1,Rotation Invariance Neural Network. Rotation...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0


In [11]:
train_df['context_clean'] = train_df.context.apply(lambda x: process_context(x))
val_df['context_clean'] = val_df.context.apply(lambda x: process_context(x))

## Build Vocab

In [12]:
special_tokens = ['__PAD__','__UNK__']
vocab = list(set(itertools.chain.from_iterable(train_df.context_clean.tolist())))
vocab = vocab + special_tokens
token2idx = {w:i for i,w in enumerate(vocab)}
idx2token = {i:w for w,i in token2idx.items()}

PAD_ID = token2idx['__PAD__']
UNK_ID = token2idx['__UNK__']

print(f"Vocab size : {len(vocab)}")

Vocab size : 38181


## Convert word to number tensor

In [135]:
train_df['encoded_context'] = train_df.context_clean.apply(lambda x: convert_word_to_number_tensor(x))
val_df['encoded_context'] = val_df.context_clean.apply(lambda x: convert_word_to_number_tensor(x))

train_df['labels'] = train_df[target_columns].values.tolist()
val_df['labels'] = val_df[target_columns].values.tolist()

train_df['lengths'] = train_df.encoded_context.str.len()
val_df['lengths'] = val_df.encoded_context.str.len()

In [136]:
train_df.head(3)

Unnamed: 0,context,computer science,physics,mathematics,statistics,quantitative biology,quantitative finance,context_clean,encoded_context,labels,lengths
0,Reconstructing Subject-Specific Effect Maps. ...,1,0,0,0,0,0,"[reconstruct, subject, specif, effect, map, pr...","[tensor(18596), tensor(31367), tensor(24791), ...","[1, 0, 0, 0, 0, 0]",198
1,Rotation Invariance Neural Network. Rotation...,1,0,0,0,0,0,"[rotat, invari, neural, network, rotat, invari...","[tensor(28906), tensor(864), tensor(6077), ten...","[1, 0, 0, 0, 0, 0]",55
2,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0,"[spheric, polyharmon, poisson, kernel, polyhar...","[tensor(37566), tensor(33389), tensor(12923), ...","[0, 0, 1, 0, 0, 0]",64


In [139]:
train_data = train_df[['encoded_context','labels','lengths']].to_dict("records")
val_data = val_df[['encoded_context','labels','lengths']].to_dict("records")

In [140]:
train_data[:1]

[{'encoded_context': tensor([18596, 31367, 24791, 34856, 22348, 27528, 11431, 26648, 31367, 24791,
          32804, 27228, 23688, 12843, 22383, 13099, 21269, 12514, 31367, 21269,
          32804, 22260, 10848, 32448, 14328, 17245, 15785, 37197, 18730, 31367,
          32153, 17245, 12990, 37197, 34856,  6348, 13519, 13098, 31367, 21269,
          14328, 32804,  4145, 28208, 32153, 32804, 28208, 24522, 31367, 24791,
          34856, 22348, 15205, 28208,  3230, 11431,  9693, 15710,  8494, 12990,
           5210,   477,   595, 24133,  1182,  5904, 18596, 17821, 12169, 29292,
          24562, 31367, 24791, 12990, 27528, 11431, 21472,  4256, 22773, 15275,
          29292, 24791, 33737, 28730, 14058,  8973, 15128,  6452, 30223, 28208,
          11893, 15128,  3256, 37242, 15275,  5904, 17821, 26338,  4856, 37998,
          28208, 20815, 22773, 15275, 29563,  8415, 17245, 17944,  3251, 37197,
          18730, 18596, 31019, 14892, 29948, 16819,  5652, 11431, 25633, 34570,
          15395, 3724

## Data Loaders

In [141]:
train_df.encoded_context.str.len().describe([.99])

count    16777.000000
mean       107.690648
std         43.493761
min          5.000000
50%        104.000000
99%        218.000000
max        412.000000
Name: encoded_context, dtype: float64

In [142]:
def custom_collate(batch):

  context = [torch.tensor(item['encoded_context']) for item in batch]
  padded_context = nn.utils.rnn.pad_sequence(context, batch_first= True, padding_value= PAD_ID)

  labels = torch.tensor([item['labels'] for item in batch], dtype = torch.float)
  lengths = torch.tensor([item['lengths'] for item in batch], dtype = torch.float)

  batch = {"context": padded_context, "label": labels, "lengths": lengths}
  return batch

In [143]:
batch_size = 2
train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn= custom_collate)

In [144]:
example = next(iter(train_dl))
example['context'].shape, example['label'].shape, example['lengths'].shape

  context = [torch.tensor(item['encoded_context']) for item in batch]


(torch.Size([2, 110]), torch.Size([2, 6]), torch.Size([2]))

In [145]:
example['context']

tensor([[ 1085, 29728, 11785,  8627,  4581, 18810, 19229, 38033,  8393, 21210,
         16422, 18251, 34764, 22490,  5267, 36326, 17821,  5267, 33024, 34570,
         11481, 17750, 19229,  2808, 14104,  5952, 19152, 11431, 15037, 17821,
         18810, 27310, 11481, 18045, 14151,  4076, 34812, 29728, 18840,  4413,
         33737, 18045,  5267, 17821, 23022, 20543, 11481,  8627,  4581, 18810,
         19229,  1085, 29728, 28261, 11557, 20752, 22092,  5904,  1085, 29728,
         11481,  1085, 11481,  2623, 34570, 32976, 29728, 11481, 15304, 34570,
          5704, 18303, 34570, 13168,  1085, 11481, 11199, 33181,  1673, 10599,
         28706,  5904, 28208, 30951, 37998, 15395, 19128, 34570, 30454, 16321,
         17898, 36449, 20239, 34856,  5904, 30951, 37998, 29728, 21269, 32749,
         27228,  5704,  5904, 11431, 30951, 37998, 25424, 25113,  1673, 29025],
        [ 4145, 23601, 24500, 28871,  5730,   148, 12786, 11074, 27197, 31920,
         16321, 31698, 24500,  5730,   148, 12786, 

In [146]:
example['label']

tensor([[0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0.]])

In [147]:
example['lengths']

tensor([110.,  86.])

In [148]:
## dataloaders
batch_size = 64
train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn= custom_collate)
val_dl = DataLoader(val_data, batch_size = batch_size, shuffle = False, collate_fn= custom_collate)

## Build Model

In [149]:
class MultiLabelLSTM(pl.LightningModule):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, learning_rate, dropout, num_layers = 1, bidirectional = False):
    super().__init__()
    self.learning_rate = learning_rate
    self.bidirectional = bidirectional

    ## define loss & accuracy
    self.loss_fn = nn.BCEWithLogitsLoss()
    self.train_f1 = torchmetrics.F1Score(task="multilabel", num_labels=output_dim)
    self.val_f1 = torchmetrics.F1Score(task="multilabel", num_labels=output_dim)
    self.train_ham = torchmetrics.HammingDistance(task="multilabel", num_labels=output_dim)
    self.val_ham = torchmetrics.HammingDistance(task="multilabel", num_labels=output_dim)

    ## define layers
    self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim= embedding_dim, padding_idx= PAD_ID)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True, num_layers = num_layers, bidirectional = bidirectional, dropout=dropout)
    self.relu = nn.ReLU()
    self.linear1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 32)
    self.linear2 = nn.Linear(32, output_dim)


  def forward(self, text, lengths):
    """
    No need to apply softmax at the end as crossentropy implicitly apply the softmax
    """
    embedded = self.embedding(text)

    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'), batch_first = True, enforce_sorted = False)
    packed_output, (hidden, cell) = self.lstm(packed_embedded)
    #unpack sequence
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first = True)

    if self.bidirectional:
       ## concatnate last hidden layer of forward & backward
      hidden_squeezed = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
    else:
      hidden_squeezed = hidden[-1,:,:].squeeze(0)

    hidden_squeezed = self.relu(hidden_squeezed)
    out = self.linear1(hidden_squeezed)
    hidden = self.relu(out)
    logits = self.linear2(out)
    return logits

  def _shared_step(self, batch):
    text, label, lengths = batch['context'], batch['label'], batch['lengths']
    logits = self(text, lengths)
    loss = self.loss_fn(logits, label)
    return logits, loss, label

  def training_step(self, batch, batch_idx):
    logits, loss, label = self._shared_step(batch)
    self.train_f1(logits, label)
    self.train_ham(logits, label)
    self.log_dict({"train_loss": loss, "train_f1": self.train_f1,"train_ham" : self.train_ham}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self,batch, batch_idx):
    logits, loss, label = self._shared_step(batch)
    self.val_f1(logits, label)
    self.val_ham(logits, label)
    self.log_dict({"val_loss": loss,  "val_f1": self.val_f1, "val_ham": self.val_ham}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def on_training_epoch_end(self):
    self.train_f1.reset()
    self.train_ham.reset()

  def on_validation_epoch_end(self):
    print(f"Epoch : {self.current_epoch} Val F1 : {self.val_f1.compute()}  val ham : {self.val_ham.compute()}")
    self.val_f1.reset()
    self.val_ham.reset()

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

In [151]:
# ## test model architecture
# model = MultiLabelLSTM(vocab_size = len(token2idx),
#                       embedding_dim=100,
#                       hidden_dim= 64,
#                       output_dim= len(target_columns),
#                       learning_rate= 1e-3,
#                       dropout = 0.5,
#                       num_layers= 2,
#                       bidirectional = True
#                       )

# logits = model(example['context'], example['lengths'])
# model.loss_fn(logits, example['label'])

In [152]:
## Model Training

model = MultiLabelLSTM(vocab_size = len(token2idx),
                      embedding_dim=100,
                      hidden_dim= 256,
                      output_dim= len(target_columns),
                      learning_rate= 1e-4,
                      dropout = 0.25,
                      num_layers= 2,
                      bidirectional = True
                      )

callbacks = pl.callbacks.ModelCheckpoint(dirpath = "checkpoints_logs",
                                         filename = '{epoch}-{val_loss:.2f}-{val_ham:.2f}',
                                          mode = "min",
                                          monitor = "val_ham",
                                          save_last = True,
                                          save_top_k=-1)


trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=20,
           check_val_every_n_epoch = 2,
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/checkpoints_logs exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                      | Params
--------------------------------------------------------
0 | loss_fn   | BCEWithLogitsLoss         | 0     
1 | train_f1  | MultilabelF1Score         | 0     
2 | val_f1    | MultilabelF1Score         | 0     
3 | train_ham | MultilabelHammingDistance | 0     
4 | val_ham   | MultilabelHammingDistance | 0     
5 | embedding | E

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Val F1 : 0.3014925420284271  val ham : 0.609375


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
  context = [torch.tensor(item['encoded_context']) for item in batch]
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Val F1 : 0.43824800848960876  val ham : 0.18088996410369873


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Val F1 : 0.5336759686470032  val ham : 0.17302346229553223


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 5 Val F1 : 0.6705539226531982  val ham : 0.12570518255233765


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 7 Val F1 : 0.7080573439598083  val ham : 0.11732220649719238


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 9 Val F1 : 0.7309674024581909  val ham : 0.10993248224258423


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 11 Val F1 : 0.748656690120697  val ham : 0.10035759210586548


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 13 Val F1 : 0.7804300785064697  val ham : 0.08843863010406494


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 15 Val F1 : 0.7885081768035889  val ham : 0.0865713357925415


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 17 Val F1 : 0.7879430055618286  val ham : 0.08748507499694824


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 19 Val F1 : 0.7882093787193298  val ham : 0.08792215585708618


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


## Predict

In [163]:
def predict(context):
  clean_context = process_context(context)
  encoded_context = convert_word_to_number_tensor(clean_context)
  encoded_context = encoded_context.view(1, -1)
  lengths = torch.tensor([encoded_context.shape[1]], dtype = torch.long)
  preds = model(encoded_context, lengths)
  preds = preds.detach().numpy().flatten()
  preds = np.array(preds>=0.5).astype(int)
  print("Pred : ", [target_columns[i] for i, val in enumerate(preds) if val==1])

In [164]:
model = model.eval()

In [168]:
random_sample = val_df.sample().to_dict('records')[0]
context = random_sample['context']
label = random_sample['labels']
print("True : ",[target_columns[i] for i, val in enumerate(label) if val==1])
predict(context)

True :  ['computer science', 'statistics']
Pred :  ['computer science', 'statistics']
