<a href="https://colab.research.google.com/github/gupta24789/multiclass-classification/blob/main/multiclass_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import pytorch_lightning as pl
import torchmetrics
from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

## Set Seed

In [3]:
SEED = 121
torch.manual_seed(SEED)
np.random.seed(SEED)
pl.seed_everything(SEED)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

## Load Read Data

In [4]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multiclass-classification/main/data/train.txt", header = None, sep=';')
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multiclass-classification/main/data/val.txt",header = None, sep=';')
train_df.columns = ['text','label']
val_df.columns = ['text','label']

In [5]:
print(train_df.label.value_counts())

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: label, dtype: int64


In [6]:
print(val_df.label.value_counts())

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: label, dtype: int64


In [7]:
train_df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


## Encode Label

In [8]:
## Encode Labels
label2idx_map = {w:i for i,w in enumerate(train_df.label.unique().tolist())}
idx2label_map = {i:w for w,i in label2idx_map.items()}

train_df['encoded_label'] = train_df.label.apply(lambda x: label2idx_map[x])
val_df['encoded_label'] = val_df.label.apply(lambda x: label2idx_map[x])

## Class Weight

In [9]:
class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(train_df.encoded_label), y=train_df.encoded_label)
class_weights

array([0.57151022, 1.23513973, 2.04498978, 4.66200466, 1.37669936,
       0.49732687])

## Transformer Model Exploration

In [10]:
model_name = "albert-base-v2"
# model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

In [11]:
inputs = tokenizer("I hate you", return_tensors='pt')
inputs

{'input_ids': tensor([[   2,   31, 3223,   42,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [12]:
embedding = transformer_model(**inputs)

In [13]:
last_hidden_state, pooler_output = embedding['last_hidden_state'], embedding['pooler_output']

In [14]:
last_hidden_state.shape, pooler_output.shape

(torch.Size([1, 5, 768]), torch.Size([1, 768]))

In [15]:
## last hidden state
last_hidden_state[:,0,:].shape

torch.Size([1, 768])

In [16]:
## average of last 4 hidden states
torch.mean(last_hidden_state[:,-4:,:], dim = 1).shape

torch.Size([1, 768])

## Data Loaders

In [17]:
train_df.text.str.split(" ").str.len().describe([.99])

count    16000.000000
mean        19.166313
std         10.986905
min          2.000000
50%         17.000000
99%         52.000000
max         66.000000
Name: text, dtype: float64

In [18]:
def custom_collate(batch):

  text = [item['text'] for item in batch]
  label = [item['encoded_label'] for item in batch]

  inputs = tokenizer(text, max_length= 60, truncation=True, padding='max_length', return_tensors='pt')
  label = torch.tensor(label, dtype = torch.long)

  batch = {"input_ids": inputs['input_ids'], "token_type_ids": inputs['token_type_ids'],"attention_mask": inputs['attention_mask'], "label": label}
  return batch

In [19]:
train_data = train_df[['text','encoded_label']].to_dict('records')
val_data = val_df[['text','encoded_label']].to_dict('records')

In [20]:
train_data[:2]

[{'text': 'i didnt feel humiliated', 'encoded_label': 0},
 {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'encoded_label': 0}]

In [21]:
batch_size = 2
train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn= custom_collate)

In [22]:
example = next(iter(train_dl))
example['input_ids'].shape, example['token_type_ids'].shape, example['attention_mask'].shape, example['label'].shape

(torch.Size([2, 60]),
 torch.Size([2, 60]),
 torch.Size([2, 60]),
 torch.Size([2]))

In [23]:
example['input_ids']

tensor([[    2,    31,    57,    14,  1249,    39,    23, 12824,    17, 17850,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2,    31,   289,    31,  7290,    44,  4114,    34,   184,  2680,
          1440,  1249,    47,    14, 12839,   270,    16,    42,  6131,   267,
            38,    55,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [24]:
example['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [25]:
example['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [26]:
example['label']

tensor([5, 1])

In [27]:
## dataloaders
batch_size = 64
train_dl = DataLoader(train_data, batch_size = batch_size, shuffle = True, collate_fn= custom_collate, num_workers = 2)
val_dl = DataLoader(val_data, batch_size = batch_size, shuffle = False, collate_fn= custom_collate, num_workers = 2)

## Build Model

In [32]:
class MultiClassTransformer(pl.LightningModule):

  def __init__(self, output_dim, learning_rate, dropout, freeze = False):
    super().__init__()
    self.learning_rate = learning_rate

    ## define loss & accuracy
    self.loss_fn = nn.CrossEntropyLoss()
    # self.loss_fn = nn.CrossEntropyLoss(weight= torch.tensor(class_weights, dtype = torch.float))
    self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=output_dim)
    self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=output_dim)

    ## define layers
    self.transformer_model = AutoModel.from_pretrained(model_name)
    hidden_dim = self.transformer_model.config.hidden_size
    self.clf = nn.Linear(hidden_dim, output_dim)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)

    ## freeze layers
    bertLayerList = ['pooler.weight','pooler.bias']
    if freeze:
      for name, params in self.transformer_model.named_parameters():
        if name not in bertLayerList:
          params.requires_grad = False


  def forward(self, inputs):
    """
    No need to apply softmax at the end as crossentropy implicitly apply the softmax
    """
    embeddings = self.transformer_model(**inputs)
    last_hidden_state, pooler_output = embeddings['last_hidden_state'], embeddings['pooler_output']

    ## last hidden state
    hidden_state = last_hidden_state[:,-1,:]
    ## pooler state
    # hidden_state = pooler_output
    ## average of last 4 hidden state
    # hidden_state = torch.mean(last_hidden_state[:,-4:,:], dim = 1)

    hidden_state = self.dropout(hidden_state)
    out = self.clf(self.relu(hidden_state))
    return out

  def _shared_step(self, batch):
    label = batch.pop('label')
    logits = self(batch)
    loss = self.loss_fn(logits, label)
    return logits, loss, label

  def training_step(self, batch, batch_idx):
    logits, loss, label = self._shared_step(batch)
    self.train_f1.update(logits, label)
    self.log_dict({"train_loss": loss, "train_f1": self.train_f1}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self,batch, batch_idx):
    logits, loss, label = self._shared_step(batch)
    self.val_f1.update(logits, label)
    self.log_dict({"val_loss": loss,  "val_f1": self.val_f1}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def on_training_epoch_end(self):
    self.train_f1.reset()

  def on_validation_epoch_end(self):
    print(f"Epoch : {self.current_epoch} Val F1 : {self.val_f1.compute()}")
    self.val_f1.reset()

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

In [33]:
# ## test model architecture
# model = MultiClassTransformer(output_dim = len(label2idx_map), learning_rate = 1e-3, freeze = False)
# inputs = {
#     "input_ids": example['input_ids'],
#     "token_type_ids": example['token_type_ids'],
#     "attention_mask": example['attention_mask']
# }
# logits = model(inputs)
# model.loss_fn(logits, example['label'])

In [34]:
## Model Training

model = MultiClassTransformer(output_dim = len(label2idx_map), learning_rate = .0001, dropout = 0.5, freeze = False)

callbacks = pl.callbacks.ModelCheckpoint(dirpath = "multiclass_logs",
                                         filename = '{epoch}-{val_loss:.2f}-{val_f1:.2f}',
                                          mode = "min",
                                          monitor = "val_loss",
                                          save_last = True,
                                          save_top_k=-1)

trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=3,
           check_val_every_n_epoch = 1,
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/multiclass_logs exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type              | Params
--------------------------------------------------------
0 | loss_fn           | CrossEntropyLoss  | 0     
1 | train_f1          | MulticlassF1Score | 0     
2 | val_f1            | MulticlassF1Score | 0     
3 | transformer_model | AlbertModel       | 11.7 M
4 | clf               | Linear            | 4.6 K 
5 | relu          

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Val F1 : 0.3046875




Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Val F1 : 0.8995000123977661


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Val F1 : 0.9104999899864197


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 2 Val F1 : 0.934499979019165


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


## Load Model

In [46]:
checkpoint = "multiclass_logs/epoch=2-val_loss=0.16-val_f1=0.93.ckpt"

In [47]:
# ## Load model in cpu
# model = MultiClassTransformer.load_from_checkpoint(checkpoint,
#                                             output_dim = len(label2idx_map),
#                                            learning_rate = 1e-3,
#                                            freeze = False,
#                                            dropout = 0.1,
#                                            map_location = "cpu")

In [48]:
model = MultiClassTransformer.load_from_checkpoint(checkpoint,
                                                   output_dim = len(label2idx_map),
                                                   learning_rate = 1e-3,
                                                   freeze = False,
                                                   dropout = 0.1,
                                                   map_location = "cuda")

In [50]:
model = model.eval()

In [51]:
def predict(text):
  inputs = tokenizer(text, return_tensors='pt')
  inputs.to('cuda:0')
  preds = model(inputs)
  value, index = torch.topk(preds.cpu(), k = 1)
  return index.item()

In [52]:
index = predict("I love you")
print(f"Label : {idx2label_map[index]}")

Label : joy


In [53]:
index = predict("i hate you")
print(f"Label : {idx2label_map[index]}")

Label : joy


## Classification report

In [54]:
val_preds_index = [predict(text) for text in val_df.text]

In [55]:
print(classification_report(val_df.encoded_label, val_preds_index, target_names = list(label2idx_map.keys())))

              precision    recall  f1-score   support

     sadness       0.96      0.93      0.94       550
       anger       0.91      0.91      0.91       275
        love       0.00      0.00      0.00       178
    surprise       0.00      0.00      0.00        81
        fear       0.00      0.00      0.00       212
         joy       0.59      0.99      0.74       704

    accuracy                           0.73      2000
   macro avg       0.41      0.47      0.43      2000
weighted avg       0.60      0.73      0.65      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
val_df.label.value_counts()

joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: label, dtype: int64