<a href="https://colab.research.google.com/github/gupta24789/multilabel-classification/blob/main/01_stackoverflow_bert_lighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers
!pip install -q  pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import pandas as pd
import numpy as np
import random
import shutil
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from torchmetrics import Accuracy
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AlbertModel

In [3]:
## set seed
random.seed(121)
np.random.seed(121)
torch.manual_seed(121)

<torch._C.Generator at 0x7ab46beff710>

In [4]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/multilabel-classification/main/data/stackoverflow_train.csv")
train_df.head(3)

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0


In [5]:
train_df['CONTEXT'] = train_df['TITLE'] + ". " + train_df['ABSTRACT']
train_df.drop(labels=['TITLE', 'ABSTRACT', 'ID'], axis=1, inplace=True)
train_df = train_df[['CONTEXT', 'Computer Science', 'Physics', 'Mathematics', 'Statistics',
                     'Quantitative Biology', 'Quantitative Finance',]]
train_df = train_df.reset_index(drop = True)

train_df.head(3)

Unnamed: 0,CONTEXT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps. ...,1,0,0,0,0,0
1,Rotation Invariance Neural Network. Rotation...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0


In [6]:
train_size = 0.8
index_list = train_df.index.tolist()
random.shuffle(index_list)
train_sample = int(train_size * len(index_list))
val_df = train_df.iloc[train_sample:]
train_df = train_df.iloc[:train_sample]

train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

print(f"Train shape : {train_df.shape}")
print(f"Val shape : {val_df.shape}")

Train shape : (16777, 7)
Val shape : (4195, 7)


In [7]:
target_list = ['Computer Science', 'Physics', 'Mathematics', 'Statistics',
              'Quantitative Biology', 'Quantitative Finance']

In [8]:
train_x = train_df.CONTEXT.tolist()
train_y = train_df[target_list].values

val_x = val_df.CONTEXT.tolist()
val_y = val_df[target_list].values

del train_df, val_df

## Hyperparameter

In [9]:
# hyperparameters
MAX_LEN = 100
BATCH_SIZE = 32
LEARNING_RATE = 1e-04
MODEL_NAME = "albert-base-v2"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device : {DEVICE}")

Device : cuda:0


## DataLoader

In [10]:
class MultiLabelDataset:

  def __init__(self, features, labels):
    self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
    self.features = features
    self.labels = labels

  def __getitem__(self, index):
    feature = self.features[index]
    label = torch.tensor(self.labels[index], dtype = torch.float32)

    ## get the output from tokenizer
    inputs = self.tokenizer.encode_plus(feature, padding='max_length',max_length=MAX_LEN,truncation=True, return_tensors='pt')
    inputs = {k: v.flatten() for k,v in inputs.items()}
    return (inputs,label)

  def __len__(self):
    return len(self.features)

In [11]:
train_ds = MultiLabelDataset(train_x, train_y)
val_ds = MultiLabelDataset(val_x, val_y)
train_dl = DataLoader(train_ds,shuffle=True,batch_size=BATCH_SIZE)
val_dl = DataLoader(val_ds,shuffle=False,batch_size=BATCH_SIZE)

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [12]:
example = next(iter(train_dl))
inputs, label = example[0], example[1]
inputs['input_ids'].shape, inputs['token_type_ids'].shape, inputs['attention_mask'].shape, label.shape

(torch.Size([32, 100]),
 torch.Size([32, 100]),
 torch.Size([32, 100]),
 torch.Size([32, 6]))

## Model

In [13]:
class MultiLabelModel(pl.LightningModule):

  def __init__(self, num_classes, learning_rate):
    super().__init__()
    self.learning_rate = learning_rate

    ## config
    self.num_classes = num_classes
    self.hidden_unit = 32
    self.bert_emb_dim = 768

    ## Model
    self.bert_model = AlbertModel.from_pretrained(MODEL_NAME)
    bertLayerList = [
                #  'encoder.albert_layer_groups.0.albert_layers.0.ffn.weight',
                #  'encoder.albert_layer_groups.0.albert_layers.0.ffn.bias',
                #  'encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight',
                #  'encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias',
                 'pooler.weight',
                 'pooler.bias']
    ## freeze layer except in above list
    for name, params in self.bert_model.named_parameters():
      if name not in bertLayerList:
        params.requires_grad = False

    self.dropout = torch.nn.Dropout(0.2)
    self.dense1 = nn.Linear(in_features= self.bert_emb_dim, out_features= self.hidden_unit)
    self.relu = nn.ReLU()
    self.dense2 = nn.Linear(in_features= self.hidden_unit, out_features= num_classes)

    ## define loss
    self.loss_fn = nn.BCEWithLogitsLoss()
    ## define metrics
    self.train_accuracy = Accuracy(task = "multilabel", num_labels = num_classes, threshold= 0.5)
    self.val_accuracy = Accuracy(task = "multilabel", num_labels = num_classes, threshold= 0.5)


  def forward(self,inputs, verbose = False):

    emb = self.bert_model(**inputs)['pooler_output']
    out_dense1 = self.dense1(emb)
    out_relu = self.relu(out_dense1)
    out_dense2 = self.dense2(out_relu)
    logits = torch.squeeze(out_dense2, dim = 1)

    if verbose:
      print(f"Bert shape : {emb.shape}")
      print(f"Dense-1 shape : {out_dense1.shape}")
      print(f"Dense-2 shape : {out_dense2.shape}")
      print(f"logits shape : {logits.shape}")

    return logits

  def training_step(self, batch, batch_idx):
    inputs, label = batch[0], batch[1]
    logits = self(inputs)
    loss = self.loss_fn(logits, label)
    self.train_accuracy(logits,label)
    self.log_dict({"train_loss": loss,  "train_accuracy": self.train_accuracy}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    inputs, label = batch[0], batch[1]
    logits = self(inputs)
    loss = self.loss_fn(logits, label)
    self.val_accuracy(logits,label)
    self.log_dict({"val_loss": loss,  "val_accuracy": self.val_accuracy}, on_step = False, on_epoch = True, prog_bar = True)

    return loss

  def on_train_epoch_end(self):
    self.train_accuracy.reset()

  def on_validation_epoch_end(self):
     print(f"Epoch : {self.current_epoch} Validation Accuracy : {self.val_accuracy.compute()}")
     self.val_accuracy.reset()

  def configure_optimizers(self):
     optimizer = optim.Adam(self.parameters(), lr =self.learning_rate)
     return optimizer

In [14]:
model = MultiLabelModel(learning_rate=0.001, num_classes=len(target_list))
logits = model(inputs,verbose = True)
print(f"Logits : {logits.shape}")
print(f"Loss : {model.loss_fn(logits, label)}")

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Bert shape : torch.Size([32, 768])
Dense-1 shape : torch.Size([32, 32])
Dense-2 shape : torch.Size([32, 6])
logits shape : torch.Size([32, 6])
Logits : torch.Size([32, 6])
Loss : 0.6777482032775879


## Train Model

In [15]:
## logger
logger = pl.loggers.CSVLogger("logs", name="multi-label")

## checkpoints
checkpoint_callback  = pl.callbacks.ModelCheckpoint(
                                                filename='{epoch}-{val_loss:.2f}-{val_accuracy:.2f}',
                                                every_n_epochs = 1,
                                                save_top_k = -1,
                                                monitor='val_loss',
                                                )


model = MultiLabelModel(learning_rate=0.0001, num_classes=len(target_list))

trainer = pl.Trainer(accelerator="auto",
                     max_epochs = 5,
                     check_val_every_n_epoch=1,
                     callbacks=[checkpoint_callback],
                     logger=logger

                    )

## Train the Model
trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params
------------------------------------------------------
0 | bert_model     | AlbertModel        | 11.7 M
1 | dropout        | Dropout            | 0     
2 | dense1         | Linear             | 24.6 K
3 | relu           | ReLU               | 0     
4 | dense2         | Linear             | 198   
5 | loss_fn        | BCEWithLogitsLoss  | 0     
6 | train_accuracy | MultilabelAccuracy | 0     
7 | val_accuracy   | MultilabelAccuracy | 0     
-------------------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Epoch : 0 Validation Accuracy : 0.3385416567325592


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Validation Accuracy : 0.8719507455825806


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Validation Accuracy : 0.8802940249443054


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 2 Validation Accuracy : 0.8821613192558289


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Validation Accuracy : 0.8876439929008484


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch : 4 Validation Accuracy : 0.8895113468170166


## Load Model

In [16]:
# model = MultiLabelModel.load_from_checkpoint("logs/multi-label/version_6/checkpoints/epoch=9-val_loss=0.03-val_accuracy=0.99.ckpt",
#                   learning_rate=0.001, num_classes=len(target_list))


## Predict

In [17]:
# model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

In [65]:
INDEX = 2
CONTEXT = val_x[INDEX]
TRUE_LABEL = val_y[INDEX]
TRUE_CLASSES = [target_list[i] for i, label in enumerate(TRUE_LABEL) if label==1]
TRUE_CLASSES

['Statistics']

In [66]:
model = model.eval()
inputs = tokenizer.encode_plus(CONTEXT, padding='max_length',max_length=MAX_LEN,truncation=True, return_tensors='pt')
preds = model(inputs)
preds = preds.flatten().detach().numpy()
preds

array([ 0.1682685, -3.5301564, -1.7493111,  1.27623  , -2.8906837,
       -3.320072 ], dtype=float32)

In [67]:
predictions = []
for idx, label in enumerate(target_list):
    if preds[idx] > 0.5:
        predictions.append((label,preds[idx]))

predictions

[('Statistics', 1.27623)]