<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/07_logistic_regression_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers
!pip install -q  pytorch-lightning

In [2]:
import pandas as pd
import numpy as np
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler

from transformers import AutoTokenizer, AlbertModel
import pytorch_lightning as pl
from torchmetrics import Accuracy

In [3]:
## set seed
np.random.seed(121)
torch.manual_seed(121)
pl.seed_everything(121)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

## Read Data

In [4]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.drop("processed_tweet", axis =1 , inplace = True)
val_df.drop("processed_tweet", axis =1 , inplace = True)

train_df = train_df.dropna()
val_df = val_df.dropna()

## reset index
train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

In [5]:
train_df.head(3)

Unnamed: 0,raw_tweet,label
0,Want to say a huge thanks to @WarriorAssaultS ...,1.0
1,@jaynehh_ you just need a job and get a letter...,1.0
2,"@knhillrocks HA yes, make it quick tho :D",1.0


In [6]:
train_df.label.value_counts()

1.0    4000
0.0    4000
Name: label, dtype: int64

In [7]:
val_df.label.value_counts()

1    1000
0    1000
Name: label, dtype: int64

In [8]:
train_x = train_df['raw_tweet'].tolist()
train_y = train_df['label'].tolist()
val_x = val_df['raw_tweet'].tolist()
val_y = val_df['label'].tolist()

In [9]:
del train_df, val_df

## Config

In [10]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device : {DEVICE}")
MODEL_NAME = "albert-base-v2"
MAX_LEN = 100

Device : cuda:0


## DataLoader

In [11]:
class SentimentDataset:

  def __init__(self, features, labels):
    self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
    self.features = features
    self.labels = labels

  def __getitem__(self, index):
    feature = self.features[index]
    label = torch.tensor(self.labels[index], dtype = torch.float32)
    ## get the output from tokenizer
    inputs = self.tokenizer.encode_plus(feature, padding='max_length',max_length=MAX_LEN,truncation=True, return_tensors='pt')
    inputs = {k: torch.squeeze(v, dim = 0) for k,v in inputs.items()}
    return (inputs,label)

  def __len__(self):
    return len(self.features)

In [12]:
train_ds = SentimentDataset(train_x, train_y)
val_ds = SentimentDataset(val_x, val_y)

BATCH_SIZE = 64
train_dl = DataLoader(train_ds,shuffle=True,batch_size=BATCH_SIZE)
val_dl = DataLoader(val_ds,shuffle=False,batch_size=BATCH_SIZE)

In [13]:
example = next(iter(train_dl))
inputs, label = example[0], example[1]
inputs['input_ids'].shape, inputs['token_type_ids'].shape, inputs['attention_mask'].shape

(torch.Size([64, 100]), torch.Size([64, 100]), torch.Size([64, 100]))

## Model

In [14]:
## Model
class SentimentModel(pl.LightningModule):

  def __init__(self, in_features, learning_rate):
    super().__init__()
    self.learning_rate = learning_rate

    ## pretrained model
    self.bert_model = AlbertModel.from_pretrained(MODEL_NAME)
    for param in self.bert_model.parameters():
      param.requires_grad = False

    ## Define Model
    self.num_classes = 1
    self.hidden_unit = 32
    self.dense1 = nn.Linear(in_features= in_features, out_features= self.hidden_unit)
    self.relu = nn.ReLU()
    self.dense2 = nn.Linear(in_features= self.hidden_unit, out_features= 1)
    self.sigmoid = nn.Sigmoid()

    ## define loss
    self.loss_fn = nn.BCELoss()
    ## define metrics
    self.train_accuracy = Accuracy(task = "binary", num_classes = 2, threshold= 0.5)
    self.val_accuracy = Accuracy(task = "binary", num_classes = 2, threshold= 0.5)


  def forward(self,inputs, verbose = False):

    emb = self.bert_model(**inputs)['pooler_output']
    out_dense1 = self.dense1(emb)
    out = self.relu(out_dense1)
    out_dense2 = self.dense2(out)
    out_sigmoid = self.sigmoid(out_dense2)

    if verbose:
      print(f"Bert shape : {emb.shape}")
      print(f"Dense-1 shape : {out_dense1.shape}")
      print(f"Dense-2 shape : {out_dense2.shape}")
      print(f"logits shape : {out_sigmoid.shape}")

    output = torch.squeeze(out_sigmoid, dim = 1)
    return output

  def training_step(self, batch, batch_idx):
    inputs, label = batch[0], batch[1]

    assert inputs['input_ids'].shape == torch.Size([BATCH_SIZE, MAX_LEN]), "Train : input_ids size mismatch"
    assert inputs['token_type_ids'].shape == torch.Size([BATCH_SIZE, MAX_LEN]), "Train : token_type_ids size mismatch"
    assert inputs['attention_mask'].shape == torch.Size([BATCH_SIZE, MAX_LEN]), "Train : attention_mask size mismatch"
    assert label.shape == torch.Size([BATCH_SIZE]), "Train : label size mismatch"


    logits = self(inputs)
    loss = self.loss_fn(logits, label)
    self.train_accuracy(logits,label)
    self.log_dict({"train_loss": loss, "train_accuracy": self.train_accuracy}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    inputs, label = batch[0], batch[1]
    logits = self(inputs)
    loss = self.loss_fn(logits, label)
    self.val_accuracy(logits,label)
    self.log_dict({"val_loss": loss,  "val_accuracy": self.val_accuracy}, on_step = False, on_epoch = True, prog_bar = True)

    return loss

  def on_train_epoch_end(self):
    self.train_accuracy.reset()

  def on_validation_epoch_end(self):
     print(f"Epoch : {self.current_epoch} Validation Accuracy : {self.val_accuracy.compute()}")
     self.val_accuracy.reset()

  def configure_optimizers(self):
     optimizer = optim.Adam(self.parameters(), lr =self.learning_rate)
     return optimizer

## Test Model & workflow

In [15]:
model = SentimentModel(in_features = 768, learning_rate=0.001)

logits = model(inputs,verbose = True)
print(f"Logits : {logits}")
print(f"Loss : {model.loss_fn(logits, label)}")

Bert shape : torch.Size([64, 768])
Dense-1 shape : torch.Size([64, 32])
Dense-2 shape : torch.Size([64, 1])
logits shape : torch.Size([64, 1])
Logits : tensor([0.4248, 0.4311, 0.4272, 0.4316, 0.4525, 0.4556, 0.4265, 0.4292, 0.4218,
        0.4214, 0.4561, 0.4326, 0.4294, 0.4618, 0.4396, 0.4459, 0.4361, 0.4450,
        0.4589, 0.4260, 0.4528, 0.4563, 0.4274, 0.4526, 0.4899, 0.4347, 0.4239,
        0.4414, 0.4250, 0.4419, 0.4438, 0.4443, 0.4650, 0.4513, 0.4429, 0.4225,
        0.5111, 0.4233, 0.4377, 0.4382, 0.4277, 0.4331, 0.4470, 0.4261, 0.4353,
        0.4322, 0.4222, 0.4524, 0.4309, 0.4447, 0.4534, 0.4470, 0.4516, 0.5000,
        0.4321, 0.4385, 0.4564, 0.4273, 0.4359, 0.4351, 0.4688, 0.4390, 0.4286,
        0.4478], grad_fn=<SqueezeBackward1>)
Loss : 0.7035216689109802


## Train Model

In [16]:
## logger
logger = pl.loggers.CSVLogger("logs", name="sentiment_analysis")

## checkpoints
checkpoint_callback  = pl.callbacks.ModelCheckpoint(
                                                filename='{epoch}-{val_loss:.2f}-{val_accuracy:.2f}',
                                                every_n_epochs = 1,
                                                save_top_k = -1,
                                                monitor='val_loss',
                                                )


model = SentimentModel(in_features = 768, learning_rate=0.001)

trainer = pl.Trainer(accelerator="auto",
                     max_epochs = 6,
                     check_val_every_n_epoch=1,
                     callbacks=[checkpoint_callback],
                     logger=logger

                    )

## Train the Model
trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type           | Params
--------------------------------------------------
0 | bert_model     | AlbertModel    | 11.7 M
1 | dense1         | Linear         | 24.6 K
2 | relu           | ReLU           | 0     
3 | dense2         | Linear         | 33    
4 | sigmoid        | Sigmoid        | 0     
5 | loss_fn        | BCELoss        | 0     
6 | train_accuracy | BinaryAccuracy | 0     
7 | val_accuracy   | BinaryAccuracy | 0     
--------------------------------------------------
24.6 K    Trainable params
11.7 M    N

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Epoch : 0 Validation Accuracy : 0.65625


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Validation Accuracy : 0.7005000114440918


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Validation Accuracy : 0.7770000100135803


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 2 Validation Accuracy : 0.7954999804496765


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Validation Accuracy : 0.8245000243186951


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 4 Validation Accuracy : 0.8550000190734863


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.


Epoch : 5 Validation Accuracy : 0.8730000257492065


**Note** : If there is any blank or missing label in the data & after some steps you will get the RuntimeError: CUDA error: device-side assert triggered. If you will get the CUDA error then you can follow below steps

1. There should not be missing values
2. Test you code on CPU

## Load the model from checkpoints

In [17]:
# model = SentimentModel.load_from_checkpoint("logs/sentiment_analysis/version_6/checkpoints/epoch=9-val_loss=0.03-val_accuracy=0.99.ckpt",
#                   in_features = 768, learning_rate=0.001)

## Predict

In [18]:
model = model.eval()

In [19]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

In [24]:
tweet = "I love this movies"
inputs = tokenizer(tweet)
inputs = {k: torch.tensor(v).view(1,-1) for k,v in inputs.items()}
preds = model(inputs).item()
int(preds>0.55)

1

In [26]:
tweet = "I hate this movies"
inputs = tokenizer(tweet)
inputs = {k: torch.tensor(v).view(1,-1) for k,v in inputs.items()}
preds = model(inputs).item()
int(preds>0.55)

0

In [27]:
tweet = "thank you :)"
inputs = tokenizer(tweet)
inputs = {k: torch.tensor(v).view(1,-1) for k,v in inputs.items()}
preds = model(inputs).item()
int(preds>0.55)

1