<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/07_logistic_regression_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
# !pip install -q transformers
# !pip install -q  pytorch-lightning
# !pip install -q neattext

In [20]:
import pandas as pd
import numpy as np
import itertools
import neattext as nt
from neattext.functions import clean_text

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler

from transformers import AutoTokenizer, AlbertModel
import pytorch_lightning as pl
from torchmetrics import Accuracy

In [21]:
## set seed
np.random.seed(121)
torch.manual_seed(121)
pl.seed_everything(121)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

## Read Data

In [22]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.drop("processed_tweet", axis =1 , inplace = True)
val_df.drop("processed_tweet", axis =1 , inplace = True)

train_df = train_df.dropna()
val_df = val_df.dropna()

## reset index
train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

In [23]:
train_df.head(3)

Unnamed: 0,raw_tweet,label
0,Want to say a huge thanks to @WarriorAssaultS ...,1.0
1,@jaynehh_ you just need a job and get a letter...,1.0
2,"@knhillrocks HA yes, make it quick tho :D",1.0


In [24]:
train_df.label.value_counts()

1.0    4000
0.0    4000
Name: label, dtype: int64

In [25]:
val_df.label.value_counts()

1    1000
0    1000
Name: label, dtype: int64

In [26]:
## Clean the data
def custom_clean_text(x):
  x = nt.TextFrame(x)
  x = x.remove_stopwords().remove_urls().remove_emails().remove_dates().remove_puncts().remove_numbers().remove_userhandles().remove_multiple_spaces()
  x = x.text.lower()
  return x

train_df['raw_tweet'] = train_df.raw_tweet.apply(lambda x: custom_clean_text(x))
val_df['raw_tweet'] = val_df.raw_tweet.apply(lambda x: custom_clean_text(x))

In [27]:
train_df.head(3)

Unnamed: 0,raw_tweet,label
0,want huge thanks #ff thanks support :),1.0
1,need job letter work place saying work letter...,1.0
2,ha yes quick tho :d,1.0


In [28]:
train_x = train_df['raw_tweet'].tolist()
train_y = train_df['label'].tolist()
val_x = val_df['raw_tweet'].tolist()
val_y = val_df['label'].tolist()

In [29]:
del train_df, val_df

## Config

In [30]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device : {DEVICE}")
MODEL_NAME = "albert-base-v2"
MAX_LEN = 100

Device : cuda:0


## DataLoader

In [31]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
train_encoded = tokenizer(train_x, padding='max_length',max_length=MAX_LEN,truncation=True, return_tensors='pt')
val_encoded = tokenizer(val_x, padding='max_length',max_length=MAX_LEN,truncation=True, return_tensors='pt')

In [32]:
train_encoded['label'] = torch.tensor(train_y, dtype = torch.float32)
val_encoded['label'] = torch.tensor(val_y, dtype = torch.float32)

In [33]:
del train_x, val_x, train_y, val_y

In [34]:
train_ds = TensorDataset(train_encoded['input_ids'], train_encoded['token_type_ids'], train_encoded['attention_mask'], train_encoded['label'])
val_ds = TensorDataset(val_encoded['input_ids'], val_encoded['token_type_ids'], val_encoded['attention_mask'], val_encoded['label'])

In [35]:
BATCH_SIZE = 32
train_dl = DataLoader(train_ds,shuffle=True,batch_size=BATCH_SIZE)
val_dl = DataLoader(val_ds,shuffle=False,batch_size=BATCH_SIZE)

In [36]:
example = next(iter(train_dl))
input_ids,token_type_ids,attention_mask, label   = example
input_ids.shape,token_type_ids.shape,attention_mask.shape, label.shape

(torch.Size([32, 100]),
 torch.Size([32, 100]),
 torch.Size([32, 100]),
 torch.Size([32]))

## Model

In [41]:
## Model
class SentimentModel(pl.LightningModule):

  def __init__(self, in_features, learning_rate):
    super().__init__()
    self.learning_rate = learning_rate

    ## pretrained model
    self.bert_model = AlbertModel.from_pretrained(MODEL_NAME)
    for param in self.bert_model.parameters():
      param.requires_grad = False

    ## Define Model
    self.num_classes = 1
    self.hidden_unit = 64
    self.dense1 = nn.Linear(in_features= in_features, out_features= self.hidden_unit)
    self.relu = nn.ReLU()
    self.dense2 = nn.Linear(in_features= self.hidden_unit, out_features= 1)
    self.sigmoid = nn.Sigmoid()

    ## define loss
    self.loss_fn = nn.BCELoss()
    ## define metrics
    self.train_accuracy = Accuracy(task = "binary", num_classes = 2, threshold= 0.5)
    self.val_accuracy = Accuracy(task = "binary", num_classes = 2, threshold= 0.5)


  def forward(self,input_ids, token_type_ids, attention_mask, verbose = False):
    emb = self.bert_model(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)['pooler_output']
    out_dense1 = self.dense1(emb)
    out = self.relu(out_dense1)
    out_dense2 = self.dense2(out)
    out_sigmoid = self.sigmoid(out_dense2)

    if verbose:
      print(f"Bert shape : {emb.shape}")
      print(f"Dense-1 shape : {out_dense1.shape}")
      print(f"Dense-2 shape : {out_dense2.shape}")
      print(f"logits shape : {out_sigmoid.shape}")

    output = torch.squeeze(out_sigmoid, dim = 1)
    return output

  def training_step(self, batch, batch_idx):
    input_ids,token_type_ids,attention_mask, label = batch
    logits = self(input_ids,token_type_ids,attention_mask)
    loss = self.loss_fn(logits, label)
    self.train_accuracy(logits,label)
    self.log_dict({"train_loss": loss, "train_accuracy": self.train_accuracy}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids,token_type_ids,attention_mask, label = batch
    logits = self(input_ids,token_type_ids,attention_mask)
    loss = self.loss_fn(logits, label)
    self.val_accuracy(logits,label)
    self.log_dict({"val_loss": loss,  "val_accuracy": self.val_accuracy}, on_step = False, on_epoch = True, prog_bar = True)

    return loss

  def on_train_epoch_end(self):
    self.train_accuracy.reset()

  def on_validation_epoch_end(self):
     print(f"Epoch : {self.current_epoch} Validation Accuracy : {self.val_accuracy.compute()}")
     self.val_accuracy.reset()

  def configure_optimizers(self):
     optimizer = optim.Adam(self.parameters(), lr =self.learning_rate)
     return optimizer

## Test Model & workflow

In [42]:
# model = SentimentModel(in_features = 768, learning_rate=0.001)
# logits = model(input_ids,token_type_ids,attention_mask,verbose = True)
# print(f"Logits : {logits}")
# print(f"Loss : {model.loss_fn(logits, label)}")

## Train Model

In [43]:
## logger
logger = pl.loggers.CSVLogger("logs", name="sentiment_analysis")

## checkpoints
checkpoint_callback  = pl.callbacks.ModelCheckpoint(
                                                filename='{epoch}-{val_loss:.2f}-{val_accuracy:.2f}',
                                                every_n_epochs = 1,
                                                save_top_k = -1,
                                                monitor='val_loss',
                                                )


model = SentimentModel(in_features = 768, learning_rate=0.001)
trainer = pl.Trainer(accelerator="cuda",
                     max_epochs = 5,
                     check_val_every_n_epoch=1,
                     callbacks=[checkpoint_callback],
                     logger=logger

                    )

## Train the Model
trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type           | Params
--------------------------------------------------
0 | bert_model     | AlbertModel    | 11.7 M
1 | dense1         | Linear         | 49.2 K
2 | relu           | ReLU           | 0     
3 | dense2         | Linear         | 65    
4 | sigmoid        | Sigmoid        | 0     
5 | loss_fn        | BCELoss        | 0     
6 | train_accuracy | BinaryAccuracy | 0     
7 | val_accuracy   | BinaryAccuracy | 0     
--------------------------------------------------
49.3 K    Trainable params
11.7 M    N

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Validation Accuracy : 1.0


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 0 Validation Accuracy : 0.8585000038146973


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Validation Accuracy : 0.8679999709129333


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 2 Validation Accuracy : 0.9125000238418579


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Validation Accuracy : 0.9225000143051147


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch : 4 Validation Accuracy : 0.9309999942779541


**Note** : If there is any blank or missing label in the data & after some steps you will get the RuntimeError: CUDA error: device-side assert triggered. If you will get the CUDA error then you can follow below steps

1. There should not be missing values
2. Test you code on CPU

## Load the model from checkpoints

In [45]:
# model = SentimentModel.load_from_checkpoint("logs/sentiment_analysis/version_6/checkpoints/epoch=9-val_loss=0.93-val_accuracy=0.93.ckpt",
#                   in_features = 768, learning_rate=0.001)

## Predict

In [46]:
model = model.eval()

In [47]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

In [51]:
inputs

{'input_ids': tensor([[   2, 3531,   42,   13,   45,    6,    3]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [54]:
tweet = "I love this movies"
inputs = tokenizer(tweet)
inputs = {k: torch.tensor(v).view(1,-1) for k,v in inputs.items()}
input_ids, token_type_ids, attention_mask = inputs['input_ids'],inputs['token_type_ids'],inputs['attention_mask']
preds = model(input_ids,token_type_ids,attention_mask).item()
int(preds>0.5)

1

In [57]:
tweet = "I hate this movies :("
inputs = tokenizer(tweet)
inputs = {k: torch.tensor(v).view(1,-1) for k,v in inputs.items()}
input_ids, token_type_ids, attention_mask = inputs['input_ids'],inputs['token_type_ids'],inputs['attention_mask']
preds = model(input_ids, token_type_ids, attention_mask).item()
int(preds>0.5)

0

In [58]:
tweet = "thank you :)"
inputs = tokenizer(tweet)
inputs = {k: torch.tensor(v).view(1,-1) for k,v in inputs.items()}
input_ids, token_type_ids, attention_mask = inputs['input_ids'],inputs['token_type_ids'],inputs['attention_mask']
preds = model(input_ids, token_type_ids, attention_mask).item()
int(preds>0.5)

1

In [59]:
## Note : To improve the accuracy you can clean then the raw_text then run the same model.