<a href="https://colab.research.google.com/github/gupta24789/hugging-face/blob/main/04_sentiment_analysis_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['TOKENIZERS_PARALLELISM'] = "0"

In [None]:
import evaluate
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import load_dataset, Features, ClassLabel, Value
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import torchmetrics

## Load Dataset

In [None]:
dataset = load_dataset("sg247/binary-classification", data_files= {"train": "train.csv", "test":"test.csv"})
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8004
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
## Label must be of type ClassLabel
dataset['train'].features

{'tweet': Value(dtype='string', id=None),
 'label': Value(dtype='float64', id=None)}

In [None]:
## label must of ClassLabel Type
features = Features({"tweet": Value(dtype = "string"), "label": ClassLabel(num_classes=2, names=[0,1])})
dataset = load_dataset("sg247/binary-classification", data_files= {"train": "train.csv", "test":"test.csv"}, features = features)
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8004
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset['train'].features

{'tweet': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1], id=None)}

## Remove NA from the data

In [None]:
dataset = dataset.filter(lambda x: x['tweet'] is not None and x['label'] is not None and len(x['tweet'])>0)
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset['train'][0]

{'tweet': 'Want to say a huge thanks to @WarriorAssaultS @uktac @BolleSafety @Mechanix_Wear @Airtech_Studios @Hexmags #FF Thanks for the support :)',
 'label': 1}

## Tokenized Tweet

In [None]:
def tokenize_tweet(row):
    return tokenizer(row['tweet'], padding='max_length', truncation=True, max_length=50)

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_datasets = dataset.map(tokenize_tweet)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tweet', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['tweet', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## remove tweet column and rename label as labels
tokenized_datasets = tokenized_datasets.remove_columns('tweet')
tokenized_datasets = tokenized_datasets.rename_columns({"label":"labels"})
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [None]:
## create train and test dataset
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"]

In [None]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8000
})

## Data Loaders

In [None]:
def custom_collator(batch):
    labels = torch.tensor([item['labels'] for item in batch])
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    token_type_ids = torch.tensor([item['token_type_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])

    return {"labels": labels, "input_ids":input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}

## Build Model

In [None]:
torchmetrics.Precision
torchmetrics.Recall

torchmetrics.classification.precision_recall.Recall

In [None]:
class SentimentModel(pl.LightningModule):

    def __init__(self, output_dim, learning_rate):
        super().__init__()

        ## layers
        self.learning_rate = learning_rate
        self.transformer = AutoModel.from_pretrained(model_name)
        self.linear = nn.Linear(self.transformer.config.hidden_size, output_dim)

        ## loss
        self.loss_fn = nn.CrossEntropyLoss()
        self.train_loss = []
        self.val_loss = []

        ## metrics
        self.accuracy = torchmetrics.Accuracy(task = 'binary', num_classes = output_dim)
        self.f1_score = torchmetrics.F1Score(task = 'binary', num_classes = output_dim)
        self.precision = torchmetrics.Precision(task = 'binary', num_classes = output_dim)
        self.recall = torchmetrics.Recall(task = 'binary', num_classes = output_dim)


    def compute_metrics(self, logits, references):
        logits = torch.argmax(logits, axis = 1)
        self.accuracy(logits, references)
        self.f1_score(logits, references)
        self.precision(logits, references)
        self.recall(logits, references)

    def forward(self, inputs):
        output = self.transformer(**inputs)
        # output has two keys : last_hidden_state, pooler_output
        logits = self.linear(output['pooler_output'])
        return logits


    def training_step(self, batch):
        labels, inputs = batch.pop('labels'), batch
        logits = self(inputs)
        loss = self.loss_fn(logits, labels)
        self.train_loss.append(loss.item())
        self.log_dict({"train_loss": loss}, on_step = False, on_epoch = True, prog_bar = True)
        return loss

    def validation_step(self, batch):
        labels, inputs = batch.pop('labels'), batch
        logits = self(inputs)
        loss = self.loss_fn(logits, labels)
        self.val_loss.append(loss.item())
        self.log_dict({"val_loss": loss}, on_step = False, on_epoch = True, prog_bar = True)
        self.compute_metrics(logits, labels)
        return loss


    def on_validation_epoch_end(self):
        metrics = {
            "Epoch": self.current_epoch,
            "Train Loss" : np.mean(self.train_loss),
            "Val Loss": np.mean(self.val_loss),
            "Accuracy ": self.accuracy.compute().item(),
            "F1": self.f1_score.compute().item(),
            "Precision": self.precision.compute().item(),
            "Recall": self.recall.compute().item()
        }

        print(pd.DataFrame(metrics.items()).T)

        self.train_loss =[]
        self.val_loss = []
        self.accuracy.reset()
        self.f1_score.reset()
        self.precision.reset()
        self.recall.reset()

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr = self.learning_rate)

## Train

In [None]:
## config
BATCH_SIZE = 32
LEARNING_RATE = 1e-5
NUM_EPOCHS = 2

In [None]:
train_dl = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True, collate_fn = custom_collator, num_workers = 1)
test_dl = DataLoader(eval_dataset, batch_size = BATCH_SIZE, shuffle = False, collate_fn = custom_collator, num_workers = 1)

In [None]:
model = SentimentModel(output_dim=2, learning_rate= LEARNING_RATE)

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath = "checkpoints_logs",
        filename = '{epoch}-{val_loss:.2f}-{val_accuracy:.2f}',
        monitor = "val_loss",
        save_last = True,
        save_top_k = -1
)

early_stoping_callback = pl.callbacks.EarlyStopping(
            monitor = "val_loss",
            min_delta = 0.001,
            patience = 3,
            mode = "min"
)

trainer = pl.Trainer(
        accelerator = "gpu",
        callbacks = [checkpoint_callback, early_stoping_callback],
        max_epochs = NUM_EPOCHS,
        check_val_every_n_epoch = 1,
        gradient_clip_val = 1
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dl, test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type             | Params
-------------------------------------------------
0 | transformer | BertModel        | 109 M 
1 | linear      | Linear           | 1.5 K 
2 | loss_fn     | CrossEntropyLoss | 0     
3 | accuracy    | BinaryAccuracy   | 0     
4 | f1_score    | BinaryF1Score    | 0     
5 | precision   | BinaryPrecision  | 0     
6 | recall      | BinaryRecall     | 0     
-------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


       0           1         2          3         4          5       6
0  Epoch  Train Loss  Val Loss  Accuracy         F1  Precision  Recall
1    0.0         NaN   0.76561     0.1875  0.315789        1.0  0.1875


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

       0           1         2          3         4          5       6
0  Epoch  Train Loss  Val Loss  Accuracy         F1  Precision  Recall
1    0.0    0.071429  0.013807      0.997  0.997003   0.996008   0.998


Validation: |          | 0/? [00:00<?, ?it/s]

       0           1         2          3         4          5       6
0  Epoch  Train Loss  Val Loss  Accuracy         F1  Precision  Recall
1    1.0    0.005805  0.011801      0.997  0.997003   0.996008   0.998


`Trainer.fit` stopped: `max_epochs=2` reached.


## Inference

In [None]:
model = model.eval()

In [None]:
sample = dataset['test'].shuffle()[0]
tweet, label = sample['tweet'], sample['label']

print(f'Tweet : {tweet}')
print(f'True : {label}')

inputs =tokenizer(tweet, return_tensors='pt')
logits = model(inputs).detach()
preds = torch.argmax(logits, axis=1)[0].item()
print(f'Pred : {preds}')

Tweet : thinking about @StereoKicks again :(
True : 0


Pred : 0
