In [13]:
def print_example(data, index, print_content=True, print_classification=True, print_rationales=True ):
    print(f'Retrieving Training Example [{index}].................\n')
    item = data[index]
    classification = item['classification']
    evidences = item['evidences']
    content = item['content']
    if print_content: print(f'Review content:\n{content}\n')
    if print_classification: print('----------------------------',
                                   '\n| Sentiment class:',
                                   classification,
                                   ("- NEG" if not classification else "- POS"),
                                   '|', '\n----------------------------')
    if print_rationales:
        print('\nHuman rationales / Supporting Evidence:')
        for evidence in evidences:
            print('     - ', evidence[0])

def get_content(data, index):
    item = data[index]
    content = item['content']
    return content

def get_classes(data, index):
    item = data[index]
    classification = item['classification']
    return torch.tensor(classification)

def get_annotations(data, index):
    item = data[index]
    content = item['evidences']
    annotations = [evidence for evidence in content]
    return annotations


In [14]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from typing import List, Dict, Union
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
max_length = 512
# With it being easy to generate batches of tokenized texts, it's actually easier
# not to do the tokenization beforehand, and just store texts
# It's a little bit slow though, so if you found this to be bottleneck
# you'd want to pre-tokenize everything and then batch/pad as necessary
class SST2TransformerDataset(Dataset):
  def __init__(self,
               labels=None,
               texts=None,
               evidences=None
               ):

    self.y = torch.tensor(labels,dtype=torch.int64)
    self.texts = texts
    self.evidences = evidences

  def __len__(self):
    return self.y.shape[0]

  def __getitem__(self, idx):
    rdict = {
      'y': self.y[idx],
      'text': self.texts[idx],
      'evidences': self.evidences[idx]
    }
    return rdict


def SST2_transformer_collate(batch:List[Dict[str, Union[torch.Tensor,str]]]):
  # print("BATCH: ", batch)
  y_batch = torch.tensor([example['y'] for example in batch])
  evidences = [example['evidences'].numpy() for example in batch]

  # We'll just reuse the tokenizer we created earlier, since it doesn't change
  tokenized_batch = tokenizer.batch_encode_plus([example['text'] for example in batch],
                                                return_tensors='pt',
                                                padding=True,
                                                max_length=max_length,
                                                truncation=True)

  return {
      'y':y_batch,
      'input_ids':tokenized_batch['input_ids'],
      'attention_mask':tokenized_batch['attention_mask'],
      'evidences':torch.tensor(evidences)
  }


# LOAD SAVED DATA

In [15]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

train_data_path = 'drive/MyDrive/google_colab_data/train_data_loader.pt'
test_data_path = 'drive/MyDrive/google_colab_data/test_data_loader.pt'
val_data_path = 'drive/MyDrive/google_colab_data/val_data_loader.pt'

def load_dataLoader(path):
  loaded_data = torch.load(path)

  # Recreate the TensorDataset
  loaded_input = loaded_data['input']
  loaded_Y_star = loaded_data['Y_star']
  loaded_evidences = loaded_data['evidence']
  # print(loaded_evidences)

  loaded_dataset = SST2TransformerDataset(loaded_Y_star, loaded_input, loaded_evidences)

  # Recreate the DataLoader
  dataLoader = DataLoader(
      loaded_dataset,
      collate_fn = SST2_transformer_collate,
      batch_size=loaded_data['dataloader_params']['batch_size'],
      shuffle=loaded_data['dataloader_params']['shuffle'],  # Use shuffle from the saved data
      num_workers=loaded_data['dataloader_params']['num_workers']
  )
  return (dataLoader, len(loaded_input), loaded_input, loaded_Y_star, loaded_evidences)

(train_dataloader, train_size, train_in, train_classes, encoded_evidences) = load_dataLoader(train_data_path)
(dev_dataloader, test_size, test_in, test_classes, encoded_evidences_test) = load_dataLoader(test_data_path)
(val_dataloader, val_data, val_in, val_classes, encoded_evidences_val) = load_dataLoader(val_data_path)

print("ENCODED EVIDENCES : ", encoded_evidences.shape)
print("ENCODED EVIDENCES : ", encoded_evidences_test.shape)
print("ENCODED EVIDENCES : ", encoded_evidences_val.shape)

ENCODED EVIDENCES :  torch.Size([1600, 512])
ENCODED EVIDENCES :  torch.Size([199, 512])
ENCODED EVIDENCES :  torch.Size([200, 512])


  loaded_data = torch.load(path)


# BERT Classifier

In [16]:
! pip install --quiet "pytorch-lightning==1.9.4"


In [17]:
from transformers import BertModel
# Like the tokenizer, we can just download one of these from Hugging Face
bert = BertModel.from_pretrained('bert-base-uncased')

In [24]:
import pytorch_lightning as pl
from torchmetrics.classification import BinaryAccuracy
from transformers import BertModel
import torch

class BertClassifier(pl.LightningModule):
  def __init__(self,
               learning_rate:float,
               num_classes:int,
               freeze_bert:bool=False,
               **kwargs):
    super().__init__(**kwargs)

    # Like with the LSTM, we'll define a central BERT we're gonna use
    # Again, this will download this from Hugging Face in the background
    self.bert = BertModel.from_pretrained('bert-base-uncased')

    # If we want to speed up training, we can freeze the BERT module and train
    # just the output layer. This will hurt accuracy though.
    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad = False

    # Then the only other thing we need is an output layer, whose input size will
    # be the BERT's output size (768), which can can find as follows:
    self.output_layer = torch.nn.Linear(self.bert.config.hidden_size, num_classes)

    self.learning_rate = learning_rate
    self.train_accuracy = BinaryAccuracy()
    self.val_accuracy = BinaryAccuracy()
    self.test_accuracy = BinaryAccuracy()


  def forward(self, y:torch.Tensor, input_ids:torch.Tensor,
              attention_mask:torch.Tensor, evidences:torch.Tensor):
    # And then the forward function is pretty simple--
    # way simpler than with the LSTM
    bert_result = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

    # Typically we just use the pooler output for classification
    # Which, again, is the hidden state output for the [CLS] token
    cls_output = bert_result['pooler_output']

    py_logits = self.output_layer(cls_output)
    probs = torch.sigmoid(py_logits).view(-1)  # Convert logits to probabilities

    py = (probs > 0.5).float()
    if(y is not None):
      loss = torch.nn.functional.binary_cross_entropy_with_logits(py_logits.view(-1), y.float())
    else:
      loss = None
    return {'py':py,
            'probs':probs,
            'loss':loss}

  # Then do all the usual PyTorch Lightning functions
  def configure_optimizers(self):
    return [torch.optim.Adam(self.parameters(), lr=self.learning_rate)]

  def training_step(self, batch, batch_idx):
    result = self.forward(**batch)
    loss = result['loss']
    self.log('train_loss', result['loss'])
    self.train_accuracy.update(result['py'], batch['y'])
    return loss

  def training_epoch_end(self, outs):
    print(f' Epoch {self.current_epoch} training accuracy:', self.train_accuracy.compute())
    self.train_accuracy.reset()

  def validation_step(self, batch, batch_idx):
    result = self.forward(**batch)
    self.val_accuracy.update(result['py'], batch['y'])
    return result['loss']

  def validation_epoch_end(self, outs):
    print(f'Epoch {self.current_epoch} step {self.global_step} validation accuracy:', self.val_accuracy.compute())
    self.val_accuracy.reset()

  def test_step(self, batch, batch_idx):
    result = self.forward(**batch)
    self.test_accuracy.update(result['py'], batch['y'])
    return result['loss']

  def test_epoch_end(self, outs):
    print(f'Test accuracy:', self.test_accuracy.compute())
    self.test_accuracy.reset()

In [25]:
from collections import OrderedDict
import json

def stringify_ordered_dict_with_tensors(ordered_dict):
    serializable_dict = {k: v.tolist() for k, v in ordered_dict.items()}  # Convert tensors to lists
    return json.dumps(serializable_dict)

# Function to parse the string back to an OrderedDict of tensors
def parse_ordered_dict_with_tensors(stringified):
    deserialized_dict = json.loads(stringified)
    return OrderedDict({k: torch.tensor(v) for k, v in deserialized_dict.items()})

In [26]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.progress import TQDMProgressBar

# And then training is easy with our old friend PyTorch Lightning
classifier_trainer = Trainer(
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None,
    max_epochs=2,
    callbacks=[TQDMProgressBar(refresh_rate=20)],
    val_check_interval = 0.2,
    )

# Note that this is the best accuracy we've seen on this dataset, by a pretty wide margin

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [27]:
classifier_model_loaded = BertClassifier(learning_rate=2e-5, #if we were fine-tuning the BERT, we'd want to use something like 2e-5
                            num_classes=1)
# classifier_model = classifier_model.to('cuda')
print('Model:')
print(classifier_model_loaded)

Model:
BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [29]:
params_path = 'drive/MyDrive/google_colab_data/params_std.txt'

data = ''
with open(params_path, 'r') as file:
  for line in file:
    data = data + line
# print(data)
parsed = parse_ordered_dict_with_tensors(data)
classifier_model_loaded.load_state_dict(parsed)


<All keys matched successfully>

In [30]:
val_results = classifier_trainer.validate(model=classifier_model_loaded, dataloaders=val_dataloader)

# Print the validation results
print("Validation Results:", val_results)

Validation: 0it [00:00, ?it/s]

Epoch 0 step 0 validation accuracy: tensor(0.8550)
Validation Results: [{}]


# CUSTOM MODEL

In [31]:
import torch
import torch.nn as nn
import math
from pprint import pprint


class CustomModel(pl.LightningModule):
    def __init__(self, transformer, max_length, hidden_size, learning_rate:float):
        super().__init__()

        self.transformer = transformer
        self.learning_rate = learning_rate

        self.fc1 = nn.Linear(max_length + 1, hidden_size)  # First fully connected layer
        self.relu = nn.ReLU()  # Activation function
        self.fc2 = nn.Linear(hidden_size, max_length)  # Second fully connected layer

        self.train_accuracy = BinaryAccuracy()
        self.val_accuracy = BinaryAccuracy()
        self.test_accuracy = BinaryAccuracy()

    def forward(self, y:torch.Tensor, input_ids:torch.Tensor,
              attention_mask:torch.Tensor, evidences:torch.Tensor):


        out = self.transformer(y, input_ids, attention_mask, evidences)

        mlp_in = torch.cat((out['py'].unsqueeze(1), input_ids), dim=1)
        # print("MLP IN: ", mlp_in)
        # print("SHAPE PROBS: ", out['py'].shape)
        # print("SHAPE input_ids: ", input_ids.shape)
        # print("SHAPE MLP IN: ", mlp_in.shape)

        logits = self.fc1(mlp_in)  # Apply first layer
        logits = self.relu(logits)  # Apply activation
        logits = self.fc2(logits)  # Apply second layer

        # print("LOGITS SHAPE: ", logits.shape)
        # print("LOGITS VALUE: ", logits)

        probs = torch.sigmoid(logits)

        # print("probs SHAPE: ", probs.shape)
        # print("probs VALUE: ", probs)


        py = (probs > 0.5).float()

        loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, evidences)

        return {
          'class':out['py'],
          'probs':probs,
          'py':py,
          'loss': loss
        }

    # Then do all the usual PyTorch Lightning functions
    def configure_optimizers(self):
      return [torch.optim.Adam(self.parameters(), lr=self.learning_rate)]

    def training_step(self, batch, batch_idx):
      result = self.forward(**batch)
      loss = result['loss']
      self.log('train_loss', result['loss'])
      self.train_accuracy.update(result['py'], batch['evidences'])
      # self.train_accuracy.update(result['py'], batch['y'])
      return loss

    def training_epoch_end(self, outs):
      print(f' Epoch {self.current_epoch} training accuracy:', self.train_accuracy.compute())
      self.train_accuracy.reset()

    def validation_step(self, batch, batch_idx):
      result = self.forward(**batch)
      self.val_accuracy.update(result['py'], batch['evidences'])
      return result['loss']

    def validation_epoch_end(self, outs):
      print(f'Epoch {self.current_epoch} step {self.global_step} validation accuracy:', self.val_accuracy.compute())
      self.val_accuracy.reset()

    def test_step(self, batch, batch_idx):
      result = self.forward(**batch)
      self.test_accuracy.update(result['py'], batch['evidences'])
      return result['loss']

    def test_epoch_end(self, outs):
      print(f'Test accuracy:', self.test_accuracy.compute())
      self.test_accuracy.reset()

In [32]:
torch.random.manual_seed(10)
first_train_batch = next(iter(train_dataloader))
print('First training batch:')
print(first_train_batch)

print('First training batch sizes:')
print({key:value.shape for key, value in first_train_batch.items()})

First training batch:
{'y': tensor([1, 1, 0, 0, 1, 1, 1, 1, 0, 0]), 'input_ids': tensor([[  101,  2096,  3898,  ..., 16344,  1029,   102],
        [  101, 18269,  1024,  ...,  2872,  1999,   102],
        [  101, 19962, 22599,  ...,  1055,  1037,   102],
        ...,
        [  101,  8383,  1006,  ..., 11721, 15378,   102],
        [  101,  2009,  2003,  ...,  2187,  2017,   102],
        [  101,  2045,  2001,  ...,  1010, 19031,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'evidences': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)}
First training batch sizes:
{'

In [33]:
model = CustomModel(classifier_model_loaded, max_length, 600, learning_rate=2e-5)

with torch.no_grad():
  first_train_output = model(**first_train_batch)

print('First training output:')
pprint(first_train_output)

# print('Output item shapes:')
# pprint({key:value.shape for key, value in first_train_output.items()})


First training output:
{'class': tensor([1., 1., 0., 0., 1., 1., 1., 1., 0., 0.]),
 'loss': tensor(647.3324, dtype=torch.float64),
 'probs': tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 1.],
        ...,
        [0., 1., 1.,  ..., 1., 0., 1.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 1., 0., 1.]]),
 'py': tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 1.],
        ...,
        [0., 1., 1.,  ..., 1., 0., 1.],
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 1., 0., 1.]])}


In [34]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.progress import TQDMProgressBar

# And then training is easy with our old friend PyTorch Lightning
classifier_trainer = Trainer(
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None,
    max_epochs=2,
    callbacks=[TQDMProgressBar(refresh_rate=20)],
    val_check_interval = 0.2,
    )

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [36]:
classifier_trainer.fit(model=model,
            train_dataloaders=train_dataloader,
            val_dataloaders=dev_dataloader)



INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type           | Params
--------------------------------------------------
0 | transformer    | BertClassifier | 109 M 
1 | fc1            | Linear         | 308 K 
2 | relu           | ReLU           | 0     
3 | fc2            | Linear         | 307 K 
4 | train_accuracy | BinaryAccuracy | 0     
5 | val_accuracy   | BinaryAccuracy | 0     
6 | test_accuracy  | BinaryAccuracy | 0     
--------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
440.396   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Epoch 0 step 1 validation accuracy: tensor(0.4894)


Training: 1it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0 step 33 validation accuracy: tensor(0.6262)


Validation: 0it [00:00, ?it/s]

Epoch 0 step 65 validation accuracy: tensor(0.7285)


Validation: 0it [00:00, ?it/s]

Epoch 0 step 97 validation accuracy: tensor(0.7942)


Validation: 0it [00:00, ?it/s]

Epoch 0 step 129 validation accuracy: tensor(0.8297)


Validation: 0it [00:00, ?it/s]

Epoch 0 step 161 validation accuracy: tensor(0.8510)
 Epoch 0 training accuracy: tensor(0.7163)


Validation: 0it [00:00, ?it/s]

Epoch 1 step 193 validation accuracy: tensor(0.8605)


Validation: 0it [00:00, ?it/s]

Epoch 1 step 225 validation accuracy: tensor(0.8633)


Validation: 0it [00:00, ?it/s]

Epoch 1 step 257 validation accuracy: tensor(0.8686)


Validation: 0it [00:00, ?it/s]

Epoch 1 step 289 validation accuracy: tensor(0.8702)


Validation: 0it [00:00, ?it/s]

Epoch 1 step 321 validation accuracy: tensor(0.8716)


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


 Epoch 1 training accuracy: tensor(0.8426)


In [None]:
params_path = 'drive/MyDrive/google_colab_data/params_custom.txt'
!mkdir -p params_path
with open(params_path, 'w') as f:
  f.write(stringify_ordered_dict_with_tensors(model.state_dict()))

In [37]:
val_results = classifier_trainer.validate(model=model, dataloaders=val_dataloader)

# Print the validation results
print("Validation Results:", val_results)

Validation: 0it [00:00, ?it/s]

Epoch 2 step 321 validation accuracy: tensor(0.8603)
Validation Results: [{}]
