In [15]:
from __future__ import annotations

import logging
from typing import Any, Dict, List, Literal

import evaluate
import numpy as np
from datasets import Dataset, load_dataset
from rich.pretty import pprint
from torchinfo import summary
import psutil

from transformers import (
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    DistilBertTokenizer,
    RobertaConfig,
    RobertaModel,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizer, PreTrainedModel
)
from collections import Counter, OrderedDict

from tqdm.notebook import tqdm  # Use notebook version for better UI in notebooks
from sklearn.metrics import classification_report

from omnivault.utils.reproducibility.seed import seed_all
import torch
from transformers import GPT2Tokenizer



In [9]:
seed_all(42, seed_torch=True, set_torch_deterministic=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
LOGGER.addHandler(handler)


In [12]:
dataset = load_dataset('financial_phrasebank', 'sentences_allagree', trust_remote_code=True)["train"]
dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 2264
})

In [13]:
def count_labels(labels: List[int]) -> Dict[int, int]:
    label_counts = Counter(labels)
    ordered_label_counts = OrderedDict(sorted(label_counts.items()))
    return dict(ordered_label_counts)


sentences_allagree = dataset['sentence']
labels_allagree = dataset['label']

label_counts = count_labels(labels_allagree)
pprint(label_counts)



In [14]:
train_valid_split = dataset.train_test_split(test_size=0.1, shuffle=True, stratify_by_column='label')
train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']

In [17]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [1]:
from __future__ import annotations

import torch
import torch.nn.functional as F
from torch import nn
from transformers.activations import ACT2FN
from transformers.modeling_outputs import BaseModelOutput
from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Config, StableDropout


class MeanPooler(nn.Module):
    """


    # Qwen/Qwen1.5-0.5B
    padding side = right
    B=2, T=3, D=4
    attention_mask: [B, T] -> [[1, 1, 0], [1, 0, 0]]
    last_hidden_state: [B, T, D] -> [
                                        [[1, 2, 3, 4],    [5, 6, 7, 8],     [1, 1, 5, 2]],
                                        [[9, 10, 11, 12], [13, 14, 15, 16], [1, 3, 2, 2]]
                                    ]
    input_mask_expanded: [B, T, D] ->   [
                                            [[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]],
                                            [[1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0]]
                                        ]

    sum_embeddings: [B, D] -> the idea is simple, you want the sequence position
    for which the attention mask is 1, and sum the embeddings for that position.
    In other words, if the attention mask is 0, you want to nullify the embedding
    for that position. This is achieved by multiplying the embeddings with the
    attention mask. This is done for all the positions in the sequence. This
    effectively make [1,1,5,2] * [0,0,0,0] = [0,0,0,0] in the example above.
    We just want:

    1st sequence in the batch to become shape [D] by:
        - do a multiplication of the last hidden state with the attention mask
            [1, 2, 3, 4] * [1, 1, 1, 1] = [1, 2, 3, 4]
            [5, 6, 7, 8] * [1, 1, 1, 1] = [5, 6, 7, 8]
            [1, 1, 5, 2] * [0, 0, 0, 0] = [0, 0, 0, 0]

            leads to stacked shape of [T, D] for the first sequence

        - sum the embeddings for each position in the sequence
            [1, 2, 3, 4] + [5, 6, 7, 8] + [0, 0, 0, 0] = [6, 8, 10, 12]

                leads to shape [D] for the first sequence
        - divide the sum by the sum of the attention mask, in this example
            our sum of the attention mask is [1, 1, 1, 1] + [1, 1, 1, 1] + [0, 0, 0, 0] = [2, 2, 2, 2]
            in other words we have 2 valid tokens in the sequence to be divided
    """

    def __init__(self) -> None:
        super().__init__()

    def forward(
        self,
        backbone_outputs: BaseModelOutput,
        _input_ids: torch.Tensor | None = None,
        _attention_mask: torch.Tensor | None = None,
    ) -> torch.Tensor:
        if _attention_mask is None:
            raise ValueError("Attention mask is required for mean pooling.")

        last_hidden_state = backbone_outputs.last_hidden_state
        input_mask_expanded = _attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

    @property
    def output_dim(self) -> int:
        return self._output_dim


In [2]:
attention_mask = torch.tensor([
    [1, 1, 0],  # Sequence 1
    [1, 0, 0]   # Sequence 2
])
last_hidden_state = torch.tensor([
    [[1, 2, 3, 4], [5, 6, 7, 8], [1, 1, 5, 2]],  # Sequence 1
    [[9, 10, 11, 12], [13, 14, 15, 16], [1, 3, 2, 2]]  # Sequence 2
])

attention_mask.shape, last_hidden_state.shape

(torch.Size([2, 3]), torch.Size([2, 3, 4]))

first, if there is no padding and all sequences in the batch are truncated to the
same context length of $T$, then first sequence say is 3 by 4 would simply mean
I have a stack of 3 token level hidden embeddings. But as we know pooling is an aggregation
to make "token" level to "sequence" level and a simple mean pooling would be
$\frac{1}{3} \sum_{i=1}^{3} h_i$ where $h_i$ is the hidden embedding of token $i$.
This idea is similar to computer vision's mean pooling.


In [4]:
input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
input_mask_expanded

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [0., 0., 0., 0.]],

        [[1., 1., 1., 1.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

In [5]:
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1)
sum_embeddings

tensor([[ 6.,  8., 10., 12.],
        [ 9., 10., 11., 12.]])

In [6]:
sum_mask = input_mask_expanded.sum(dim=1)
sum_mask

tensor([[2., 2., 2., 2.],
        [1., 1., 1., 1.]])

In [7]:
mean_embeddings = sum_embeddings / sum_mask
mean_embeddings

tensor([[ 3.,  4.,  5.,  6.],
        [ 9., 10., 11., 12.]])