In [15]:
from __future__ import annotations

import logging
from typing import Any, Dict, List, Literal

import evaluate
import numpy as np
from datasets import Dataset, load_dataset
from rich.pretty import pprint
from torchinfo import summary
import psutil

from transformers import (
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    DistilBertTokenizer,
    RobertaConfig,
    RobertaModel,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizer, PreTrainedModel
)
from collections import Counter, OrderedDict

from tqdm.notebook import tqdm  # Use notebook version for better UI in notebooks
from sklearn.metrics import classification_report

from omnivault.utils.reproducibility.seed import seed_all
import torch
from transformers import GPT2Tokenizer



In [9]:
seed_all(42, seed_torch=True, set_torch_deterministic=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
LOGGER.addHandler(handler)


In [12]:
dataset = load_dataset('financial_phrasebank', 'sentences_allagree', trust_remote_code=True)["train"]
dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 2264
})

In [13]:
def count_labels(labels: List[int]) -> Dict[int, int]:
    label_counts = Counter(labels)
    ordered_label_counts = OrderedDict(sorted(label_counts.items()))
    return dict(ordered_label_counts)


sentences_allagree = dataset['sentence']
labels_allagree = dataset['label']

label_counts = count_labels(labels_allagree)
pprint(label_counts)



In [14]:
train_valid_split = dataset.train_test_split(test_size=0.1, shuffle=True, stratify_by_column='label')
train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']

In [18]:

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
pprint(tokenizer.special_tokens_map)

tokenizer.pad_token = tokenizer.eos_token

In [None]:

combined_train = pd.concat([ext_train, ori_train], ignore_index=True)
unique_train = combined_train.drop_duplicates(subset='prompt', keep='first')

# Display the head of the DataFrame to verify
print(unique_train.head())


In [17]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}