In [9]:
!pip install transformers
!pip install datasets
!pip install torch



In [19]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from datasets import load_dataset
from dataclasses import dataclass
from typing import List, Tuple

import torch.nn as nn
from torch.optim import Optimizer, AdamW, Adam, SGD, RMSprop
from tqdm.notebook import tqdm
from transformers import AutoModelForSequenceClassification, RobertaForMultipleChoice

In [11]:
@dataclass
class AlphaNLIExample:
    choices: list[str]
    label: int

    @staticmethod
    def from_dict(data: dict):
      return AlphaNLIExample(
        choices=data["choices"],
        label=data["label"],
      )


def initialize_datasets(tokenizer: PreTrainedTokenizerFast, sample: bool, data_file: str, label_data: str, sample_size: int = 0) -> dict:
    # load dataset
    raw_data = load_dataset("json", data_files=data_file)

    # read labels
    labels = list()
    with open(label_data, "r") as f:
       labels = [int(line.strip())-1 for line in f]

    # add labels to dataset
    raw_data["train"] = raw_data["train"].add_column("label", labels)

    # generate our choices (based on linear chain: obs1 -> hyp -> obs2)
    raw_data["train"] = raw_data["train"].add_column("choices", [[x["obs1"] + " " + x["hyp1"] + " " + x["obs2"],
                                                                x["obs1"] + " " + x["hyp2"] + " " + x["obs2"]] for x in raw_data["train"]])

    # just for now - take random sampling of 1000
    #dataset = raw_data
    dataset = {
        "train": raw_data["train"].shuffle(seed=42).select(range(sample_size)) if sample else raw_data["train"]
    }

    # initialize as AlphaNLI Dataset
    split_datasets = {}
    for split_name in dataset.keys():
      split_data = list(dataset[split_name])
      split_datasets[split_name] = AlphaNLIDataset(tokenizer, split_data)

    return split_datasets

In [12]:
class AlphaNLIDataset(Dataset):
    tokenizer: PreTrainedTokenizerFast = None

    def __init__(self, tokenizer: PreTrainedTokenizerFast, raw_data_list: List[dict]):
        AlphaNLIDataset.tokenizer = tokenizer
        self.sample_list = [AlphaNLIExample.from_dict(data) for data in raw_data_list]

    def __len__(self):
        return len(self.sample_list)

    def __getitem__(self, idx):
        return self.sample_list[idx]

    def __iter__(self):
        return iter(self.sample_list)

    @staticmethod
    def collate_fn(batched_samples: List[AlphaNLIExample]) -> dict:
        batched_choices = [sample.choices for sample in batched_samples]
        batched_label = [sample.label for sample in batched_samples]

        # choice_encoding = AlphaNLIDataset.tokenizer(batched_choices,
        #                                       padding=True,
        #                                       max_length=512,
        #                                       truncation=True,
        #                                       return_tensors="pt")
        tokenized_choices = []
        for choices in batched_choices:
            # Tokenize each choice in the pair
            tokenized_pair = AlphaNLIDataset.tokenizer(
                choices,  # List of strings (e.g., ["obs1 hyp1 obs2", "obs1 hyp2 obs2"])
                padding="max_length",
                truncation=True,
                max_length=256,
                return_tensors="pt",
            )
            tokenized_choices.append(tokenized_pair)

        # Stack tokenized inputs into the correct shape
        input_ids = torch.stack([tc["input_ids"] for tc in tokenized_choices])  # (batch_size, num_choices, seq_len)
        attention_mask = torch.stack([tc["attention_mask"] for tc in tokenized_choices])

        label_encoding = torch.LongTensor(batched_label)

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label_encoding}

In [13]:
# this is like ALL AI - so should change this up a little

def train_one_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer: Optimizer,
    epoch: int,
    gradient_accumulation_steps: int = 4,  # Number of steps to accumulate gradients
    target_batch_size: int = 32,  # Simulated larger batch size
):
    model.train()
    optimizer.zero_grad()

    effective_batch_size = target_batch_size
    actual_batch_size = dataloader.batch_size
    assert effective_batch_size % actual_batch_size == 0, (
        f"Target batch size ({effective_batch_size}) must be divisible by "
        f"actual batch size ({actual_batch_size})."
    )
    gradient_accumulation_steps = effective_batch_size // actual_batch_size

    with tqdm(dataloader, desc=f"Train Ep {epoch}", total=len(dataloader)) as tq:
        for step, batch in enumerate(tq):
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            label_encoding = batch['label'].to(model.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_encoding)
            loss = outputs.loss / gradient_accumulation_steps  # Scale loss by accumulation steps

            loss.backward()

            # Perform optimizer step and zero gradients after accumulating enough steps
            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            # Update progress bar
            tq.set_postfix({"loss": loss.detach().item() * gradient_accumulation_steps})  # Rescale loss for display

        # Handle the last batch if it doesn't align with gradient_accumulation_steps
        if (step + 1) % gradient_accumulation_steps != 0:
            optimizer.step()
            optimizer.zero_grad()

In [14]:
def evaluate(model: nn.Module, dataloader: DataLoader) -> float:
    model.eval()
    all_predictions = []
    all_labels = []
    with tqdm(dataloader, desc=f"Eval", total=len(dataloader)) as tq:
        for batch in tq:
            with torch.no_grad():
                input_ids = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                label_encoding = batch['label'].to(model.device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_encoding)
                logits = outputs.logits

                predictions = torch.argmax(logits, dim=-1)
                labels = label_encoding

                all_predictions += predictions
                all_labels += labels

    all_predictions = torch.Tensor(all_predictions)
    all_labels = torch.Tensor(all_labels)
    accuracy = compute_accuracy(all_predictions, all_labels)

    print(f"Accuracy: {accuracy}")
    return accuracy


def compute_accuracy(predictions: torch.Tensor, labels: torch.Tensor) -> float:
    assert predictions.size(-1) == labels.size(-1)
    accuracy = (predictions == labels).sum().item() / len(labels)
    return accuracy

In [15]:
torch.manual_seed(64)

def main(batch_size, learning_rate, num_epochs, grad_accum):
    model_name = "distilroberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model = model.cuda()
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

    datasets = initialize_datasets(tokenizer, sample=True, data_file="train.jsonl", label_data="train-labels.lst", sample_size=10000)
    print(datasets['train'][0])

    train_dataloader = DataLoader(datasets['train'],
                                   batch_size=batch_size,
                                   shuffle=True,
                                   collate_fn=AlphaNLIDataset.collate_fn,
                                   num_workers=2)

    dev_datasets = initialize_datasets(tokenizer, sample=False, data_file="dev.jsonl", label_data="dev-labels.lst")
    dev_dataloader = DataLoader(dev_datasets['train'],
                                   batch_size=batch_size,
                                   shuffle=True,
                                   collate_fn=AlphaNLIDataset.collate_fn,
                                   num_workers=2)

    best_acc = 0.0
    for epoch in range(1, num_epochs + 1):
        train_one_epoch(model, train_dataloader, optimizer, epoch, gradient_accumulation_steps=grad_accum, target_batch_size=batch_size)
        valid_acc = evaluate(model, dev_dataloader)
        best_acc = max(best_acc, valid_acc)
    return best_acc

In [18]:
main(batch_size=32, learning_rate=5e-5, num_epochs=5, grad_accum=4)

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

AlphaNLIExample(choices=['Albert was a weight loss guru. Albert increased his exercise regimen. He died of a heart attack on the last mile.', 'Albert was a weight loss guru. Albert stopped his exercise regimen. He died of a heart attack on the last mile.'], label=0)


Generating train split: 0 examples [00:00, ? examples/s]

Train Ep 1:   0%|          | 0/313 [00:00<?, ?it/s]

Eval:   0%|          | 0/48 [00:00<?, ?it/s]

Accuracy: 0.5678851174934726


Train Ep 2:   0%|          | 0/313 [00:00<?, ?it/s]

Eval:   0%|          | 0/48 [00:00<?, ?it/s]

Accuracy: 0.5731070496083551


Train Ep 3:   0%|          | 0/313 [00:00<?, ?it/s]

Eval:   0%|          | 0/48 [00:00<?, ?it/s]

Accuracy: 0.589425587467363


Train Ep 4:   0%|          | 0/313 [00:00<?, ?it/s]

Eval:   0%|          | 0/48 [00:00<?, ?it/s]

Accuracy: 0.5953002610966057


Train Ep 5:   0%|          | 0/313 [00:00<?, ?it/s]

Eval:   0%|          | 0/48 [00:00<?, ?it/s]

Accuracy: 0.5868146214099217


0.5953002610966057