In [5]:
!pip install -q transformers datasets tokenizers sentencepiece torch scikit-learn spacy

# Library Load

In [1]:
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import spacy
import random
import pandas as pd
import torch
from typing import Mapping, Tuple
# import en_core_web_sm
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import huggingface_hub

  from .autonotebook import tqdm as notebook_tqdm


# Hyperparameters

In [6]:
question_ranking_model = "bert-base-cased"
max_length = 512
pad_mask_id = -100
train_batch_size = 16
valid_batch_size = 128
epochs = 20
learning_rate = 1e-3
dataloader_workers = 0
pin_memory = False
# device = "cuda"
device = "mps"
save_dir = "./IAmA-question-ranker"
pin_memory = False

# Dataset Class

In [3]:
class QuestionRankingDataset(torch.utils.data.Dataset):
    def __init__(self, data: datasets.Dataset, max_length: int, tokenizer: AutoTokenizer) -> None:
        self.data = pd.DataFrame(data)
        self.max_length = max_length
        self.hf_tokenizer = tokenizer
        self.spacy_tokenizer = spacy.load('en_core_web_sm')

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int) -> Mapping[str, torch.Tensor]:
        question, context, label = self.data.loc[index]
        encoded_data = self.hf_tokenizer(
            text=question,
            text_pair=context,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoded_data["input_ids"].squeeze(),
            "attention_mask": encoded_data["attention_mask"].squeeze(),
            "token_type_ids": encoded_data["token_type_ids"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.int64)
        }


# Dataset Load

In [4]:
tokenizer = AutoTokenizer.from_pretrained(qa_eval_model)
dataset = datasets.load_dataset("ehsanul007/IAmA-question-ranking")
train_set_name, valid_set_name = list(dataset.keys())
train_set = QuestionRankingDataset(dataset[train_set_name], max_length, tokenizer)
valid_set = QuestionRankingDataset(dataset[valid_set_name], max_length, tokenizer)

Downloading and preparing dataset csv/ehsanul007--IAmA-question-ranking to /Users/ehsanulkabir/.cache/huggingface/datasets/ehsanul007___csv/ehsanul007--IAmA-question-ranking-aa9f49d08f32f605/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data: 100%|██████████| 4.17M/4.17M [00:00<00:00, 18.9MB/s]
Downloading data: 100%|██████████| 4.16M/4.16M [00:00<00:00, 18.7MB/s]
Downloading data files: 100%|██████████| 2/2 [00:01<00:00,  1.58it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 804.43it/s]
                                                             

Dataset csv downloaded and prepared to /Users/ehsanulkabir/.cache/huggingface/datasets/ehsanul007___csv/ehsanul007--IAmA-question-ranking-aa9f49d08f32f605/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 676.17it/s]


# Trainer Class

In [5]:
class Trainer:
    def __init__(
        self,
        dataloader_workers: int,
        device: str,
        epochs: int,
        learning_rate: float,
        model: torch.nn.Module,
        tokenizer: AutoTokenizer,
        pin_memory: bool,
        save_dir: str,
        train_batch_size: int,
        train_set: Dataset,
        valid_batch_size: int,
        valid_set: Dataset,
        evaluate_on_accuracy: bool = False,
        push_to_hub: bool = False
    ) -> None:
        self.device = device
        self.epochs = epochs
        self.save_dir = save_dir
        self.train_batch_size = train_batch_size
        self.valid_batch_size = valid_batch_size
        self.train_loader = DataLoader(
            train_set,
            batch_size=train_batch_size,
            num_workers=dataloader_workers,
            pin_memory=pin_memory,
            shuffle=True
        )
        self.valid_loader = DataLoader(
            valid_set,
            batch_size=train_batch_size,
            num_workers=dataloader_workers,
            pin_memory=pin_memory,
            shuffle=False
        )
        self.tokenizer = tokenizer
        self.model = model.to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        self.train_loss = AverageMeter()
        self.evaluate_on_accuracy = evaluate_on_accuracy
        self.push_to_hub = push_to_hub
        if evaluate_on_accuracy:
            self.best_valid_score = 0
        else:
            self.best_valid_score = float("inf")

    def train(self) -> None:
        for epoch in range(1, self.epochs+1):
            self.model.train()
            self.train_loss.reset()

            with tqdm(total=len(self.train_loader), unit="batches") as tepoch:
                tepoch.set_description(f"epoch {epoch}")
                for data in self.train_loader:
                    self.optimizer.zero_grad()
                    data = {key: value.to(self.device) for key, value in data.items()}
                    output = self.model(**data)
                    loss = output.loss
                    loss.backward()
                    self.optimizer.step()
                    self.train_loss.update(loss.item(), self.train_batch_size)
                    tepoch.set_postfix({"train_loss": self.train_loss.avg})
                    tepoch.update(1)

            if self.evaluate_on_accuracy:
                valid_accuracy = self.evaluate_accuracy(self.valid_loader)
                if valid_accuracy > self.best_valid_score:
                    print(
                        f"Validation accuracy improved from {self.best_valid_score:.4f} to {valid_accuracy:.4f}. Saving."
                    )
                    self.best_valid_score = valid_accuracy
                    self._save()
            else:
                valid_loss = self.evaluate(self.valid_loader)
                if valid_loss < self.best_valid_score:
                    print(
                        f"Validation loss decreased from {self.best_valid_score:.4f} to {valid_loss:.4f}. Saving.")
                    self.best_valid_score = valid_loss
                    self._save()

    @torch.no_grad()
    def evaluate(self, dataloader: DataLoader) -> float:
        self.model.eval()
        eval_loss = AverageMeter()
        with tqdm(total=len(dataloader), unit="batches") as tepoch:
            tepoch.set_description("validation")
            for data in dataloader:
                data = {key: value.to(self.device) for key, value in data.items()}
                output = self.model(**data)
                loss = output.loss
                eval_loss.update(loss.item(), self.valid_batch_size)
                tepoch.set_postfix({"valid_loss": eval_loss.avg})
                tepoch.update(1)
        return eval_loss.avg

    @torch.no_grad()
    def evaluate_accuracy(self, dataloader: DataLoader) -> float:
        self.model.eval()
        accuracy = AverageMeter()
        with tqdm(total=len(dataloader), unit="batches") as tepoch:
            tepoch.set_description("validation")
            for data in dataloader:
                data = {key: value.to(self.device) for key, value in data.items()}
                output = self.model(**data)
                preds = torch.argmax(output.logits, dim=1)
                score = accuracy_score(data["labels"].cpu(), preds.cpu())
                accuracy.update(score, self.valid_batch_size)
                tepoch.set_postfix({"valid_acc": accuracy.avg})
                tepoch.update(1)
        return accuracy.avg

    def _save(self) -> None:
        self.tokenizer.save_pretrained(self.save_dir)
        self.model.save_pretrained(self.save_dir)
            

# For averaging the training loss
class AverageMeter(object):
    def __init__(self) -> None:
        self.reset()

    def reset(self) -> None:
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val: float, n: int = 1) -> None:
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Training

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(question_ranking_model)
# huggingface_hub.login(token="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")
trainer = Trainer(
    dataloader_workers=dataloader_workers,
    device=device,
    epochs=epochs,
    learning_rate=learning_rate,
    model=model,
    pin_memory=pin_memory,
    save_dir=save_dir,
    tokenizer=tokenizer,
    train_batch_size=train_batch_size,
    train_set=train_set,
    valid_batch_size=valid_batch_size,
    valid_set=valid_set,
    evaluate_on_accuracy=True
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
trainer.train()