In [5]:
!pip install -q transformers datasets tokenizers sentencepiece torch scikit-learn spacy

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Library Load

In [18]:
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import spacy
import random
import pandas as pd
import torch
from typing import Mapping, Tuple
# import en_core_web_sm
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import huggingface_hub


# Hyperparameters

In [8]:
qa_eval_model = "bert-base-cased"
max_length = 512
pad_mask_id = -100
train_batch_size = 16
valid_batch_size = 128
epochs = 20
learning_rate = 1e-3
dataloader_workers = 0
pin_memory = False
# device = "cuda"
device = "mps"
save_dir = "./IAmA-qt-evaluator"
pin_memory = False

# Dataset Class

In [9]:
class QTEvalDataset(torch.utils.data.Dataset):
    def __init__(self, data: datasets.Dataset, max_length: int, tokenizer: AutoTokenizer) -> None:
        self.data = pd.DataFrame(data)
        self.max_length = max_length
        self.transforms = [self.shuffle, self.corrupt]
        self.hf_tokenizer = tokenizer
        self.spacy_tokenizer = spacy.load('en_core_web_sm')

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int) -> Mapping[str, torch.Tensor]:
        question, topic = self.data.loc[index]
        label = random.choice([0, 1])
        if label == 0:
            question, topic = random.choice(self.transforms)(question, topic)
        encoded_data = self.hf_tokenizer(
            text=question,
            text_pair=topic,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoded_data["input_ids"].squeeze(),
            "attention_mask": encoded_data["attention_mask"].squeeze(),
            "token_type_ids": encoded_data["token_type_ids"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.int64)
        }

    def shuffle(self, question: str, topic: str) -> Tuple[str, str]:
        shuffled_answer = topic
        while shuffled_answer == topic:
            # shuffled_answer = self.data.sample(1)['topic'].item()
            shuffled_answer = self.data.sample(1)['topic'].item()
        return question, shuffled_answer

    def corrupt(self, question: str, topic: str) -> Tuple[str, str]:
        doc = self.spacy_tokenizer(question)
        if len(doc.ents) > 1:
            # Replace all entities in the sentence with the same thing
            copy_ent = str(random.choice(doc.ents))
            for ent in doc.ents:
                question = question.replace(str(ent), copy_ent)
        elif len(doc.ents) == 1:
            # Replace the topic with an entity from the question
            topic = str(doc.ents[0])
        else:
            question, topic = self.shuffle(question, topic)
        return question, topic

# Dataset Load

In [12]:
tokenizer = AutoTokenizer.from_pretrained(qa_eval_model)
dataset = datasets.load_dataset("ehsanul007/IAmA-qt-evaluator")
train_set_name, valid_set_name = list(dataset.keys())
train_set = QTEvalDataset(dataset[train_set_name], max_length, tokenizer)
valid_set = QTEvalDataset(dataset[valid_set_name], max_length, tokenizer)

Found cached dataset csv (/Users/ehsanulkabir/.cache/huggingface/datasets/ehsanul007___csv/ehsanul007--IAmA-qt-evaluator-9a106f4f52259aec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 2/2 [00:00<00:00, 1153.55it/s]


# Trainer Class

In [13]:
class Trainer:
    def __init__(
        self,
        dataloader_workers: int,
        device: str,
        epochs: int,
        learning_rate: float,
        model: torch.nn.Module,
        tokenizer: AutoTokenizer,
        pin_memory: bool,
        save_dir: str,
        train_batch_size: int,
        train_set: Dataset,
        valid_batch_size: int,
        valid_set: Dataset,
        evaluate_on_accuracy: bool = False,
        push_to_hub: bool = False
    ) -> None:
        self.device = device
        self.epochs = epochs
        self.save_dir = save_dir
        self.train_batch_size = train_batch_size
        self.valid_batch_size = valid_batch_size
        self.train_loader = DataLoader(
            train_set,
            batch_size=train_batch_size,
            num_workers=dataloader_workers,
            pin_memory=pin_memory,
            shuffle=True
        )
        self.valid_loader = DataLoader(
            valid_set,
            batch_size=train_batch_size,
            num_workers=dataloader_workers,
            pin_memory=pin_memory,
            shuffle=False
        )
        self.tokenizer = tokenizer
        self.model = model.to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        self.train_loss = AverageMeter()
        self.evaluate_on_accuracy = evaluate_on_accuracy
        self.push_to_hub = push_to_hub
        if evaluate_on_accuracy:
            self.best_valid_score = 0
        else:
            self.best_valid_score = float("inf")

    def train(self) -> None:
        for epoch in range(1, self.epochs+1):
            self.model.train()
            self.train_loss.reset()

            with tqdm(total=len(self.train_loader), unit="batches") as tepoch:
                tepoch.set_description(f"epoch {epoch}")
                for data in self.train_loader:
                    self.optimizer.zero_grad()
                    data = {key: value.to(self.device) for key, value in data.items()}
                    output = self.model(**data)
                    loss = output.loss
                    loss.backward()
                    self.optimizer.step()
                    self.train_loss.update(loss.item(), self.train_batch_size)
                    tepoch.set_postfix({"train_loss": self.train_loss.avg})
                    tepoch.update(1)

            if self.evaluate_on_accuracy:
                valid_accuracy = self.evaluate_accuracy(self.valid_loader)
                if valid_accuracy > self.best_valid_score:
                    print(
                        f"Validation accuracy improved from {self.best_valid_score:.4f} to {valid_accuracy:.4f}. Saving."
                    )
                    self.best_valid_score = valid_accuracy
                    self._save()
            else:
                valid_loss = self.evaluate(self.valid_loader)
                if valid_loss < self.best_valid_score:
                    print(
                        f"Validation loss decreased from {self.best_valid_score:.4f} to {valid_loss:.4f}. Saving.")
                    self.best_valid_score = valid_loss
                    self._save()

    @torch.no_grad()
    def evaluate(self, dataloader: DataLoader) -> float:
        self.model.eval()
        eval_loss = AverageMeter()
        with tqdm(total=len(dataloader), unit="batches") as tepoch:
            tepoch.set_description("validation")
            for data in dataloader:
                data = {key: value.to(self.device) for key, value in data.items()}
                output = self.model(**data)
                loss = output.loss
                eval_loss.update(loss.item(), self.valid_batch_size)
                tepoch.set_postfix({"valid_loss": eval_loss.avg})
                tepoch.update(1)
        return eval_loss.avg

    @torch.no_grad()
    def evaluate_accuracy(self, dataloader: DataLoader) -> float:
        self.model.eval()
        accuracy = AverageMeter()
        with tqdm(total=len(dataloader), unit="batches") as tepoch:
            tepoch.set_description("validation")
            for data in dataloader:
                data = {key: value.to(self.device) for key, value in data.items()}
                output = self.model(**data)
                preds = torch.argmax(output.logits, dim=1)
                score = accuracy_score(data["labels"].cpu(), preds.cpu())
                accuracy.update(score, self.valid_batch_size)
                tepoch.set_postfix({"valid_acc": accuracy.avg})
                tepoch.update(1)
        return accuracy.avg

    def _save(self) -> None:
        self.tokenizer.save_pretrained(self.save_dir)
        self.model.save_pretrained(self.save_dir)
            

# For averaging the training loss
class AverageMeter(object):
    def __init__(self) -> None:
        self.reset()

    def reset(self) -> None:
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val: float, n: int = 1) -> None:
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Training

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(qa_eval_model)
huggingface_hub.login(token="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")
trainer = Trainer(
    dataloader_workers=dataloader_workers,
    device=device,
    epochs=epochs,
    learning_rate=learning_rate,
    model=model,
    pin_memory=pin_memory,
    save_dir=save_dir,
    tokenizer=tokenizer,
    train_batch_size=train_batch_size,
    train_set=train_set,
    valid_batch_size=valid_batch_size,
    valid_set=valid_set,
    # evaluate_on_accuracy=True
    evaluate_on_accuracy=False
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /Users/ehsanulkabir/.cache/huggingface/token
Login successful


In [17]:
trainer.train()

epoch 1:   0%|          | 2/585 [00:08<39:08,  4.03s/batches, train_loss=0.681]


KeyboardInterrupt: 