In [None]:
# Example Code from PyTorch Lightning Official Website
# import torch
# from torch import nn
# from torch.nn import functional as F
# from torch.utils.data import DataLoader
# from torch.utils.data import random_split
# from torchvision.datasets import MNIST
# from torchvision import transforms
# import pytorch_lightning as pl

# class LitAutoEncoder(pl.LightningModule):
# 	def __init__(self):
# 		super().__init__()
# 		self.encoder = nn.Sequential(
#       nn.Linear(28 * 28, 64),
#       nn.ReLU(),
#       nn.Linear(64, 3))
# 		self.decoder = nn.Sequential(
#       nn.Linear(3, 64),
#       nn.ReLU(),
#       nn.Linear(64, 28 * 28))

# 	def forward(self, x):
# 		embedding = self.encoder(x)
# 		return embedding

# 	def configure_optimizers(self):
# 		optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
# 		return optimizer

# 	def training_step(self, train_batch, batch_idx):
# 		x, y = train_batch
# 		x = x.view(x.size(0), -1)
# 		z = self.encoder(x)    
# 		x_hat = self.decoder(z)
# 		loss = F.mse_loss(x_hat, x)
# 		self.log('train_loss', loss)
# 		return loss

# 	def validation_step(self, val_batch, batch_idx):
# 		x, y = val_batch
# 		x = x.view(x.size(0), -1)
# 		z = self.encoder(x)
# 		x_hat = self.decoder(z)
# 		loss = F.mse_loss(x_hat, x)
# 		self.log('val_loss', loss)

# # data
# dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor())
# mnist_train, mnist_val = random_split(dataset, [55000, 5000])

# train_loader = DataLoader(mnist_train, batch_size=32)
# val_loader = DataLoader(mnist_val, batch_size=32)

# # model
# model = LitAutoEncoder()

# # training
# trainer = pl.Trainer(gpus=4, num_nodes=8, precision=16, limit_train_batches=0.5)
# trainer.fit(model, train_loader, val_loader)
    


In [32]:
from datetime import datetime
from typing import Optional

import datasets
import torch
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, seed_everything, LightningDataModule
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

AVAIL_GPUS = min(1, torch.cuda.device_count())
AVAIL_GPUS

0

In [2]:
class GLUEDataModule(LightningDataModule):

    task_text_field_map = {
        "cola": ["sentence"],
        "sst2": ["sentence"],
        "mrpc": ["sentence1", "sentence2"],
        "qqp": ["question1", "question2"],
        "stsb": ["sentence1", "sentence2"],
        "mnli": ["premise", "hypothesis"],
        "qnli": ["question", "sentence"],
        "rte": ["sentence1", "sentence2"],
        "wnli": ["sentence1", "sentence2"],
        "ax": ["premise", "hypothesis"],
    }

    glue_task_num_labels = {
        "cola": 2,
        "sst2": 2,
        "mrpc": 2,
        "qqp": 2,
        "stsb": 1,
        "mnli": 3,
        "qnli": 2,
        "rte": 2,
        "wnli": 2,
        "ax": 3,
    }

    loader_columns = [
        "datasets_idx",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "start_positions",
        "end_positions",
        "labels",
    ]

    def __init__(
        self,
        model_name_or_path: str,
        task_name: str = "mrpc",
        max_seq_length: int = 128,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        self.text_fields = self.task_text_field_map[task_name]
        self.num_labels = self.glue_task_num_labels[task_name]
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def setup(self, stage: str):
        self.dataset = datasets.load_dataset("glue", self.task_name)
        
        # for having different mapping according to different loader
        for split in self.dataset.keys():
            self.dataset[split] = self.dataset[split].map(
                self.convert_to_features,
                batched=True,
                remove_columns=["label"],
            )
            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
            self.dataset[split].set_format(type="torch", columns=self.columns)

        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]

    def prepare_data(self):
        # for loading dataset
        datasets.load_dataset("glue", self.task_name)
        # for loading the tokenizer
        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.train_batch_size)

    def val_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["validation"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def test_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["test"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    # the mapping function
    def convert_to_features(self, example_batch, indices=None):

        # Either encode single sentence or sentence pairs
        if len(self.text_fields) > 1:
            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
        else:
            texts_or_text_pairs = example_batch[self.text_fields[0]]

        # Tokenize the text/text pairs
        features = self.tokenizer.batch_encode_plus(
            texts_or_text_pairs, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True
        )

        # Rename label to labels to make it easier to pass to model forward
        features["labels"] = example_batch["label"]

        return features

In [3]:
dm = GLUEDataModule("distilbert-base-uncased")
dm.prepare_data()
dm.setup("fit")
next(iter(dm.train_dataloader()))

Reusing dataset glue (/Users/daohuei/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset glue (/Users/daohuei/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e351be6b4ac3b7f1.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

{'input_ids': tensor([[  101,  2572,  3217,  ...,     0,     0,     0],
         [  101,  9805,  3540,  ...,     0,     0,     0],
         [  101,  2027,  2018,  ...,     0,     0,     0],
         ...,
         [  101,  1996,  2922,  ...,     0,     0,     0],
         [  101,  6202,  1999,  ...,     0,     0,     0],
         [  101, 16565,  2566,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
         1, 1, 0, 0, 1, 1, 1, 0])}

In [4]:
from transformers import BartForConditionalGeneration, BartTokenizer

# model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
# tok = BartTokenizer.from_pretrained("facebook/bart-large")
example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
# batch = tok(example_english_phrase, return_tensors="pt")
# generated_ids = model.generate(batch["input_ids"])
# assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
#     "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
# ]
class DSTBart(LightningModule):
    def __init__(
        self,
        # model_name_or_path: str,
        num_labels: int = 0,
        task_name: str = "dst",
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        self.model_name_or_path = "facebook/bart-base"
        self.config = AutoConfig.from_pretrained(self.model_name_or_path)
        self.model = BartForConditionalGeneration.from_pretrained(self.model_name_or_path, forced_bos_token_id=0)
        # self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
        # self.metric = datasets.load_metric(
        #     "glue", self.hparams.task_name, experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        # )

    
    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]
        
        # need to embeded in post-processing code
        if self.hparams.num_labels >= 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        return {"loss": val_loss, "preds": preds, "labels": labels}

    def validation_epoch_end(self, outputs):
        # need to embeded in relative slot accuracy calculation
        if self.hparams.task_name == "mnli":
            for i, output in enumerate(outputs):
                # matched or mismatched
                split = self.hparams.eval_splits[i].split("_")[-1]
                # evaluation
                preds = torch.cat([x["preds"] for x in output]).detach().cpu().numpy()
                labels = torch.cat([x["labels"] for x in output]).detach().cpu().numpy()
                loss = torch.stack([x["loss"] for x in output]).mean()
                self.log(f"val_loss_{split}", loss, prog_bar=True)
                split_metrics = {
                    f"{k}_{split}": v for k, v in self.metric.compute(predictions=preds, references=labels).items()
                }
                self.log_dict(split_metrics, prog_bar=True)
            return loss

        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x["loss"] for x in outputs]).mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)
        return loss

    def setup(self, stage=None) -> None:
        # ignore if in training stage
        if stage != "fit":
            return
        # Get dataloader by calling it - train_dataloader() is called after setup() by default
        train_loader = self.trainer.datamodule.train_dataloader()

        # Calculate total steps
        tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size


    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        # customize: do not do weight decay at bias weight and weights in LayerNorm Module
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

class GLUETransformer(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        task_name: str,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
        self.metric = datasets.load_metric(
            "glue", self.hparams.task_name, experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        )

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        if self.hparams.num_labels >= 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        return {"loss": val_loss, "preds": preds, "labels": labels}

    def validation_epoch_end(self, outputs):
        if self.hparams.task_name == "mnli":
            for i, output in enumerate(outputs):
                # matched or mismatched
                split = self.hparams.eval_splits[i].split("_")[-1]
                preds = torch.cat([x["preds"] for x in output]).detach().cpu().numpy()
                labels = torch.cat([x["labels"] for x in output]).detach().cpu().numpy()
                loss = torch.stack([x["loss"] for x in output]).mean()
                self.log(f"val_loss_{split}", loss, prog_bar=True)
                split_metrics = {
                    f"{k}_{split}": v for k, v in self.metric.compute(predictions=preds, references=labels).items()
                }
                self.log_dict(split_metrics, prog_bar=True)
            return loss

        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x["loss"] for x in outputs]).mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)
        return loss

    def setup(self, stage=None) -> None:
        if stage != "fit":
            return
        # Get dataloader by calling it - train_dataloader() is called after setup() by default
        train_loader = self.trainer.datamodule.train_dataloader()

        # Calculate total steps
        tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        # customize: do not do weight decay at bias weight and weights in LayerNorm Module
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [5]:
seed_everything(42)

dm = GLUEDataModule(model_name_or_path="albert-base-v2", task_name="cola")
dm.setup("fit")
model = GLUETransformer(
    model_name_or_path="albert-base-v2",
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
    task_name=dm.task_name,
)

trainer = Trainer(max_epochs=1, gpus=AVAIL_GPUS)
trainer.fit(model, datamodule=dm)

Global seed set to 42
Reusing dataset glue (/Users/daohuei/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7126e8f9bcee5c6a.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

  0%|          | 0/3 [00:00<?, ?it/s]

Missing logger folder: /Users/daohuei/CodeNest/ucsc-nlp-244-dst/lightning_logs
Reusing dataset glue (/Users/daohuei/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  rank_zero_deprecation(

  | Name  | Type                            | Params
----------------------------------------------------------
0 | model | AlbertForSequenceClassification | 11.7 M
----------------------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
46.740    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [6]:
dm = GLUEDataModule(
    model_name_or_path="distilbert-base-cased",
    task_name="mnli",
)
dm.setup("fit")
model = GLUETransformer(
    model_name_or_path="distilbert-base-cased",
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
    task_name=dm.task_name,
)

trainer = Trainer(gpus=AVAIL_GPUS, progress_bar_refresh_rate=20)
trainer.validate(model, dm.val_dataloader())

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mnli (download: 298.29 MiB, generated: 78.65 MiB, post-processed: Unknown size, total: 376.95 MiB) to /Users/daohuei/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/313M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /Users/daohuei/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/393 [00:00<?, ?ba/s]



  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.wei

Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0             DataLoader 1
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    accuracy_matched        0.3175751268863678       0.3175751268863678
   accuracy_mismatched       0.318856805562973        0.318856805562973
    val_loss_matched        1.1064648628234863       1.1064648628234863
   val_loss_mismatched      1.1063889265060425       1.1063889265060425
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss_matched': 1.1064648628234863,
  'accuracy_matched': 0.3175751268863678,
  'val_loss_mismatched': 1.1063889265060425,
  'accuracy_mismatched': 0.318856805562973},
 {'val_loss_matched': 1.1064648628234863,
  'accuracy_matched': 0.3175751268863678,
  'val_loss_mismatched': 1.1063889265060425,
  'accuracy_mismatched': 0.318856805562973}]

In [7]:
from pathlib import Path

from datasets import load_dataset, Split

from data import *

In [12]:
# load dialogue history and target belief

data_dir = Path('resources/bart/')

data_files = {
    Split.TRAIN: str((data_dir / 'train.history_belief').absolute()),
    Split.VALIDATION: str((data_dir / 'val.history_belief').absolute()),
    Split.TEST: str((data_dir / 'test.history_belief').absolute())
}

dataset = load_dataset('data/dataset/multiwoz_dataset.py', data_files=data_files)

Using custom data configuration default-bfc7feb1c1e93a63
Reusing dataset multi_woz_dataset (/Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
# flatten dialogues into turn-level, and also include the previous belief(dialogue state)
dataset = dataset.map(flatten_conversation, batched=True, remove_columns=dataset['train'].column_names)

Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7/cache-d16f23eccc4b23c9.arrow
Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7/cache-729a1d081f6da48b.arrow
Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7/cache-68c0f4701c954fae.arrow


In [25]:
# mask specifically on the value that is having changes in the current state
masked_deltas = dataset.map(mask_delta_beliefs, remove_columns='turn')

Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7/cache-61f9fa801989862d.arrow
Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7/cache-4b08653818633b57.arrow
Loading cached processed dataset at /Users/daohuei/.cache/huggingface/datasets/multi_woz_dataset/default-bfc7feb1c1e93a63/0.0.0/214b5f86ecf324bff982d075298ac2b6490bbca4895dda93d15c69bf8e8274e7/cache-174b0d14d0ea69dd.arrow


In [26]:
# randomly mask 15% of the state value
random_masked_beliefs = dataset.map(lambda d: random_mask_beliefs(d, 0.15), remove_columns='turn')

  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/7374 [00:00<?, ?ex/s]

  0%|          | 0/7372 [00:00<?, ?ex/s]

In [29]:
# masked out the entity value in the utterance that should be predict as belief value
masked_context_belief_entities = dataset.map(mask_context_belief_entities, remove_columns='turn')

  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/7374 [00:00<?, ?ex/s]

  0%|          | 0/7372 [00:00<?, ?ex/s]

In [31]:
# randomly mask utterance tokens
random_masked_utterances = dataset.map(lambda d: random_mask_utterance(d, 0.15), remove_columns='turn')

  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/7374 [00:00<?, ?ex/s]

  0%|          | 0/7372 [00:00<?, ?ex/s]

In [None]:
class DSTDataModule(LightningDataModule):

    # task_text_field_map = {
    #     "cola": ["sentence"],
    #     "sst2": ["sentence"],
    #     "mrpc": ["sentence1", "sentence2"],
    #     "qqp": ["question1", "question2"],
    #     "stsb": ["sentence1", "sentence2"],
    #     "mnli": ["premise", "hypothesis"],
    #     "qnli": ["question", "sentence"],
    #     "rte": ["sentence1", "sentence2"],
    #     "wnli": ["sentence1", "sentence2"],
    #     "ax": ["premise", "hypothesis"],
    # }

    # glue_task_num_labels = {
    #     "cola": 2,
    #     "sst2": 2,
    #     "mrpc": 2,
    #     "qqp": 2,
    #     "stsb": 1,
    #     "mnli": 3,
    #     "qnli": 2,
    #     "rte": 2,
    #     "wnli": 2,
    #     "ax": 3,
    # }

    loader_columns = [
        # "datasets_idx",
        # "input_ids",
        # "token_type_ids",
        # "attention_mask",
        # "start_positions",
        # "end_positions",
        # "labels",
    ]

    def __init__(
        self,
        model_name_or_path: str,
        task_name: str = "dst",
        max_seq_length: int = 128,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        # self.text_fields = self.task_text_field_map[task_name]
        # self.num_labels = self.glue_task_num_labels[task_name]
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def setup(self, stage: str):
#         self.dataset = datasets.load_dataset("glue", self.task_name)
        
#         # for having different mapping according to different loader
#         for split in self.dataset.keys():
#             self.dataset[split] = self.dataset[split].map(
#                 self.convert_to_features,
#                 batched=True,
#                 remove_columns=["label"],
#             )
#             self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
#             self.dataset[split].set_format(type="torch", columns=self.columns)

#         self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]

        # load dialogue history and target belief

        data_dir = Path('resources/bart/')

        data_files = {
            Split.TRAIN: str((data_dir / 'train.history_belief').absolute()),
            Split.VALIDATION: str((data_dir / 'val.history_belief').absolute()),
            Split.TEST: str((data_dir / 'test.history_belief').absolute())
        }

        self.dataset = load_dataset('data/dataset/multiwoz_dataset.py', data_files=data_files)

    # def prepare_data(self):
    #     # for loading dataset
    #     datasets.load_dataset("glue", self.task_name)
    #     # for loading the tokenizer
    #     AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.train_batch_size)

    def val_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["validation"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def test_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["test"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

#     # the mapping function
#     def convert_to_features(self, example_batch, indices=None):

#         # Either encode single sentence or sentence pairs
#         if len(self.text_fields) > 1:
#             texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
#         else:
#             texts_or_text_pairs = example_batch[self.text_fields[0]]

#         # Tokenize the text/text pairs
#         features = self.tokenizer.batch_encode_plus(
#             texts_or_text_pairs, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True
#         )

#         # Rename label to labels to make it easier to pass to model forward
#         features["labels"] = example_batch["label"]

#         return features