In [1]:
from pathlib import Path

from datasets import load_dataset, Split, Dataset

from data.dataset.tokenize import tokenizer
from data.dataset.data_augmentations import (
    flatten_conversation,
    mask_delta_beliefs,
    random_mask_beliefs,
    random_mask_prev_beliefs,
    random_mask_both_beliefs,
)
from utils import print_stage

data_dir = Path("resources/bart/")

data_files = {
    Split.TRAIN: str((data_dir / "train.history_belief").absolute()),
    Split.VALIDATION: str((data_dir / "val.history_belief").absolute()),
    Split.TEST: str((data_dir / "test.history_belief").absolute()),
}

In [2]:
# tokenization
def tokenization(examples):
    tokenized_examples = tokenizer(
        examples["masked"],
    )
    tokenized_examples["labels"] = tokenizer(
        examples["target"],
    )["input_ids"]
    return tokenized_examples


In [3]:
dataset = load_dataset(
    "data/dataset/multiwoz_dataset.py", data_files=data_files
)
print_stage("Flattening Conversation")
dataset = dataset.map(
    flatten_conversation,
    batched=True,
    remove_columns=dataset["train"].column_names,
)



Downloading and preparing dataset multi_woz_dataset/default to /data/users/cting3/.cache/huggingface/datasets/multi_woz_dataset/default-fa91ae44cff1e0ff/0.0.0/82f2807f144539f2bf2f840f4aef1af4678190d8457d6ebd5510fcde01f3c0c9...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset multi_woz_dataset downloaded and prepared to /data/users/cting3/.cache/huggingface/datasets/multi_woz_dataset/default-fa91ae44cff1e0ff/0.0.0/82f2807f144539f2bf2f840f4aef1af4678190d8457d6ebd5510fcde01f3c0c9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [4]:
from datasets import set_caching_enabled
set_caching_enabled(False)

  set_caching_enabled(False)


In [5]:
print_stage("Masking All Belief Values")
masked_beliefs_final = dataset.map(
    lambda d: random_mask_both_beliefs(d, 1), remove_columns="turn"
)
masked_beliefs_final = masked_beliefs_final.map(
    tokenization,
    batched=True,
    remove_columns=masked_beliefs_final["train"].column_names,
)
masked_beliefs_final["train"].to_json("resources/tokens/masked_beliefs_final_train_token.json")
masked_beliefs_final["validation"].to_json("resources/tokens/masked_beliefs_final_dev_token.json")
masked_beliefs_final["test"].to_json("resources/tokens/masked_beliefs_final_test_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/7374 [00:00<?, ?ex/s]

  0%|          | 0/7372 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1040 > 1024). Running this sequence through the model will result in indexing errors


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

28995371

In [9]:
print_stage("Masking Difference of Dialogue States")
masked_deltas = dataset["train"].map(
    mask_delta_beliefs, remove_columns="turn"
)
masked_deltas = masked_deltas.map(
    tokenization, remove_columns=masked_deltas.column_names, batched=True
)
masked_deltas.to_json("resources/tokens/masked_deltas_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

230470271

In [10]:
print_stage("Masking Beliefs (Easy)")
random_masked_beliefs_easy = dataset["train"].map(
    lambda d: random_mask_beliefs(d, 0.25), remove_columns="turn"
)
random_masked_beliefs_easy = random_masked_beliefs_easy.map(
    tokenization,
    remove_columns=random_masked_beliefs_easy.column_names, batched=True
)
random_masked_beliefs_easy.to_json("resources/tokens/random_masked_beliefs_easy_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

230167228

In [11]:
print_stage("Masking Beliefs (Hard)")
random_masked_beliefs_hard = dataset["train"].map(
    lambda d: random_mask_beliefs(d, 0.5), remove_columns="turn"
)
random_masked_beliefs_hard = random_masked_beliefs_hard.map(
    tokenization,
    remove_columns=random_masked_beliefs_hard.column_names, batched=True
)
random_masked_beliefs_hard.to_json("resources/tokens/random_masked_beliefs_hard_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

228244429

In [12]:
print_stage("Masking Previous Beliefs (Easy)")
random_masked_prev_beliefs_easy = dataset["train"].map(
    lambda d: random_mask_prev_beliefs(d, 0.25), remove_columns="turn"
)
random_masked_prev_beliefs_easy = random_masked_prev_beliefs_easy.map(
    tokenization,
    remove_columns=random_masked_prev_beliefs_easy.column_names, batched=True
)
random_masked_prev_beliefs_easy.to_json("resources/tokens/random_masked_prev_beliefs_easy_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

230273831

In [13]:
print_stage("Masking Beliefs (Hard)")
random_masked_prev_beliefs_hard = dataset["train"].map(
    lambda d: random_mask_prev_beliefs(d, 0.5), remove_columns="turn"
)
random_masked_prev_beliefs_hard = random_masked_prev_beliefs_hard.map(
    tokenization,
    remove_columns=random_masked_prev_beliefs_hard.column_names, batched=True
)
random_masked_prev_beliefs_hard.to_json("resources/tokens/random_masked_prev_beliefs_hard_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

228456935

In [14]:
print_stage("Masking Both Beliefs (Hard)")
random_masked_both_beliefs_hard = dataset["train"].map(
    lambda d: random_mask_both_beliefs(d, 0.5), remove_columns="turn"
)
random_masked_both_beliefs_hard = random_masked_both_beliefs_hard.map(
    tokenization,
    remove_columns=random_masked_both_beliefs_hard.column_names, batched=True
)
random_masked_both_beliefs_hard.to_json("resources/tokens/random_masked_both_beliefs_hard_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

224610866

In [5]:
# print_stage("Masking Beliefs (Medium)")
# random_masked_beliefs_medium = dataset["train"].map(
#     lambda d: random_mask_beliefs(d, 0.25), remove_columns="turn"
# )
# random_masked_beliefs_medium = random_masked_beliefs_medium.map(
#     tokenization,
#     remove_columns=random_masked_beliefs_medium.column_names, batched=True
# )
# random_masked_beliefs_medium.to_json("resources/tokens/random_masked_beliefs_medium_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1047 > 1024). Running this sequence through the model will result in indexing errors


Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

230168624

In [9]:
# print_stage("Masking Utterances (Easy)")
# random_masked_utterances_easy = dataset["train"].map(
#     lambda d: random_mask_utterance(d, 0.15), remove_columns="turn"
# )
# random_masked_utterances_easy = random_masked_utterances_easy.map(
#     tokenization,
#     remove_columns=random_masked_utterances_easy.column_names, batched=True
# )
# random_masked_utterances_easy.to_json("resources/tokens/random_masked_utterances_easy_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

232351647

In [10]:
# print_stage("Masking Belief Entities in the Context")
# masked_context_belief_entities = dataset["train"].map(
#     mask_context_belief_entities, remove_columns="turn"
# )
# masked_context_belief_entities = masked_context_belief_entities.map(
#     tokenization,
#     remove_columns=masked_context_belief_entities.column_names, batched=True
# )
# masked_context_belief_entities.to_json("resources/tokens/masked_context_belief_entities_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

230703072

In [6]:
print_stage("Masking Beliefs (Super Hard)")
random_masked_beliefs_super_hard = dataset["train"].map(
    lambda d: random_mask_beliefs(d, 0.75), remove_columns="turn"
)
random_masked_beliefs_super_hard = random_masked_beliefs_super_hard.map(
    tokenization,
    remove_columns=random_masked_beliefs_super_hard.column_names, batched=True
)
random_masked_beliefs_super_hard.to_json("resources/tokens/random_masked_beliefs_super_hard_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

226566978

In [12]:
print_stage("Masking Utterances (Hard)")
random_masked_utterances_hard = dataset["train"].map(
    lambda d: random_mask_utterance(d, 0.5), remove_columns="turn"
)
random_masked_utterances_hard = random_masked_utterances_hard.map(
    tokenization,
    batched=True,
    remove_columns=random_masked_utterances_hard.column_names,
)

random_masked_utterances_hard.to_json("resources/tokens/random_masked_utterances_hard_token.json")



  0%|          | 0/56778 [00:00<?, ?ex/s]

  0%|          | 0/57 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

223608721

In [13]:
sample = masked_beliefs_final["validation"]["input_ids"][0]
tokenizer.decode(sample)

'<s><s> <|context|> <|user|> i need to book a hotel in the east that has 4 stars. <|endofcontext|> <|previousbelief|> attraction area not mentioned, attraction name not mentioned, attraction type not mentioned, hospital department not mentioned, hotel area not mentioned, hotel book day not mentioned, hotel book people not mentioned, hotel book stay not mentioned, hotel internet not mentioned, hotel name not mentioned, hotel parking not mentioned, hotel pricerange not mentioned, hotel stars not mentioned, hotel type not mentioned, restaurant area not mentioned, restaurant book day not mentioned, restaurant book people not mentioned, restaurant book time not mentioned, restaurant food not mentioned, restaurant name not mentioned, restaurant pricerange not mentioned, taxi arriveby not mentioned, taxi departure not mentioned, taxi destination not mentioned, taxi leaveat not mentioned, train arriveby not mentioned, train book people not mentioned, train day not mentioned, train departure no

In [3]:
masked_deltas = load_dataset(
    "json", data_files="resources/tokens/masked_deltas_token.json"
)["train"]
random_masked_beliefs_easy = load_dataset(
    "json", data_files="resources/tokens/random_masked_beliefs_easy_token.json"
)["train"]
random_masked_utterances_easy = load_dataset(
    "json", data_files="resources/tokens/random_masked_utterances_easy_token.json"
)["train"]
masked_context_belief_entities = load_dataset(
    "json", data_files="resources/tokens/masked_context_belief_entities_token.json"
)["train"]
random_masked_beliefs_hard = load_dataset(
    "json", data_files="resources/tokens/random_masked_beliefs_hard_token.json"
)["train"]
random_masked_utterances_hard = load_dataset(
    "json", data_files="resources/tokens/random_masked_utterances_hard_token.json"
)["train"]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
tokenizer.decode(masked_deltas["input_ids"][10])

'<s><s> <|context|> <|user|> hi, i am looking for a train that is going to cambridge and arriving there by 20:45, is there anything like that? <|endofcontext|> <|previousbelief|> attraction area not mentioned, attraction name not mentioned, attraction type not mentioned, hospital department not mentioned, hotel area not mentioned, hotel book day not mentioned, hotel book people not mentioned, hotel book stay not mentioned, hotel internet not mentioned, hotel name not mentioned, hotel parking not mentioned, hotel pricerange not mentioned, hotel stars not mentioned, hotel type not mentioned, restaurant area not mentioned, restaurant book day not mentioned, restaurant book people not mentioned, restaurant book time not mentioned, restaurant food not mentioned, restaurant name not mentioned, restaurant pricerange not mentioned, taxi arriveby not mentioned, taxi departure not mentioned, taxi destination not mentioned, taxi leaveat not mentioned, train arriveby not mentioned, train book peop

In [6]:
tokenizer.decode(random_masked_beliefs_easy["input_ids"][10])

'<s><s> <|context|> <|user|> hi, i am looking for a train that is going to cambridge and arriving there by 20:45, is there anything like that? <|endofcontext|> <|previousbelief|> attraction area not mentioned, attraction name not mentioned, attraction type not mentioned, hospital department not mentioned, hotel area not mentioned, hotel book day not mentioned, hotel book people not mentioned, hotel book stay not mentioned, hotel internet not mentioned, hotel name not mentioned, hotel parking not mentioned, hotel pricerange not mentioned, hotel stars not mentioned, hotel type not mentioned, restaurant area not mentioned, restaurant book day not mentioned, restaurant book people not mentioned, restaurant book time not mentioned, restaurant food not mentioned, restaurant name not mentioned, restaurant pricerange not mentioned, taxi arriveby not mentioned, taxi departure not mentioned, taxi destination not mentioned, taxi leaveat not mentioned, train arriveby not mentioned, train book peop

In [7]:
tokenizer.decode(random_masked_utterances_easy["input_ids"][10])

'<s><s> <|context|> <|user|> <mask> i am looking for a<mask> that is going<mask> cambridge and arriving there by 20:45, is there anything like that? <|endofcontext|> <|previousbelief|> attraction area not mentioned, attraction name not mentioned, attraction type not mentioned, hospital department not mentioned, hotel area not mentioned, hotel book day not mentioned, hotel book people not mentioned, hotel book stay not mentioned, hotel internet not mentioned, hotel name not mentioned, hotel parking not mentioned, hotel pricerange not mentioned, hotel stars not mentioned, hotel type not mentioned, restaurant area not mentioned, restaurant book day not mentioned, restaurant book people not mentioned, restaurant book time not mentioned, restaurant food not mentioned, restaurant name not mentioned, restaurant pricerange not mentioned, taxi arriveby not mentioned, taxi departure not mentioned, taxi destination not mentioned, taxi leaveat not mentioned, train arriveby not mentioned, train boo

In [8]:
tokenizer.decode(masked_context_belief_entities["input_ids"][10])

'<s><s> <|context|> <|user|> hi, i am looking for a train that is going to<mask> and arriving there by<mask> , is there anything like that? <|endofcontext|> <|previousbelief|> attraction area not mentioned, attraction name not mentioned, attraction type not mentioned, hospital department not mentioned, hotel area not mentioned, hotel book day not mentioned, hotel book people not mentioned, hotel book stay not mentioned, hotel internet not mentioned, hotel name not mentioned, hotel parking not mentioned, hotel pricerange not mentioned, hotel stars not mentioned, hotel type not mentioned, restaurant area not mentioned, restaurant book day not mentioned, restaurant book people not mentioned, restaurant book time not mentioned, restaurant food not mentioned, restaurant name not mentioned, restaurant pricerange not mentioned, taxi arriveby not mentioned, taxi departure not mentioned, taxi destination not mentioned, taxi leaveat not mentioned, train arriveby not mentioned, train book people 

In [None]:
masked_beliefs_final_train = load_dataset(
    "json", data_files="resources/tokens/masked_beliefs_final_train_token.json"
)["train"]
masked_beliefs_final_dev = load_dataset(
    "json", data_files="resources/tokens/masked_beliefs_final_dev_token.json"
)["train"]
masked_beliefs_final_test = load_dataset(
    "json", data_files="resources/tokens/masked_beliefs_final_test_token.json"
)["train"]

In [None]:
from transformers import (
    TrainingArguments,
    BartForConditionalGeneration,
    default_data_collator,
)

from trainer.curriculum_trainer import CurriculumTrainer
from data.dataset.tokenize import tokenizer

In [None]:
name = "dataset_test"
BATCH_SIZE = 2
EPOCHS = 10
curriculum_datasets = [
    masked_deltas["train"],
    random_masked_beliefs_easy["train"],
    random_masked_utterances_easy["train"],
    masked_context_belief_entities["train"],
    random_masked_beliefs_hard["train"],
    random_masked_utterances_hard["train"],
]

model = BartForConditionalGeneration.from_pretrained(
    "facebook/bart-base"
)#.to(device)
model.resize_token_embeddings(len(tokenizer))

# setup trainer
args = TrainingArguments(
    output_dir=f"checkpoints/{name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    dataloader_num_workers=0,
    local_rank=-1,
    load_best_model_at_end=True,
    # resume_from_checkpoint=f"{name}/checkpoint-19000",
)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
trainer = CurriculumTrainer(
    curriculum_datasets,
    model,
    args,
    train_dataset=masked_beliefs_final_train["train"],
    eval_dataset=masked_beliefs_final_dev["train"],
    data_collator=data_collator,
    # compute_metrics=test_compute_metrics
    # callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
)
trainer.curriculum_train()

In [None]:
tokenizer.encode("<mask>")

In [None]:
tokenizer.decode(30415)

In [None]:
data_collator()