In [20]:
import argparse
from collections import defaultdict

import numpy as np

import torch
from transformers import BertConfig, BertTokenizerFast, TrainingArguments, Trainer

from utils import seed_everything, empty_cuda_cache, compute_metrics
from modeling import JointBERT
from data_loader import LoadDataset
from data_tokenizer import TokenizeDataset

In [22]:
parser = argparse.ArgumentParser()
parser.add_argument('--task', default='snips')
parser.add_argument('--epoch', default=30)
parser.add_argument('--lr', default=5e-5)
parser.add_argument('--batch', default=128)
parser.add_argument('--seed', default=1234)
args = parser.parse_args(args=[])

TASK = args.task
EPOCH = args.epoch
LR = args.lr
BATCH_SIZE = args.batch
SEED = args.seed
print('============================================================')
print(f'TASK: {TASK}')
print(f'EPOCH: {EPOCH}')
print(f'LR: {LR}')
print(f'BATCH_SIZE: {BATCH_SIZE}')
print(f'SEED: {SEED}')

TASK: snips
EPOCH: 30
LR: 5e-05
BATCH_SIZE: 128
SEED: 1234


In [23]:
seed_everything(SEED)

In [4]:
seq_train = LoadDataset.load_dataset(f'./data/{TASK}/train/seq.in')
seq_dev = LoadDataset.load_dataset(f'./data/{TASK}/dev/seq.in')
seq_test = LoadDataset.load_dataset(f'./data/{TASK}/test/seq.in')

intent_train = LoadDataset.load_dataset(f'./data/{TASK}/train/label')
intent_dev = LoadDataset.load_dataset(f'./data/{TASK}/dev/label')
intent_test = LoadDataset.load_dataset(f'./data/{TASK}/test/label')
intent_labels = LoadDataset.load_dataset(f'./data/{TASK}/intent_label_vocab')

slot_train = LoadDataset.load_dataset(f'./data/{TASK}/train/seq.out', slot = True)
slot_dev = LoadDataset.load_dataset(f'./data/{TASK}/dev/seq.out', slot = True)
slot_test = LoadDataset.load_dataset(f'./data/{TASK}/test/seq.out', slot = True)
slot_labels = LoadDataset.load_dataset(f'./data/{TASK}/slot_label_vocab')

intent_word2idx = defaultdict(int, {k: v for v, k in enumerate(intent_labels)})
intent_idx2word = {v: k for v, k in enumerate(intent_labels)}

slot_word2idx = defaultdict(int, {k: v for v, k in enumerate(slot_labels)})
slot_idx2word = {v: k for v, k in enumerate(slot_labels)}

In [5]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

model_config = BertConfig.from_pretrained("bert-base-uncased", num_labels = len(intent_idx2word), problem_type = "single_label_classification", id2label = intent_idx2word, label2id = intent_word2idx)
# model_config.classifier_dropout

model = JointBERT.from_pretrained("bert-base-uncased", config = model_config, intent_labels = intent_labels, slot_labels = slot_labels)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing JointBERT: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing JointBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing JointBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of JointBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['slot_classifier.linear.weight', 

In [6]:
train_dataset = TokenizeDataset(seq_train, intent_train, slot_train, intent_word2idx, slot_word2idx, tokenizer)
dev_dataset = TokenizeDataset(seq_dev, intent_dev, slot_dev, intent_word2idx, slot_word2idx, tokenizer)
test_dataset = TokenizeDataset(seq_test, intent_test, slot_test, intent_word2idx, slot_word2idx, tokenizer)

In [7]:
arguments = TrainingArguments(
    output_dir='checkpoints',
    do_train=True,
    do_eval=True,

    num_train_epochs=EPOCH,
    learning_rate = LR,

    save_strategy="epoch",
    save_total_limit=2,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    
    report_to = 'none',

    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=1,
    dataloader_num_workers=0,
    fp16=True,

)

trainer = Trainer(
    model,
    arguments,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

Using cuda_amp half precision backend


In [8]:
empty_cuda_cache()
trainer.train()
model.save_pretrained(f"checkpoints/{TASK}_ep{EPOCH}")

***** Running training *****
  Num examples = 13084
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 103
 99%|█████████▉| 102/103 [00:21<00:00,  4.91it/s]***** Running Evaluation *****
  Num examples = 700
  Batch size = 64

100%|██████████| 103/103 [00:22<00:00,  4.91it/s]Saving model checkpoint to checkpoints\checkpoint-103
Configuration saved in checkpoints\checkpoint-103\config.json


{'eval_loss': 0.8551413416862488, 'eval_runtime': 0.323, 'eval_samples_per_second': 2167.149, 'eval_steps_per_second': 34.055, 'epoch': 1.0}


Model weights saved in checkpoints\checkpoint-103\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from checkpoints\checkpoint-103 (score: 0.8551413416862488).
100%|██████████| 103/103 [00:24<00:00,  4.19it/s]
Configuration saved in checkpoints/snips_ep1\config.json


{'train_runtime': 24.6002, 'train_samples_per_second': 531.865, 'train_steps_per_second': 4.187, 'train_loss': 2.2695216206670965, 'epoch': 1.0}


Model weights saved in checkpoints/snips_ep1\pytorch_model.bin


In [9]:
intent_label_ids = []
slot_label_ids = []

with open(f'./data/{TASK}/test/label', 'r', encoding='utf-8') as intent_f, \
    open(f'./data/{TASK}/test/seq.out', 'r', encoding='utf-8') as slot_f:
    for line in intent_f:
        line = line.strip()
        intent_label_ids.append(line)
    intent_label_ids = np.array(intent_label_ids)
    
    for line in slot_f:
        line = line.strip().split()
        slot_label_ids.append(line)


In [10]:
def predict(model, seqs):
    model.to('cpu')
    pred_intent_ids = []
    pred_slot_ids = []

    for i in range(len(seqs)):
        input_seq = tokenizer(seqs[i], return_tensors='pt')
        
        model.eval()
        with torch.no_grad():
            _, (intent_logits, slot_logits) = model(**input_seq)

        # Intent
        pred_intent_ids.append(intent_idx2word[intent_logits[0].argmax().item()])

        # Slot
        slot_logits_size = slot_logits[0].shape[0]
        slot_logits_mask = np.array(test_dataset[i]['slot_label_ids'][:slot_logits_size]) != -100
        slot_logits_clean = slot_logits[0][slot_logits_mask]
        pred_slot_ids.append([slot_idx2word[i.item()] for i in slot_logits_clean.argmax(dim=1)])

    return np.array(pred_intent_ids), pred_slot_ids

pred_intent_ids, pred_slot_ids = predict(model, seq_test)

In [11]:
res = compute_metrics(pred_intent_ids, intent_label_ids, pred_slot_ids, slot_label_ids)
for k, v in res.items():
    print(f'============{k}: {v}')

