In [1]:
import torch
from datasets import load_dataset, concatenate_datasets, DatasetDict, ClassLabel
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, BertForSequenceClassification, DataCollatorWithPadding

import math
import numpy as np
import matplotlib.pyplot as plt
import evaluate
import pandas as pd
import scipy


# hyperparameters from https://github.com/clinc/oos-eval/blob/master/hyperparameters.csv
learning_rate = 4.00e-05
warmup_proportion = 0.1
train_batch_size = 32
eval_batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 1

huggingface_modelname = "bert-large-uncased"


language = 'en'
data_path = f'data/mtod/csv/{language}/'


dataset = load_dataset(data_path)
dataset = dataset.class_encode_column('labels')
# dataset = load_dataset("mtod", data_files=data_file_names)
dataset_full = concatenate_datasets([dataset["train"], dataset["test"], dataset["validation"]]).shuffle(seed=42)
train_size, cal_size, test_size = .6, .2, .2
train_test_set = dataset_full.train_test_split(train_size=train_size, stratify_by_column='labels', seed=42)
test_val_set = train_test_set["test"].train_test_split(train_size=test_size/(cal_size+test_size), stratify_by_column='labels', seed=42)
dataset = DatasetDict({
    'train': train_test_set['train'],
    'test': test_val_set['train'],
    'validation': test_val_set['test']
})

In [2]:

id2label = {i: name for i, name in enumerate(dataset["train"].features["labels"].names)}
label2id = {name: i for i, name in id2label.items()}

In [3]:
torch.mps.current_allocated_memory()

0

In [4]:
tokenizer = AutoTokenizer.from_pretrained(huggingface_modelname)

def tokenize_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# assert dataset["train"].features["labels"].names == dataset["test"].features["labels"].names 

Map:   0%|          | 0/25993 [00:00<?, ? examples/s]

Map:   0%|          | 0/8665 [00:00<?, ? examples/s]

Map:   0%|          | 0/8665 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    huggingface_modelname,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
torch.mps.current_allocated_memory(), torch.mps.driver_allocated_memory()

(0, 393216)

In [7]:
id2label

{0: 'cancel_alarm',
 1: 'cancel_reminder',
 2: 'checkSunrise',
 3: 'checkSunset',
 4: 'find',
 5: 'modify_alarm',
 6: 'set_alarm',
 7: 'set_reminder',
 8: 'show_alarms',
 9: 'show_reminders',
 10: 'snooze_alarm',
 11: 'time_left_on_alarm'}

In [8]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
    load_best_model_at_end=True,
    )

In [9]:
metric = evaluate.load("accuracy")

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

  0%|          | 0/4065 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.1889, 'learning_rate': 3.5079950799507995e-05, 'epoch': 0.62}


  0%|          | 0/271 [00:00<?, ?it/s]

{'eval_loss': 0.06356018036603928, 'eval_accuracy': 0.9870744373918061, 'eval_runtime': 71.8024, 'eval_samples_per_second': 120.678, 'eval_steps_per_second': 3.774, 'epoch': 1.0}
{'loss': 0.0514, 'learning_rate': 3.0159901599015993e-05, 'epoch': 1.23}
{'loss': 0.0353, 'learning_rate': 2.5239852398523988e-05, 'epoch': 1.85}


  0%|          | 0/271 [00:00<?, ?it/s]

{'eval_loss': 0.053416699171066284, 'eval_accuracy': 0.9894979803808425, 'eval_runtime': 62.4732, 'eval_samples_per_second': 138.7, 'eval_steps_per_second': 4.338, 'epoch': 2.0}
{'loss': 0.0311, 'learning_rate': 2.0319803198031983e-05, 'epoch': 2.46}


  0%|          | 0/271 [00:00<?, ?it/s]

{'eval_loss': 0.06136251986026764, 'eval_accuracy': 0.9849971148297749, 'eval_runtime': 63.4518, 'eval_samples_per_second': 136.56, 'eval_steps_per_second': 4.271, 'epoch': 3.0}
{'loss': 0.0241, 'learning_rate': 1.5399753997539978e-05, 'epoch': 3.08}
{'loss': 0.0173, 'learning_rate': 1.0479704797047971e-05, 'epoch': 3.69}


  0%|          | 0/271 [00:00<?, ?it/s]

{'eval_loss': 0.0503595806658268, 'eval_accuracy': 0.9905366416618581, 'eval_runtime': 35.0068, 'eval_samples_per_second': 247.523, 'eval_steps_per_second': 7.741, 'epoch': 4.0}
{'loss': 0.0132, 'learning_rate': 5.559655596555966e-06, 'epoch': 4.31}
{'loss': 0.0109, 'learning_rate': 6.396063960639606e-07, 'epoch': 4.92}


  0%|          | 0/271 [00:00<?, ?it/s]

{'eval_loss': 0.053456537425518036, 'eval_accuracy': 0.9899596076168494, 'eval_runtime': 34.6265, 'eval_samples_per_second': 250.242, 'eval_steps_per_second': 7.826, 'epoch': 5.0}
{'train_runtime': 3578.5591, 'train_samples_per_second': 36.318, 'train_steps_per_second': 1.136, 'train_loss': 0.045862746326685834, 'epoch': 5.0}


TrainOutput(global_step=4065, training_loss=0.045862746326685834, metrics={'train_runtime': 3578.5591, 'train_samples_per_second': 36.318, 'train_steps_per_second': 1.136, 'train_loss': 0.045862746326685834, 'epoch': 5.0})

In [13]:
pred_train = trainer.predict(tokenized_datasets["train"])
pred_val = trainer.predict(tokenized_datasets["validation"])
pred_test = trainer.predict(tokenized_datasets["test"])

  0%|          | 0/813 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

  0%|          | 0/271 [00:00<?, ?it/s]

In [14]:
train_sm = scipy.special.softmax(pred_train.predictions, axis=1)
val_sm = scipy.special.softmax(pred_val.predictions, axis=1)
test_sm = scipy.special.softmax(pred_test.predictions, axis=1)

np.testing.assert_allclose(train_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(val_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(test_sm.sum(axis=1), 1.0, rtol=1e-6)

In [15]:
train_sm.shape, val_sm.shape, test_sm.shape

((25993, 12), (8665, 12), (8665, 12))

In [16]:
# train_sm_dict = get_score(train_sm, oos_label)
# val_sm_dict = get_score(val_sm, oos_label)
# test_sm_dict = get_score(test_sm, oos_label)
train_sm_dict = {
    f'label_{i}':  train_sm[:,i] for i in range(train_sm.shape[1])
}
val_sm_dict = {
    f'label_{i}':  val_sm[:,i] for i in range(val_sm.shape[1])
}
test_sm_dict = {
    f'label_{i}':  test_sm[:,i] for i in range(test_sm.shape[1])
}
train_frame = pd.DataFrame(
    {"text": dataset["train"]["utterance"],
     "hash": range(len(dataset["train"]["utterance"])),
     "label": dataset["train"]["labels"],
     **train_sm_dict
    }
)
val_frame = pd.DataFrame(
    {"text": dataset["validation"]["utterance"],
     "hash": range(len(dataset["validation"]["utterance"])),
     "label": dataset["validation"]["labels"],
     **val_sm_dict
    }
)
test_frame = pd.DataFrame(
    {"text": dataset["test"]["utterance"],
     "hash": range(len(dataset["test"]["utterance"])),
     "label": dataset["test"]["labels"],
     **test_sm_dict
    }
)

In [17]:
train_frame.to_csv('data/mtod/bert/train_predictions.csv', index=False)
val_frame.to_csv('data/mtod/bert/calibration_predictions.csv', index=False)
test_frame.to_csv('data/mtod/bert/test_predictions.csv', index=False)



In [18]:
pd.DataFrame({'0': id2label.values()}).to_csv('data/mtod/bert/labels.csv')


In [19]:
id2label

{0: 'cancel_alarm',
 1: 'cancel_reminder',
 2: 'checkSunrise',
 3: 'checkSunset',
 4: 'find',
 5: 'modify_alarm',
 6: 'set_alarm',
 7: 'set_reminder',
 8: 'show_alarms',
 9: 'show_reminders',
 10: 'snooze_alarm',
 11: 'time_left_on_alarm'}

In [20]:
test_frame['label'].value_counts()

label
4     3977
6     1380
7     1343
0      594
1      327
8      314
9      290
5      123
10     123
11     109
3       49
2       36
Name: count, dtype: int64

In [21]:
train_frame['label']

0        0
1        6
2        0
3        6
4        4
        ..
25988    4
25989    4
25990    6
25991    6
25992    4
Name: label, Length: 25993, dtype: int64