In [1]:
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding


import numpy as np
import matplotlib.pyplot as plt
import evaluate
import math
import pandas as pd
import scipy
dataset_full = load_dataset("nlu_evaluation_data").rename_column("label", "labels").remove_columns("scenario")

# hyperparameters from https://github.com/clinc/oos-eval/blob/master/hyperparameters.csv
learning_rate = 4.00e-05
warmup_proportion = 0.1
train_batch_size = 32
eval_batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 1

huggingface_modelname = "bert-base-uncased"

In [2]:
dataset_full["train"].features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['alarm_query', 'alarm_remove', 'alarm_set', 'audio_volume_down', 'audio_volume_mute', 'audio_volume_other', 'audio_volume_up', 'calendar_query', 'calendar_remove', 'calendar_set', 'cooking_query', 'cooking_recipe', 'datetime_convert', 'datetime_query', 'email_addcontact', 'email_query', 'email_querycontact', 'email_sendemail', 'general_affirm', 'general_commandstop', 'general_confirm', 'general_dontcare', 'general_explain', 'general_greet', 'general_joke', 'general_negate', 'general_praise', 'general_quirky', 'general_repeat', 'iot_cleaning', 'iot_coffee', 'iot_hue_lightchange', 'iot_hue_lightdim', 'iot_hue_lightoff', 'iot_hue_lighton', 'iot_hue_lightup', 'iot_wemo_off', 'iot_wemo_on', 'lists_createoradd', 'lists_query', 'lists_remove', 'music_dislikeness', 'music_likeness', 'music_query', 'music_settings', 'news_query', 'play_audiobook', 'play_game', 'play_music', 'play_podcasts', 'play_radio', 'qa_currency', 'qa_de

In [3]:
train_size, cal_size, test_size = .6, .2, .2
train_test_set = dataset_full["train"].train_test_split(train_size=train_size, stratify_by_column='labels', seed=1)
test_val_set = train_test_set['test'].train_test_split(train_size=test_size / (cal_size+test_size), stratify_by_column="labels", seed=1)
dataset = DatasetDict({
    'train': train_test_set['train'],
    'test': test_val_set['train'],
    'calibration': test_val_set['test']
})

assert math.fabs(((dataset['train'].num_rows / dataset_full["train"].num_rows) - train_size)) < 1e-5
assert math.fabs(((dataset['test'].num_rows / dataset_full["train"].num_rows) - test_size)) < 1e-5
assert math.fabs(((dataset['calibration'].num_rows / dataset_full["train"].num_rows) - cal_size)) < 1e-5

In [4]:
tokenizer = AutoTokenizer.from_pretrained(huggingface_modelname)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

assert dataset["train"].features["labels"].names == dataset["test"].features["labels"].names 
id2label = {i: name for i, name in enumerate(dataset["train"].features["labels"].names)}
label2id = {name: i for i, name in id2label.items()}

Map:   0%|          | 0/15429 [00:00<?, ? examples/s]

Map:   0%|          | 0/5143 [00:00<?, ? examples/s]

Map:   0%|          | 0/5143 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    huggingface_modelname,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
torch.mps.current_allocated_memory(), torch.mps.driver_allocated_memory()

(0, 393216)

In [7]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
    load_best_model_at_end=True,
    )
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42),
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [8]:
trainer.train()

  0%|          | 0/2415 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.6793674230575562, 'eval_accuracy': 0.8660314991250243, 'eval_runtime': 15.5823, 'eval_samples_per_second': 330.053, 'eval_steps_per_second': 10.332, 'epoch': 1.0}
{'loss': 1.6924, 'learning_rate': 3.17184265010352e-05, 'epoch': 1.04}


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.4588022828102112, 'eval_accuracy': 0.8922807699786117, 'eval_runtime': 13.2805, 'eval_samples_per_second': 387.259, 'eval_steps_per_second': 12.123, 'epoch': 2.0}
{'loss': 0.4771, 'learning_rate': 2.3436853002070396e-05, 'epoch': 2.07}


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.415767639875412, 'eval_accuracy': 0.9029749173634066, 'eval_runtime': 13.3284, 'eval_samples_per_second': 385.867, 'eval_steps_per_second': 12.079, 'epoch': 3.0}
{'loss': 0.2467, 'learning_rate': 1.5155279503105591e-05, 'epoch': 3.11}


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.43203067779541016, 'eval_accuracy': 0.9053081858837254, 'eval_runtime': 13.4565, 'eval_samples_per_second': 382.195, 'eval_steps_per_second': 11.964, 'epoch': 4.0}
{'loss': 0.1393, 'learning_rate': 6.873706004140788e-06, 'epoch': 4.14}


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.4229484498500824, 'eval_accuracy': 0.9066692591872448, 'eval_runtime': 13.464, 'eval_samples_per_second': 381.98, 'eval_steps_per_second': 11.958, 'epoch': 5.0}
{'train_runtime': 971.1529, 'train_samples_per_second': 79.437, 'train_steps_per_second': 2.487, 'train_loss': 0.5448588416689918, 'epoch': 5.0}


TrainOutput(global_step=2415, training_loss=0.5448588416689918, metrics={'train_runtime': 971.1529, 'train_samples_per_second': 79.437, 'train_steps_per_second': 2.487, 'train_loss': 0.5448588416689918, 'epoch': 5.0})

In [9]:
pred_train = trainer.predict(tokenized_datasets["train"])
pred_val = trainer.predict(tokenized_datasets["calibration"])
pred_test = trainer.predict(tokenized_datasets["test"])

  0%|          | 0/483 [00:00<?, ?it/s]

  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/161 [00:00<?, ?it/s]

In [10]:
train_sm = scipy.special.softmax(pred_train.predictions, axis=1)
val_sm = scipy.special.softmax(pred_val.predictions, axis=1)
test_sm = scipy.special.softmax(pred_test.predictions, axis=1)

np.testing.assert_allclose(train_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(val_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(test_sm.sum(axis=1), 1.0, rtol=1e-6)

In [11]:
# Create dataframe with columns text, hash, label, label_0, ... label_150
def get_score(matrix, oos_label):
    result = {}
    for i in range(matrix.shape[1]):
        if i < oos_label:
            result[f'label_{i}'] = matrix[:, i]
        elif i > oos_label:
            result[f'label_{i+1}'] = matrix[:, i]
        elif i == oos_label:
            result[f'label_{i}'] = np.repeat(0.0, matrix.shape[0])# the oos label
            result[f'label_{i+1}'] = matrix[:, i]
    return result

# train_sm_dict = get_score(train_sm, oos_label)
# val_sm_dict = get_score(val_sm, oos_label)
# test_sm_dict = get_score(test_sm, oos_label)
train_sm_dict = {
    f'label_{i}':  train_sm[:,i] for i in range(train_sm.shape[1])
}
val_sm_dict = {
    f'label_{i}':  val_sm[:,i] for i in range(val_sm.shape[1])
}
test_sm_dict = {
    f'label_{i}':  test_sm[:,i] for i in range(test_sm.shape[1])
}
train_frame = pd.DataFrame(
    {"text": dataset["train"]["text"],
     "hash": range(len(dataset["train"]["text"])),
     "label": dataset["train"]["labels"],
     **train_sm_dict
    }
)
val_frame = pd.DataFrame(
    {"text": dataset["calibration"]["text"],
     "hash": range(len(dataset["calibration"]["text"])),
     "label": dataset["calibration"]["labels"],
     **val_sm_dict
    }
)
test_frame = pd.DataFrame(
    {"text": dataset["test"]["text"],
     "hash": range(len(dataset["test"]["text"])),
     "label": dataset["test"]["labels"],
     **test_sm_dict
    }
)

In [12]:
train_frame.to_csv('data/hwu64/bert/train_predictions.csv', index=False)
val_frame.to_csv('data/hwu64/bert/calibration_predictions.csv', index=False)
test_frame.to_csv('data/hwu64/bert/test_predictions.csv', index=False)

pd.DataFrame({'0': id2label.values()}).to_csv('data/hwu64/bert/labels.csv')


In [20]:
LABEL_QRY = 'datetime_query'
dataset['train'].filter(lambda x: x['labels'] == label2id[LABEL_QRY])['text']

Filter:   0%|          | 0/15429 [00:00<?, ? examples/s]

["give me new york's current time",
 'august fifteenth is what day of the week',
 'tell me the current time',
 'what is the time in the GMT time zone',
 'display date',
 'tell me time it is',
 'can you tell me what date it is',
 'what time is it in california',
 'present time in new york',
 'what day of the week is twenty first',
 'what is the current time in california',
 'what time is time',
 'hows the time in key largo',
 'what is the current eastern time',
 'what is the time now in san diego',
 'what time is it in new york',
 'what day of the week is fourteenth',
 'is today march sixth',
 'current time in london',
 'twenty second april day',
 'what time is it in los angeles',
 'just let me know the current date today',
 'show me minutes of current hour',
 'can you tell me what month it is',
 'tell me the date',
 'what day it is on twenty second april',
 'what was the date of two wednesdays back from today',
 'was christmas one thousand nine hundred and seventy two on a weekend',
 '

In [17]:
label2id

{'alarm_query': 0,
 'alarm_remove': 1,
 'alarm_set': 2,
 'audio_volume_down': 3,
 'audio_volume_mute': 4,
 'audio_volume_other': 5,
 'audio_volume_up': 6,
 'calendar_query': 7,
 'calendar_remove': 8,
 'calendar_set': 9,
 'cooking_query': 10,
 'cooking_recipe': 11,
 'datetime_convert': 12,
 'datetime_query': 13,
 'email_addcontact': 14,
 'email_query': 15,
 'email_querycontact': 16,
 'email_sendemail': 17,
 'general_affirm': 18,
 'general_commandstop': 19,
 'general_confirm': 20,
 'general_dontcare': 21,
 'general_explain': 22,
 'general_greet': 23,
 'general_joke': 24,
 'general_negate': 25,
 'general_praise': 26,
 'general_quirky': 27,
 'general_repeat': 28,
 'iot_cleaning': 29,
 'iot_coffee': 30,
 'iot_hue_lightchange': 31,
 'iot_hue_lightdim': 32,
 'iot_hue_lightoff': 33,
 'iot_hue_lighton': 34,
 'iot_hue_lightup': 35,
 'iot_wemo_off': 36,
 'iot_wemo_on': 37,
 'lists_createoradd': 38,
 'lists_query': 39,
 'lists_remove': 40,
 'music_dislikeness': 41,
 'music_likeness': 42,
 'music_q