In [27]:
import torch
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding


import numpy as np
import matplotlib.pyplot as plt
import evaluate
import math
from collections import Counter
import pandas as pd
import scipy
dataset_full = load_dataset("tuetschek/atis").rename_column("intent", "labels").remove_columns(["slots", "id"])
dataset_full = concatenate_datasets([dataset_full['train'], dataset_full['test']])
dataset_full = dataset_full.class_encode_column('labels')

# hyperparameters from https://github.com/clinc/oos-eval/blob/master/hyperparameters.csv
learning_rate = 4.00e-05
warmup_proportion = 0.1
train_batch_size = 32
eval_batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 1

huggingface_modelname = "bert-base-uncased"

In [31]:
train_size, cal_size, test_size = .6, .2, .2
# no stratificiation due to intents with a single exmaple
train_test_set = dataset_full.train_test_split(train_size=train_size, seed=42)
test_val_set = train_test_set['test'].train_test_split(train_size=test_size / (cal_size+test_size), seed=42)
dataset = DatasetDict({
    'train': train_test_set['train'],
    'test': test_val_set['train'],
    'calibration': test_val_set['test']
})

# assert math.fabs(((dataset['train'].num_rows / dataset_full["train"].num_rows) - train_size)) < 1e-5
# assert math.fabs(((dataset['test'].num_rows / dataset_full["train"].num_rows) - test_size)) < 1e-5
# assert math.fabs(((dataset['calibration'].num_rows / dataset_full["train"].num_rows) - cal_size)) < 1e-5

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 3522
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 1174
    })
    calibration: Dataset({
        features: ['labels', 'text'],
        num_rows: 1175
    })
})

In [34]:
tokenizer = AutoTokenizer.from_pretrained(huggingface_modelname)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

assert dataset["train"].features["labels"].names == dataset["test"].features["labels"].names 
id2label = {i: name for i, name in enumerate(dataset["train"].features["labels"].names)}
label2id = {name: i for i, name in id2label.items()}

Map:   0%|          | 0/3522 [00:00<?, ? examples/s]

Map:   0%|          | 0/1174 [00:00<?, ? examples/s]

Map:   0%|          | 0/1175 [00:00<?, ? examples/s]

In [35]:
model = AutoModelForSequenceClassification.from_pretrained(
    huggingface_modelname,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
torch.mps.current_allocated_memory(), torch.mps.driver_allocated_memory()

(0, 393216)

In [37]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
    load_best_model_at_end=True,
    )
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42),
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [38]:
trainer.train()

  0%|          | 0/555 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 0.42313531041145325, 'eval_accuracy': 0.919931856899489, 'eval_runtime': 2.5681, 'eval_samples_per_second': 457.154, 'eval_steps_per_second': 14.408, 'epoch': 1.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 0.19348132610321045, 'eval_accuracy': 0.9582623509369677, 'eval_runtime': 2.0626, 'eval_samples_per_second': 569.178, 'eval_steps_per_second': 17.938, 'epoch': 2.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 0.14185981452465057, 'eval_accuracy': 0.9735945485519591, 'eval_runtime': 2.115, 'eval_samples_per_second': 555.076, 'eval_steps_per_second': 17.494, 'epoch': 3.0}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 0.12500126659870148, 'eval_accuracy': 0.9761499148211243, 'eval_runtime': 2.8562, 'eval_samples_per_second': 411.042, 'eval_steps_per_second': 12.954, 'epoch': 4.0}
{'loss': 0.3195, 'learning_rate': 3.9639639639639645e-06, 'epoch': 4.5}


  0%|          | 0/37 [00:00<?, ?it/s]

{'eval_loss': 0.11479911208152771, 'eval_accuracy': 0.9787052810902896, 'eval_runtime': 2.1074, 'eval_samples_per_second': 557.082, 'eval_steps_per_second': 17.557, 'epoch': 5.0}
{'train_runtime': 166.8068, 'train_samples_per_second': 105.571, 'train_steps_per_second': 3.327, 'train_loss': 0.29219723821760296, 'epoch': 5.0}


TrainOutput(global_step=555, training_loss=0.29219723821760296, metrics={'train_runtime': 166.8068, 'train_samples_per_second': 105.571, 'train_steps_per_second': 3.327, 'train_loss': 0.29219723821760296, 'epoch': 5.0})

In [39]:
pred_train = trainer.predict(tokenized_datasets["train"])
pred_val = trainer.predict(tokenized_datasets["calibration"])
pred_test = trainer.predict(tokenized_datasets["test"])

  0%|          | 0/111 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

In [40]:
train_sm = scipy.special.softmax(pred_train.predictions, axis=1)
val_sm = scipy.special.softmax(pred_val.predictions, axis=1)
test_sm = scipy.special.softmax(pred_test.predictions, axis=1)

np.testing.assert_allclose(train_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(val_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(test_sm.sum(axis=1), 1.0, rtol=1e-6)

In [41]:
# train_sm_dict = get_score(train_sm, oos_label)
# val_sm_dict = get_score(val_sm, oos_label)
# test_sm_dict = get_score(test_sm, oos_label)
train_sm_dict = {
    f'label_{i}':  train_sm[:,i] for i in range(train_sm.shape[1])
}
val_sm_dict = {
    f'label_{i}':  val_sm[:,i] for i in range(val_sm.shape[1])
}
test_sm_dict = {
    f'label_{i}':  test_sm[:,i] for i in range(test_sm.shape[1])
}
train_frame = pd.DataFrame(
    {"text": dataset["train"]["text"],
     "hash": range(len(dataset["train"]["text"])),
     "label": dataset["train"]["labels"],
     **train_sm_dict
    }
)
val_frame = pd.DataFrame(
    {"text": dataset["calibration"]["text"],
     "hash": range(len(dataset["calibration"]["text"])),
     "label": dataset["calibration"]["labels"],
     **val_sm_dict
    }
)
test_frame = pd.DataFrame(
    {"text": dataset["test"]["text"],
     "hash": range(len(dataset["test"]["text"])),
     "label": dataset["test"]["labels"],
     **test_sm_dict
    }
)

In [43]:
train_frame.to_csv('data/atis/bert/train_predictions.csv', index=False)
val_frame.to_csv('data/atis/bert/calibration_predictions.csv', index=False)
test_frame.to_csv('data/atis/bert/test_predictions.csv', index=False)

pd.DataFrame({'0': id2label.values()}).to_csv('data/atis/bert/labels.csv')


In [48]:
train_sm.shape

(3522, 26)

In [50]:
len(id2label)

26