In [1]:
import torch
from datasets import load_dataset, concatenate_datasets, DatasetDict, ClassLabel
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, BertForSequenceClassification, DataCollatorWithPadding

import math
import numpy as np
import matplotlib.pyplot as plt
import evaluate
import pandas as pd
import scipy


# hyperparameters from https://github.com/clinc/oos-eval/blob/master/hyperparameters.csv
learning_rate = 4.00e-05
warmup_proportion = 0.1
train_batch_size = 32
eval_batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 1

huggingface_modelname = "bert-large-uncased"


language = 'en'
data_path = f'data/acid/'


dataset = load_dataset(data_path).rename_column('INTENT_NAME','labels').rename_column('UTTERANCES', 'utterance')
dataset_full = concatenate_datasets([dataset['train'], dataset['test']])


Dataset({
    features: ['labels', 'utterance'],
    num_rows: 22172
})

In [2]:
train_size, cal_size, test_size = .6, .2, .2
dataset_full = dataset_full.class_encode_column('labels')
train_test_set = dataset_full.train_test_split(train_size=train_size, stratify_by_column='labels', seed=1)
test_val_set = train_test_set['test'].train_test_split(train_size=test_size / (cal_size+test_size), stratify_by_column="labels", seed=1)
dataset = DatasetDict({
    'train': train_test_set['train'],
    'test': test_val_set['train'],
    'validation': test_val_set['test']
})

In [3]:

id2label = {i: name for i, name in enumerate(dataset["train"].features["labels"].names)}
label2id = {name: i for i, name in id2label.items()}

In [4]:
torch.mps.current_allocated_memory()

0

In [5]:
tokenizer = AutoTokenizer.from_pretrained(huggingface_modelname)

def tokenize_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# assert dataset["train"].features["labels"].names == dataset["test"].features["labels"].names 

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    huggingface_modelname,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
torch.mps.current_allocated_memory(), torch.mps.driver_allocated_memory()

(0, 393216)

In [8]:
id2label

{0: 'INFO_ADD_HOUSE',
 1: 'INFO_ADD_REMOVE_INSURED',
 2: 'INFO_ADD_REMOVE_VEHICLE',
 3: 'INFO_ADD_VEHICLE_PROPERTY_PAPERLESS_BILLING',
 4: 'INFO_AGENT_WRONG',
 5: 'INFO_AGT_NOT_RESPONDING',
 6: 'INFO_AMERICAN_STAR',
 7: 'INFO_AMT_DUE',
 8: 'INFO_AST_PURCHASE',
 9: 'INFO_AST_QUOTE',
 10: 'INFO_ATV_INS_EXPLAN',
 11: 'INFO_AUTO_COV_QUESTION',
 12: 'INFO_AUTO_INS_CANADA',
 13: 'INFO_AUTO_POLICY_CANT_SEE_IN_ACCT',
 14: 'INFO_AUTO_PYMT_CANCEL',
 15: 'INFO_AUTO_PYMT_MIN_BALANCE',
 16: 'INFO_AUTO_PYMT_SCHEDULE',
 17: 'INFO_BILLING_ACCT_NAME_EDIT',
 18: 'INFO_BILLING_ACCT_NUM',
 19: 'INFO_BILLING_DEPT_CONTACT',
 20: 'INFO_BILL_DUE_DATE',
 21: 'INFO_BOAT_COV_EXPLAN',
 22: 'INFO_BUSINESS_POLICY_CANT_SEE',
 23: 'INFO_CANCEL_CONFIRM',
 24: 'INFO_CANCEL_FEE',
 25: 'INFO_CANCEL_INS_POLICY',
 26: 'INFO_CANT_SEE_FARM_RANCH_POLICY',
 27: 'INFO_CANT_SEE_POLICY',
 28: 'INFO_CAREERS',
 29: 'INFO_CFR_QUESTION_GENERAL',
 30: 'INFO_CHANGE_AGENT',
 31: 'INFO_CHANGE_AUTOPAY_DATE',
 32: 'INFO_CHANGE_BANK_ACCT',


In [9]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
    load_best_model_at_end=True,
    )

In [10]:
metric = evaluate.load("accuracy")

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

  0%|          | 0/2080 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.37323564291000366, 'eval_accuracy': 0.9208568207440812, 'eval_runtime': 25.0116, 'eval_samples_per_second': 177.318, 'eval_steps_per_second': 5.557, 'epoch': 1.0}
{'loss': 0.2677, 'learning_rate': 3.0384615384615386e-05, 'epoch': 1.2}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.3154609501361847, 'eval_accuracy': 0.9307779030439685, 'eval_runtime': 26.6169, 'eval_samples_per_second': 166.623, 'eval_steps_per_second': 5.222, 'epoch': 2.0}
{'loss': 0.101, 'learning_rate': 2.0769230769230772e-05, 'epoch': 2.4}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.29833850264549255, 'eval_accuracy': 0.9377677564825254, 'eval_runtime': 24.8718, 'eval_samples_per_second': 178.315, 'eval_steps_per_second': 5.589, 'epoch': 3.0}
{'loss': 0.051, 'learning_rate': 1.1153846153846154e-05, 'epoch': 3.61}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.2788759469985962, 'eval_accuracy': 0.9431792559188276, 'eval_runtime': 24.977, 'eval_samples_per_second': 177.563, 'eval_steps_per_second': 5.565, 'epoch': 4.0}
{'loss': 0.0222, 'learning_rate': 1.5384615384615387e-06, 'epoch': 4.81}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.27623820304870605, 'eval_accuracy': 0.944532130777903, 'eval_runtime': 24.9941, 'eval_samples_per_second': 177.442, 'eval_steps_per_second': 5.561, 'epoch': 5.0}
{'train_runtime': 1714.726, 'train_samples_per_second': 38.79, 'train_steps_per_second': 1.213, 'train_loss': 0.10670332306852708, 'epoch': 5.0}


TrainOutput(global_step=2080, training_loss=0.10670332306852708, metrics={'train_runtime': 1714.726, 'train_samples_per_second': 38.79, 'train_steps_per_second': 1.213, 'train_loss': 0.10670332306852708, 'epoch': 5.0})

In [16]:
pred_train = trainer.predict(tokenized_datasets["train"])
pred_val = trainer.predict(tokenized_datasets["validation"])
pred_test = trainer.predict(tokenized_datasets["test"])

  0%|          | 0/416 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

In [17]:
train_sm = scipy.special.softmax(pred_train.predictions, axis=1)
val_sm = scipy.special.softmax(pred_val.predictions, axis=1)
test_sm = scipy.special.softmax(pred_test.predictions, axis=1)

np.testing.assert_allclose(train_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(val_sm.sum(axis=1), 1.0, rtol=1e-6)
np.testing.assert_allclose(test_sm.sum(axis=1), 1.0, rtol=1e-6)

In [18]:
train_sm.shape, val_sm.shape, test_sm.shape

((13303, 175), (4435, 175), (4434, 175))

In [19]:
# train_sm_dict = get_score(train_sm, oos_label)
# val_sm_dict = get_score(val_sm, oos_label)
# test_sm_dict = get_score(test_sm, oos_label)
train_sm_dict = {
    f'label_{i}':  train_sm[:,i] for i in range(train_sm.shape[1])
}
val_sm_dict = {
    f'label_{i}':  val_sm[:,i] for i in range(val_sm.shape[1])
}
test_sm_dict = {
    f'label_{i}':  test_sm[:,i] for i in range(test_sm.shape[1])
}
train_frame = pd.DataFrame(
    {"text": dataset["train"]["utterance"],
     "hash": range(len(dataset["train"]["utterance"])),
     "label": dataset["train"]["labels"],
     **train_sm_dict
    }
)
val_frame = pd.DataFrame(
    {"text": dataset["validation"]["utterance"],
     "hash": range(len(dataset["validation"]["utterance"])),
     "label": dataset["validation"]["labels"],
     **val_sm_dict
    }
)
test_frame = pd.DataFrame(
    {"text": dataset["test"]["utterance"],
     "hash": range(len(dataset["test"]["utterance"])),
     "label": dataset["test"]["labels"],
     **test_sm_dict
    }
)

In [25]:
train_frame.to_csv('data/acid/bert/train_predictions.csv', index=False)
val_frame.to_csv('data/acid/bert/calibration_predictions.csv', index=False)
test_frame.to_csv('data/acid/bert/test_predictions.csv', index=False)

pd.DataFrame({'0': id2label.values()}).to_csv('data/acid/bert/labels.csv')


In [21]:
id2label

{0: 'INFO_ADD_HOUSE',
 1: 'INFO_ADD_REMOVE_INSURED',
 2: 'INFO_ADD_REMOVE_VEHICLE',
 3: 'INFO_ADD_VEHICLE_PROPERTY_PAPERLESS_BILLING',
 4: 'INFO_AGENT_WRONG',
 5: 'INFO_AGT_NOT_RESPONDING',
 6: 'INFO_AMERICAN_STAR',
 7: 'INFO_AMT_DUE',
 8: 'INFO_AST_PURCHASE',
 9: 'INFO_AST_QUOTE',
 10: 'INFO_ATV_INS_EXPLAN',
 11: 'INFO_AUTO_COV_QUESTION',
 12: 'INFO_AUTO_INS_CANADA',
 13: 'INFO_AUTO_POLICY_CANT_SEE_IN_ACCT',
 14: 'INFO_AUTO_PYMT_CANCEL',
 15: 'INFO_AUTO_PYMT_MIN_BALANCE',
 16: 'INFO_AUTO_PYMT_SCHEDULE',
 17: 'INFO_BILLING_ACCT_NAME_EDIT',
 18: 'INFO_BILLING_ACCT_NUM',
 19: 'INFO_BILLING_DEPT_CONTACT',
 20: 'INFO_BILL_DUE_DATE',
 21: 'INFO_BOAT_COV_EXPLAN',
 22: 'INFO_BUSINESS_POLICY_CANT_SEE',
 23: 'INFO_CANCEL_CONFIRM',
 24: 'INFO_CANCEL_FEE',
 25: 'INFO_CANCEL_INS_POLICY',
 26: 'INFO_CANT_SEE_FARM_RANCH_POLICY',
 27: 'INFO_CANT_SEE_POLICY',
 28: 'INFO_CAREERS',
 29: 'INFO_CFR_QUESTION_GENERAL',
 30: 'INFO_CHANGE_AGENT',
 31: 'INFO_CHANGE_AUTOPAY_DATE',
 32: 'INFO_CHANGE_BANK_ACCT',


In [22]:
test_frame['label'].value_counts()

label
2      75
113    74
73     72
1      71
28     65
       ..
161     2
147     2
48      2
153     2
6       2
Name: count, Length: 175, dtype: int64

In [23]:
train_frame['label']

0         59
1        151
2        135
3        119
4        163
        ... 
13298    122
13299    107
13300     28
13301     62
13302    114
Name: label, Length: 13303, dtype: int64