## Imports

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

In [2]:
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from datasets import Dataset, load_dataset, Features, ClassLabel, Value, DatasetDict, set_caching_enabled
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from matplotlib import pyplot as plt

In [None]:
set_caching_enabled(False)

## Define the paths and experiment parameters

In [4]:
project_path = "/content/drive/MyDrive/University/Winter 23 Courses/CSI 5180 - AI Virtual Assistants/Project"
data_path = f"{project_path}/data/augmented_dataset"
model_save_path = f"{project_path}/models/baseline_bert_augmented_data/saved_models"
log_directory = f"{project_path}/models/baseline_bert_augmented_data/logs"
plots_save_path = f"{project_path}/models/baseline_bert_augmented_data/plots"

In [5]:
# Parameters
model_name = "bert-large-uncased"
learning_rate = 4.00e-05
warmup_proportion = 0.1
train_batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 1
num_labels = 151

## Load the train, val and test datasets into a Dataset object

In [7]:
train_data = pd.read_csv(f"{data_path}/augmented_train_data.csv")
val_data = pd.read_csv(f"{data_path}/original_val_data.csv")
test_data = pd.read_csv(f"{data_path}/original_test_data.csv")

class_names = sorted(train_data["label"].unique())
dataset_features = Features({'query': Value('string'), 'label': ClassLabel(num_classes=151, names=class_names)})

train_dataset = Dataset.from_pandas(train_data, features=dataset_features)
val_dataset = Dataset.from_pandas(val_data, features=dataset_features)
test_dataset = Dataset.from_pandas(test_data, features=dataset_features)

dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['validation'] = val_dataset
dataset['test'] = test_dataset

### Add the inscope test data to the dataset object

In [8]:
oos_label = test_dataset.features['label'].str2int('oos')
oos_label

80

In [None]:
# Define a function to filter examples based on their label
def exclude_oos(example):
    return example["label"] != oos_label

# Filter the test dataset to exclude the label class
inscope_test_dataset = test_dataset.filter(exclude_oos)

In [10]:
dataset['inscope_test'] = inscope_test_dataset

## Explore the dataset

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'label'],
        num_rows: 30200
    })
    validation: Dataset({
        features: ['query', 'label'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['query', 'label'],
        num_rows: 5500
    })
    inscope_test: Dataset({
        features: ['query', 'label'],
        num_rows: 4500
    })
})

In [12]:
dataset['train'][0]

{'query': 'can you check my reservations for mortons under david winters',
 'label': 26}

In [13]:
dataset['train'].features

{'query': Value(dtype='string', id=None),
 'label': ClassLabel(names=['accept_reservations', 'account_blocked', 'alarm', 'application_status', 'apr', 'are_you_a_bot', 'balance', 'bill_balance', 'bill_due', 'book_flight', 'book_hotel', 'calculator', 'calendar', 'calendar_update', 'calories', 'cancel', 'cancel_reservation', 'car_rental', 'card_declined', 'carry_on', 'change_accent', 'change_ai_name', 'change_language', 'change_speed', 'change_user_name', 'change_volume', 'confirm_reservation', 'cook_time', 'credit_limit', 'credit_limit_change', 'credit_score', 'current_location', 'damaged_card', 'date', 'definition', 'direct_deposit', 'directions', 'distance', 'do_you_have_pets', 'exchange_rate', 'expiration_date', 'find_phone', 'flight_status', 'flip_coin', 'food_last', 'freeze_account', 'fun_fact', 'gas', 'gas_type', 'goodbye', 'greeting', 'how_busy', 'how_old_are_you', 'improve_credit_score', 'income', 'ingredient_substitution', 'ingredients_list', 'insurance', 'insurance_change', 'in

## Tokenize the dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["query"], padding=True, truncation=True, return_tensors="pt")

tokenizer = AutoTokenizer.from_pretrained(model_name)

encoded_dataset = dataset.map(tokenize_function, batched=True)

In [15]:
example = encoded_dataset['train'][0]
print(example.keys())
print(example)

dict_keys(['query', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])
{'query': 'can you check my reservations for mortons under david winters', 'label': 26, 'input_ids': [101, 2064, 2017, 4638, 2026, 17829, 2005, 11164, 2015, 2104, 2585, 12214, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [16]:
tokenizer.decode(example['input_ids'])

'[CLS] can you check my reservations for mortons under david winters [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Set up the model and Training Arguments

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [None]:
args = TrainingArguments(
    output_dir = model_save_path,
    logging_dir = log_directory,
    logging_strategy='epoch',
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
)

## Define the compute metrics function to evaluate the model with the desired metrics

In [17]:
def multi_label_metrics(predictions, labels):
    y_true = labels
    y_pred = predictions.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    accuracy = accuracy_score(y_true, y_pred)
    
    metrics = {'accuracy': accuracy,
               'precision': precision,
               'recall': recall,
               'f1': f1
               }

    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)

    return result

## Train the model

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2339,0.290064,0.937419,0.94455,0.937419,0.934175
2,0.0925,0.218643,0.95871,0.961477,0.95871,0.958006
3,0.0407,0.251314,0.955161,0.958834,0.955161,0.953996
4,0.0203,0.239919,0.958065,0.960829,0.958065,0.956817
5,0.0096,0.242539,0.959032,0.961598,0.959032,0.957786


TrainOutput(global_step=4720, training_loss=0.27941569736448385, metrics={'train_runtime': 3647.5108, 'train_samples_per_second': 41.398, 'train_steps_per_second': 1.294, 'total_flos': 1.0080502691465952e+16, 'train_loss': 0.27941569736448385, 'epoch': 5.0})

## Evaluate the model on the validation set

In [None]:
trainer.evaluate()

{'eval_loss': 0.24253851175308228,
 'eval_accuracy': 0.9590322580645161,
 'eval_precision': 0.9615979938796799,
 'eval_recall': 0.9590322580645161,
 'eval_f1': 0.9577855010205117,
 'eval_runtime': 18.4735,
 'eval_samples_per_second': 167.808,
 'eval_steps_per_second': 5.251,
 'epoch': 5.0}

## Evaluate the model on the test set

In [None]:
test_eval = trainer.predict(encoded_dataset["test"])

In [None]:
test_preds = test_eval.predictions.argmax(-1)
test_labels = test_eval.label_ids

report = classification_report(test_labels, test_preds, target_names=class_names)

print("Classification report:")
print(report)

Classification report:
                           precision    recall  f1-score   support

      accept_reservations       0.88      1.00      0.94        30
          account_blocked       0.77      0.90      0.83        30
                    alarm       0.94      1.00      0.97        30
       application_status       1.00      1.00      1.00        30
                      apr       0.88      1.00      0.94        30
            are_you_a_bot       0.94      1.00      0.97        30
                  balance       0.77      1.00      0.87        30
             bill_balance       0.84      0.90      0.87        30
                 bill_due       0.88      0.93      0.90        30
              book_flight       0.97      0.93      0.95        30
               book_hotel       0.88      1.00      0.94        30
               calculator       0.78      0.93      0.85        30
                 calendar       0.87      0.87      0.87        30
          calendar_update       0.85  

In [None]:
trainer.save_model(model_save_path)

## Load the saved model

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

In [19]:
trainer = Trainer(
    model=model,
)

## Evaluate the model on the inscope test set

In [20]:
inscope_test_eval = trainer.predict(encoded_dataset["inscope_test"])

In [26]:
test_preds = inscope_test_eval.predictions.argmax(-1)
test_labels = inscope_test_eval.label_ids

report = classification_report(test_labels, test_preds, target_names=class_names)

print("Inscope Classification report:")
print(report)

Inscope Classification report:
                           precision    recall  f1-score   support

      accept_reservations       0.91      1.00      0.95        30
          account_blocked       0.96      0.90      0.93        30
                    alarm       0.94      1.00      0.97        30
       application_status       1.00      1.00      1.00        30
                      apr       0.97      1.00      0.98        30
            are_you_a_bot       0.97      1.00      0.98        30
                  balance       0.91      1.00      0.95        30
             bill_balance       0.93      0.90      0.92        30
                 bill_due       0.93      0.93      0.93        30
              book_flight       1.00      0.93      0.97        30
               book_hotel       0.97      1.00      0.98        30
               calculator       0.97      0.93      0.95        30
                 calendar       0.93      0.87      0.90        30
          calendar_update     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
