## Imports

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

In [2]:
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from datasets import Dataset, load_dataset, Features, ClassLabel, Value, DatasetDict, set_caching_enabled
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from matplotlib import pyplot as plt

In [None]:
set_caching_enabled(False)

## Define the paths and experiment parameters

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
project_path = "/content/drive/MyDrive/University/Winter 23 Courses/CSI 5180 - AI Virtual Assistants/Project"
data_path = f"{project_path}/data/original_dataset"
model_save_path = f"{project_path}/models/bert_inscope_original_data/saved_models"
log_directory = f"{project_path}/models/bert_inscope_original_data/logs"
plots_save_path = f"{project_path}/models/bert_inscope_original_data/plots"

In [6]:
# Parameters
model_name = 'bert-large-uncased'
learning_rate = 4.00e-05
warmup_proportion = 0.1
train_batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 1
num_labels = 150

## Load the train, val and test datasets into a Dataset object

In [7]:
train_data = pd.read_csv(f"{data_path}/original_train_data.csv")
val_data = pd.read_csv(f"{data_path}/original_val_data.csv")
test_data = pd.read_csv(f"{data_path}/original_test_data.csv")

train_data = train_data.loc[train_data['label'] != 'oos']
val_data = val_data.loc[val_data['label'] != 'oos']
test_data = test_data.loc[test_data['label'] != 'oos']

train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [8]:
class_names = sorted(train_data["label"].unique())
dataset_features = Features({'query': Value('string'), 'label': ClassLabel(num_classes=150, names=class_names)})

train_dataset = Dataset.from_pandas(train_data, features=dataset_features)
val_dataset = Dataset.from_pandas(val_data, features=dataset_features)
test_dataset = Dataset.from_pandas(test_data, features=dataset_features)

dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['validation'] = val_dataset
dataset['test'] = test_dataset

## Explore the dataset

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'label'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label'],
        num_rows: 4500
    })
})

In [10]:
dataset['train'][0]

{'query': 'what products are on my shopping list', 'label': 111}

In [11]:
dataset['train'].features

{'query': Value(dtype='string', id=None),
 'label': ClassLabel(names=['accept_reservations', 'account_blocked', 'alarm', 'application_status', 'apr', 'are_you_a_bot', 'balance', 'bill_balance', 'bill_due', 'book_flight', 'book_hotel', 'calculator', 'calendar', 'calendar_update', 'calories', 'cancel', 'cancel_reservation', 'car_rental', 'card_declined', 'carry_on', 'change_accent', 'change_ai_name', 'change_language', 'change_speed', 'change_user_name', 'change_volume', 'confirm_reservation', 'cook_time', 'credit_limit', 'credit_limit_change', 'credit_score', 'current_location', 'damaged_card', 'date', 'definition', 'direct_deposit', 'directions', 'distance', 'do_you_have_pets', 'exchange_rate', 'expiration_date', 'find_phone', 'flight_status', 'flip_coin', 'food_last', 'freeze_account', 'fun_fact', 'gas', 'gas_type', 'goodbye', 'greeting', 'how_busy', 'how_old_are_you', 'improve_credit_score', 'income', 'ingredient_substitution', 'ingredients_list', 'insurance', 'insurance_change', 'in

## Tokenize the dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["query"], padding=True, truncation=True, return_tensors="pt")

tokenizer = AutoTokenizer.from_pretrained(model_name)

encoded_dataset = dataset.map(tokenize_function, batched=True)

In [13]:
example = encoded_dataset['train'][0]
print(example.keys())
print(example)

dict_keys(['query', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])
{'query': 'what products are on my shopping list', 'label': 111, 'input_ids': [101, 2054, 3688, 2024, 2006, 2026, 6023, 2862, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [14]:
tokenizer.decode(example['input_ids'])

'[CLS] what products are on my shopping list [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Set up the model and Training Arguments

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [16]:
args = TrainingArguments(
    output_dir = model_save_path,
    logging_dir = log_directory,
    logging_strategy='epoch',
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
)

## Define the compute metrics function to evaluate the model with the desired metrics

In [17]:
def multi_label_metrics(predictions, labels):
    y_true = labels
    y_pred = predictions.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    accuracy = accuracy_score(y_true, y_pred)
    
    metrics = {'accuracy': accuracy,
               'precision': precision,
               'recall': recall,
               'f1': f1
               }

    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)

    return result

## Train the model

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9695,0.342975,0.943333,0.953837,0.943333,0.940344
2,0.1376,0.157622,0.969667,0.972065,0.969667,0.969375
3,0.0433,0.143766,0.972333,0.974025,0.972333,0.972131
4,0.0197,0.143025,0.973667,0.975355,0.973667,0.973441
5,0.0118,0.141126,0.974,0.975772,0.974,0.973807


TrainOutput(global_step=2345, training_loss=0.4363935549884463, metrics={'train_runtime': 1714.589, 'train_samples_per_second': 43.742, 'train_steps_per_second': 1.368, 'total_flos': 4491416481888384.0, 'train_loss': 0.4363935549884463, 'epoch': 5.0})

## Evaluate the model on the validation set

In [20]:
trainer.evaluate()

{'eval_loss': 0.14112591743469238,
 'eval_accuracy': 0.974,
 'eval_precision': 0.97577241192573,
 'eval_recall': 0.974,
 'eval_f1': 0.9738067085266238,
 'eval_runtime': 17.4672,
 'eval_samples_per_second': 171.751,
 'eval_steps_per_second': 5.382,
 'epoch': 5.0}

## Evaluate the model on the test set

In [21]:
test_eval = trainer.predict(encoded_dataset["test"])

In [22]:
test_preds = test_eval.predictions.argmax(-1)
test_labels = test_eval.label_ids

report = classification_report(test_labels, test_preds, target_names=class_names)

print("Classification report:")
print(report)

Classification report:
                           precision    recall  f1-score   support

      accept_reservations       0.94      0.97      0.95        30
          account_blocked       0.97      0.93      0.95        30
                    alarm       1.00      0.97      0.98        30
       application_status       1.00      1.00      1.00        30
                      apr       0.97      1.00      0.98        30
            are_you_a_bot       1.00      1.00      1.00        30
                  balance       0.97      1.00      0.98        30
             bill_balance       0.96      0.90      0.93        30
                 bill_due       0.85      0.97      0.91        30
              book_flight       1.00      0.97      0.98        30
               book_hotel       0.97      1.00      0.98        30
               calculator       0.93      0.93      0.93        30
                 calendar       0.96      0.90      0.93        30
          calendar_update       0.97  

In [23]:
trainer.save_model(model_save_path)