In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer, normalize
from torch import cuda
import random
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report, multilabel_confusion_matrix, ConfusionMatrixDisplay
import numpy as np
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
from datasets import load_dataset
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

sys.path.append(os.path.abspath('../..'))
from util import generate_metrics_latex_table

In [2]:
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    # for multi-GPU
    torch.cuda.manual_seed_all(seed_value) 
    # or starting nondeterministic operations in PyTorch
    if seed_value is not None:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed = 13
set_seed(seed)

In [3]:
dataset = load_dataset('multi_woz_v22')

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/Users/pepe/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(device)

mps


In [5]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant'):
            services.add(service)
        elif service.startswith('Hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return sorted(services)

In [6]:
def extract_utterance_and_act_types(dataset):
    act_types = []
    utterance_list = []
    
    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for utterance, speaker, dialogue_act in zip(turns['utterance'], turns['speaker'], turns['dialogue_acts']):
            if speaker == 0: # if it's the user's turn
                act_type = dialogue_act['dialog_act']['act_type']
                act_type = process_service_list(act_type)
                utterance_list.append(utterance)
                act_types.append(act_type)
    
                    
    return utterance_list, act_types

In [7]:
utterance_list, act_types = extract_utterance_and_act_types(dataset['train'].to_pandas())

100%|██████████| 8437/8437 [00:00<00:00, 34168.42it/s]


In [8]:
all_labels = sorted({act for act_list in act_types for act in act_list})
num_dialog_acts = len(all_labels)
print(all_labels)

['Hotel-Inform', 'Hotel-Request', 'Restaurant-Inform', 'Restaurant-Request', 'general-bye', 'general-greet', 'general-thank', 'other']


In [8]:
# This will help us to transform the labels into a one-hot encoded numeric array
mlb = MultiLabelBinarizer(classes=all_labels)


In [9]:
LOAD_FINETUNED_MODEL = True
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_path = 'model' if LOAD_FINETUNED_MODEL else 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_dialog_acts)


In [10]:
class DialogueActDataset(Dataset):
    def __init__(self, tokenizer, utterances, labels):
        self.utterances = utterances
        self.tokenizer = tokenizer
        # Fit the label binarizer and transform the labels into one-hot encoded format
        self.labels = mlb.fit_transform(labels)
        
    def __len__(self):
        return len(self.utterances)
    
    def __getitem__(self, idx):
        # Encode the utterance using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            self.utterances[idx],
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        # Convert the list of strings into a one-hot encoded format
        label = self.labels[idx]  # This should now be a binary vector instead of a list of strings
        # Return the encoding and the label
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [11]:
def preprocess_dataset(dataset):
    dataset_types = ['train', 'validation', 'test']
    output = dict()
    for dataset_type in dataset_types:
        data = dataset[dataset_type]
        utterance_list, act_types = extract_utterance_and_act_types(data.to_pandas())
        preprocessed_data = DialogueActDataset(tokenizer, utterance_list, act_types)
        output[dataset_type] = preprocessed_data
    
    return output
        
preprocessed_data = preprocess_dataset(dataset)

100%|██████████| 8437/8437 [00:00<00:00, 19192.15it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15225.99it/s]
100%|██████████| 1000/1000 [00:00<00:00, 23902.85it/s]


In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    report_to='none',
    # to stop training when validation loss increases,
    # set load_best_model_at_end, greater_is_better, metric_for_best_model 
    load_best_model_at_end=True,
    greater_is_better=False,
    metric_for_best_model='eval_loss',
    # save_strategy must match evaluation_strategy when load_best_model_at_end is set
    save_total_limit=2,
    save_strategy='epoch',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_data['train'],
    eval_dataset=preprocessed_data['validation'],
)


In [13]:
# Train the model and save it
if not LOAD_FINETUNED_MODEL:
    model.train()
    trainer.train()
    trainer.save_model('model')

In [14]:
# do not shuffle test and validation data sets so model is consistent and not affected by order of samples
test_dataloader = DataLoader(preprocessed_data['test'], batch_size=32, shuffle=False)

In [15]:
# Put the model in evaluation mode
model.eval()

# Move the model to the appropriate device (GPU or CPU)
model.to(device)

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(logits.detach().cpu().numpy())
        true_labels.extend(batch['labels'].detach().cpu().numpy())


In [16]:
# Apply softmax to logits and then use threshold to convert to binary labels
sigmoid_outputs = torch.sigmoid(torch.tensor(predictions))
threshold = 0.5
binary_predictions = (sigmoid_outputs > threshold).numpy()  # Applying a threshold to get binary values


# Calculate the metrics
accuracy = accuracy_score(true_labels, binary_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, binary_predictions, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')


Accuracy: 0.861096039066739
Precision: 0.925725860904794
Recall: 0.8816720257234727
F1: 0.9031620553359685


  sigmoid_outputs = torch.sigmoid(torch.tensor(predictions))


In [19]:
# Apply softmax to logits and then use threshold to convert to binary labels
prediction_probs = torch.sigmoid(torch.tensor(predictions))
threshold = 0.5
binary_predictions = (sigmoid_outputs > threshold).numpy()  # Applying a threshold to get binary values

In [20]:
# Generate a classification report
report = classification_report(true_labels, binary_predictions, target_names=mlb.classes_, digits=3)
print(report)

                    precision    recall  f1-score   support

      Hotel-Inform      0.877     0.909     0.893      1328
     Hotel-Request      0.795     0.425     0.554       292
 Restaurant-Inform      0.935     0.872     0.902      1323
Restaurant-Request      0.680     0.462     0.550       286
       general-bye      0.993     1.000     0.997       293
     general-greet      0.833     0.833     0.833         6
     general-thank      0.982     0.969     0.975       940
             other      0.942     0.916     0.929      3307

         micro avg      0.926     0.882     0.903      7775
         macro avg      0.880     0.798     0.829      7775
      weighted avg      0.921     0.882     0.898      7775
       samples avg      0.892     0.892     0.889      7775



  _warn_prf(average, modifier, msg_start, len(result))


In [161]:
generate_metrics_latex_table(model_name='BERT', task_number=1, true_labels=true_labels, binary_predictions=binary_predictions, prediction_probs=prediction_probs, target_names=mlb.classes_)

  _warn_prf(average, modifier, msg_start, len(result))
