In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np




In [2]:
# Run it only once
from datasets import load_dataset
dataset = load_dataset("multi_woz_v22")

Downloading builder script:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

Downloading and preparing dataset multi_woz_v22/v2.2_active_only (download: 263.78 MiB, generated: 49.33 MiB, post-processed: Unknown size, total: 313.11 MiB) to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a...
                

Downloading data files #3:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data:   0%|          | 0.00/215k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/467k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/439k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/444k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/465k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/454k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/508k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/452k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/452k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/439k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/440k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/448k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/453k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/439k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.46M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/445k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/449k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/432k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/459k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/22 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset multi_woz_v22 downloaded and prepared to /root/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/7452f16a8b502e97df5c04cc4ee5436464762fa93b1ce778dd14181e79d8b51a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
# Run it only once
dataset.save_to_disk("dataset.hf")

In [5]:
from datasets import DatasetDict
dataset = DatasetDict.load_from_disk("dataset.hf")

In [6]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant'):
            services.add(service)
        elif service.startswith('Hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [7]:
def extract_utterance_and_act_types(dataset):
    act_types = []
    utterance_list = []
    
    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for utterance, speaker, dialogue_act in zip(turns['utterance'], turns['speaker'], turns['dialogue_acts']):
            if speaker == 0: # if it's the user's turn
                act_type = dialogue_act['dialog_act']['act_type']
                act_type = process_service_list(act_type)
                utterance_list.append(utterance)
                act_types.append(act_type)
    
                    
    return utterance_list, act_types

In [8]:
utterance_list, act_types = extract_utterance_and_act_types(dataset['train'].to_pandas())

100%|██████████| 8437/8437 [00:00<00:00, 12379.59it/s]


In [9]:
num_dialog_acts = len(set([act for act_list in act_types for act in act_list ]))

In [10]:
all_labels = set([act for act_list in act_types for act in act_list ])
print(all_labels)

{'Restaurant-Request', 'general-thank', 'general-greet', 'Hotel-Request', 'other', 'Restaurant-Inform', 'general-bye', 'Hotel-Inform'}


In [11]:
# This will help us to transform the labels into a one-hot encoded numeric array
mlb = MultiLabelBinarizer(classes=list(all_labels))


In [12]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_dialog_acts)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
class DialogueActDataset(Dataset):
    def __init__(self, tokenizer, utterances, labels):
        self.utterances = utterances
        self.tokenizer = tokenizer
        # Fit the label binarizer and transform the labels into one-hot encoded format
        self.labels = mlb.fit_transform(labels)
        
    def __len__(self):
        return len(self.utterances)
    
    def __getitem__(self, idx):
        # Encode the utterance using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            self.utterances[idx],
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        # Convert the list of strings into a one-hot encoded format
        label = self.labels[idx]  # This should now be a binary vector instead of a list of strings
        # Return the encoding and the label
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [14]:
def preprocess_dataset(dataset):
    dataset_types = ['train', 'validation', 'test']
    output = dict()
    for dataset_type in dataset_types:
        data = dataset[dataset_type]
        utterance_list, act_types = extract_utterance_and_act_types(data.to_pandas())
        preprocessed_data = DialogueActDataset(tokenizer, utterance_list, act_types)
        output[dataset_type] = preprocessed_data
    
    return output
        
preprocessed_data = preprocess_dataset(dataset)

100%|██████████| 8437/8437 [00:00<00:00, 12147.36it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12149.73it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12204.49it/s]


In [15]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy='epoch',
    save_strategy='no',
    evaluation_strategy="epoch",
    report_to='none',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_data['train'],
    eval_dataset=preprocessed_data['validation'],
)


In [16]:
# Train the model
model.train()
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1048,0.066998
2,0.0667,0.064515
3,0.0593,0.063286
4,0.0523,0.068072
5,0.0454,0.070818
6,0.0384,0.075574
7,0.0316,0.087777
8,0.0252,0.095564
9,0.0199,0.107119
10,0.0154,0.114132


TrainOutput(global_step=35490, training_loss=0.04592261390976249, metrics={'train_runtime': 7822.9572, 'train_samples_per_second': 72.576, 'train_steps_per_second': 4.537, 'total_flos': 3.734799508488192e+16, 'train_loss': 0.04592261390976249, 'epoch': 10.0})

In [17]:
trainer.save_model("my_model")

In [18]:
test_dataloader = DataLoader(preprocessed_data['test'], batch_size=32, shuffle=False)


In [19]:
# Put the model in evaluation mode
model.eval()

# Move the model to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(logits.detach().cpu().numpy())
        true_labels.extend(batch['labels'].detach().cpu().numpy())


In [20]:
# Apply softmax to logits and then take the argmax to get the most likely label
sigmoid_outputs = torch.sigmoid(torch.tensor(predictions))
threshold = 0.5
binary_predictions = (sigmoid_outputs > threshold).numpy()  # Applying a threshold to get binary values


# Calculate the metrics
accuracy = accuracy_score(true_labels, binary_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, binary_predictions, average='micro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


Accuracy: 0.8661150298426479
Precision: 0.8981601451153148
Recall: 0.8915755627009646
F1: 0.8948557413025237


  sigmoid_outputs = torch.sigmoid(torch.tensor(predictions))


In [21]:
# Generate a classification report
report = classification_report(true_labels, binary_predictions, target_names=mlb.classes_)
print(report)


                    precision    recall  f1-score   support

Restaurant-Request       0.52      0.51      0.51       286
     general-thank       0.98      0.97      0.98       940
     general-greet       1.00      0.83      0.91         6
     Hotel-Request       0.57      0.52      0.54       292
             other       0.93      0.92      0.92      3307
 Restaurant-Inform       0.88      0.91      0.90      1323
       general-bye       1.00      1.00      1.00       293
      Hotel-Inform       0.90      0.89      0.90      1328

         micro avg       0.90      0.89      0.89      7775
         macro avg       0.85      0.82      0.83      7775
      weighted avg       0.90      0.89      0.89      7775
       samples avg       0.90      0.90      0.90      7775



  _warn_prf(average, modifier, msg_start, len(result))
