In [24]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda


In [3]:
# Run it only once
from datasets import load_dataset
#dataset = load_dataset("multi_woz_v22")

In [25]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [4]:
# Run it only once
#dataset.save_to_disk("dataset.hf")

In [5]:
from datasets import DatasetDict
dataset = DatasetDict.load_from_disk("dataset.hf")

In [6]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant'):
            services.add(service)
        elif service.startswith('Hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [7]:
#train = preprocess_split(dataset, 'train')
#test = preprocess_split(dataset, 'test')
#val = preprocess_split(dataset, 'validation')

In [8]:
def extract_utterance_and_act_types(dataset):
    act_types = []
    utterance_list = []
    
    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for utterance, speaker, dialogue_act in zip(turns['utterance'], turns['speaker'], turns['dialogue_acts']):
            if speaker == 0: # if it's the user's turn
                act_type = dialogue_act['dialog_act']['act_type']
                act_type = process_service_list(act_type)
                utterance_list.append(utterance)
                act_types.append(act_type)
    
                    
    return utterance_list, act_types

In [9]:
utterance_list, act_types = extract_utterance_and_act_types(dataset['train'].to_pandas())

100%|██████████| 8437/8437 [00:00<00:00, 14494.94it/s]


In [10]:
num_dialog_acts = len(set([act for act_list in act_types for act in act_list ]))

In [11]:
all_labels = set([act for act_list in act_types for act in act_list ])
print(all_labels)

{'Restaurant-Request', 'Hotel-Inform', 'general-bye', 'Hotel-Request', 'Restaurant-Inform', 'general-thank', 'general-greet', 'other'}


In [20]:
# This will help us to transform the labels into a one-hot encoded numeric array
mlb = MultiLabelBinarizer(classes=list(all_labels))


In [13]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_dialog_acts)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
class DialogueActDataset(Dataset):
    def __init__(self, tokenizer, utterances, labels):
        self.utterances = utterances
        self.tokenizer = tokenizer
        # Fit the label binarizer and transform the labels into one-hot encoded format
        self.labels = mlb.fit_transform(labels)
        
    def __len__(self):
        return len(self.utterances)
    
    def __getitem__(self, idx):
        # Encode the utterance using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            self.utterances[idx],
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        # Convert the list of strings into a one-hot encoded format
        label = self.labels[idx]  # This should now be a binary vector instead of a list of strings
        # Return the encoding and the label
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [22]:
dataset = DialogueActDataset(tokenizer, utterance_list, act_types)


In [23]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the model
trainer.train()

  0%|          | 0/10647 [00:00<?, ?it/s]

KeyboardInterrupt: 