In [2]:
# !pip install seqeval

In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
from torch import nn
from tqdm import tqdm
import numpy as np
import torch
from seqeval.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

In [4]:
TRANSFORMER_MODEL_NAME = 'roberta-base'
SAVE_MODEL_SUFFIX = 'with_intent'
save_model_name = TRANSFORMER_MODEL_NAME.split('/')[-1]
epochs = 10
batch_size = 16
learning_rate = 2e-5
class_weight_beta = 0.999 # 0.99999 (yes, 5 nines) should work ok, increase number of nines if you want stronger imbalance compensation
patience = 2
ignored_tags = ['I-hotel-bookstay', 'I-hotel-stars']
use_history = True

In [5]:
def process_intent_list(intent_list):
    intents = set()
    if len(intent_list) == 0:
        intents.add('other')
    for intent in intent_list:
        if intent.startswith('Restaurant'):
            intents.add(intent)
        elif intent.startswith('Hotel'):
            intents.add(intent)
        elif intent.startswith('general'):
            intents.add(intent)
        else:
            intents.add('other')
    # print(f'Original {intent_list}')
    # print(f'Modified {list(intents)}')
    return list(intents)

def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

In [6]:
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue
        
        new_df.loc[len(new_df)] = row
        # new_df.loc[len(new_df) - 1]['services'] = process_service_list(new_df.loc[len(new_df) - 1]['services'])
        # for i, frame_service in [frame['service'] for frame in df.loc[i].turns['frames']]:
            # df.loc[i].turns['frames']
    return new_df

In [9]:
dataset = load_dataset('multi_woz_v22')

train = preprocess_split(dataset, 'train')
val = preprocess_split(dataset, 'validation')
test = preprocess_split(dataset, 'test')

No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/home/adrian/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
pick_ind = np.random.choice(len(train))
pick = train.iloc[pick_ind]
print("INDEX: ", pick_ind)
# print(pick)
# pick.turns['utterance']
dialogue = '\n'.join([('USR: ' if speaker == 0 else 'BOT: ') + utterance for speaker, utterance in zip(pick.turns['speaker'], pick.turns['utterance'])])
print(dialogue)

INDEX:  1840
USR: Hi! I'd like a hotel with a 4 star rating and free parking, please.
BOT: Were you looking for a hotel in a particular area?
USR: I'm trying to stay on the east, if possible.
BOT: I have 5 of them in the east. Three of them are cheap and two are moderate priced. Did you have a preference?
USR: The price doesn't matter. What do you recommend?
BOT: I would recommend the allenbell.
USR: Does it have free wifi?
BOT: Yes, it does have free wifi. Would you like me to book this for you?
USR: Not yet. What is their phone number?
BOT: Yes their telephone number is 01223210353.
USR: Great and what is their post code?
BOT: postcode, is cb13js, address is 517a coldham lane, internet and parking is included and price is cheap. Can I help with anything else?
USR: Would you also find me a hotel in the same area?
BOT: In the east, I have 6 different guesthouses and 1 hotel. Do you have a preference?
USR: No, that's OK. The Allenbell is great. What i need is information on boat attract