In [1]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [3]:
#!pip install transformers
#!pip install accelerate
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from seqeval.metrics import classification_report
from itertools import product
import os
import joblib
import json
import math
import nltk
from datasets import load_dataset
from datasets import DatasetDict

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
DATASET_DIRECTORY = "./../dataset.hf/"
TRAIN_DATA_SAVE_PATH = "preprocessed_data_to_be_requested/train.json"
TEST_DATA_SAVE_PATH = "preprocessed_data_to_be_requested/test.json"
VALID_DATA_SAVE_PATH = "preprocessed_data_to_be_requested/valid.json"

ALWAYS_LOAD_DATASET = False
SAVE_DATASET = True

In [6]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [7]:
# Extracting only hotel and restaurant features from the dataset
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue

        new_df.loc[len(new_df)] = row
    return new_df

In [8]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

def process_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant') or service.startswith('restaurant'):
            services.add(service)
        elif service.startswith('Hotel') or service.startswith('hotel'):
            services.add(service)
        elif service.startswith('general') or service.startswith('General'):
            services.add(service)
        elif service.startswith('Booking') or service.startswith('booking'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [9]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence
                

In [10]:
def get_act_type_to_slots(dialogue_act):
    act_type_to_slots = {}

    # All the slots that are present in the current user utterance
    span_info = dialogue_act['span_info']
    act_types = span_info['act_type']
    act_slot_names = span_info['act_slot_name']
    act_slot_values = span_info['act_slot_value']

    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            if not act_type in act_type_to_slots:
                act_type_to_slots[act_type] = []
            act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))

    # All the slots that are not present in the current user utterance (slots with '?')
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    for act_type, act_slot in zip(act_types, act_slots):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value == "?":
                    if not act_type in act_type_to_slots:
                        act_type_to_slots[act_type] = []
                    act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))
    return act_type_to_slots

In [11]:
def get_to_be_provided_overall(dialogue_act, current_booking_service):
    
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    to_be_provided_overall = []
    for act_type, act_slot in zip(act_types, act_slots):
        # If the domain is booking, we need to know which service is being booked
        
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel') or act_type.startswith('Booking') or act_type.startswith('general'):
            domain = act_type.split("-")[0]
            if domain == "booking" and len(current_booking_service)==1:
                domain = current_booking_service[0]
            domain = domain.lower()

            if domain in ["hotel", "restaurant", "booking", "general"]:
                # Retrieve the slots that are in the form of "something : ?"
                slots = []
                for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                    if slot_name != "none" and  slot_value != "?":
                        slots.append(domain + "-" + slot_name + ":" + slot_value)
            
                if slots and any((slot_name_value.split(":")[0] != domain + "-none" for slot_name_value in slots)) and not "-No" in dialogue_act:
                    to_be_provided = ["%s-availability:yes" % (domain)] + slots
                    to_be_provided_overall.extend(to_be_provided)
                elif "-No" in dialogue_act:
                    to_be_provided = ["%s-availability:no" % (domain)] + slots
                    to_be_provided_overall.extend(to_be_provided)
            
    to_be_provided_overall = sorted(list(set(to_be_provided_overall)))
    remove_avail_no_list = [elem for elem in to_be_provided_overall if elem.endswith("availability:no")]
    for remove_avail in remove_avail_no_list:
        remove_avail_yes = remove_avail[:-2]+"yes"
        while remove_avail_yes in to_be_provided_overall:
            del to_be_provided_overall[to_be_provided_overall.index(remove_avail_yes)]

    return to_be_provided_overall


In [12]:
def get_to_be_requested(dialogue_act, current_booking_service):

    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    to_be_requested = []
    for act_type, act_slot in zip(act_types, act_slots):
        # If the domain is booking, we need to know which service is being booked
        
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel') or act_type.startswith('Booking') or act_type.startswith('general'):
            domain = act_type.split("-")[0].lower()
            if domain == "booking" and len(current_booking_service)==1:
                domain = current_booking_service[0]
            domain = domain.lower()
        
            if domain in ["hotel", "restaurant", "booking", "general"]:
                
                slots = []
                for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                    if slot_name != "none" and  slot_value == "?":
                        slots.append(domain + "-" + slot_name)
                
                to_be_requested.extend(slots)
    to_be_requested = sorted(list(set(to_be_requested)))
    return to_be_requested

In [13]:
def concatenate_to_be_retrieved(prev_user_utterance, prev_user_act_type_to_slots, user_utterance, user_act_type_to_slots, user_booking_service):

    historical_utterance = prev_user_utterance + " | "
    for act_type in prev_user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in prev_user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(prev_user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(prev_user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    
    historical_utterance += " | "
    historical_utterance += user_utterance + " | "
    for act_type in user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    historical_utterance += " | "
    historical_utterance += " , ".join(user_booking_service)

    return historical_utterance

In [14]:
def extract_to_be_requested(dataset):
    target_list = []
    training_list = []

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns

        speaker_str = {'User': 0, 'Agent': 1}
        historical_slots = []

        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if speaker == speaker_str['Agent']:

                prev_user = j - 1
                # The dataset is very unbalanced, so I will remove the labels that does not have also our domain
                current_act_type = dialogue_act['dialog_act']['act_type']
                current_act_type = process_dialogue_act(current_act_type)
                current_act_type = [act_type for act_type in current_act_type if act_type.endswith('Request')]
                
                if len(current_act_type) == 0:
                    continue

                services = turns['frames'][prev_user]['service']
                user_booking_service = [service for service in services if service in ["hotel", "restaurant"]]
                if len(user_booking_service) == 0:
                    continue

                # let's see if we can retrieve something from the previous previous user
                prev_user = j - 3
                if prev_user >= 0:
                    # Retrieve the previous previous user stuff
                    prev_user_dialogue_act = turns['dialogue_acts'][prev_user]
                    prev_user_act_type_to_slots = get_act_type_to_slots(prev_user_dialogue_act)
                    prev_user_utterance = parse(turns['utterance'][prev_user])
                else:
                    prev_user_utterance = ""
                    prev_user_act_type_to_slots = {}

                # The user start, so there will always be j-1 turns before
                prev_user = j - 1

                
                # Retrieve the previous user stuff
                user_dialogue_act = turns['dialogue_acts'][prev_user]
                user_act_type_to_slots = get_act_type_to_slots(user_dialogue_act)
                user_utterance = parse(turns['utterance'][prev_user])
                
                for user_act_type in user_act_type_to_slots:
                    for slot_name, slot_value in user_act_type_to_slots[user_act_type]:
                        if slot_value != "?":
                            historical_slots.append(user_act_type + "-" + slot_name)
                            
                to_be_provided = get_to_be_provided_overall(dialogue_act, user_booking_service)
                
                # HISTORY = 2
                # - 2 Previous user utterance
                # - 2 Previous user acts
                # - 2 Previous slots (also slots with '?')
                # - Previous user utterance
                # - Previous user acts
                # - Previous slots (also slots with '?')
                # - to be provided overall
                # Structure: user_utterance | user_act = slot_name : slot_value , slot_name : slot_value
                #           user_utterance | user_act = slot_name : slot_value , slot_name : slot_value
                #           user_booking_service |
                #           historical slots |
                #           to_be_provided_overall

                historical_utterance = concatenate_to_be_retrieved(prev_user_utterance, prev_user_act_type_to_slots, user_utterance, user_act_type_to_slots, user_booking_service)
                historical_utterance += " | "
                historical_utterance += " , ".join(to_be_provided)

                # Target
                targets = get_to_be_requested(dialogue_act, user_booking_service)
                
                # If there is no request why should be a question?
                if(len(targets) == 0):
                    targets.append('none')
                training_list.append(historical_utterance)
                target_list.append(targets)

    return training_list, target_list

In [15]:
try:
    print('Loading from json...')
    train_utterances, train_labels = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    test_utterances, test_labels = load_variable_from_json(TEST_DATA_SAVE_PATH)
    valid_utterances, valid_labels = load_variable_from_json(VALID_DATA_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')
    print('Loading dataset...')

    try:
        if ALWAYS_LOAD_DATASET:
            raise Exception('Load data always')
        # Run it only once
        dataset = DatasetDict.load_from_disk(DATASET_DIRECTORY)
        
    except:
        # Run it only once
        dataset = load_dataset("multi_woz_v22")

        if SAVE_DATASET:
            # Run it only once
            dataset.save_to_disk(DATASET_DIRECTORY)

    print('Preprocessing...')
    train = preprocess_split(dataset, 'train')
    val = preprocess_split(dataset, 'validation')
    test = preprocess_split(dataset, 'test')

    print('Extracting utterances and act types...')
    train_utterances, train_labels = extract_to_be_requested(train)
    test_utterances, test_labels = extract_to_be_requested(test)
    valid_utterances, valid_labels = extract_to_be_requested(val)

    print('Saving to json...')
    save_variable_to_json([train_utterances,train_labels] , TRAIN_DATA_SAVE_PATH)
    save_variable_to_json([test_utterances,test_labels], TEST_DATA_SAVE_PATH)
    save_variable_to_json([valid_utterances,valid_labels], VALID_DATA_SAVE_PATH)

    print('Saved to json')

Loading from json...
Exception occured during loading from json
Loading dataset...
Preprocessing...


Extracting utterances and act types...


100%|██████████| 6321/6321 [01:21<00:00, 77.82it/s] 
100%|██████████| 745/745 [00:10<00:00, 69.55it/s]
100%|██████████| 762/762 [00:10<00:00, 75.49it/s]

Saving to json...
Saved to json





In [19]:
none_count = train_labels.count(['none'])
train_labels.index(['none'])
print(train_utterances[train_labels.index(['none'])]  )
print("Number of 'none' labels:", none_count)
print("Total number of labels:", len(train_labels))

can I get the post code for Christ 's College ? |  | I be also look for a place to stay that have 4 star and in the centre of town . | Hotel-Inform = stars : 4 , area : centre | hotel | hotel-availability:yes , hotel-name:Alexander B&B , hotel-name:University Arms Hotel] , hotel-pricerange:?]
Number of 'none' labels: 4
Total number of labels: 7188


In [20]:
for utterance, act_type in zip(train_utterances[:100], train_labels[:100]):
    print("Utterance:", utterance)
    print("Act Type:", act_type)
    print()


Utterance: sound good , could I get that phone number ? also , could you recommend I an expensive hotel ? | Hotel-Inform = pricerange : expensive , type : hotel ; Restaurant-Request = phone : ? | yes . can you book it for I ? |  | hotel | 
Act Type: ['hotel-bookday']

Utterance: Guten Tag , I be stay overnight in Cambridge and need a place to sleep . I need free parking and internet . |  | no , but I would really like to be on the south end of the city . do any of those fit the bill ? | Hotel-Inform = area : south | hotel , restaurant | 
Act Type: ['hotel-pricerange', 'restaurant-pricerange']

Utterance: no I do not care about the price . which one do you recommend ? | Hotel-Inform = pricerange : dontcare ; Restaurant-Inform = pricerange : dontcare | yes , book it for 4 people and 4 night start from tuesday . | Hotel-Inform = bookstay : 4 , bookpeople : 4 , bookday : tuesday | hotel | 
Act Type: ['hotel-bookday', 'hotel-bookstay']

Utterance: hi there ! can you give I some info on City

In [18]:
none_count = train_labels.count(['none'])
print("Number of 'none' labels:", none_count)
print("Total number of labels:", len(train_labels))


Number of 'none' labels: 4
Total number of labels: 7188
