In [45]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [46]:
#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [47]:
#!pip install transformers
#!pip install accelerate
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [48]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from seqeval.metrics import classification_report
from itertools import product
import os
import joblib
import json
import math
import nltk
from datasets import load_dataset
from datasets import DatasetDict

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
DATASET_DIRECTORY = "./../dataset.hf/"
TRAIN_DATA_SAVE_PATH = "preprocessed_data_to_be_retrieved/train.json"
TEST_DATA_SAVE_PATH = "preprocessed_data_to_be_retrieved/test.json"
VALID_DATA_SAVE_PATH = "preprocessed_data_to_be_retrieved/valid.json"

ALWAYS_LOAD_DATASET = False
SAVE_DATASET = True

In [50]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [51]:
# Extracting only hotel and restaurant features from the dataset
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue

        new_df.loc[len(new_df)] = row
    return new_df

In [52]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

def process_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant'):
            services.add(service)
        elif service.startswith('Hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [53]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence
                

In [54]:
def get_act_type_to_slots(dialogue_act):
    act_type_to_slots = {}

    # All the slots that are present in the current user utterance
    span_info = dialogue_act['span_info']
    act_types = span_info['act_type']
    act_slot_names = span_info['act_slot_name']
    act_slot_values = span_info['act_slot_value']

    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            if not act_type in act_type_to_slots:
                act_type_to_slots[act_type] = []
            act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))

    # All the slots that are not present in the current user utterance (slots with '?')
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    for act_type, act_slot in zip(act_types, act_slots):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value == "?":
                    if not act_type in act_type_to_slots:
                        act_type_to_slots[act_type] = []
                    act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))
    return act_type_to_slots

In [55]:
def get_to_be_retrieved(dialogue_act, current_booking_service):
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']
    
    to_be_retrieved = []
    for act_type, act_slot in zip(act_types, act_slots):
        # If the domain is booking, we need to know which service is being booked
        
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            domain = act_type.split("-")[0]
            if domain == "booking" and len(current_booking_service)==1:
                domain = current_booking_service[0]
            domain = domain.lower()

            # Retrieve the slots that are in the form of "something : ?"
            slots = []
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value != "?":
                    slots.append(domain + "-" + slot_name)

            if slots:
                # Add the domain + "-availability" slot
                to_be_retrieved_slot = [domain + "-availability"] + slots

                # Remove domain + "-choice" if it is present (WHYYY????)
                while domain+"-choice" in to_be_retrieved_slot:
                    del to_be_retrieved_slot[to_be_retrieved_slot.index(domain+"-choice")]
                to_be_retrieved.extend(to_be_retrieved_slot)
        
    # Remove duplicates
    to_be_retrieved = sorted(list(set(to_be_retrieved)))
    return to_be_retrieved


In [56]:
def concatenate_to_be_retrieved(prev_user_utterance, prev_user_act_type_to_slots, user_utterance, user_act_type_to_slots, user_booking_service):

    historical_utterance = prev_user_utterance + " | "
    for act_type in prev_user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in prev_user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(prev_user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(prev_user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    
    historical_utterance += " | "
    historical_utterance += user_utterance + " | "
    for act_type in user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    historical_utterance += " | "
    historical_utterance += " , ".join(user_booking_service)

    return historical_utterance

In [57]:
def extract_to_be_retrieved(dataset):
    target_list = []
    training_list = []

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns

        speaker_str = {'User': 0, 'Agent': 1}
        
        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if speaker == speaker_str['Agent']:
                
                prev_user = j - 1
                services = turns['frames'][prev_user]['service']
                user_booking_service = [service for service in services if service in ["hotel", "restaurant"]]
            
                if len(user_booking_service) == 0:
                    continue

                # The user start, so there will always be j-1 turns before
                # Retrieve the previous user dialogue act
                prev_user = j - 3
                if prev_user >= 0:
                    user_dialogue_act = turns['dialogue_acts'][prev_user]

                    prev_user_act_type_to_slots = get_act_type_to_slots(user_dialogue_act)

                    prev_user_utterance = parse(turns['utterance'][prev_user])
                else:
                    prev_user_utterance = ""
                    prev_user_act_type_to_slots = {}

                prev_user = j - 1

                user_dialogue_act = turns['dialogue_acts'][prev_user]

                user_act_type_to_slots = get_act_type_to_slots(user_dialogue_act)

                user_utterance = parse(turns['utterance'][prev_user])
                    
                # HISTORY = 2
                # - 2 Previous user utterance
                # - 2 Previous user acts
                # - 2 Previous slots (also slots with '?')
                # - Previous user utterance
                # - Previous user acts
                # - Previous slots (also slots with '?')
                # Structure: user_utterance | user_act = slot_name : slot_value , slot_name : slot_value ; user_act = slot_name : slot_value , slot_name : slot_value |
                #           user_utterance | user_act = slot_name : slot_value , slot_name : slot_value ; user_act = slot_name : slot_value , slot_name : slot_value |
                
                historical_utterance = concatenate_to_be_retrieved(prev_user_utterance, prev_user_act_type_to_slots, user_utterance, user_act_type_to_slots, user_booking_service)

                # Target
                targets = get_to_be_retrieved(dialogue_act, user_booking_service)

                # The dataset is very unbalanced, so I will remove the labels that does not have also our domain                
                if(len(targets) == 0):
                    targets.append('none')
                training_list.append(historical_utterance)
                target_list.append(targets)

    return training_list, target_list

In [58]:
try:
    print('Loading from json...')
    train_utterances, train_labels = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    test_utterances, test_labels = load_variable_from_json(TEST_DATA_SAVE_PATH)
    valid_utterances, valid_labels = load_variable_from_json(VALID_DATA_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')
    print('Loading dataset...')

    try:
        if ALWAYS_LOAD_DATASET:
            raise Exception('Load data always')
        # Run it only once
        dataset = DatasetDict.load_from_disk(DATASET_DIRECTORY)
        
    except:
        # Run it only once
        dataset = load_dataset("multi_woz_v22")

        if SAVE_DATASET:
            # Run it only once
            dataset.save_to_disk(DATASET_DIRECTORY)

    print('Preprocessing...')
    train = preprocess_split(dataset, 'train')
    val = preprocess_split(dataset, 'validation')
    test = preprocess_split(dataset, 'test')

    print('Extracting utterances and act types...')
    train_utterances, train_labels = extract_to_be_retrieved(train)
    test_utterances, test_labels = extract_to_be_retrieved(test)
    valid_utterances, valid_labels = extract_to_be_retrieved(val)

    print('Saving to json...')
    save_variable_to_json([train_utterances,train_labels] , TRAIN_DATA_SAVE_PATH)
    save_variable_to_json([test_utterances,test_labels], TEST_DATA_SAVE_PATH)
    save_variable_to_json([valid_utterances,valid_labels], VALID_DATA_SAVE_PATH)

    print('Saved to json')

Loading from json...
Exception occured during loading from json
Loading dataset...
Preprocessing...
Extracting utterances and act types...


100%|██████████| 6321/6321 [06:26<00:00, 16.36it/s]
100%|██████████| 745/745 [00:45<00:00, 16.41it/s]
100%|██████████| 762/762 [00:49<00:00, 15.49it/s]

Saving to json...
Saved to json





In [59]:
for utterance, act_type in zip(train_utterances[:100], train_labels[:100]):
    print("Utterance:", utterance)
    print("Act Type:", act_type)
    print()


Utterance:  |  | I need a place to dine in the center that s expensive | Restaurant-Inform = area : centre , pricerange : expensive | restaurant , hotel
Act Type: ['restaurant-availability', 'restaurant-food']

Utterance: I need a place to dine in the center that s expensive | Restaurant-Inform = area : centre , pricerange : expensive | any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ? | Restaurant-Request = food : ? | restaurant , hotel
Act Type: ['restaurant-area', 'restaurant-availability', 'restaurant-food', 'restaurant-name']

Utterance: any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ? | Restaurant-Request = food : ? | sound good , could I get that phone number ? also , could you recommend I an expensive hotel ? | Hotel-Inform = pricerange : expensive , type : hotel ; Restaurant-Request = phone : ? | restaurant , hotel
Act Type: ['hotel

In [60]:
none_labels_length = len([label for label in train_labels if label == ['none']])
print("None labels length:", none_labels_length)
print("Total labels length:", len(train_labels))

None labels length: 11858
Total labels length: 28226
