In [54]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [55]:
#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [56]:
#!pip install transformers
#!pip install accelerate
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [57]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from itertools import product
import os
import joblib
import json
import math
import nltk
from datasets import load_dataset
from datasets import DatasetDict 

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
DATASET_DIRECTORY = "./../dataset.hf/"
TRAIN_DATA_SAVE_PATH = "agent_utterances_act_types/train.json"
TEST_DATA_SAVE_PATH = "agent_utterances_act_types/test.json"
VALID_DATA_SAVE_PATH = "agent_utterances_act_types/valid.json"

ALWAYS_LOAD_DATASET = False
SAVE_DATASET = True

In [59]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [60]:
def process_agent_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant') or service.startswith('restaurant'):
            services.add(service)
        elif service.startswith('Hotel') or service.startswith('hotel'):
            services.add(service)
        elif service.startswith('booking') or service.startswith('Booking'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [61]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence
                

In [62]:
def get_act_type_to_slots(dialogue_act):
    act_type_to_slots = {}

    # All the slots that are present in the current user utterance
    span_info = dialogue_act['span_info']
    act_types = span_info['act_type']
    act_slot_names = span_info['act_slot_name']
    act_slot_values = span_info['act_slot_value']

    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            if not act_type in act_type_to_slots:
                act_type_to_slots[act_type] = []
            act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))

    # All the slots that are not present in the current user utterance (slots with '?')
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    for act_type, act_slot in zip(act_types, act_slots):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value == "?":
                    if not act_type in act_type_to_slots:
                        act_type_to_slots[act_type] = []
                    act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))
    return act_type_to_slots

In [63]:
def get_to_be_retrieved(dialogue_act, current_booking_service):
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']
    
    to_be_retrieved = []
    for act_type, act_slot in zip(act_types, act_slots):
        # If the domain is booking, we need to know which service is being booked
        
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            domain = act_type.split("-")[0]
            if domain == "booking" and len(current_booking_service)==1:
                domain = current_booking_service[0]
            domain = domain.lower()

            # Retrieve the slots that are in the form of "something : ?"
            slots = []
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value != "?":
                    slots.append(domain + "-" + slot_name)

            if slots:
                # Add the domain + "-availability" slot
                to_be_retrieved_slot = [domain + "-availability"] + slots

                # Remove domain + "-choice" if it is present
                while domain+"-choice" in to_be_retrieved_slot:
                    del to_be_retrieved_slot[to_be_retrieved_slot.index(domain+"-choice")]
                to_be_retrieved.extend(to_be_retrieved_slot)
        
    # Remove duplicates
    to_be_retrieved = sorted(list(set(to_be_retrieved)))
    return to_be_retrieved

In [64]:
def concatenate_to_be_retrieved(prev_user_utterance, prev_user_act_type_to_slots, user_utterance, user_act_type_to_slots, user_booking_service):

    historical_utterance = prev_user_utterance + " | "
    for act_type in prev_user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in prev_user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(prev_user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(prev_user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    
    historical_utterance += " | "
    historical_utterance += user_utterance + " | "
    for act_type in user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    historical_utterance += " | "
    historical_utterance += " , ".join(user_booking_service)

    return historical_utterance

In [65]:
def extract_agent_utterance_and_act_types(dataset):
    act_types = []
    utterance_list = []
    speaker_str = {'User': 0, 'Agent': 1}

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        
        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if speaker == speaker_str['Agent']: # if it's the agent's turn
                if j == 1:
                    prev_agent_utterance = ''
                    prev_agent_acts_to_slot = {}
                else:
                    # Retrieve the previous user utterance and acts
                    prev_agent_utterance = turns['utterance'][j - 2]
                    prev_agent_utterance = parse(prev_agent_utterance) # Preprocess the utterance
                    prev_agent_acts_to_slot = get_act_type_to_slots(turns['dialogue_acts'][j - 2])

                    # Retrieve the previous bot utterance and acts
                prev_user_utterance = turns['utterance'][j - 1]
                prev_user_utterance = parse(prev_user_utterance) # Preprocess the utterance
                prev_user_acts_to_slot = get_act_type_to_slots(turns['dialogue_acts'][j - 1])
                
                user_booking_service = process_agent_dialogue_act(turns['dialogue_acts'][j - 1]['dialog_act']['act_type'])

                # Retrieve the current user utterance and acts
                current_act_type = dialogue_act['dialog_act']['act_type']
                current_act_type = process_agent_dialogue_act(current_act_type)
                
                to_be_retrieved = get_to_be_retrieved(dialogue_act, current_act_type)

                # HISTORY = 2
                # - Previous agent utterance
                # - Previous agent acts
                # - Previous agent slots
                # - Previous user utterance
                # - Previous user acts
                # - Previous user slots (also slots with '?')
                # Structure: prev_agent_utterance | agent_act = slot_name : slot_value , slot_name : slot_value ; agent_act = slot_name : slot_value , slot_name : slot_value |
                #           prev_user_utterance | user_act = slot_name : slot_value , slot_name : slot_value ; user_act = slot_name : slot_value , slot_name : slot_value |
                #           user_booking_service | to_be_retrieved
                

                historical_utterance = concatenate_to_be_retrieved(prev_user_utterance, prev_user_acts_to_slot, prev_agent_utterance, prev_agent_acts_to_slot, user_booking_service)
                historical_utterance += " | " + " , ".join(to_be_retrieved)

                utterance_list.append(historical_utterance)
                act_types.append(current_act_type)

    return utterance_list, act_types

In [66]:
try:
    print('Loading from json...')
    train_utterances, train_labels = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    test_utterances, test_labels = load_variable_from_json(TEST_DATA_SAVE_PATH)
    valid_utterances, valid_labels = load_variable_from_json(VALID_DATA_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')
    print('Loading dataset...')

    try:
        if ALWAYS_LOAD_DATASET:
            raise Exception('Load data always')
        # Run it only once
        dataset = DatasetDict.load_from_disk(DATASET_DIRECTORY)
        
    except:
        # Run it only once
        dataset = load_dataset("multi_woz_v22")

        if SAVE_DATASET:
            # Run it only once
            dataset.save_to_disk(DATASET_DIRECTORY)

    print('Extracting utterances and act types...')
    train_utterances, train_labels = extract_agent_utterance_and_act_types(dataset['train'].to_pandas())
    test_utterances, test_labels = extract_agent_utterance_and_act_types(dataset['test'].to_pandas())
    valid_utterances, valid_labels = extract_agent_utterance_and_act_types(dataset['validation'].to_pandas())

    print('Saving to json...')
    save_variable_to_json([train_utterances,train_labels] , TRAIN_DATA_SAVE_PATH)
    save_variable_to_json([test_utterances,test_labels], TEST_DATA_SAVE_PATH)
    save_variable_to_json([valid_utterances,valid_labels], VALID_DATA_SAVE_PATH)

    print('Saved to json')

Loading from json...
Exception occured during loading from json
Loading dataset...
Extracting utterances and act types...


100%|██████████| 8437/8437 [10:34<00:00, 13.30it/s]
100%|██████████| 1000/1000 [01:22<00:00, 12.10it/s]
100%|██████████| 1000/1000 [01:21<00:00, 12.25it/s]


Saving to json...
Saved to json


In [67]:
for utterance, act_type in zip(train_utterances[:5], train_labels[:5]):
    print("Utterance:", utterance)
    print("Act Type:", act_type)
    print()


Utterance: I need a place to dine in the center that s expensive | Restaurant-Inform = area : centre , pricerange : expensive |  |  | Restaurant-Inform | restaurant-availability , restaurant-food
Act Type: ['Restaurant-Select', 'Restaurant-Inform']

Utterance: any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ? | Restaurant-Request = food : ? | I have several option for you ; do you prefer african , asian , or british food ? | Restaurant-Inform = choice : several ; Restaurant-Select = food : African , food : Asian , food : British | Restaurant-Request | restaurant-area , restaurant-availability , restaurant-food , restaurant-name
Act Type: ['Restaurant-Inform']

Utterance: sound good , could I get that phone number ? also , could you recommend I an expensive hotel ? | Hotel-Inform = pricerange : expensive , type : hotel ; Restaurant-Request = phone : ? | there be an afrian place name Bedouin in the centre . how do t