In [1]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [3]:
#!pip install transformers
#!pip install accelerate
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from seqeval.metrics import classification_report
from itertools import product
import os
import joblib
import json
import math
import nltk
from datasets import load_dataset
from datasets import DatasetDict

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
DATASET_DIRECTORY = "./../dataset.hf/"
DATA_SAVE_PATH = "cosine_similarity/utt+values.json"
VECTORS_SAVE_PATH = "cosine_similarity/vectors.json"

ALWAYS_LOAD_DATASET = False
SAVE_DATASET = True

In [6]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [7]:
# Extracting only hotel and restaurant features from the dataset
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue

        new_df.loc[len(new_df)] = row
    return new_df

In [8]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

def process_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant') or service.startswith('restaurant'):
            services.add(service)
        elif service.startswith('Hotel') or service.startswith('hotel'):
            services.add(service)
        elif service.startswith('general') or service.startswith('General'):
            services.add(service)
        elif service.startswith('Booking') or service.startswith('booking'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [9]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence
                

In [10]:
def get_act_type_to_slots(dialogue_act):
    act_type_to_slots = {}

    # All the slots that are present in the current user utterance
    span_info = dialogue_act['span_info']
    act_types = span_info['act_type']
    act_slot_names = span_info['act_slot_name']
    act_slot_values = span_info['act_slot_value']

    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            if not act_type in act_type_to_slots:
                act_type_to_slots[act_type] = []
            act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))

    # All the slots that are not present in the current user utterance (slots with '?')
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    for act_type, act_slot in zip(act_types, act_slots):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value == "?":
                    if not act_type in act_type_to_slots:
                        act_type_to_slots[act_type] = []
                    act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))
    return act_type_to_slots

In [11]:
def get_to_be_provided_overall(dialogue_act, current_booking_service):
    
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    to_be_provided_overall = []
    for act_type, act_slot in zip(act_types, act_slots):
        # If the domain is booking, we need to know which service is being booked
        
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel') or act_type.startswith('Booking') or act_type.startswith('general'):
            domain = act_type.split("-")[0]
            if domain == "booking" and len(current_booking_service)==1:
                domain = current_booking_service[0]
            domain = domain.lower()

            if domain in ["hotel", "restaurant", "booking", "general"]:
                # Retrieve the slots that are in the form of "something : ?"
                slots = []
                for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                    if slot_name != "none" and  slot_value != "?":
                        slots.append(domain + "-" + slot_name + ":" + slot_value)
            
                if slots and any((slot_name_value.split(":")[0] != domain + "-none" for slot_name_value in slots)) and not "-No" in dialogue_act:
                    to_be_provided = ["%s-availability:yes" % (domain)] + slots
                    to_be_provided_overall.extend(to_be_provided)
                elif "-No" in dialogue_act:
                    to_be_provided = ["%s-availability:no" % (domain)] + slots
                    to_be_provided_overall.extend(to_be_provided)
            
    to_be_provided_overall = sorted(list(set(to_be_provided_overall)))
    remove_avail_no_list = [elem for elem in to_be_provided_overall if elem.endswith("availability:no")]
    for remove_avail in remove_avail_no_list:
        remove_avail_yes = remove_avail[:-2]+"yes"
        while remove_avail_yes in to_be_provided_overall:
            del to_be_provided_overall[to_be_provided_overall.index(remove_avail_yes)]

    return to_be_provided_overall


In [12]:
def get_to_be_requested(dialogue_act, current_booking_service):

    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    to_be_requested = []
    for act_type, act_slot in zip(act_types, act_slots):
        # If the domain is booking, we need to know which service is being booked
        
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel') or act_type.startswith('Booking') or act_type.startswith('general'):
            domain = act_type.split("-")[0].lower()
            if domain == "booking" and len(current_booking_service)==1:
                domain = current_booking_service[0]
            domain = domain.lower()
        
            if domain in ["hotel", "restaurant", "booking", "general"]:
                
                slots = []
                for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                    if slot_name != "none" and  slot_value == "?":
                        slots.append(domain + "-" + slot_name)
                
                to_be_requested.extend(slots)
    to_be_requested = sorted(list(set(to_be_requested)))
    return to_be_requested

In [13]:
def concatenate_user_act_type_slots(user_act_type_to_slots):
    
    historical_utterance = ""
    for act_type in user_act_type_to_slots:
        historical_utterance += act_type + " = "
        for slot_name, slot_value in user_act_type_to_slots[act_type]:
            historical_utterance += slot_name + " : " + slot_value + " , "
        
        if len(user_act_type_to_slots[act_type]) > 0:
            # Remove last comma
            historical_utterance = historical_utterance[:-3]

        historical_utterance += " ; "

    if len(user_act_type_to_slots) > 0:
        # Remove last semicolon
        historical_utterance = historical_utterance[:-3]

    return historical_utterance

In [24]:
def extract_features(dataset):
    target_list = []
    training_list = []

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns

        speaker_str = {'User': 0, 'Agent': 1}

        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if speaker == speaker_str['Agent']:
                prev_user = j - 1
                services = turns['frames'][prev_user]['service']
                user_booking_service = [service for service in services if service in ["hotel", "restaurant"]]
                if len(user_booking_service) == 0:
                    continue

                # Target
                to_be_requested = get_to_be_requested(dialogue_act, user_booking_service)
                to_be_provided = get_to_be_provided_overall(dialogue_act, user_booking_service)
                
                to_be_requested = [request + " : ?" for request in to_be_requested]
                
                names_to_be_provided = []
                values_to_be_provided = []
                for provided in to_be_provided:
                    name = provided.split(':')[0]
                    value = provided.split(':')[1]
                    if name.endswith("availability"):
                        names_to_be_provided.append(provided)
                    
                    else:
                        names_to_be_provided.append(name)
                    values_to_be_provided.append(value)

                vector = to_be_requested + names_to_be_provided
                
                info = [utterance, values_to_be_provided]

                training_list.append(vector)
                target_list.append(info)

    return training_list, target_list

In [58]:
try:
    print('Loading from json...')
    utterances = load_variable_from_json(DATA_SAVE_PATH)
    vectors = load_variable_from_json(VECTORS_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')
    print('Loading dataset...')

    try:
        if ALWAYS_LOAD_DATASET:
            raise Exception('Load data always')
        # Run it only once
        dataset = DatasetDict.load_from_disk(DATASET_DIRECTORY)
        
    except:
        # Run it only once
        dataset = load_dataset("multi_woz_v22")

        if SAVE_DATASET:
            # Run it only once
            dataset.save_to_disk(DATASET_DIRECTORY)

    print('Preprocessing...')
    train = preprocess_split(dataset, 'train')
    val = preprocess_split(dataset, 'validation')
    test = preprocess_split(dataset, 'test')

    print('Extracting utterances and act types...')
    train_vectors, train_utterances= extract_features(train)
    test_vectors, test_utterances = extract_features(test)
    valid_vectors,  valid_utterances= extract_features(val)

    utterances = train_utterances + test_utterances + valid_utterances
    vectors = train_vectors + test_vectors + valid_vectors

    print('Saving to json...')
    save_variable_to_json(utterances , DATA_SAVE_PATH)
    save_variable_to_json(vectors, VECTORS_SAVE_PATH)
    
    print('Saved to json')

Loading from json...
Loaded from json


In [59]:
for utterance, act_type in zip(utterances[:100], vectors[:100]):
    print("Utterance:", utterance)
    print("Vector:", act_type)
    print()


Utterance: ['I have several options for you; do you prefer African, Asian, or British food?', ['yes', 'several', 'African', 'Asian', 'British']]
Vector: ['restaurant-availability:yes', 'restaurant-choice', 'restaurant-food', 'restaurant-food', 'restaurant-food']

Utterance: ['There is an Afrian place named Bedouin in the centre. How does that sound?', ['centre', 'yes', 'Afrian', 'Bedouin']]
Vector: ['restaurant-area', 'restaurant-availability:yes', 'restaurant-food', 'restaurant-name']

Utterance: ["Bedouin's phone is 01223367660. As far as hotels go, I recommend the University Arms Hotel in the center of town.", ['center of town', 'yes', 'the University Arms Hotel', 'yes', 'Bedouin', '01223367660']]
Vector: ['hotel-area', 'hotel-availability:yes', 'hotel-name', 'restaurant-availability:yes', 'restaurant-name', 'restaurant-phone']

Utterance: ['Sure, when would you like that reservation?', []]
Vector: ['hotel-bookday : ?']

Utterance: ['Your booking was successful. Your reference numbe

In [71]:
requested = sorted(set([act.split('-')[1][:-4] for vector in vectors for act in vector if act.endswith('?')]))
print(len(requested))
print(requested)

12
['area', 'bookday', 'bookpeople', 'bookstay', 'booktime', 'food', 'internet', 'name', 'parking', 'pricerange', 'stars', 'type']


In [75]:
provide = sorted(set([act.split('-')[1] for vector in vectors for act in vector if not act.endswith('?')]))
print(len(provide))
print(provide)

18
['address', 'area', 'availability:yes', 'bookday', 'bookpeople', 'bookstay', 'booktime', 'choice', 'food', 'internet', 'name', 'parking', 'phone', 'postcode', 'pricerange', 'ref', 'stars', 'type']


In [30]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import os
import json

output_folder = "cosine_similarity/embeddings"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
#sentences = ['This is an example sentence', 'Each sentence is converted']
batch_size = 64
sentences = [" | ".join(vector) for vector in vectors]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

for i in range(0,len(sentences),batch_size):
    # Tokenize sentences
    encoded_input = tokenizer(sentences[i:i+batch_size], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    np.save(f"{output_folder}/embeddings_with_requests_%d.npy" % (i), sentence_embeddings)
    #print("Sentence embeddings:")
    #print(sentence_embeddings)

In [31]:
files = os.listdir(output_folder)
files = sorted(files, key = lambda x: int(x[:-4].split("_")[-1]))
vectors = np.load(f"{output_folder}/{files[0]}")
#print(training_vectors.shape)
for filename in files[1:]:
    print(filename)
    vectors = np.concatenate((vectors, np.load(f"{output_folder}/{filename}")), axis=0)
print(vectors.shape)

embeddings_with_requests_64.npy
embeddings_with_requests_128.npy
embeddings_with_requests_192.npy
embeddings_with_requests_256.npy
embeddings_with_requests_320.npy
embeddings_with_requests_384.npy
embeddings_with_requests_448.npy
embeddings_with_requests_512.npy
embeddings_with_requests_576.npy
embeddings_with_requests_640.npy
embeddings_with_requests_704.npy
embeddings_with_requests_768.npy
embeddings_with_requests_832.npy
embeddings_with_requests_896.npy
embeddings_with_requests_960.npy
embeddings_with_requests_1024.npy
embeddings_with_requests_1088.npy
embeddings_with_requests_1152.npy
embeddings_with_requests_1216.npy
embeddings_with_requests_1280.npy
embeddings_with_requests_1344.npy
embeddings_with_requests_1408.npy
embeddings_with_requests_1472.npy
embeddings_with_requests_1536.npy
embeddings_with_requests_1600.npy
embeddings_with_requests_1664.npy
embeddings_with_requests_1728.npy
embeddings_with_requests_1792.npy
embeddings_with_requests_1856.npy
embeddings_with_requests_1920.

In [40]:
def get_embedding_vector(sentence):
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings[0].cpu().detach().numpy()


In [54]:
from scipy import spatial

utterances = load_variable_from_json(DATA_SAVE_PATH)
slots = load_variable_from_json(VECTORS_SAVE_PATH)

test_slots = ['restaurant-name']

sentence = " | ".join(test_slots)

test_vec = get_embedding_vector(sentence)

worddists_array_of_arrays = spatial.distance.cdist(np.array([test_vec]),np.array(vectors), metric = "cosine")
min_dist = np.argsort(worddists_array_of_arrays[0])[:10]

for min_dist_ex_i in min_dist:
    print(worddists_array_of_arrays[0][min_dist_ex_i], min_dist_ex_i, slots[min_dist_ex_i], test_slots)
    print('Agent response:', utterances[min_dist_ex_i])
    

0.07118623742431707 16828 ['restaurant-name : ?'] ['restaurant-name']
Agent response: ['Yes what is the name of the restaurant you are looking for?', []]
0.07118623742431707 8233 ['restaurant-name : ?'] ['restaurant-name']
Agent response: ['Ok. Is there a specific restaurant you are looking for?', []]
0.07118623742431707 34362 ['restaurant-name : ?'] ['restaurant-name']
Agent response: ['No problem, are you looking for anything specific?', []]
0.07118623742431707 17974 ['restaurant-name : ?'] ['restaurant-name']
Agent response: ["I didn't find any results matching your criteria. Would you like me to find another restaurant?", []]
0.07118623742431707 21891 ['restaurant-name : ?'] ['restaurant-name']
Agent response: ["I'm so sorry, that booking was unsuccessful. Would you like to try another restaurant, instead?", []]
0.07118623742431707 1960 ['restaurant-name : ?'] ['restaurant-name']
Agent response: ["I'm sorry could you confirm what restaurant you wanted to reserve a table for 2 at?",