In [2]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [4]:
#!pip install transformers
#!pip install accelerate
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from seqeval.metrics import classification_report
from itertools import product
import os
import joblib
import json
import math
import nltk
from datasets import load_dataset
from datasets import DatasetDict

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
DATASET_DIRECTORY = "./../dataset.hf/"
TRAIN_DATA_SAVE_PATH = "utterances_?_tags/train.json"
TEST_DATA_SAVE_PATH = "utterances_?_tags/test.json"
VALID_DATA_SAVE_PATH = "utterances_?_tags/valid.json"

ALWAYS_LOAD_DATASET = False
SAVE_DATASET = True

In [7]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [8]:
# Extracting only hotel and restaurant features from the dataset
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue

        new_df.loc[len(new_df)] = row
    return new_df

In [9]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

def process_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant') or service.startswith('restaurant'):
            services.add(service)
        elif service.startswith('Hotel') or service.startswith('hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [10]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence
                

In [11]:
def get_act_type_to_slots(dialogue_act):
    act_type_to_slots = {}

    # All the slots that are present in the current user utterance
    span_info = dialogue_act['span_info']
    act_types = span_info['act_type']
    act_slot_names = span_info['act_slot_name']
    act_slot_values = span_info['act_slot_value']

    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            if not act_type in act_type_to_slots:
                act_type_to_slots[act_type] = []
            act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))

    # All the slots that are not present in the current user utterance (slots with '?')
    act_types = dialogue_act['dialog_act']['act_type']
    act_slots = dialogue_act['dialog_act']['act_slots']

    for act_type, act_slot in zip(act_types, act_slots):
        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                if slot_name != "none" and  slot_value == "?":
                    if not act_type in act_type_to_slots:
                        act_type_to_slots[act_type] = []
                    act_type_to_slots[act_type].append(tuple([slot_name, slot_value]))
    return act_type_to_slots

In [12]:
def extract_question_tag(dataset):
    target_list = []
    training_list = []

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns

        speaker_str = {'User': 0, 'Agent': 1}

        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if speaker == speaker_str['User']:
                current_act_type = dialogue_act['dialog_act']['act_type']
                current_act_type = process_dialogue_act(current_act_type)
                current_act_type = [act_type for act_type in current_act_type if act_type.endswith('Request')]
                # If there is no request why should be a question?
                if len(current_act_type) == 0:
                    continue

                if j == 0:
                    prev_user_utterance = ''
                    prev_user_acts = []
                    prev_user_slots = []
                    prev_bot_utterance = ''
                    prev_bot_acts = []
                    prev_bot_slots = []
                else:
                    # Retrieve the previous user utterance and acts
                    prev_user_utterance = turns['utterance'][j - 2]
                    prev_user_utterance = parse(prev_user_utterance) # Preprocess the utterance

                    prev_user_slots = []
                    prev_user_dialogue_act = turns['dialogue_acts'][j - 2]
                    # All the slots that are present in the current user utterance
                    span_info = prev_user_dialogue_act['span_info']
                    act_types = span_info['act_type']
                    act_slot_names = span_info['act_slot_name']
                    act_slot_values = span_info['act_slot_value']

                    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
                        act_category = act_type.split('-')[0].lower()
                        prev_user_slots.append(act_category + "-" + slot_name)

                    # All the slots that are not present in the current user utterance (slots with '?')
                    act_types = prev_user_dialogue_act['dialog_act']['act_type']
                    act_slots = prev_user_dialogue_act['dialog_act']['act_slots']

                    for act_type, act_slot in zip(act_types, act_slots):
                        if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
                            for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                                if slot_name != "none" and  slot_value == "?":
                                    act_category = act_type.split('-')[0].lower()
                                    prev_user_slots.append(act_category + "-" + slot_name + " : " + slot_value)
                    
                    prev_user_slots = process_dialogue_act(prev_user_slots)

                    prev_user_acts = turns['dialogue_acts'][j - 2]['dialog_act']['act_type']
                    prev_user_acts = process_dialogue_act(prev_user_acts)

                    # Retrieve the previous bot utterance and acts
                    prev_bot_utterance = turns['utterance'][j - 1]
                    prev_bot_utterance = parse(prev_bot_utterance)

                    prev_bot_slots = []
                    prev_bot_dialogue_act = turns['dialogue_acts'][j - 1]
                    span_info = prev_bot_dialogue_act['span_info']
                    act_types = span_info['act_type']
                    act_slot_names = span_info['act_slot_name']
                    act_slot_values = span_info['act_slot_value']

                    for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
                        act_category = act_type.split('-')[0].lower()
                        prev_bot_slots.append(act_category + "-" + slot_name)

                    prev_bot_slots = process_dialogue_act(prev_bot_slots)
                    prev_bot_acts = turns['dialogue_acts'][j - 1]['dialog_act']['act_type']
                    prev_bot_acts = process_dialogue_act(prev_bot_acts)
                
                # Retrieve the current user utterance and acts
                current_user_utterance = parse(utterance) # Preprocess the utterance

                current_user_slots = []
                # All the slots that are present in the current user utterance
                span_info = dialogue_act['span_info']
                act_types = span_info['act_type']
                act_slot_names = span_info['act_slot_name']
                act_slot_values = span_info['act_slot_value']

                for slot_name, act_type, slot_value in zip(act_slot_names, act_types , act_slot_values):
                    act_category = act_type.split('-')[0].lower()
                    current_user_slots.append(act_category + "-" + slot_name)
                    

                # HISTORY = 2
                # - 2 Previous user utterance
                # - 2 Previous user acts
                # - 2 Previous slots (also slots with '?')
                # - Previous bot utterance
                # - Previous bot acts
                # - Previous bot slots 
                # - current user utterance
                # - current user acts

                # Structure: 2 previous user utterance | 2 previous user acts | 2 previous slots | 
                #               previous bot utterance | previous bot acts | previous bot slots | 
                #               current user utterance | current user acts
                
                historical_utterance = prev_user_utterance + " | " + " , ".join(prev_user_acts) + " | " + " , ".join(prev_user_slots) + " | " 
                historical_utterance += prev_bot_utterance + " | " + " , ".join(prev_bot_acts) + " | " + " , ".join(prev_bot_slots) + " | " 
                historical_utterance += current_user_utterance + " | " + " , ".join(current_act_type)

                # Target
                targets = []

                # All the slots that are not present in the current user utterance (slots with '?')
                act_types = dialogue_act['dialog_act']['act_type']
                act_slots = dialogue_act['dialog_act']['act_slots']
                
                for act_type, act_slot in zip(act_types, act_slots):
                    if act_type.startswith('Restaurant') or act_type.startswith('Hotel'):
                        for slot_name, slot_value in zip(act_slot['slot_name'], act_slot['slot_value']):
                            if slot_name != "none" and  slot_value == "?":
                                act_category = act_type.split('-')[0].lower()
                                targets.append(act_category + "-" + slot_name)

                # The dataset is very unbalanced, so I will remove the labels that does not have also our domain
                if len(targets) == 0:
                    targets.append('none')

                training_list.append(historical_utterance)
                target_list.append(targets)

    return training_list, target_list

In [13]:
try:
    print('Loading from json...')
    train_utterances, train_labels = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    test_utterances, test_labels = load_variable_from_json(TEST_DATA_SAVE_PATH)
    valid_utterances, valid_labels = load_variable_from_json(VALID_DATA_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')
    print('Loading dataset...')

    try:
        if ALWAYS_LOAD_DATASET:
            raise Exception('Load data always')
        # Run it only once
        dataset = DatasetDict.load_from_disk(DATASET_DIRECTORY)
        
    except:
        # Run it only once
        dataset = load_dataset("multi_woz_v22")

        if SAVE_DATASET:
            # Run it only once
            dataset.save_to_disk(DATASET_DIRECTORY)

    print('Preprocessing...')
    train = preprocess_split(dataset, 'train')
    val = preprocess_split(dataset, 'validation')
    test = preprocess_split(dataset, 'test')

    print('Extracting utterances and act types...')
    train_utterances, train_labels = extract_question_tag(train)
    test_utterances, test_labels = extract_question_tag(test)
    valid_utterances, valid_labels = extract_question_tag(val)

    print('Saving to json...')
    save_variable_to_json([train_utterances,train_labels] , TRAIN_DATA_SAVE_PATH)
    save_variable_to_json([test_utterances,test_labels], TEST_DATA_SAVE_PATH)
    save_variable_to_json([valid_utterances,valid_labels], VALID_DATA_SAVE_PATH)

    print('Saved to json')

Loading from json...
Loaded from json


In [23]:
for utterance, act_type in zip(train_utterances[:10], train_labels[:10]):
    print("Utterance:", utterance)
    print("Act Type:", act_type)
    print()

Utterance: I need a place to dine in the center that s expensive | Restaurant-Inform | restaurant-area , restaurant-pricerange | I have several option for you ; do you prefer african , asian , or british food ? | Restaurant-Inform , Restaurant-Select | restaurant-food , restaurant-choice | any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ? | Restaurant-Request
Act Type: ['restaurant-food']

Utterance: any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ? | Restaurant-Request | restaurant-food : ? | there be an afrian place name Bedouin in the centre . how do that sound ? | Restaurant-Inform | restaurant-food , restaurant-area , restaurant-name | sound good , could I get that phone number ? also , could you recommend I an expensive hotel ? | Restaurant-Request
Act Type: ['restaurant-phone']

Utterance: yeah , I need a restaurant in the west and wit

In [56]:
none_labels_length = len([label for label in train_labels if label == ['none']])
print("None labels length:", none_labels_length)
print("Total labels length:", len(train_labels))

None labels length: 0
Total labels length: 4800
