# Slot Filling

In [17]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [18]:
#!pip install transformers
#!pip install seqeval
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import AlbertForSequenceClassification, AlbertTokenizer, AlbertConfig
from seqeval.metrics import classification_report
from datasets import load_dataset
from datasets import DatasetDict

import nltk
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch import cuda
import re

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
DATASET_DIRECTORY = "./../dataset.hf/"
TRAIN_DATA_SAVE_PATH = "utterances_tags/train.json"
TEST_DATA_SAVE_PATH = "utterances_tags/test.json"
VALID_DATA_SAVE_PATH = "utterances_tags/valid.json"

ALWAYS_LOAD_DATASET = False
SAVE_DATASET = True

In [21]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [22]:

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [23]:
# Extracting only hotel and restaurant features from the dataset
def preprocess_split(dataset, split):
    df = dataset[split].to_pandas()
    new_df = pd.DataFrame(columns = df.columns)
    for i in range(len(df)):
        # Taken from notebook, to know which lines to skip
        row = df.loc[i]
        if not any(set(row.turns['frames'][turn_id]['service']).intersection(['hotel', 'restaurant']) for turn_id,utt in enumerate(row.turns['utterance'])):
            continue

        new_df.loc[len(new_df)] = row
    return new_df

In [24]:
def process_service_list(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service == 'restaurant':
            services.add('restaurant')
        elif service == 'hotel':
            services.add('hotel')
        else:
            services.add('other')
        if len(services) == 3:
            break
    return list(services)

def process_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant'):
            services.add(service)
        elif service.startswith('Hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [25]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence

In [26]:
def extract_token_bio_tags(dataset):
    utterances = [[]]
    tags = [[]]
    count = 0

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        for j, (utterance, speaker, dialogue_act, frames) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'], turns['frames'])):
            if speaker != 0:
                continue
            if 'other' in process_service_list(frames['service']):
                continue
            
            if j == 0:
                prev_user_acts = []
                prev_bot_acts = []
            else:
                # Retrieve the previous user utterance and acts
                prev_user_acts = turns['dialogue_acts'][j - 2]['dialog_act']['act_type']
                prev_user_acts = process_dialogue_act(prev_user_acts)

                # Retrieve the previous bot utterance and acts
                prev_bot_acts = turns['dialogue_acts'][j - 1]['dialog_act']['act_type']
                prev_bot_acts = process_dialogue_act(prev_bot_acts)

            current_user_acts = dialogue_act['dialog_act']['act_type']
            current_user_acts = process_dialogue_act(current_user_acts)

            span_info = dialogue_act['span_info']
            act_slot_names = span_info['act_slot_name']
            act_slot_values = span_info['act_slot_value']
            span_starts = span_info['span_start']
            span_ends = span_info['span_end']
            slots = {slot_name : {'start': start, 'end': end} for slot_name, start, end in zip(act_slot_names, span_starts, span_ends)}

            #Preprocess
            utterances.append([])
            tags.append([])
            count = count+1

            prev_user_acts_str = " , ".join(prev_user_acts)
            prev_bot_acts_str = " , ".join(prev_bot_acts)
            current_user_acts_str = " , ".join(current_user_acts)
            hystory = " | ".join([prev_user_acts_str, prev_bot_acts_str, current_user_acts_str])

            for word in hystory.split():
                utterances[count].append(word)
                tags[count].append("O")
            
            last_tag = 0
            for slot_name in slots:
                slot_start, slot_end = slots[slot_name]['start'], slots[slot_name]['end']
                tokens = parse(utterance[slot_start:slot_end]).split()
                #tokens = re.sub("[^a-zA-Z0-9]", " ", utterance[slot_start:slot_end]).lower().split()
                for word in parse(utterance[last_tag:slot_start]).split():
                #for word in re.sub("[^a-zA-Z0-9]", " ", utterance[last_tag:slot_start]).lower().split():
                    utterances[count].append(word)
                    tags[count].append("O")


                for j, token in enumerate(tokens):
                    bio_type = 'B-' if j == 0 else 'I-'    
                    utterances[count].append(token) 
                    tags[count].append(bio_type + slot_name)
                last_tag = slot_end

            #for word in re.sub("[^a-zA-Z0-9]", " ", utterance[last_tag:]).lower().split():
            for word in parse(utterance[last_tag:]).split():
                    utterances[count].append(word)
                    tags[count].append("O")
            


    return utterances, tags

In [27]:
try:
    print('Loading from json...')
    train_utterances, train_act_types = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    test_utterances, test_act_types = load_variable_from_json(TEST_DATA_SAVE_PATH)
    valid_utterances, valid_act_types = load_variable_from_json(VALID_DATA_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')
    print('Loading dataset...')

    try:
        if ALWAYS_LOAD_DATASET:
            raise Exception('Load data always')
        # Run it only once
        data = DatasetDict.load_from_disk(DATASET_DIRECTORY)
        
    except:
        # Run it only once
        dataset = load_dataset("multi_woz_v22")

        if SAVE_DATASET:
            # Run it only once
            dataset.save_to_disk(DATASET_DIRECTORY)

    print('Preprocessing...')
    train = preprocess_split(data, 'train')
    val = preprocess_split(data, 'validation')
    test = preprocess_split(data, 'test')

    print('Extracting utterances and bio tags...')
    train_words, train_tags = extract_token_bio_tags(train)
    valid_word, valid_tags = extract_token_bio_tags(val)
    test_word, test_tags = extract_token_bio_tags(test)
    
    print('Saving to json...')
    save_variable_to_json([train_words, train_tags] , TRAIN_DATA_SAVE_PATH)
    save_variable_to_json([test_word, test_tags], TEST_DATA_SAVE_PATH)
    save_variable_to_json([valid_word, valid_tags], VALID_DATA_SAVE_PATH)

    print('Saved to json')


Loading from json...
Exception occured during loading from json
Loading dataset...
Preprocessing...


Extracting utterances and bio tags...


100%|██████████| 6321/6321 [06:49<00:00, 15.44it/s]
100%|██████████| 762/762 [00:52<00:00, 14.42it/s]
100%|██████████| 745/745 [00:51<00:00, 14.34it/s]


Saving to json...
Saved to json


In [28]:
for word, tag in zip(train_words[:10], train_tags[:10]):
    print("Word:", word)
    print("Tag:", tag)
    print()

Word: []
Tag: []

Word: ['|', '|', 'Restaurant-Inform', 'I', 'need', 'a', 'place', 'to', 'dine', 'in', 'the', 'center', 'that', 's', 'expensive']
Tag: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-area', 'O', 'O', 'B-pricerange']

Word: ['Restaurant-Inform', '|', 'Restaurant-Select', ',', 'Restaurant-Inform', '|', 'Restaurant-Request', 'any', 'sort', 'of', 'food', 'would', 'be', 'fine', ',', 'as', 'long', 'as', 'it', 'be', 'a', 'bit', 'expensive', '.', 'could', 'I', 'get', 'the', 'phone', 'number', 'for', 'your', 'recommendation', '?']
Tag: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Word: ['Restaurant-Request', '|', 'Restaurant-Inform', '|', 'Hotel-Inform', ',', 'Restaurant-Request', 'sound', 'good', ',', 'could', 'I', 'get', 'that', 'phone', 'number', '?', 'also', ',', 'could', 'you', 'recommend', 'I', 'an', 'expensive', 'hotel', '?']
Tag: ['O',