In [1]:
#Mount Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#change current working directory
#%cd "/content/drive/Shareddrives/NLI"

In [3]:
#!pip install transformers
#!pip install accelerate
#!pip install datasets
#!python3 -m spacy download en_core_web_lg

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import recall_score, precision_score, f1_score
from torch import cuda
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from itertools import product
import os
import joblib
import json
import math
import nltk

nltk.download('punkt')

import spacy

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to /home/mister/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
DATASET_DIRECTORY = "./../dataset.hf/"
TRAIN_DATA_SAVE_PATH = "utterances_act_types/train.json"
TEST_DATA_SAVE_PATH = "utterances_act_types/test.json"
VALID_DATA_SAVE_PATH = "utterances_act_types/valid.json"

In [6]:
import json

def save_variable_to_json(variable, file_path):
    with open(file_path, 'w') as file:
        json.dump(variable, file)

def load_variable_from_json(file_path):
    with open(file_path, 'r') as file:
        variable = json.load(file)
    return variable

In [8]:
def process_dialogue_act(service_list):
    services = set()
    if len(service_list) == 0:
        services.add('other')
    for service in service_list:
        if service.startswith('Restaurant'):
            services.add(service)
        elif service.startswith('Hotel'):
            services.add(service)
        elif service.startswith('general'):
            services.add(service)
        else:
            services.add('other')
    return list(services)

In [9]:
def parse(sentence):
    # Tokenize
    sentence = nlp(sentence)
    # Remove stop words
    sentence = " ".join([token.lemma_ for token in sentence])
    
    return sentence
                

In [10]:
def extract_utterance_and_act_types(dataset):
    act_types = []
    utterance_list = []

    for i in tqdm(range(len(dataset))):
        turns = dataset.loc[i].turns
        # print(dataset.loc[i].turns['utterance'])
        # print([frame['service'] for frame in dataset.loc[i].turns['frames']])
        for j, (utterance, speaker, dialogue_act) in enumerate(zip(turns['utterance'], turns['speaker'], turns['dialogue_acts'])):
            if speaker != 0: # if it's the user's turn
                continue
            if j == 0:
                prev_user_utterance = ''
                prev_user_acts = []
                prev_bot_utterance = ''
                prev_bot_acts = []
            else:
                # Retrieve the previous user utterance and acts
                prev_user_utterance = turns['utterance'][j - 2]
                prev_user_utterance = parse(prev_user_utterance) # Preprocess the utterance
                prev_user_acts = turns['dialogue_acts'][j - 2]['dialog_act']['act_type']
                prev_user_acts = process_dialogue_act(prev_user_acts)

                # Retrieve the previous bot utterance and acts
                prev_bot_utterance = turns['utterance'][j - 1]
                prev_bot_utterance = parse(prev_bot_utterance) # Preprocess the utterance
                prev_bot_acts = turns['dialogue_acts'][j - 1]['dialog_act']['act_type']
                prev_bot_acts = process_dialogue_act(prev_bot_acts)


            # Retrieve the current user utterance and acts
            current_act_type = dialogue_act['dialog_act']['act_type']
            current_act_type = process_dialogue_act(current_act_type)
            current_user_utterance = parse(utterance) # Preprocess the utterance

            historical_utterance = ' | '.join([prev_user_utterance, ', '.join(prev_user_acts), prev_bot_utterance, ', '.join(prev_bot_acts), current_user_utterance])

            utterance_list.append(historical_utterance)
            act_types.append(current_act_type)

    return utterance_list, act_types

In [15]:
try:
    print('Loading from json...')
    train_utterances, train_act_types = load_variable_from_json(TRAIN_DATA_SAVE_PATH)
    test_utterances, test_act_types = load_variable_from_json(TEST_DATA_SAVE_PATH)
    valid_utterances, valid_act_types = load_variable_from_json(VALID_DATA_SAVE_PATH)
    print('Loaded from json')
except:
    print('Exception occured during loading from json')

    print('Extracting utterances and act types...')
    train_utterances, train_act_types = extract_utterance_and_act_types(dataset['train'].to_pandas())
    test_utterances, test_act_types = extract_utterance_and_act_types(dataset['test'].to_pandas())
    valid_utterances, valid_act_types = extract_utterance_and_act_types(dataset['validation'].to_pandas())

    print('Saving to json...')
    save_variable_to_json([train_utterances,train_act_types] , TRAIN_DATA_SAVE_PATH)
    save_variable_to_json([test_utterances,test_act_types], TEST_DATA_SAVE_PATH)
    save_variable_to_json([valid_utterances,valid_act_types], VALID_DATA_SAVE_PATH)

    print('Saved to json')

Loading from json...
Loaded from json


In [16]:
for utterance, act_type in zip(train_utterances[:5], train_act_types[:5]):
    print("Utterance:", utterance)
    print("Act Type:", act_type)
    print()


Utterance:  |  |  |  | I need a place to dine in the center that s expensive
Act Type: ['Restaurant-Inform']

Utterance: I need a place to dine in the center that s expensive | Restaurant-Inform | I have several option for you ; do you prefer african , asian , or british food ? | Restaurant-Select, Restaurant-Inform | any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ?
Act Type: ['Restaurant-Request']

Utterance: any sort of food would be fine , as long as it be a bit expensive . could I get the phone number for your recommendation ? | Restaurant-Request | there be an afrian place name Bedouin in the centre . how do that sound ? | Restaurant-Inform | sound good , could I get that phone number ? also , could you recommend I an expensive hotel ?
Act Type: ['Hotel-Inform', 'Restaurant-Request']

Utterance: sound good , could I get that phone number ? also , could you recommend I an expensive hotel ? | Hotel-Inform, Res