In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and saving data files...
Data files already downloaded.
>>> OK.



In [2]:
import json
import warnings
from pathlib import Path

import modin.pandas as pd
import pandas
import plotly.io as pio
import pyarrow.feather as feather
from tqdm import tqdm

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


pio.renderers.default = "notebook"


pd.options.plotting.backend = "plotly"


DATA_PATH = Path("../data")
FRAMES_JSON_PATH = Path(DATA_PATH, "raw/frames.json")

In [3]:
raw_data = pd.read_json(FRAMES_JSON_PATH)

raw_data.describe(include="all")


the 'numpy' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'numpy'



Unnamed: 0,user_id,turns,wizard_id,id,labels
count,1369,1369,1369,1369,1369
unique,11,1369,12,1369,16
top,U22K1SX9N,[{'text': 'I'd like to book a trip to Atlantis...,U21T9NMKM,e2c0fc6c-2134-4891-8353-ef16d8412c9a,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."
freq,345,1,301,1,929


In [4]:
frames = raw_data[["id", "wizard_id", "user_id"]]
frames[["userSurveyRating", "wizardSurveyTaskSuccessful"]] = [
    [x["userSurveyRating"], x["wizardSurveyTaskSuccessful"]] for x in raw_data.labels
]
frames = frames.astype(
    {"userSurveyRating": "float", "wizardSurveyTaskSuccessful": "bool"}
)
frames.describe(include="all")

Unnamed: 0,id,wizard_id,user_id,userSurveyRating,wizardSurveyTaskSuccessful
count,1369,1369,1369,1366.0,1369
unique,1369,12,11,,2
top,e2c0fc6c-2134-4891-8353-ef16d8412c9a,U21T9NMKM,U22K1SX9N,,True
freq,1,301,345,,1287
mean,,,,4.573419,
std,,,,0.839596,
min,,,,1.0,
25%,,,,4.0,
50%,,,,5.0,
75%,,,,5.0,


In [14]:
if Path(DATA_PATH, "processed/turns.csv").exists():
    turns = pd.read_csv(Path(DATA_PATH, "processed/turns.csv"))
else:
    turns = pd.DataFrame()

    for turn in tqdm(raw_data["turns"]):
        # print()
        # print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
        # print()

        known_facts = {}
        for i, frame in enumerate(turn):
            # print(f'{i} - { frame["author"] } says : \n"{ frame["text"] }"')

            if frame["author"] == "wizard":
                continue

            turn_dict = {
                "text": frame["text"],
            }

            turn_dict.update(
                {f"old_{key}": value for key, value in known_facts.items()}
            )

            known_facts.update(
                {
                    info_key: info[-1]["val"] if not info[-1]["negated"] else None
                    for f in frame["labels"]["frames"]
                    for info_key, info in f["info"].items()
                }
            )

            turn_dict.update(
                {f"new_{key}": value for key, value in known_facts.items()}
            )

            turns = turns.append(turn_dict, ignore_index=True)

            # print(f"Known facts : \n{known_facts}")
            # print()

    turns.to_csv(Path(DATA_PATH, "processed/turns.csv"), index=False)

turns.describe(include="all")

Unnamed: 0,text,new_intent,new_budget,new_dst_city,new_or_city,new_str_date,new_n_adults,old_intent,old_budget,old_dst_city,...,new_count_seat,old_count_seat,new_dst_city_ok,old_dst_city_ok,new_impl_anaphora,old_impl_anaphora,new_str_date_ok,new_end_date_ok,old_str_date_ok,old_end_date_ok
count,10407,9362,6229,9631,9620,7430,5570,8078,5255,8307,...,7,6,8,7,5,4,3,3,2,2
unique,9695,1,228,392,339,155,57,1,225,382,...,1,1,1,1,1,1,1,1,1,1
top,Thanks!,book,-1,Punta Cana,-1,-1,1,book,-1,-1,...,two,two,True,True,category,category,True,True,True,True
freq,73,9362,1704,283,174,655,2462,8078,1469,257,...,7,6,8,7,5,4,3,3,2,2


In [16]:
columns = ["text"] + [
    f"{prefix}_{key}"
    for key in ["or_city", "dst_city", "str_date", "end_date", "budget"]
    for prefix in ["old", "new"]
]
data = turns[columns]

data

Unnamed: 0,text,old_or_city,new_or_city,old_dst_city,new_dst_city,old_str_date,new_str_date,old_end_date,new_end_date,old_budget,new_budget
0,I'd like to book a trip to Atlantis from Capri...,,Caprica,,Atlantis,,august 13,,,,1700.0
1,"Yes, how about going to Neverland from Caprica...",Caprica,Caprica,Atlantis,Neverland,august 13,august 13,,,1700.0,1900.0
2,I have no flexibility for dates... but I can l...,Caprica,Atlantis,Neverland,Atlantis,august 13,august 13,,,1900.0,1700.0
3,I suppose I'll speak with my husband to see if...,Atlantis,Atlantis,Atlantis,Atlantis,august 13,august 13,,,1700.0,1700.0
4,"Hello, I am looking to book a vacation from Go...",,Gotham City,,Mos Eisley,,,,,,2100.0
...,...,...,...,...,...,...,...,...,...,...,...
10402,"5 adults and 7 kids! Yup, the lot of us. We wa...",Tampa,Tampa,-1,-1,,,,,,32800.0
10403,Oh yes! Between September 12 and 26!,Tampa,Tampa,-1,-1,,september 12,,26,32800.0,32800.0
10404,"That sounds amazing, and it's within those dat...",Tampa,Tampa,-1,Queenstown,september 12,september 12,26,26,32800.0,32800.0
10405,"Ok perfect, book me!",Tampa,Tampa,Queenstown,Queenstown,september 12,september 12,26,25,32800.0,32800.0
