## Data Summary

In [1]:
from collections import defaultdict
from collections import Counter
import numpy as np

In [2]:
intents = defaultdict(int)
attr_dict = defaultdict(Counter)

In [3]:
with open('./Multiwoz/WOZ_train_ans.txt') as f:
    for line in f:
        line = line.strip()
        comps = line.split('|')
        # count intent
        intent = comps[0]
        intents[intent] += 1
        
        if len(comps) > 1:
            attrs = comps[1:]
        
            for attr in attrs:
                attr_type, ans = tuple(attr.split('='))
                attr_dict[attr_type].update([ans])

In [4]:
intents

defaultdict(int, {'find_hotel': 1609, 'find_restaurant': 2151})

In [5]:
attr_dict.keys()

dict_keys(['hotel-area', 'hotel-internet', 'hotel-parking', 'hotel-name', 'restaurant-food', 'restaurant-pricerange', 'restaurant-area', 'restaurant-name', 'hotel-pricerange', 'hotel-type', 'hotel-stars'])

In [6]:
attr_dict['hotel-area']

Counter({'centre': 62,
         'north': 122,
         'west': 45,
         'south': 34,
         'dontcare': 2,
         'east': 61})

In [8]:
attr_dict['hotel-internet']

Counter({'yes': 306, 'dontcare': 27, 'no': 16})

In [9]:
attr_dict['hotel-parking']

Counter({'yes': 287, 'no': 20, 'dontcare': 28})

In [10]:
attr_dict['hotel-pricerange']

Counter({'expensive': 75, 'cheap': 114, 'moderate': 139, 'dontcare': 1})

In [11]:
attr_dict['hotel-type']

Counter({'hotel': 261, 'guesthouse': 271, 'dontcare': 5})

In [12]:
attr_dict['hotel-stars']

Counter({'4': 210, '2': 22, '0': 37, '3': 64, '1': 11, '5': 10, 'dontcare': 1})

In [13]:
attr_dict['restaurant-area']

Counter({'centre': 419,
         'south': 159,
         'west': 142,
         'east': 133,
         'dontcare': 25,
         'north': 131})

In [16]:
attr_dict['restaurant-pricerange']

Counter({'cheap': 274, 'expensive': 433, 'moderate': 320, 'dontcare': 20})

In [17]:
attr_dict['restaurant-area']

Counter({'centre': 419,
         'south': 159,
         'west': 142,
         'east': 133,
         'dontcare': 25,
         'north': 131})

## Convert txt files into dataframe

In [18]:
hotel_attrs = [
    "hotel-area",
    "hotel-internet",
    "hotel-parking",
    "hotel-name",
    "hotel-pricerange",
    "hotel-type",
    "hotel-stars"
]

restaurant_attrs = [
    'restaurant-food', 
    'restaurant-pricerange', 
    'restaurant-area', 
    'restaurant-name'
]

In [19]:
HEADERS = ['find_hotel', 'find_restaurant', 'utts'] + hotel_attrs + restaurant_attrs

In [20]:
import csv

In [21]:
def write_data_to_df(file_in_utt, file_in_ans, file_out):
    """Write utterances and answers into DataFrame format based on 
    the given file_in_utt and file_in_ans, and name the output file file_out
    """
    with open(file_out, 'w') as fout:
        writer = csv.DictWriter(fout, fieldnames=HEADERS)
        writer.writeheader()

        with open(file_in_ans) as f_ans, open(file_in_utt) as f_utt:
            for ans, utt in zip(f_ans, f_utt):
                row = {attr: None for attr in HEADERS}
                utt = utt.strip()
                row['utts'] = utt

                comps = ans.split('|')
                intent = comps[0].strip('\n')
                row[intent] = 1

                if len(comps) > 1:
                    for comp in comps[1:]:
                        attr_type, ans = tuple(comp.split('='))
                        row[attr_type] = ans.strip('\n')

                writer.writerow(row)

In [22]:
def write_test_df(file_in_utt, file_out):
    """Write the test utterances in file_in_utt to a DataFrame, file_out"""
    with open(file_out, 'w') as fout:
        writer = csv.DictWriter(fout, fieldnames=HEADERS)
        writer.writeheader()

        with open(file_in_utt) as f_utt:
            for utt in f_utt:
                row = {attr: None for attr in HEADERS}
                utt = utt.strip()
                row['utts'] = utt
                writer.writerow(row)

In [23]:
# write_data_to_df('../Multiwoz/WOZ_train_utt.txt', '../Multiwoz/WOZ_train_ans.txt', '../data/dioData_train.csv')

In [24]:
# write_data_to_df('../Multiwoz/WOZ_dev_utt.txt', '../Multiwoz/WOZ_dev_ans.txt', '../data/dioData_dev.csv')

In [369]:
# write_test_df('../Multiwoz/WOZ_test_utt.txt', '../data/dioData_test.csv')