In [1]:
# Global variables
frames_path = '../data/frames/frames.json'
utterances_train_path = '../data/utterances-train.json'
utterances_test_path = '../data/utterances-test.json'

In [2]:
# Imports
import json
import pandas as pd

In [3]:
# Initial data
frames = pd.read_json(frames_path)

print('Shape:', frames.shape)
display(frames.head())

Shape: (1369, 5)


Unnamed: 0,user_id,turns,wizard_id,id,labels
0,U22HTHYNP,[{'text': 'I'd like to book a trip to Atlantis...,U21DKG18C,e2c0fc6c-2134-4891-8353-ef16d8412c9a,"{'userSurveyRating': 4.0, 'wizardSurveyTaskSuc..."
1,U21E41CQP,"[{'text': 'Hello, I am looking to book a vacat...",U21DMV0KA,4a3bfa39-2c22-42c8-8694-32b4e34415e9,"{'userSurveyRating': 3.0, 'wizardSurveyTaskSuc..."
2,U21RP4FCY,[{'text': 'Hello there i am looking to go on a...,U21E0179B,6e67ed28-e94c-4fab-96b6-68569a92682f,"{'userSurveyRating': 2.0, 'wizardSurveyTaskSuc..."
3,U22HTHYNP,[{'text': 'Hi I'd like to go to Caprica from B...,U21DKG18C,5ae76e50-5b48-4166-9f6d-67aaabd7bcaa,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."
4,U21E41CQP,"[{'text': 'Hello, I am looking to book a trip ...",U21DMV0KA,24603086-bb53-431e-a0d8-1dcc63518ba9,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."


In [4]:
# Extract only turns
turns = []
for i, row in frames.iterrows():
    for turn in row['turns']:
        if turn['author'] == 'user':
            turns.append(turn)

print('Turn number:', len(turns))

Turn number: 10407


In [8]:
utterances = []

for turn in turns:

    # Get wanted properties
    properties = []
    all_acts = turn['labels']['acts']
    for act in all_acts:
        for arg in act['args']:
            if 'val' not in arg: continue
            if arg['key'] == 'intent' and arg['val'] == 'book': properties.append({'val': 'BookFlight', 'key': 'intent'})
            elif arg['key'] == 'dst_city': properties.append({'val': arg['val'].lower(), 'key': 'destination'})
            elif arg['key'] == 'or_city': properties.append({'val': arg['val'].lower(), 'key': 'origin'})
            elif arg['key'] == 'str_date': properties.append({'val': arg['val'].lower(), 'key': 'go_date'})
            elif arg['key'] == 'end_date': properties.append({'val': arg['val'].lower(), 'key': 'back_date'})
            elif arg['key'] == 'budget': properties.append({'val': arg['val'].lower(), 'key': 'budget'})
            else: pass

    # Prepare the variables
    intent = 'BookFlight'
    language = 'en-us'
    text = turn['text'].lower()
    entities = []

    for prop in properties:
        if prop['key'] == 'intent': continue

        index = text.find(prop['val'])
        entities.append({
            'category': prop['key'],
            'offset': index,
            'length': len(prop['val'])
        })


    # Create utterances
    utterances.append({
        'intent': intent, 
        'language': language,
        'text': text,
        'entities': entities
    })


print('Utterances initial number:', len(utterances))

# Filter utterances
utterances_filtered = []
for utt in utterances:
    if len(utt['entities']) == 0: continue    
    else: 
        pb = False
        for ent in utt['entities']:
            if ent['offset'] == -1: pb = True
        
        if pb: continue
        else: utterances_filtered.append(utt)

utterances_filtered = pd.DataFrame(utterances_filtered).drop_duplicates('text')

print('Filtered utterances shape:', utterances_filtered.shape)
display(utterances_filtered.sample(5))


utt_shuffled = utterances_filtered.sample(frac=1)
train_size = int(len(utt_shuffled) * 0.9)
utt_train = utt_shuffled[0:train_size]
print('Train shape:', utt_train.shape)
utt_test = utt_shuffled[train_size:]
print('Test shape:', utt_test.shape)

utt_train.to_json(utterances_train_path, orient='records')
utt_test.to_json(utterances_test_path, orient='records')

Utterances initial number: 10407
Filtered utterances shape: (4138, 4)


Unnamed: 0,intent,language,text,entities
2567,BookFlight,en-us,i like. how about trips to belo horizonte,"[{'category': 'destination', 'offset': 27, 'le..."
210,BookFlight,en-us,is there anything to mannheim?,"[{'category': 'destination', 'offset': 21, 'le..."
630,BookFlight,en-us,what about alexandria to salvador?,"[{'category': 'destination', 'offset': 25, 'le..."
1581,BookFlight,en-us,anytime after this saturday,"[{'category': 'go_date', 'offset': 8, 'length'..."
850,BookFlight,en-us,termina,"[{'category': 'origin', 'offset': 0, 'length':..."


Train shape: (3724, 4)
Test shape: (414, 4)
