In [1]:
import json

records = []
for line in open('data/en-US.jsonl', 'r'):
    records.append(json.loads(line))


In [2]:
set([r["intent"] for r in records])

{'alarm_query',
 'alarm_remove',
 'alarm_set',
 'audio_volume_down',
 'audio_volume_mute',
 'audio_volume_other',
 'audio_volume_up',
 'calendar_query',
 'calendar_remove',
 'calendar_set',
 'cooking_query',
 'cooking_recipe',
 'datetime_convert',
 'datetime_query',
 'email_addcontact',
 'email_query',
 'email_querycontact',
 'email_sendemail',
 'general_greet',
 'general_joke',
 'general_quirky',
 'iot_cleaning',
 'iot_coffee',
 'iot_hue_lightchange',
 'iot_hue_lightdim',
 'iot_hue_lightoff',
 'iot_hue_lighton',
 'iot_hue_lightup',
 'iot_wemo_off',
 'iot_wemo_on',
 'lists_createoradd',
 'lists_query',
 'lists_remove',
 'music_dislikeness',
 'music_likeness',
 'music_query',
 'music_settings',
 'news_query',
 'play_audiobook',
 'play_game',
 'play_music',
 'play_podcasts',
 'play_radio',
 'qa_currency',
 'qa_definition',
 'qa_factoid',
 'qa_maths',
 'qa_stock',
 'recommendation_events',
 'recommendation_locations',
 'recommendation_movies',
 'social_post',
 'social_query',
 'takeaway_o

In [3]:
INTENT = "recommendation_movies"
examples = [r["utt"] for r in records if r["intent"] == INTENT]
print(len(examples))
examples[:5]

102


['should i watch this movie',
 'what film should i watch',
 'what is a good iranian film from two thousand and sixteen',
 'what is a good movie to see right now',
 'top ten movies']

In [4]:
INTENTS = ["alarm_set", "audio_volume_down", "datetime_query", "general_greet", "iot_hue_lightoff", "weather_query", "recommendation_movies"]
len(INTENTS)

7

In [5]:
intent2records = {i:[] for i in INTENTS}
for r in records:
    intent = r["intent"]
    if intent in INTENTS:
        x = intent2records[intent]
        x.append(r)

num_records = [len(y) for x,y in intent2records.items()]
num_train = [int(x * 0.7) for x in num_records]
num_test = [x - y for x,y in zip(num_records, num_train)]

num_records, num_train, num_test

([254, 71, 502, 28, 213, 855, 102],
 [177, 49, 351, 19, 149, 598, 71],
 [77, 22, 151, 9, 64, 257, 31])

In [6]:
num_train = {x:y for x,y in zip(INTENTS, num_train)}
num_train

{'alarm_set': 177,
 'audio_volume_down': 49,
 'datetime_query': 351,
 'general_greet': 19,
 'iot_hue_lightoff': 149,
 'weather_query': 598,
 'recommendation_movies': 71}

In [7]:
def reformat(entity_string):
    entity_type, entity = entity_string.split(":")
    entity = entity.strip().replace(" ", "_")
    entity_type = entity_type.strip()
    return f"[{entity}]({entity_type})"

reformat("time : five am")

'[five_am](time)'

In [8]:
import re

def reformat_utt(annot_utt):
    matches = re.findall(r'\[([a-zA-Z0-9\'\._ ]*:[a-zA-Z0-9\'\._ ]*)\]', annot_utt)
    result = annot_utt
    for match in matches:
        match_formatted = f"[{match}]"
        result = result.replace(match_formatted, reformat(match))
    return result

reformat_utt("wake me up at [time : five am] [date : this week]")

'wake me up at [five_am](time) [this_week](date)'

In [9]:
reformat_utt("tell me [date : today''s] date")

"tell me [today''s](date) date"

In [10]:
[reformat_utt(a["annot_utt"]) for a in records[:10]]

['wake me up at [five_am](time) [this_week](date)',
 'wake me up at [nine_am](time) on [friday](date)',
 'set an alarm for [two_hours_from_now](time)',
 'quiet',
 'olly quiet',
 'stop',
 'olly pause for [ten_seconds](time)',
 'pause for [ten_seconds](time)',
 '[pink](color_type) is all we need',
 'make the lighting bit more [warm](color_type) here']

In [11]:
train = {x:[reformat_utt(z["annot_utt"]) for z in intent2records[x][:y]] for x,y in num_train.items()}
test = {x:[reformat_utt(z["annot_utt"]) for z in intent2records[x][y:]] for x,y in num_train.items()}
train

{'alarm_set': ['wake me up at [five_am](time) [this_week](date)',
  'wake me up at [nine_am](time) on [friday](date)',
  'set an alarm for [two_hours_from_now](time)',
  'set an alarm for [twelve](time)',
  'set an alarm [forty_minutes_from_now](time)',
  'set alarm for [eight](time) [every_weekday](general_frequency)',
  'set alarm at [ten_am](time)',
  'set alarm [tomorrow](date) at [six_am](time)',
  'set wake up [thursday](date) [seven_am](time)',
  'set an alarm at [six](time) in the [morning](timeofday)',
  'set an alarm for [four](time) in the [afternoon](timeofday)',
  'olly alert me at [three_p._m.](time) to go to the [concert](event_name)',
  'alert me at [three_p._m.](time) to go to the [concert](event_name)',
  'set an alarm for me at [eight_am](time)',
  'make an alarm for [four_p._m.](time)',
  'make me a wake up alarm for [eight_forty_five_am](time)',
  'set an alarm for [nine_am](time)',
  'set a wake up call for [ten_am](time)',
  'i need a reminder alarm at [one](time

In [12]:
test["weather_query"]

['what is [home_town](place_name) weather',
 'how [hot](weather_descriptor) is [today](date)',
 'how [cold](weather_descriptor) is [today](date)',
 'what is the weather for [this_week](date)',
 'is it [rainy](weather_descriptor) in my area',
 "what's the weather now",
 'status of weather',
 'what was the weather for the [week](time)',
 "how is [this_week's](date) weather",
 'it seems [cold](weather_descriptor)',
 'do i need a [coat](weather_descriptor)',
 'what is the forecast for [saturday](date)',
 'do i need a [raincoat](weather_descriptor) [day_after](date)',
 'what will be the weather [day_after](date)',
 "what's the [temperature](weather_descriptor) outside",
 'is it [hot_or_cold](weather_descriptor) outside',
 'could you please tell me what is the weather predication for [this_week](date)',
 'please tell what would be the weather in [washington_d._c.](place_name) for [the_coming_week](time)',
 'please tell me what will be [rainy](weather_descriptor) day based on the weather fore

In [13]:
nlu_yaml = {"version": "3.1"}
nlu_examples = []
for intent, ex in train.items():
    example = {}
    example["intent"] = intent
    example["examples"] = ex
    nlu_examples.append(example)

nlu_yaml["nlu"] = nlu_examples
# nlu_yaml

In [14]:
import yaml
# This doesn't give quite the result we want but fairly close to where a quick bit of hand editting gets the rest of the way there
with open('data/nlu.yml', 'w') as f:
    yaml.dump(nlu_yaml, f, default_flow_style=False)