In [1]:
%load_ext autoreload  
%autoreload 2 
%matplotlib inline

In [2]:
import torch 
from tqdm import tqdm
import json
from datasets import load_dataset
from sentence_transformers import CrossEncoder

In [3]:
# get instances
dataset = load_dataset('e2e_nlg')


Using custom data configuration default
Reusing dataset e2e_nlg (/Users/garylai/.cache/huggingface/datasets/e2e_nlg/default/0.0.0/bfeceb720929c2705bd227d1cfe5eaaab102a0bdac10dad618dac1e00c737430)
100%|██████████| 3/3 [00:00<00:00, 111.11it/s]


In [4]:
dataset = dataset['train']; len(dataset)

42061

In [39]:
dataset[-1]

{'meaning_representation': 'name[Aromi], eatType[restaurant], food[English], area[city centre]',
 'human_reference': 'Aromi is an English restaurant in the city centre.'}

In [36]:
def parse( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        key, value = s[:start-1],s[start:end]
        return key, value
    except ValueError:
        return "nan", "nan"

In [40]:
dataset[0]

{'meaning_representation': 'name[The Vaults], eatType[pub], priceRange[more than £30], customer rating[5 out of 5], near[Café Adriatic]',
 'human_reference': 'The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30.'}

In [61]:
from collections import deque
def process_sentence(sample):
    sentence = sample['human_reference']
    # parse data
    pairs = sample['meaning_representation'].split(", ")
    data = {}
    for pair in pairs: 
        key, value = parse(pair, "[", "]")
        data[key] = value

    # check entities
    entities = ["name", "near", "area"]
    found_entities = []

    # get input sentence
    processed_sentence = sentence
    for entity in entities: 
        if entity in data and data[entity] in processed_sentence:
            processed_sentence = processed_sentence.replace(data[entity], "<" + data[entity] + ">")
            found_entities.append(entity)

    # get output
    output = deque()
    earlest_position = float('inf')
    for entity in found_entities:
        position = processed_sentence.find(data[entity])
        if position < earlest_position:
            earlest_position = position
            output.appendleft(entity)
        else:
            output.append(entity)

    return {
        "input": processed_sentence,
        "output": output
    }

In [62]:
process_sentence(dataset[0])

{'input': '<The Vaults> pub near <Café Adriatic> has a 5 star rating.  Prices start at £30.',
 'output': deque(['name', 'near'])}

In [67]:
processed_sentences = []
for i in tqdm(range(3000)):
    processed_sentences.append(process_sentence(dataset[i]))

100%|██████████| 3000/3000 [00:00<00:00, 17799.83it/s]


In [70]:
processed_sentences

[{'input': '<The Vaults> pub near <Café Adriatic> has a 5 star rating.  Prices start at £30.',
  'output': deque(['name', 'near'])},
 {'input': 'Close to <Café Brazil>, <The Cambridge Blue> pub serves delicious Tuscan Beef for the cheap price of £10.50. Delicious Pub food.',
  'output': deque(['near', 'name'])},
 {'input': '<The Eagle> is a low rated coffee shop near <Burger King> and the <riverside> that is family friendly and is less than £20 for Japanese food.',
  'output': deque(['name', 'near', 'area'])},
 {'input': 'Located near <The Sorrento> is a French Theme eatery and coffee shop called <The Mill>, with a price range at £20-£25 it is in the <riverside> area.',
  'output': deque(['near', 'name', 'area'])},
 {'input': 'For luxurious French food, the <Loch Fyne> is located by the river next to <The Rice Boat>.',
  'output': deque(['name', 'near'])},
 {'input': "<Bibimbap House> is a moderately priced restaurant who's main cuisine is English food. You will find this local gem nea

In [50]:
processed_sentence

'<The Vaults> pub near <Café Adriatic> has a 5 star rating.  Prices start at £30.'

In [42]:
data

{'name': 'The Vaults',
 'eatType': 'pub',
 'priceRange': 'more than £30',
 'customer rating': '5 out of 5',
 'near': 'Café Adriatic'}

In [14]:
task_json = {
    "Contributors": [
        "Gary Haizhi Lai"
    ],
    "Source": [
        "e2e"
    ],
    "Categories": [
        "Text Generation"
    ],
    "Definition": "In this task, we ask you to parse restaurant descriptions into a structured data table of key-value pairs. Here are the attributes (keys) and their examples values: \n name: The Eagle,... \n eatType: restaurant, coffee shop,... \n familyFriendly: Yes / No \n priceRange: cheap, expensive,... \n food: French, Italian,... \n near: market square,... \n area: riverside, city center, ... \n customerRating: 1 of 5 (low), 4 of 5 (high), ...  \n The output table may contain all or only some of the attributes but must not contain unlisted attributes. For the output to be considered correct, it also must parse all of the attributes existant in the input sentence; in other words, incomplete parsing would be considered incorrect.",
    "Positive Examples": [
        {
            "input": "<Aromi> is an English restaurant in the <city centre>.",
            "output": "restaurant name, location",
            "explanation": "The output correctly parses all the parseable attributes in the input, no more, no less."
        },
        {
            "input": "The Rice Boat is a cheap Indian restaurant in the center of the city near Express by Holiday Inn. It is family friendly and has the highest customer rating.",
            "output": "name[The Rice Boat], food[Indian], priceRange[cheap], customer rating[5 out of 5], area[city centre], familyFriendly[yes], near[Express by Holiday Inn]",
            "explanation": "The output data table contains all the correct attributes and values from the input. All the attributes are as listed in the instruction; there are no made-up attributes."
        },
    ],
    "Negative Examples": [
        {
            "input": "Blue Spice is a coffee shop located by the riverside, near Avalon. Its prices are over £30. Its customer ratings are 5 out of 5.",
            "output": "name[Blue Spice], eatType[pub], priceRange[more than £30], customer rating[5 out of 5], area[Boston], familyFriendly[yes], near[Avalon]",
            "explanation": "The values for some of the attributes are incorrect e.g. \"pub\" and \"Boston\"."
        },
        {
            "input": "The Waterman is an expensive family-friendly Japanese restaurant with average customer rating in riverside",
            "output": "name[The Waterman], customer rating[average], area[riverside], familyFriendly[yes]",
            "explanation": "While the output correctly parses various attributes, it leaves out some parseable attributes such as \"food[Japanese]\" and \"priceRange[expensive]\"", 
        },
        {
 "input": "The Rice Boat is a cheap Indian restaurant in the center of the city near Express by Holiday Inn. It is family friendly and has the highest customer rating.",
            "output": "name[The Rice Boat], type[cheap Indian restaurant], food[Indian], priceRange[cheap], customer rating[5 out of 5], area[city centre], familyFriendly[yes], near[Express by Holiday Inn]",
            "explanation": "While most of the attributes were parsed correctly, \"type[cheap Indian restaurant]\" is a made-up attribute and therefore the output is considered incorrect."
 }
    ],
    "Instances": Instances
}

# export
with open('task950_e2e_text_generation.json', 'w') as fp:
    final_json = json.dumps(task_json, indent=4, ensure_ascii=False)
    print(final_json, file=fp)