In [1]:
%load_ext autoreload  
%autoreload 2 
%matplotlib inline

In [2]:
import torch 
from tqdm import tqdm
import json
from datasets import load_dataset
from sentence_transformers import CrossEncoder

In [3]:
# get instances
dataset = load_dataset('e2e_nlg')


Using custom data configuration default
Reusing dataset e2e_nlg (/Users/garylai/.cache/huggingface/datasets/e2e_nlg/default/0.0.0/bfeceb720929c2705bd227d1cfe5eaaab102a0bdac10dad618dac1e00c737430)
100%|██████████| 3/3 [00:00<00:00, 163.85it/s]


In [4]:
dataset = dataset['train']; len(dataset)

42061

In [5]:
def parse( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        key, value = s[:start-1],s[start:end]
        return key, value
    except ValueError:
        return "nan", "nan"

In [6]:
def parse_representation(sample):
    # parse data
    pairs = sample['meaning_representation'].split(", ")
    data = {}
    for pair in pairs: 
        key, value = parse(pair, "[", "]")
        data[key] = value
    return data

In [7]:
def get_sentence_subject(sample):
    data = parse_representation(sample)
    return data['name']

In [8]:
def get_instance(sample):
    return {
        "input": sample['human_reference'],
        "output": get_sentence_subject(sample)
    }

In [9]:
dataset[0]

{'meaning_representation': 'name[The Vaults], eatType[pub], priceRange[more than £30], customer rating[5 out of 5], near[Café Adriatic]',
 'human_reference': 'The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30.'}

In [35]:
def swap_sentence_entity(data, sentence):
    near, name = data["near"], data["name"]
    sentence = sentence.replace(near, "xxx")
    sentence = sentence.replace(name, "yyy")
    sentence = sentence.replace("xxx", name)
    sentence = sentence.replace("yyy", near)
    return sentence


In [36]:
# sample = dataset[0]
# sentence = sample['human_reference']
# data = parse_representation(sample)
# print("sentence: ", sentence)
# print("swap_sentence_entity: ", swap_sentence_entity(data, sentence))

sentence:  The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30.
swap_sentence_entity:  Café Adriatic pub near The Vaults has a 5 star rating.  Prices start at £30.


In [43]:
Instances = []
unique_sentences = set()
unique_names = set()
for i in tqdm(range(len(dataset))):
    sample = dataset[i]
    sentence = sample['human_reference']
    # prevent duplicates
    if sentence in unique_sentences:
        continue
    else: 
        unique_sentences.add(sentence)

    # only get sentences with multiple named entities
    data = parse_representation(sample)
    # get sentences sufficiently long / complex 
    if len(sentence) > 0:
        if "name" not in data or "near" not in data:
            continue
        # if there's a duplicate, check if swapping "name" and "near" resolves it
        if data["name"] in unique_names and i % 1000 != 0:
            if data["near"] in unique_names: continue
            else:
                unique_names.add(data["near"])
                Instances.append(
                    {
                        "input": swap_sentence_entity(data, sentence), 
                        "output": data["near"]
                    }
                )
        else: 
            unique_names.add(data["name"])
            Instances.append(
                {
                    "input": sentence, 
                    "output": data["name"]
                }
            )


# print(Instances)
print(len(Instances))

100%|██████████| 42061/42061 [00:01<00:00, 21327.27it/s]

63





In [44]:
unique_nears

{'All Bar One',
 'Burger King',
 'Café Adriatic',
 'Café Brazil',
 'Café Rouge',
 'Clare Hall',
 'Raja Indian Cuisine',
 'Ranch',
 'The Rice Boat',
 'The Six Bells',
 'The Sorrento',
 'Yippee Noodle Bar'}

In [21]:
unique_names

{'Alimentum',
 'Aromi',
 'Bibimbap House',
 'Blue Spice',
 'Browns Cambridge',
 'Clowns',
 'Cocum',
 'Cotto',
 'Fitzbillies',
 'Giraffe',
 'Green Man',
 'Loch Fyne',
 'Midsummer House',
 'Strada',
 'Taste of Cambridge',
 'The Cambridge Blue',
 'The Cricketers',
 'The Dumpling Tree',
 'The Eagle',
 'The Golden Curry',
 'The Golden Palace',
 'The Mill',
 'The Olive Grove',
 'The Phoenix',
 'The Plough',
 'The Punter',
 'The Rice Boat',
 'The Twenty Two',
 'The Vaults',
 'The Waterman',
 'The Wrestlers',
 'Travellers Rest Beefeater',
 'Wildwood',
 'Zizzi'}

In [None]:
# from collections import deque
# def process_sentence(sample):
#     sentence = sample['human_reference']
#     # parse data
#     pairs = sample['meaning_representation'].split(", ")
#     data = {}
#     for pair in pairs: 
#         key, value = parse(pair, "[", "]")
#         data[key] = value

#     # check entities
#     entities = ["name", "near", "area"]
#     found_entities = []

#     # get input sentence (add <> around entities)
#     processed_sentence = sentence
#     for entity in entities: 
#         if entity in data and data[entity] in processed_sentence:
#             processed_sentence = processed_sentence.replace(data[entity], "<" + data[entity] + ">")
#             found_entities.append(entity)

#     # sort entities by position
#     sorted_entities = deque()
#     earlest_position = float('inf')
#     for entity in found_entities:
#         position = processed_sentence.find(data[entity])
#         if position < earlest_position:
#             earlest_position = position
#             sorted_entities.appendleft(entity)
#         else:
#             sorted_entities.append(entity)
    
#     # map entity to labels (output)
#     entity_to_label = {
#         "name": "venue",
#         "near": "venue",
#         "area": "area"
#     }
#     output = [entity_to_label[entity] for entity in sorted_entities]

#     return {
#         "input": processed_sentence,
#         "output": output
#     }

In [None]:
# processed_sentences = []
# for i in tqdm(range(3000)):
#     processed_sentences.append(process_sentence(dataset[i]))

In [None]:
task_json = {
    "Contributors": [
        "Gary Haizhi Lai"
    ],
    "Source": [
        "e2e"
    ],
    "Categories": [
        "Text Generation"
    ],
    "Definition": "In this task, we ask you to identify the named entity that is the subject of the sentence. Note that there could be multiple named entities in the sentence - you must correctly pick the one that is the sentence's subject.",
    "Positive Examples": [
        {
            "input": "The Eagle is an inexpensive coffee shop near Burger King and the river. It is family-friendly and serves pasta.",
            "output": "The Eagle",
            "explanation": "The correct named entity is identified as the subject of the sentence."
        },
        {
            "input": "There is a pub called Strada which serves Italian food right across the street from Yippee Noodle Bar and has a 5 out of 5 customer rating.",
            "output": "Strada",
            "explanation": "The correct named entity is identified as the subject of the sentence. Yippee Noodle Bar is also a named entity but it's not the subject."
        },
    ],
    "Negative Examples": [
        {
            "input": "Dig Inn, located near Panda Express, is a highly rated restaurant among students from Columbia University.",
            "output": "Panda Express",
            "explanation": "While Panda Express is a named entity, it is not the subject of the sentence."
        },
        {
            "input": "Korea BBQ 669 is an expensive, family-friendly Korean restaurant located near the Bistro Cafe",
            "output": "Bistro Cafe",
            "explanation": "Bistro Cafe is not the subject of the sentence", 
        },
    ],
    "Instances": Instances
}

# export
with open('task951_e2e_text_generation.json', 'w') as fp:
    final_json = json.dumps(task_json, indent=4, ensure_ascii=False)
    print(final_json, file=fp)