In [7]:
from datasets import load_dataset
xsum = load_dataset("xsum")
xsum.shape

Using custom data configuration default
Reusing dataset xsum (/Users/anton164/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)
100%|██████████| 3/3 [00:00<00:00, 116.65it/s]


{'train': (204045, 3), 'validation': (11332, 3), 'test': (11334, 3)}

### LOAD dataset from paper

In [27]:
import json

paper_datasets = []
for dataset_name in ["./data/paper/train.jsonl", "./data/paper/val.jsonl"]:
    print(dataset_name)
    dataset = []
    total_size = 0
    with_positive_examples = 0
    with_negative_examples = 0
    with open(dataset_name, "r") as f:
        for line in f:
            total_size += 1
            obj = json.loads(line)
            if len(obj["positive_examples"]) > 0:
                with_positive_examples += 1
            if len(obj["negative_examples"]) > 0:
                with_negative_examples += 1
            dataset.append(obj)
    paper_datasets.append(dataset)
    print("total size", total_size)
    print("with positive examples", with_positive_examples)
    print("with negative examples", with_negative_examples)
    print()

./data/paper/train.jsonl
total size 194436
with positive examples 194436
with negative examples 34262

./data/paper/val.jsonl
total size 11315
with positive examples 11315
with negative examples 10080



In [32]:
train_set = paper_datasets[0]
train_set[0]

{'source_text': 'The problem is affecting people using the older versions of the PlayStation 3, called the "Fat" model.The problem isn\'t affecting the newer PS3 Slim systems that have been on sale since September last year.Sony have also said they are aiming to have the problem fixed shortly but is advising some users to avoid using their console for the time being."We hope to resolve this problem within the next 24 hours," a statement reads. "In the meantime, if you have a model other than the new slim PS3, we advise that you do not use your PS3 system, as doing so may result in errors in some functionality, such as recording obtained trophies, and not being able to restore certain data."We believe we have identified that this problem is being caused by a bug in the clock functionality incorporated in the system."The PlayStation Network is used by millions of people around the world.It allows users to play their friends at games like Fifa over the internet and also do things like dow

## Testing assumption - positive example has ALL entities contained within source doc

In [35]:
from preprocessing.entity import extract_entities, entity_overlap

In [37]:
source = train_set[0]["source_text"]
gt_summary = train_set[0]["positive_examples"][0]

source_ents = extract_entities(source)
gt_summary_ents = extract_entities(gt_summary)

print(gt_summary)
gt_summary_ents

Sony has told owners of older models of its PlayStation 3 console to stop using the machine because of a problem with the PlayStation Network.


[('ORG', 'Sony'),
 ('PRODUCT', 'PlayStation 3'),
 ('ORG', 'the PlayStation Network')]

In [42]:
print(source)
extract_entities(source)

The problem is affecting people using the older versions of the PlayStation 3, called the "Fat" model.The problem isn't affecting the newer PS3 Slim systems that have been on sale since September last year.Sony have also said they are aiming to have the problem fixed shortly but is advising some users to avoid using their console for the time being."We hope to resolve this problem within the next 24 hours," a statement reads. "In the meantime, if you have a model other than the new slim PS3, we advise that you do not use your PS3 system, as doing so may result in errors in some functionality, such as recording obtained trophies, and not being able to restore certain data."We believe we have identified that this problem is being caused by a bug in the clock functionality incorporated in the system."The PlayStation Network is used by millions of people around the world.It allows users to play their friends at games like Fifa over the internet and also do things like download software or 

[('PRODUCT', 'the PlayStation 3'),
 ('PRODUCT', 'Fat'),
 ('ORG', 'PS3 Slim'),
 ('DATE', 'September last year'),
 ('ORG', 'Sony'),
 ('TIME', 'the next 24 hours'),
 ('PRODUCT', 'PS3'),
 ('PRODUCT', 'PS3'),
 ('ORG', 'The PlayStation Network'),
 ('CARDINAL', 'millions'),
 ('ORG', 'Fifa')]

In [45]:
gt_summary_ents

[('ORG', 'Sony'),
 ('PRODUCT', 'PlayStation 3'),
 ('ORG', 'the PlayStation Network')]

In [None]:
import stanza
from stanza import Document
nlp = stanza.Pipeline('en')

In [52]:
src_doc = nlp(source)
src_doc._text

'The problem is affecting people using the older versions of the PlayStation 3, called the "Fat" model.The problem isn\'t affecting the newer PS3 Slim systems that have been on sale since September last year.Sony have also said they are aiming to have the problem fixed shortly but is advising some users to avoid using their console for the time being."We hope to resolve this problem within the next 24 hours," a statement reads. "In the meantime, if you have a model other than the new slim PS3, we advise that you do not use your PS3 system, as doing so may result in errors in some functionality, such as recording obtained trophies, and not being able to restore certain data."We believe we have identified that this problem is being caused by a bug in the clock functionality incorporated in the system."The PlayStation Network is used by millions of people around the world.It allows users to play their friends at games like Fifa over the internet and also do things like download software o

In [56]:
from preprocessing.make_entity_perturbations import make_perturbations

src_doc = nlp(source)
src_doc.build_ents()

tgt_doc = nlp(gt_summary)
tgt_doc.build_ents()

neg_examples, changed_list = make_perturbations(target_text=tgt_doc._text,
                                                target_ents=tgt_doc.ents,
                                                source_ents=src_doc.ents,
                                                is_training_mode=True,
                                                max_perturbation_per_example=10)
neg_examples, changed_list

(['The PlayStation Network has told owners of older models of its PlayStation 3 console to stop using the machine because of a problem with the PlayStation Network.',
  'Fifa has told owners of older models of its PlayStation 3 console to stop using the machine because of a problem with the PlayStation Network.',
  'PS3 Slim has told owners of older models of its PlayStation 3 console to stop using the machine because of a problem with the PlayStation Network.',
  'Sony has told owners of older models of its Fat console to stop using the machine because of a problem with the PlayStation Network.',
  'Sony has told owners of older models of its PS3 console to stop using the machine because of a problem with the PlayStation Network.',
  'Sony has told owners of older models of its PlayStation 3 console to stop using the machine because of a problem with Sony.',
  'Sony has told owners of older models of its PlayStation 3 console to stop using the machine because of a problem with Fifa.',

In [57]:
train_set[0]

{'source_text': 'The problem is affecting people using the older versions of the PlayStation 3, called the "Fat" model.The problem isn\'t affecting the newer PS3 Slim systems that have been on sale since September last year.Sony have also said they are aiming to have the problem fixed shortly but is advising some users to avoid using their console for the time being."We hope to resolve this problem within the next 24 hours," a statement reads. "In the meantime, if you have a model other than the new slim PS3, we advise that you do not use your PS3 system, as doing so may result in errors in some functionality, such as recording obtained trophies, and not being able to restore certain data."We believe we have identified that this problem is being caused by a bug in the clock functionality incorporated in the system."The PlayStation Network is used by millions of people around the world.It allows users to play their friends at games like Fifa over the internet and also do things like dow

In [19]:
import json

n = 0
with open("./data/paper/val.jsonl", "r") as f:
    for line in f:
        n += 1
print(n)

11315
