In [1]:
#Filter XSUM DATASET
# If first time, use this.
# !python -m spacy download en_core_web_lg
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS

"""
https://github.com/amazon-research/fact-check-summarization
"""

def entity_match(ent, source, level=2):
    ent_split = ent.split()
    result = []
    for l in range(len(ent_split), 1, -1):
        for start_i in range(len(ent_split) - l + 1):
            sub_ent = " ".join(ent_split[start_i:start_i+l])
            if re.search(re.escape(sub_ent), source, re.IGNORECASE):
                result.append(sub_ent)
        if result:
            break
    if result:
        return result
    else:
        for token in ent_split:
            if token.lower() not in STOP_WORDS or token == "US":
                if re.search(re.escape(token), source, re.IGNORECASE):
                    result.append(token)
        return result
    return []



def select_example(intro, abstract, filter_level=2):
    entities_to_track = ['PERSON', 'FAC', 'GPE', 'ORG', 'NORP', 'LOC', 'EVENT']
    doc = nlp(abstract)
    en_count_in_summary = 0
    select = True
    for e in doc.ents:
        if e[0].ent_type_ in entities_to_track:
            en_count_in_summary += 1
            match_result = entity_match(e.text, intro, 2)
#             print(e, match_result)
            if not match_result:
                select = False
                break
    # if select and en_count_in_summary>0:
    if select:
        return True
    else:
        return False

In [3]:
from datasets import load_dataset
xsum = load_dataset('xsum')

Using custom data configuration default


Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /home/geoff/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xsum downloaded and prepared to /home/geoff/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
def check_entity_agreement(example, select_example=select_example):
    keep = select_example(example['document'], example['summary'])
    example['keep'] = keep
    return example
# # This was the biggest pain to parallelize.....
xsum_train_filtered = xsum['train'].map(check_entity_agreement)
xsum_val_filtered = xsum['validation'].map(check_entity_agreement)
xsum_test_filtered = xsum['test'].map(check_entity_agreement)

  0%|          | 0/204045 [00:00<?, ?ex/s]

  0%|          | 0/11332 [00:00<?, ?ex/s]

  0%|          | 0/11334 [00:00<?, ?ex/s]

In [8]:
import os
os.makedirs('data', exist_ok=True)

xsum_train_filtered.save_to_disk("data/xsum_filtered/train")
xsum_val_filtered.save_to_disk("data/xsum_filtered/val")
xsum_test_filtered.save_to_disk("data/xsum_filtered/test")

# Save to S3

In [2]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::448807757624:role/service-role/AmazonSageMaker-ExecutionRole-20211202T101582
sagemaker bucket: sagemaker-us-east-2-448807757624
sagemaker session region: us-east-2


In [4]:
import botocore
from datasets.filesystems import S3FileSystem
from datasets import load_from_disk

s3 = S3FileSystem()  

xsum_filtered_train = load_from_disk('data/xsum_filtered/train')
xsum_filtered_train._data = xsum_filtered_train._data.filter(xsum_filtered_train['keep'])

xsum_filtered_test = load_from_disk('data/xsum_filtered/test')
xsum_filtered_test._data = xsum_filtered_test._data.filter(xsum_filtered_test['keep'])

In [6]:
# save train_dataset to s3

do_save = False

if do_save:
    s3_prefix = 'datasets/xsum_filtered'
    training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
    xsum_filtered_train.save_to_disk(training_input_path,fs=s3)

    test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
    xsum_filtered_test.save_to_disk(test_input_path,fs=s3)
else:
    print('Please set do_save to True if you really want to save.')

Please set do_save to True if you really want to save.
