# Load datasets
---

In [1]:
import datasets
from xsum_dataset import XsumDataset

In [2]:
xsum_data_raw = datasets.load_dataset("xsum")

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# train/val/test data
# xsum_train_data = XsumDataset(xsum_data_raw["train"])
xsum_val_data = XsumDataset(xsum_data_raw["validation"])
xsum_test_data = XsumDataset(xsum_data_raw["test"])

# concat data
xsum_data_raw_cc = datasets.concatenate_datasets(
    [xsum_data_raw["train"], xsum_data_raw["validation"], xsum_data_raw["test"]]
    )
xsum_concat_data = XsumDataset(xsum_data_raw_cc)

## select a sample

In [4]:
from generate_xsum_summary import load_summarization_model_and_tokenizer, generate_summaries, generate_token_entropy_metadata
import random

random.seed(0)

### * one to be perturbed

In [5]:
# sample one bbcid
bbc_id = random.choice(list(xsum_test_data.data_by_id.keys()))
# or fix one: bbc_ids = ["33858956"]
bbc_id

'35616768'

In [6]:
# selected_data - dict with keys: (id, document, true_summary, (factuality_data, faithfulness_data))
selected_data = xsum_test_data.data_by_id[bbc_id]

# original_docs - documents to sumamrize
original_doc = selected_data["document"]
print("original doc to summarize:\n", original_doc)

original doc to summarize:
 The agreement, reached late on Friday after two days of talks in Brussels, gives the UK power to limit some EU migrants' benefits.
It also includes a treaty change so the UK is not bound to "ever closer union" with other EU member states, he said.
EU exit campaigners said the "hollow" deal offered only "very minor changes".
Mr Cameron is set to the announce the date of a referendum on whether Britain should remain in the EU after a cabinet meeting which is happening at 10:00 GMT - the referendum is widely expected to be on Thursday, 23 June.
Once the date is announced, ministers will be allowed to campaign for whichever side they want - one of Mr Cameron's closest political allies Michael Gove has already been named as supporting the Leave camp. Others, such as Iain Duncan Smith are expected to follow - but a question mark remains over which way London Mayor Boris Johnson will jump.
The key points of the deal are:
The prime minster had to make concessions to

### *one to perturb

In [4]:
import random

In [5]:
# sample one bbcid
ptb_id = random.choice(xsum_val_data.ids)
print("ptb_id:", ptb_id)

ptb_id: 29703870


In [42]:
# selected_data - dict with keys: (id, document, true_summary, (factuality_data, faithfulness_data))
ptb_selected_data = xsum_val_data.data_by_id[ptb_id]

# ptb_docs - documents to sumamrize
ptb_doc = ptb_selected_data["document"]
print("ptb_doc to sample ood sentences:\n", ptb_doc)

ptb_doc to sample ood sentences:
 Tait, who picked up a medal alongside Kate Hornsey in the women's pair, was diagnosed with cervical cancer in 2013.
Australian Rowing head coach Chris O'Brien said: "Sarah has been, and will continue to be, an inspiration to many Australian rowers.
"She was a strong role model to young athletes, and particularly influential with our emerging female athletes."
Tait and Hornsey finished behind Great Britain's Heather Stanning and Helen Glover in the 2012 pairs final, which was GB's first gold of London 2012.
Glover became friends with Tait following the final and told BBC World Service she was an "incredible personality".
She added: "To achieve so much in her too short life - two beautiful children, a husband, an Olympic silver medal, the list goes on.
"Above all that, she was an inspirational and truly lovely person that has paved the way for women in sport at home in Australia and all over the world."


### * insert sents

In [43]:
sample_sents_n = 2

In [44]:
ptb_sentences = random.choices(ptb_doc.split("\n"), k=sample_sents_n)
print(ptb_sentences)

['Australian Rowing head coach Chris O\'Brien said: "Sarah has been, and will continue to be, an inspiration to many Australian rowers.', 'She added: "To achieve so much in her too short life - two beautiful children, a husband, an Olympic silver medal, the list goes on.']


In [45]:
original_sents = original_doc.split("\n")
len(original_sents)

35

In [46]:
for ptb_sent in ptb_sentences:
    insert_idx = random.choice(range(len(original_sents)))
    print("insert_idx", insert_idx)
    original_sents.insert(insert_idx, ptb_sent)

insert_idx 32
insert_idx 20


In [47]:
len(original_sents)

37

In [48]:
ood_doc = u"\n".join(original_sents)

## generate summary from ood document

### load model and tokenizer

In [61]:
model_name = "facebook/bart-large-xsum"
model, tokenizer = load_summarization_model_and_tokenizer(model_name)

In [62]:
import time

In [71]:
beam_size = 100

In [72]:
# generate summaries and metadata
start = time.time()
gen_summaries, gen_metadata = generate_summaries(
    model,
    tokenizer,
    original_doc,
    num_beams=beam_size,
    return_generation_metadata=True
)
end = time.time()
print(f"time: {end - start}")

time: 3.0090742111206055


In [73]:
# generate summaries and metadata
start = time.time()
ood_summaries, ood_metadata = generate_summaries(
    model,
    tokenizer,
    ood_doc,
    num_beams=beam_size,
    return_generation_metadata=True
)
end = time.time()
print(f"time: {end - start}")

time: 2.862010955810547


In [74]:
gen_metadata

[[{'token_id': tensor(37703, device='cuda:0'),
   'token': 'Twenty',
   'entropy': 5.396252632141113,
   'beam_token_prob': 0.06844499707221985,
   'beam_idx': 0,
   'beam_top_probs': [{'token': 'He',
     'token_id': 894,
     'beam_token_prob': 0.0970643162727356},
    {'token': 'The', 'token_id': 133, 'beam_token_prob': 0.08087211847305298},
    {'token': 'It', 'token_id': 243, 'beam_token_prob': 0.0751330628991127}],
   'token_in_input': True},
  {'token_id': tensor(107, device='cuda:0'),
   'token': ' years',
   'entropy': 1.4511500597000122,
   'beam_token_prob': 0.8674792051315308,
   'beam_idx': 4,
   'beam_top_probs': [{'token': ' years',
     'token_id': 107,
     'beam_token_prob': 0.8674792051315308},
    {'token': '-', 'token_id': 12, 'beam_token_prob': 0.027631521224975586},
    {'token': ' two',
     'token_id': 80,
     'beam_token_prob': 0.0023367642425000668}],
   'token_in_input': True},
  {'token_id': tensor(536, device='cuda:0'),
   'token': ' ago',
   'entropy': 2

In [None]:
# generate token entropy metadata
gen_token_entropy_metadata = generate_token_entropy_metadata(bbc_ids, gen_metadata)

In [None]:
# selected_data - list of dicts with keys: (id, document, true_summary, (factuality_data, faithfulness_data))
selected_data = [xsum_test_data.data_by_id[x] for x in bbc_ids]

# original_docs - list of documents to sumamrize
original_docs = [x["document"] for x in selected_data]
print("original docs to summarize:\n", original_docs)

In [9]:
original_docs[0].split("\n")

["Sian O'Callaghan, 22, was last seen leaving Suju alone at about 0250 GMT on Saturday to walk half a mile home.",
 'About 400 people took part in the search of Savernake Forest near Marlborough which is due to resume at 1030 GMT on Wednesday.',
 'A £20,000 reward to help find her has been offered by an anonymous donor.',
 'Mobile records put her phone in the area of the 4,500-acre forest 32 minutes after she left the club.',
 'Police say the journey could only have been made by car.',
 "Miss O'Callaghan's friends and her boyfriend Kevin Reape joined the search along with local people, many of whom did not know the missing woman, following a police appeal for help.",
 'Volunteers searching the dense woodland were divided into groups which were then assigned to a specialist police officer.',
 'Mikey Jack, 19, who plays Sunday league football with Mr Reape, said: "A lot of people here don\'t really know Kevin or Sian but they just want to help any way they can."',
 'Another volunteer, co

# NER Analysis
---

In [5]:
import spacy
from spacy import displacy
NER = spacy.load("en_core_web_lg")

## 0-0) file save / load utils

In [6]:
import pickle
import os

In [7]:
def save_to_cache_dir(var, file_name):
    cache_path = "../cache"
    file_path = os.path.join(cache_path, file_name + ".pkl")
    with open(file_path, "wb") as f:
        pickle.dump(var, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"saved to '{file_path}'")

In [8]:
def load_from_cache_dir(file_name):
    cache_path = "../cache"
    file_path = os.path.join(cache_path, file_name + ".pkl")
    with open(file_path, 'rb') as f:
        var = pickle.load(f)
    print(f"'{file_path}' loaded")
    return var

## 0-1) NER utils

In [9]:
# function to display basic entity info: 
def show_ents(doc):
    print(f"original doc: {doc.text}")
    if doc.ents: 
        for ent in doc.ents: 
            print(f"entity: {ent.text : >13} | start_char: {ent.start_char: 3} | end_char: {ent.end_char: 3} | label: {ent.label_} - {spacy.explain(ent.label_)}")
            # print(ent.label)  # entity type's hash value
            # print(ent.start)  # token span's start index position (word index)
            # print(ent.end)  # token span's stop index position (word index)
    else: print('No named entities found.')

## 0-2) load cached NER files

In [10]:
# # train
# train_doc_ents_list = load_from_cache_dir("train_doc_ents_list")
# train_sum_ents_list = load_from_cache_dir("train_sum_ents_list")

# # val
# val_doc_ner_list = load_from_cache_dir("val_doc_ner_list")
# val_sum_ner_list = load_from_cache_dir("val_sum_ner_list")
# val_doc_ents_list = load_from_cache_dir("val_doc_ents_list_pp")  # preprocessed
# val_sum_ents_list = load_from_cache_dir("val_sum_ents_list_pp")

# test
test_doc_ner_list = load_from_cache_dir("test_doc_ner_list")
test_sum_ner_list = load_from_cache_dir("test_sum_ner_list")
test_doc_ents_list = load_from_cache_dir("test_doc_ents_list_pp")  # preprocessed
test_sum_ents_list = load_from_cache_dir("test_sum_ents_list_pp")

# entities pool
concat_ent_pool_dict = load_from_cache_dir("concat_ent_pool_dict_pp")  # preprocessed
# test_ent_pool_dict = load_from_cache_dir("test_ent_pool_dict")

'../cache/test_doc_ner_list.pkl' loaded
'../cache/test_sum_ner_list.pkl' loaded
'../cache/test_doc_ents_list_pp.pkl' loaded
'../cache/test_sum_ents_list_pp.pkl' loaded
'../cache/concat_ent_pool_dict_pp.pkl' loaded


## todo
* case 1. no entites in true_summary - continue
    1) ner on summary doesn't recognize the entity correctly
* case 2. no exact overlap entities btw true_summary - document   
    1) check subset
        * Edinburgh (summary) - Edinburgh Sheriff Court (document)
        * (name) Usain Bolt (summary) - Usain (document) - should I replace?
    
* case 3. ex) 4 - four
* case 4. case

1. how to choose the entity?
    * pick the replacement entity that is not in the source document
2. how to replace it?





### other ideas
* replace the entites that are not in the true summary?

## 1) analyze test summaries

In [11]:
data_idx = 0

In [12]:
sum_ner = test_sum_ner_list[data_idx]
doc_ner = test_doc_ner_list[data_idx]

In [13]:
doc_ents_sorted = test_doc_ents_list[data_idx].most_common()
sum_ents_sorted = test_sum_ents_list[data_idx].most_common()

In [14]:
show_ents(sum_ner)

original doc: There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
entity:         Wales | start_char:  65 | end_char:  70 | label: GPE - Countries, cities, states


In [15]:
sum_ents_sorted

[(('Wales', 'GPE'), 1)]

In [16]:
doc_ents_sorted

[(('one', 'CARDINAL'), 4),
 (('Wales', 'GPE'), 2),
 (('Marc', 'PERSON'), 2),
 (('1,099', 'CARDINAL'), 1),
 (('2015-16', 'DATE'), 1),
 (('up to a year', 'DATE'), 1),
 (('The Welsh Government', 'ORG'), 1),
 (('the Housing Act', 'LAW'), 1),
 (('2015', 'DATE'), 1),
 (('Andrew Stevens', 'PERSON'), 1),
 (('six months to a year', 'DATE'), 1),
 (('six months', 'DATE'), 1),
 (('Stevens', 'PERSON'), 1),
 (('a hundred pounds', 'MONEY'), 1),
 (('830', 'CARDINAL'), 1),
 (('the year to March 2016', 'DATE'), 1),
 (('6,900', 'CARDINAL'), 1),
 (('50', 'DATE'), 1),
 (('the past 20 years', 'DATE'), 1),
 (('Tom Clarke', 'PERSON'), 1),
 (('Emmaus South Wales', 'GPE'), 1),
 (('Emmaus', 'ORG'), 1),
 (('Welsh Government', 'ORG'), 1),
 (('20,000', 'CARDINAL'), 1),
 (('the next five years', 'DATE'), 1)]

## 2) filter entities

* label list
    * CARDINAL: Numerals that do not fall under another type
    * DATE: Absolute or relative dates or periods
    * EVENT: Named hurricanes, battles, wars, sports events, etc.
    * FAC: Buildings, airports, highways, bridges, etc.
    * GPE: Countries, cities, states
    * LANGUAGE: Any named language
    * LAW: Named documents made into laws.
    * LOC: Non-GPE locations, mountain ranges, bodies of water
    * MONEY: Monetary values, including unit
    * NORP: Nationalities or religious or political groups
    * ORDINAL: "first", "second", etc.
    * ORG: Companies, agencies, institutions, etc.
    * PERCENT: Percentage, including "%"
    * PERSON: People, including fictional
    * PRODUCT: Objects, vehicles, foods, etc. (not services)
    * QUANTITY: Measurements, as of weight or distance
    * TIME: Times smaller than a day
    * WORK_OF_ART: Titles of books, songs, etc.

FILTER_LABELS = ["PERSON", "FAC", "GPE", "NORP", "LOC", "EVENT"]

In [11]:
FILTER_LABELS = ["PERSON", "FAC", "GPE", "NORP", "LOC", "EVENT", "LANGUAGE", "LAW", "ORG"]
# without numbers? "DATE", "PERCENT" 
# too many errors: "PRODUCT", "WORK_OF_ART"

In [12]:
ALL_LABELS = list(NER.get_pipe('ner').labels)

## 3) replace entities

In [13]:
from tqdm.notebook import tqdm

### 1) count samples to exclude

In [14]:
def count_no_overlap(xsum_data, doc_ents_list, sum_ents_list, filter_labels):
    # samples to exclude
    doc_overlap_count = 0
    sum_overlap_count = 0
    no_overlap_count = 0

    for data_idx, data in enumerate(tqdm(xsum_data.dataset)):
        document = data["document"]
        true_summary = data["true_summary"]

        # ner
        # doc_ner = doc_ner_list[data_idx]
        # sum_ner = sum_ner_list[data_idx]

        # entites
        doc_ents = doc_ents_list[data_idx]
        sum_ents = sum_ents_list[data_idx]

        # sort and filter
        doc_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in doc_ents.most_common() 
                                    if label in filter_labels]
        sum_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in sum_ents.most_common() 
                                    if label in filter_labels]

        overlap_flag = False
        # from document side
        for (ent, label), count in doc_ents_filtered_sorted:
            if ent in true_summary:  # overlap exists
                overlap_flag = True
                doc_overlap_count += 1
                break

        # if no overlap from document side, try from summary
        if overlap_flag == False:
            if len(sum_ents) == 0: # no entity in summary -> pass
                pass
            else:
                for (ent, label), count in sum_ents_filtered_sorted:
                    if ent in document:
                        overlap_flag = True
                        sum_overlap_count += 1
                        break

        if overlap_flag == False:
            no_overlap_count += 1 # need to exclude

    # check uncounted sample
    assert doc_overlap_count + sum_overlap_count + no_overlap_count == len(xsum_data.dataset)
    
    return doc_overlap_count, sum_overlap_count, no_overlap_count

In [15]:
test_overlap_counts = count_no_overlap(xsum_data=xsum_test_data, 
                                       doc_ents_list=test_doc_ents_list, 
                                       sum_ents_list=test_sum_ents_list, 
                                       filter_labels=FILTER_LABELS)

  0%|          | 0/11334 [00:00<?, ?it/s]

In [16]:
n_test = len(xsum_test_data.dataset)
print(f"overlap from document: {(test_overlap_counts[0]/n_test)*100:.1f}%")
print(f"overlap from summary: {(test_overlap_counts[1]/n_test)*100:.1f}%")
print(f"no overlap: {(test_overlap_counts[2]/n_test)*100:.1f}%")

overlap from document: 72.6%
overlap from summary: 3.6%
no overlap: 23.7%


### 2) print

In [17]:
# extract only necessary dicts
for label in ALL_LABELS:
    if label not in FILTER_LABELS:
        del concat_ent_pool_dict[label]

In [18]:
assert(len(concat_ent_pool_dict) == len(FILTER_LABELS))

In [19]:
# reduce the size to 5%
reduced_concat_ent_pool_dict = {}
for label in concat_ent_pool_dict.keys():
    ent_pool = concat_ent_pool_dict[label]
    reduced_ent_pool = {k:v for (k,v) in list(ent_pool.items())[:len(ent_pool) // 10]}
    reduced_concat_ent_pool_dict[label] = reduced_ent_pool

In [20]:
len(concat_ent_pool_dict["PERSON"])

358369

In [21]:
len(reduced_concat_ent_pool_dict["PERSON"])

35836

In [22]:
import random
random.seed(0)

In [25]:
chosen_ent_label_list = []
for data_idx, data in enumerate(tqdm(xsum_test_data.dataset)):
    print(f"\n============ data idx: {data_idx} ============")
    document = data["document"]
    true_summary = data["true_summary"]
    
    # ner
    # doc_ner = test_doc_ner_list[data_idx]
    # sum_ner = test_sum_ner_list[data_idx]
    
    # entites
    doc_ents = test_doc_ents_list[data_idx]
    sum_ents = test_sum_ents_list[data_idx]
    
    # sort and filter
    doc_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in doc_ents.most_common() 
                                if label in FILTER_LABELS]
    sum_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in sum_ents.most_common() 
                                if label in FILTER_LABELS]
    
    # if there is an overlap
    overlap_flag = False
    
    
    chosen_ent_label = None
    
    # from document side
#     print(f"* summary: {true_summary}")
#     print(f"* document ents: {doc_ents_filtered_sorted} \n")
    for (ent, label), count in doc_ents_filtered_sorted:
        if ent in true_summary:  # overlap exists
#             print("** overlap from document")
#             print(f"ent: {ent}, label: {label}, count_doc: {count}, count_sum: {true_summary.count(ent)}")
            overlap_flag = True
            chosen_ent_label = (ent, label)
            break
    
    # if no overlap from document side, try from summary
    if overlap_flag == False:
        if len(sum_ents) == 0: # no entity in summary -> pass
            pass
        else:
#             print(f"* summary ents: {sum_ents_filtered_sorted} \n")
            for (ent, label), count in sum_ents_filtered_sorted:
                if ent in document:
#                     print("** overlap from summary")
#                     print(f"ent: {ent}, label: {label}, count_sum: {count}, count_doc: {document.count(ent)}")
                    overlap_flag = True
                    chosen_ent_label = (ent, label)
                    break
    
    
    # check the chosen entity
    if overlap_flag == True:
        chosen_ent_label_list.append(chosen_ent_label)
        
        chosen_ent, chosen_label = chosen_ent_label
        print(f"* summary: {true_summary}")
        print(f"* chosen_ent: {chosen_ent}, label: {chosen_label}")
        
        
        # choose one
        ent_pool = list(reduced_concat_ent_pool_dict[chosen_label].keys())
        replacement = random.choice(ent_pool)
        print(f"* replacement: {replacement}")
        while len(replacement.split()) != len(chosen_ent.split()):  # also this should not be in doc or summary
            print("pick new replacement")
            replacement = random.choice(ent_pool)
            print(f"* replacement: {replacement}")
        
    else:
        print("****** NO OVERLAP ******")
        chosen_ent_label_list.append((None, None))

  0%|          | 0/11334 [00:00<?, ?it/s]


* summary: There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
* chosen_ent: Wales, label: GPE
* replacement: Qingdao

* summary: A man has appeared in court after firearms, ammunition and cash were seized by police in Edinburgh.
* chosen_ent: Edinburgh, label: GPE
* replacement: Dunblane

* summary: Four people accused of kidnapping and torturing a mentally disabled man in a "racially motivated" attack streamed on Facebook have been denied bail.
* chosen_ent: Facebook, label: ORG
* replacement: Stam

* summary: West Brom have appointed Nicky Hammond as technical director, ending his 20-year association with Reading.
* chosen_ent: West Brom, label: GPE
* replacement: the Irish Republic
pick new replacement
* replacement: Gymraeg
pick new replacement
* replacement: Northwich
pick new replacement
* replacement: Thailand
pick new replacement
* replacement: Lorient
pick new replacement
* replacement: Jadeja
pick new replacement
* replacement: Di

pick new replacement
* replacement: Frank McParland
pick new replacement
* replacement: Shaq Coulthirst
pick new replacement
* replacement: McCaw

* summary: Leicester's decision to sack Claudio Ranieri nine months after winning the Premier League made former Foxes striker Gary Lineker "shed a tear".
* chosen_ent: Leicester, label: GPE
* replacement: St Ann's
pick new replacement
* replacement: Hoffenheim

* summary: Rescuers are frantically trying to save about nine people located in the wreckage of a collapsed factory complex in the Bangladeshi capital Dhaka.
* chosen_ent: Dhaka, label: GPE
* replacement: Newport West
pick new replacement
* replacement: Warrington

* summary: Dundee United have signed their second goalkeeper in a week by bringing in Harry Lewis on loan from Southampton.
* chosen_ent: Lewis, label: PERSON
* replacement: Danny Kent
pick new replacement
* replacement: Tilley

* summary: Celtic cruised to a 3-0 win over Aberdeen at Hampden Park as Brendan Rodgers secured

* replacement: Muadhamiya

* summary: A 1969 Led Zeppelin session for the BBC that was thought to have been lost when archives were wiped has been recovered from a recording made by a fan.
* chosen_ent: BBC, label: ORG
* replacement: The Portadown Times
pick new replacement
* replacement: The Church
pick new replacement
* replacement: Aegon

* summary: As the Welsh government publishes plans to reintroduce Welsh taxes for the first time since the 13th century, BBC News looks at what life was like in Wales last time there was direct Welsh taxation.
* chosen_ent: Wales, label: GPE
* replacement: Mercer

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Northern Ireland's new health minister has challenged politicians to accept change as he outlined his vision for the future of local health services.
* chosen_ent: Northern Ireland, label: GPE
* replacement: Taser
pick new replacement
* replacement: Weymouth
pick new replacement
* replacement: Frankfurt
pick new replacement
* 

* replacement: Withington

* summary: England made the right decision to omit some of their Premier League players in the European Under-21 Championship, says Football Association director of elite development Dan Ashworth.
* chosen_ent: England, label: GPE
* replacement: Balham

* summary: At his victory rally in New York US president-elect Donald Trump promised "great, great relationships" with other nations.
* chosen_ent: Trump, label: PERSON
* replacement: Neil Gorsuch
pick new replacement
* replacement: Van der Bellen
pick new replacement
* replacement: Bernard Matthews
pick new replacement
* replacement: Bowman

****** NO OVERLAP ******

* summary: Leicester director of rugby Richard Cockerill believes other clubs are offering big money to Manu Tuilagi.
* chosen_ent: Manu, label: PERSON
* replacement: Mark Griffin
pick new replacement
* replacement: Niall Booker
pick new replacement
* replacement: Tom Kohler-Cadmore
pick new replacement
* replacement: Isma Goncalves
pick new repl


****** NO OVERLAP ******

* summary: Both international governments and the world's biggest tech companies are in crisis following the leaking of documents that suggest the US government was able to access detailed records of individual smartphone and internet activity, via a scheme called Prism.
* chosen_ent: US, label: GPE
* replacement: Western Australia
pick new replacement
* replacement: North Waziristan
pick new replacement
* replacement: Dehradun

* summary: Rory McIlroy hopes to play in the WGC-HSBC Champions event in Shanghai despite suffering with food poisoning.
* chosen_ent: McIlroy, label: PERSON
* replacement: Tanaka

* summary: Ian Bell has stepped down as captain of Warwickshire in all formats of the game to focus on batting.
* chosen_ent: Bell, label: ORG
* replacement: Virgin Media
pick new replacement
* replacement: Liverpool Magistrates' Court
pick new replacement
* replacement: European Nato
pick new replacement
* replacement: Development Party
pick new replacemen

* chosen_ent: Turkish, label: NORP
* replacement: Norwegians

* summary: The Islamic State group's commander in the besieged Iraqi city of Falluja is among 70 militants killed in coalition air strikes, the US military says.
* chosen_ent: Iraqi, label: NORP
* replacement: Romans

* summary: Arsenal midfielder Santi Cazorla is to have an ankle operation that could rule him out of action for a further three months.
* chosen_ent: Arsenal, label: ORG
* replacement: the World Service
pick new replacement
* replacement: V9 Academy
pick new replacement
* replacement: Cox's Bazar
pick new replacement
* replacement: BBC social affairs
pick new replacement
* replacement: Cornwall Council
pick new replacement
* replacement: Edinburgh City Council
pick new replacement
* replacement: iPad

* summary: Trainer Jessica Harrington celebrated her first success in the Irish Grand National when favourite Our Duke won Monday's big race at Fairyhouse.
* chosen_ent: Harrington, label: PERSON
* replacement: Na

* replacement: Alan Sinclair
pick new replacement
* replacement: Craig Maclean
pick new replacement
* replacement: Abdelhak Nouri
pick new replacement
* replacement: Cibulkova

* summary: Security has been increased at France's interests abroad after a French satirical magazine published obscene cartoons of the Prophet Muhammad.
* chosen_ent: French, label: NORP
* replacement: Hawaiian

* summary: When Portugal was hit by an economic crisis in 2011, Magda Tilli and her husband Miguel realised that if they wanted to make a decent living they would have to set up their own business.
* chosen_ent: Portugal, label: GPE
* replacement: Alloa

****** NO OVERLAP ******

* summary: Zeid Ra'ad Al Hussein, high commissioner for human rights at the United Nations, has warned that a "Pandora's box" will be opened if Apple co-operates with the FBI.
* chosen_ent: Apple, label: ORG
* replacement: Rural College
pick new replacement
* replacement: The National Guard
pick new replacement
* replacement: P

* replacement: Stonehenge
pick new replacement
* replacement: Hampden Park
pick new replacement
* replacement: the Albert Pier

****** NO OVERLAP ******

* summary: Edinburgh's cultural festivals are officially kicking off, with thousands of events taking place.
* chosen_ent: Edinburgh, label: GPE
* replacement: Indonesia

****** NO OVERLAP ******

* summary: The entire senior management team of the Vatican bank is to be replaced as part of extensive reforms of the Catholic Church's central government.
* chosen_ent: Vatican, label: FAC
* replacement: Manchester Airport
pick new replacement
* replacement: Edleston Road
pick new replacement
* replacement: Kinloss Barracks
pick new replacement
* replacement: Cardwell Street
pick new replacement
* replacement: the Great Western Railway
pick new replacement
* replacement: Leicester Square
pick new replacement
* replacement: Shrews

* summary: Tony Blair is to make a rare speech to Labour activists as turmoil grows after a poll suggested lef

* chosen_ent: Android, label: ORG
* replacement: Reforma

****** NO OVERLAP ******

* summary: England put in a dominant display to win the fifth and final one-day international against West Indies by five wickets and take the series 3-2.
* chosen_ent: England, label: GPE
* replacement: Baldock Road
pick new replacement
* replacement: Mo

* summary: Kelly Sotherton says athletics chiefs should consider tweaking events rather than rewriting existing world records.
* chosen_ent: Sotherton, label: ORG
* replacement: Mullally

* summary: Hospitals with A&E departments around Bristol are back on black alert for the second time in a month amid "severe pressure" on services.
* chosen_ent: A&E, label: ORG
* replacement: Kermode

* summary: David Cameron has said the UK continues to recognise Chinese sovereignty over Tibet amid reports of a rift with Beijing over the issue.
* chosen_ent: Tibet, label: GPE
* replacement: Joselu

* summary: Craig McAllister's late goal saw Eastleigh end a run of 

* summary: Holders Arsenal must travel to Hull for an FA Cup fifth-round replay after failing to turn dominance into victory against the Championship leaders.
* chosen_ent: Arsenal, label: ORG
* replacement: Aardman

* summary: Huawei is suing its tech rival Samsung over claims that its patents have been infringed.
* chosen_ent: Huawei, label: ORG
* replacement: the European Endangered Species Programme
pick new replacement
* replacement: The Privy Council
pick new replacement
* replacement: Gujarat Lions
pick new replacement
* replacement: Etsy

* summary: The death of Saif al-Arab Gaddafi, if confirmed, is likely to have come as a consequence of Nato's increasingly aggressive tactics, undertaken by the alliance to shake up a stalemate in the conflict.
* chosen_ent: Nato, label: ORG
* replacement: the Speaker's Committee
pick new replacement
* replacement: Xerox

****** NO OVERLAP ******

* summary: A deal on the financial arrangements that will underpin Scotland's new devolution powe

* replacement: HMP Stocken
pick new replacement
* replacement: Drax

****** NO OVERLAP ******

* summary: The UK has seen the hottest July day on record, with temperatures hitting 36.7C (98F).
* chosen_ent: UK, label: GPE
* replacement: Caerphilly

* summary: The question has been raised - is 27-year-old Andreas Lubitz a mass murderer for bringing down a plane full of passengers, killing everyone on board?
* chosen_ent: Lubitz, label: PERSON
* replacement: Webb Simpson
pick new replacement
* replacement: Ms Eisenstadt
pick new replacement
* replacement: Erica Stoll
pick new replacement
* replacement: Emilia Papadopoulos
pick new replacement
* replacement: Roedd gan
pick new replacement
* replacement: Andrew Russell
pick new replacement
* replacement: Will Brown
pick new replacement
* replacement: Carole Walker
pick new replacement
* replacement: Rob Mullett
pick new replacement
* replacement: Paul Marshall
pick new replacement
* replacement: Jodie Kidd
pick new replacement
* replacemen

pick new replacement
* replacement: Liam Craig
pick new replacement
* replacement: Lena Wilson
pick new replacement
* replacement: Hwang Pyong-so
pick new replacement
* replacement: Chi

* summary: Steven Lawless has extended his contract with Partick Thistle for a further two years.
* chosen_ent: Steven, label: PERSON
* replacement: Domino

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Yorkshire should be given its own "White Rose Parliament" with its own budget, MP David Blunkett has said.
* chosen_ent: Yorkshire, label: GPE
* replacement: Qatar

* summary: Gateshead and Guiseley extended their unbeaten runs with a 1-1 draw at the Gateshead International Stadium.
* chosen_ent: Guiseley, label: ORG
* replacement: Packer

* summary: Championship side Preston North End have signed Aston Villa forward Callum Robinson and Fulham midfielder Ben Pringle, both on three-year contracts.
* chosen_ent: Pringle, label: PERSON
* replacement: Josh Ruffels
pick new replacement
* rep

* replacement: Schengen

* summary: Renaming Cardiff Airport in memory of Princess Diana would boost international recognition, a former councillor has said.
* chosen_ent: Cardiff, label: GPE
* replacement: Ipswich Town
pick new replacement
* replacement: Sark

****** NO OVERLAP ******

* summary: Brian Cookson, the head of cycling's world governing body, says blame for British Cycling's failings should not rest solely with him.
* chosen_ent: Cookson, label: PERSON
* replacement: Cian Healy
pick new replacement
* replacement: Mann

* summary: As polling day fast approaches, the BBC's Scotland 2015 programme is holding its final election debate featuring politicians answering your questions.
* chosen_ent: Scotland, label: GPE
* replacement: Manor Park
pick new replacement
* replacement: Abyei

* summary: French Prime Minister Manuel Valls has attacked the British "caricature" of France.
* chosen_ent: France, label: GPE
* replacement: Crimea

* summary: Fees for Church of England wedding

pick new replacement
* replacement: Lin

* summary: Hampshire were relegated from County Championship Division One after suffering a six-wicket home defeat by Durham in the season's final game.
* chosen_ent: Hampshire, label: GPE
* replacement: Legoland

* summary: Boss Aitor Karanka says Middlesbrough are in no hurry to make further signings as they prepare to return to the Premier League.
* chosen_ent: Karanka, label: PERSON
* replacement: Sherie Ryder
pick new replacement
* replacement: Eoin

* summary: Belfast Harbour has submitted a planning application for what would be a third major waterfront office block.
* chosen_ent: Belfast, label: GPE
* replacement: Serbia

* summary: Six years ago, professional horse-rider Claire Lomas was told that she would never walk again but now she is attempting to walk more than 26 miles (42km) at Sunday's London Marathon thanks to a pair of "robot legs", which have transformed her life.
* chosen_ent: Claire, label: PERSON
* replacement: Harry Met 

****** NO OVERLAP ******

* summary: An appeal to raise money for Syrian refugees who have arrived in the north east of Scotland has been launched.
* chosen_ent: Scotland, label: GPE
* replacement: Umbria

****** NO OVERLAP ******

* summary: Forwards Chris Morgan and Alex Cheesman have signed new two-year contracts at the Cornish Pirates.
* chosen_ent: Cheesman, label: ORG
* replacement: Noble's Hospital
pick new replacement
* replacement: the Work and Pensions
pick new replacement
* replacement: England Hockey
pick new replacement
* replacement: the Justice Committee
pick new replacement
* replacement: WAM

* summary: Thousands of people have marched in opposition to plans to downgrade Stafford Hospital's services.
* chosen_ent: Stafford, label: GPE
* replacement: the West Bank
pick new replacement
* replacement: Kirkuk

****** NO OVERLAP ******

* summary: President Donald Trump's argument that the removal of Confederate statues is a slippery slope to changing history has recharged 

* replacement: North West England
pick new replacement
* replacement: Tourism Ireland

* summary: Seven fishermen owe Captain Radhika Menon their lives.
* chosen_ent: Radhika Menon, label: PERSON
* replacement: Rojas
pick new replacement
* replacement: Jens Janse

* summary: England's collapse on the final afternoon of the fifth Test is one of the worst I have seen - and I have witnessed a few.
* chosen_ent: England, label: GPE
* replacement: Washington DC
pick new replacement
* replacement: Konya

****** NO OVERLAP ******

* summary: Surrey captain Gareth Batty has signed a new contract which will keep him at the club until the end of 2017.
* chosen_ent: Surrey, label: GPE
* replacement: Kuwait City
pick new replacement
* replacement: Hamburg

* summary: Cameroon's Confederations Cup campaign is over after a 3-1 defeat by Germany in their final Group B game on Sunday.
* chosen_ent: Cameroon, label: GPE
* replacement: County

* summary: Increased spending will result in a "bigger" Roya

* summary: Brazil's interim President Michel Temer has called an emergency meeting of state security ministers after a gang rape of a teenage girl in Rio de Janeiro triggered wide condemnation.
* chosen_ent: Brazil, label: GPE
* replacement: Copacabana Beach
pick new replacement
* replacement: Melania

****** NO OVERLAP ******

* summary: Google is to open a new headquarters building in London which could see 3,000 new jobs created by 2020.
* chosen_ent: Google, label: ORG
* replacement: Dail

* summary: A 6,000-year-old "eco-home" has been discovered close to Stonehenge, archaeologists have revealed.
* chosen_ent: Stonehenge, label: LOC
* replacement: Rugeley

****** NO OVERLAP ******

* summary: Crystal Palace have made a £25m bid to sign Christian Benteke from Liverpool.
* chosen_ent: Liverpool, label: GPE
* replacement: Samarra

* summary: A 96-year-old woman is preparing to return to Scotland from Australia after a visa wrangle.
* chosen_ent: Australia, label: GPE
* replacement: C

* replacement: Piutau

* summary: Relatives of 24 rubber plantation workers killed by British troops almost 70 years ago in Malaya have lost an appeal for an official investigation.
* chosen_ent: British, label: NORP
* replacement: Londoners

* summary: West Ham will sign Norway midfielder Havard Nordtveit on a free transfer on 1 July after his contract with Borussia Monchengladbach runs out.
* chosen_ent: Norway, label: GPE
* replacement: Borneo

* summary: Promoters have blamed bad weather for the decision to cancel a concert by Green Day in Glasgow, only hours before it was due to begin.
* chosen_ent: Glasgow, label: GPE
* replacement: Cookstown

* summary: Too many schools in England break the rules on admissions arrangements, says the outgoing chief schools adjudicator.
* chosen_ent: England, label: GPE
* replacement: Berlusconi

* summary: Tottenham moved back to within two points of Premier League leaders Leicester after battling back from behind to beat a dogged Swansea side.
*


* summary: Sweden's Henrik Stenson won the Wyndham Championship to claim his first title since The Open at Royal Troon last year.
* chosen_ent: Stenson, label: PERSON
* replacement: Simon Cox
pick new replacement
* replacement: God

* summary: Five days after Germanwings flight 4U 9525 crashed in the French Alps killing all 150 on board, investigators say they have isolated DNA of 78 victims.
* chosen_ent: German, label: NORP
* replacement: Romans

* summary: Botswana is to deport controversial US pastor Steven Anderson after he said on a local radio that homosexuals should be "stoned to death".
* chosen_ent: Anderson, label: PERSON
* replacement: Nia Elen Davies
pick new replacement
* replacement: Stoltenberg

* summary: Wales full-back Liam Williams will miss Scarlets' Pro12 trip to Zebre with an ankle injury as the region refuse to admit defeat over his prospective move to Saracens.
* chosen_ent: Wales, label: GPE
* replacement: Deir al-Zour
pick new replacement
* replacement: Ivor

pick new replacement
* replacement: ACCC
pick new replacement
* replacement: the Stormont Assembly
pick new replacement
* replacement: Bradford University
pick new replacement
* replacement: Shorty
pick new replacement
* replacement: Wings Over Scotland
pick new replacement
* replacement: Cornwall Fire and Rescue Service
pick new replacement
* replacement: the Parole Board
pick new replacement
* replacement: Okaz
pick new replacement
* replacement: Ebanks-Landell
pick new replacement
* replacement: Pixel
pick new replacement
* replacement: the Colleges Scotland Employers' Association
pick new replacement
* replacement: Barclaycard
pick new replacement
* replacement: Commissioned
pick new replacement
* replacement: HMICS
pick new replacement
* replacement: The Globe and Mail

* summary: Hampshire spun their way to an eight-wicket victory over T20 Blast South Group leaders Glamorgan at Southampton.
* chosen_ent: Hampshire, label: GPE
* replacement: Lanark

* summary: A man has appeared i

* replacement: Carlos Tavares

* summary: The Badger Trust has launched a new legal challenge to the government's plans to cull badgers in England.
* chosen_ent: England, label: GPE
* replacement: Tamworth

****** NO OVERLAP ******

* summary: BHP Billiton and Vale have agreed a deadline of 30 June to consolidate and settle claims resulting from Brazil's Samarco dam disaster in 2015.
* chosen_ent: Brazil, label: GPE
* replacement: Edinburgh city
pick new replacement
* replacement: Haverfordwest

* summary: Boeing has shown off its "space bins" that can hold 50% more luggage than existing designs.
* chosen_ent: Boeing, label: ORG
* replacement: Diana, Princess of Wales
pick new replacement
* replacement: Coventry University
pick new replacement
* replacement: Assange

* summary: South Africa beat Sri Lanka by 206 runs thanks to a five-wicket haul in the first 14 overs on the final day of the first Test.
* chosen_ent: Sri Lanka, label: GPE
* replacement: Greenock
pick new replacement
* r

* chosen_ent: Egypt, label: GPE
* replacement: Southend United 0
pick new replacement
* replacement: Glenavon

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: The US Treasury Department has warned the European Commission about taking action against US companies over tax avoidance allegations.
* chosen_ent: US, label: GPE
* replacement: Coleshill

* summary: Eight of the women's team who won a bronze medal at London 2012 have been named in the Great Britain hockey squad for this summer's Olympics in Rio.
* chosen_ent: London, label: GPE
* replacement: Shenzhen

****** NO OVERLAP ******

* summary: Members of the public can once again digitally petition MPs on issues with the launch of a new website.
* chosen_ent: MPs, label: ORG
* replacement: Shea

* summary: "Maverick" is a word that seems to follow Craig Venter around.
* chosen_ent: Craig Venter, label: PERSON
* replacement: Wyre Davis

* summary: A hospital trust in Cornwall has declared a "black alert" for the second

pick new replacement
* replacement: the Fifa Confederations Cup
pick new replacement
* replacement: S6

* summary: US Republican presidential nominee Donald Trump has backtracked on a claim that he saw video footage of a US cash payment to Iran.
* chosen_ent: Trump, label: PERSON
* replacement: Ms McIlveen
pick new replacement
* replacement: Alex McLeish
pick new replacement
* replacement: Robert Atwell
pick new replacement
* replacement: Gbagbo

****** NO OVERLAP ******

* summary: The father-in-law of former DUP director of communications John Robinson runs two green energy boilers under a botched energy scheme.
* chosen_ent: Robinson, label: PERSON
* replacement: Ellie

* summary: The mayor of Paris has said she will sue Fox News for its inaccurate reporting about the city following the attack on the magazine Charlie Hebdo.
* chosen_ent: Paris, label: GPE
* replacement: Oxfordshire

* summary: UK interest rates have been held at 0.5% again by the Bank of England's Monetary Policy Co

****** NO OVERLAP ******

* summary: A flat pack robot designed by an Edinburgh-based entrepreneur is to be a feature of this year's creative industries festival XpoNorth.
* chosen_ent: XpoNorth, label: ORG
* replacement: TI

* summary: League Two side Cambridge United have been drawn at home to Manchester United in the fourth round of the FA Cup.
* chosen_ent: Manchester United, label: GPE
* replacement: London Midland

* summary: Gateshead are in discussions to offer new deals to JJ O'Donnell, George Smith, Liam Hogan and Gus Mafuta for next season's National League campaign.
* chosen_ent: O'Donnell, label: ORG
* replacement: Borussia Moenchengladbach
pick new replacement
* replacement: Farbrace

* summary: Manchester City could recall Yaya Toure and Nicolas Otamendi, both of whom dropped to the bench in Wednesday's defeat by Monaco.
* chosen_ent: Monaco, label: GPE
* replacement: Gloucester

* summary: National League club Barrow have announced the departure of manager Paul Cox afte

* replacement: Guaruja

* summary: A candlelit vigil has been held in Edinburgh to show support for the people of Nepal.
* chosen_ent: Nepal, label: GPE
* replacement: Shropshire

* summary: Dozens of farmers gathered outside the Scottish Parliament for a rally highlighting rural issues.
* chosen_ent: Scottish, label: NORP
* replacement: Somalian

* summary: I can only imagine what Alastair Cook has been going through during his and England's wretched run of form.
* chosen_ent: Cook, label: PERSON
* replacement: Nicki Minaj
pick new replacement
* replacement: Rob Cook
pick new replacement
* replacement: Jeb

* summary: Recent attacks by the Taliban in Afghanistan are not a sign it is making advances as the last foreign troops prepare to withdraw, the former head of the British army has said.
* chosen_ent: Afghanistan, label: GPE
* replacement: Malibu

****** NO OVERLAP ******

* summary: Lotus has been given a further two-week breathing space in insolvency proceedings brought by Revenu


****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Residents in Swansea are to be given their say on plans for more than 17,000 new homes across the city.
* chosen_ent: Swansea, label: GPE
* replacement: Ladywood

****** NO OVERLAP ******

* summary: Scotland's Josh Taylor has all the elements it takes to make it to the top in boxing, says trainer Shane McGuigan.
* chosen_ent: Taylor, label: PERSON
* replacement: Cherry

* summary: The kits used by humans 100,000 years ago to make paint have been found at the famous archaeological site of Blombos Cave in South Africa.
* chosen_ent: Blombos, label: GPE
* replacement: Worcester City
pick new replacement
* replacement: the United Arab Emirates
pick new replacement
* replacement: London Road
pick new replacement
* replacement: Corner

* summary: The Republic of Ireland's economy grew 1.5% in the second quarter of the year, figures show, and was up 7.7% on the April-to-June period in 2013.
* chosen_ent: Ireland, label: GPE
* re

pick new replacement
* replacement: Royal Free Hospital
pick new replacement
* replacement: Raikkonen
pick new replacement
* replacement: Amri
pick new replacement
* replacement: Nature
pick new replacement
* replacement: Irish News

****** NO OVERLAP ******

****** NO OVERLAP ******

****** NO OVERLAP ******

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: England prop Alex Corbisiero has been called up to the British and Irish Lions squad to provide cover for the injured Cian Healy.
* chosen_ent: England, label: GPE
* replacement: Findon

* summary: The scandal surrounding Malaysia's state development fund 1MDB has gripped the country for years.
* chosen_ent: Malaysia, label: GPE
* replacement: Salvador

* summary: About 350 homes are to be built in the Govanhill area of Glasgow under a £6.4m council initiative to help "turn the area around".
* chosen_ent: Govanhill, label: GPE
* replacement: San Bernadino
pick new replacement
* replacement: Anguilla

****** NO OVERLAP

* replacement: GDC

* summary: The two rivals for the centre-right French presidential nomination have clashed over the level of change they promise to bring, in a TV debate.
* chosen_ent: French, label: NORP
* replacement: Haqqani

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Black bin bags could be collected every three weeks in Pembrokeshire as the council looks to meet "severe budget cuts" and performance targets.
* chosen_ent: Pembrokeshire, label: GPE
* replacement: Historic Scotland
pick new replacement
* replacement: Translink

****** NO OVERLAP ******

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Top seeds Novak Djokovic and Serena Williams remain on course to make history as they both reached the semi-finals at the French Open.
* chosen_ent: Djokovic, label: PERSON
* replacement: Tony Travers
pick new replacement
* replacement: Steve Jobs'
pick new replacement
* replacement: Jonjo O'Neill
pick new replacement
* replacement: Jeffrey Lewis
pi

pick new replacement
* replacement: the Congress of South African Trade Unions
pick new replacement
* replacement: Â£1.25bn

* summary: A 21-year-old man has appeared in court in County Clare charged over the fatal stabbing of Karl Haugh in Kilkee at the weekend.
* chosen_ent: Kilkee, label: GPE
* replacement: Rock

* summary: Adam Johnson has been sacked by Sunderland after pleading guilty to one count of sexual activity with a child and one charge of grooming.
* chosen_ent: Johnson, label: PERSON
* replacement: Sheila Dixon
pick new replacement
* replacement: Dimbleby

****** NO OVERLAP ******

* summary: The chief executive of Barnet Council has left his role after a blunder led to some voters being turned away from polling stations on Thursday.
* chosen_ent: Barnet Council, label: ORG
* replacement: Nature Materials

* summary: A giant cake has been made in the shape of a Land Rover by an award-winning amateur baker from the West Midlands.
* chosen_ent: Land Rover, label: ORG
* rep

* replacement: Mr Wilson
pick new replacement
* replacement: Neil Harris'
pick new replacement
* replacement: Danny MacAskill
pick new replacement
* replacement: Rajapaksa

* summary: Display restrictions on cigarettes and tobacco products sold by small traders in Wales have come into force.
* chosen_ent: Wales, label: GPE
* replacement: Wealdstone

* summary: Burnley's Sean Dyche is in the top three Premier League managers of the year, says West Brom boss Tony Pulis.
* chosen_ent: Pulis, label: PERSON
* replacement: Hugo Boss
pick new replacement
* replacement: Ms Brennan
pick new replacement
* replacement: Bradley Hudson-Odoi
pick new replacement
* replacement: Kris Marshall
pick new replacement
* replacement: Barry McNamee
pick new replacement
* replacement: Ahmed Mohamed
pick new replacement
* replacement: Joe Crilly
pick new replacement
* replacement: Nellie

* summary: North Korea has conducted a new intercontinental ballistic missile test, South Korea and the Pentagon say.
* cho

* replacement: San Steffan
pick new replacement
* replacement: Bastille

* summary: Plans to create a huge lightning bolt memorial to David Bowie in south London have been scrapped after a crowdfunding campaign fell short of its target.
* chosen_ent: Bowie, label: PERSON
* replacement: Gary Locke's
pick new replacement
* replacement: Albert

* summary: A Leicestershire man has admitted killing his neighbour after kicking down his front door.
* chosen_ent: Leicestershire, label: GPE
* replacement: Treviso

* summary: Frankie Raymond scored a late winner as promotion-chasing Dagenham beat York in the National League at Victoria Road.
* chosen_ent: Dagenham, label: GPE
* replacement: Valparaiso

* summary: The X Factor is to stay on ITV for at least the next three years, despite declining viewing figures and the arrival of The Voice to the channel.
* chosen_ent: ITV, label: ORG
* replacement: the Cancer Drugs Fund
pick new replacement
* replacement: BBC Radio 4 Extra
pick new replacement


* replacement: Dursley

****** NO OVERLAP ******

* summary: US officials in California have been seeking security advice from French officials, as the San Francisco area prepares to host the Super Bowl.
* chosen_ent: San Francisco, label: GPE
* replacement: Bodmin
pick new replacement
* replacement: Arona
pick new replacement
* replacement: El Pais

* summary: Fabrice Muamba was "in effect dead" for 78 minutes following his on-field collapse, the Bolton Wanderers club doctor Jonathan Tobin has revealed.
* chosen_ent: Muamba, label: PERSON
* replacement: Noddy

****** NO OVERLAP ******

* summary: Nico Rosberg set the pace as world champions Mercedes continued their ominous form on the first day of the final Formula 1 pre-season test.
* chosen_ent: Rosberg, label: ORG
* replacement: the NHS Alliance
pick new replacement
* replacement: Pollok

* summary: Hans Ulrich Obrist, artistic director of the Serpentine Galleries in London, has been named the most powerful figure in the art world.

* replacement: Cumhuriyet

* summary: Brussels was abuzz on Monday.
* chosen_ent: Brussels, label: GPE
* replacement: AU

* summary: Sir Bradley Wiggins has defended claims made in his 2012 autobiography that he had never received injections.
* chosen_ent: Wiggins, label: PERSON
* replacement: John Whittingdale MP
pick new replacement
* replacement: Ryan Murray
pick new replacement
* replacement: Robert Rogers
pick new replacement
* replacement: Margaritis Schinas
pick new replacement
* replacement: Smad Place
pick new replacement
* replacement: a. Substitution
pick new replacement
* replacement: Steve Mounie
pick new replacement
* replacement: Zardari

* summary: More than 1,000 young spiders from a hybrid species have been released into the Suffolk broads.
* chosen_ent: Suffolk, label: GPE
* replacement: Dartford Road
pick new replacement
* replacement: Turin

* summary: Myanmar's National League for Democracy (NLD) has named its candidates to be president, confirming that its leader

* summary: Argentine President Cristina Fernandez and Amnesty International have called for justice after the violent death of a transgender activist.
* chosen_ent: Fernandez, label: PERSON
* replacement: Abertawe Bro Morgannwg
pick new replacement
* replacement: Djakadam

* summary: Worcester Warriors loose-head prop Ryan Bower has signed a new contract with the Premiership club.
* chosen_ent: Bower, label: ORG
* replacement: Podesta

* summary: The RSPCA is urging cat owners to be vigilant after suspected cat poisoning deaths in Powys.
* chosen_ent: RSPCA, label: ORG
* replacement: Trump

* summary: Ex-England youth international Benik Afobe will not play for DR Congo in next weekend's 2017 Africa Cup of Nations clash with Angola.
* chosen_ent: England, label: GPE
* replacement: Sapa

* summary: Channel 4 will shut down E4 on the day of the general election, in a bid to encourage more young people to vote.
* chosen_ent: E4, label: ORG
* replacement: Short Strand
pick new replacement



* summary: Canada Post will phase out home delivery in urban areas over the next five years as the postal service struggles to rein in persistent losses.
* chosen_ent: Canada Post, label: ORG
* replacement: Historical Investigations Unit
pick new replacement
* replacement: The Court of Cassation
pick new replacement
* replacement: Sevilla 2

* summary: The US president has urged Kenya to hold "visible" trials to tackle corruption, which he said could be the "biggest impediment" to further growth.
* chosen_ent: Kenya, label: GPE
* replacement: Malia

* summary: In the summer months high on the French Alps the sheep graze on rich pastures.
* chosen_ent: French, label: NORP
* replacement: Africans

* summary: James Corden has had a capital idea - to film three episodes of his late night US talk show on home turf.
* chosen_ent: US, label: GPE
* replacement: Havant

* summary: David Cameron is holding talks with German Chancellor Angela Merkel on Britain's EU reform aims, the situation in 

* chosen_ent: UK, label: GPE
* replacement: Airdrie

* summary: A four-year-old boy has died in an incident at a farm in Maguiresbridge, County Fermanagh.
* chosen_ent: Maguiresbridge, label: PERSON
* replacement: Charlie Broadway
pick new replacement
* replacement: Witold Waszczykowski
pick new replacement
* replacement: Hammerstein

* summary: South Africa's prosecutors have sought permission to appeal against athlete Oscar Pistorius' "shockingly light" sentence, court papers show.
* chosen_ent: Pistorius, label: PERSON
* replacement: Aled Brew
pick new replacement
* replacement: Jez George
pick new replacement
* replacement: Stephen Myler
pick new replacement
* replacement: Craig Noone
pick new replacement
* replacement: Javeed

* summary: The owners of Dunsfold Park aerodrome, which is home to the BBC's Top Gear, have lost their latest fight for unrestricted flying.
* chosen_ent: Dunsfold Park, label: GPE
* replacement: Panathinaikos
pick new replacement
* replacement: Greater Manc

* replacement: PPF
pick new replacement
* replacement: Sanjoy MajumderBBC News
pick new replacement
* replacement: Sharia
pick new replacement
* replacement: the Care Quality Commission's
pick new replacement
* replacement: Haram
pick new replacement
* replacement: RedState
pick new replacement
* replacement: Concord Rangers

* summary: Sussex's victory bid was frustrated as David Lloyd's unbeaten hundred enabled Glamorgan to bat out the final day.
* chosen_ent: Sussex, label: ORG
* replacement: Birch

* summary: The head of one of the UK's largest credit unions is to become chief executive of Airdrie Savings Bank.
* chosen_ent: Airdrie Savings Bank, label: ORG
* replacement: Phoenix Suns
pick new replacement
* replacement: Granada Studios
pick new replacement
* replacement: Invictus
pick new replacement
* replacement: P Durcan
pick new replacement
* replacement: Corriere Della Sera

****** NO OVERLAP ******

* summary: Jamaica's most senior drug tester says the country's recent rash o

****** NO OVERLAP ******

* summary: Poet Owen Sheers has won Wales Book of the Year at a ceremony in Caernarfon for his work about three young soldiers.
* chosen_ent: Wales, label: GPE
* replacement: Guinea-Bissau

* summary: England Lions batsmen Ben Duckett and Daniel Bell-Drummond could play for England within a year, according to former national coach Andy Flower.
* chosen_ent: Lions, label: ORG
* replacement: BBC Africa's
pick new replacement
* replacement: Orange

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Formula 1 boss Bernie Ecclestone says there is "no reason" why this weekend's Bahrain Grand Prix should not go ahead.
* chosen_ent: Bahrain, label: GPE
* replacement: Lancaster

* summary: Downton Abbey star Michelle Dockery and actor Dominic West will star in a 30th anniversary revival of Les Liaisons Dangereuses at the Donmar Warehouse.
* chosen_ent: West, label: LOC
* replacement: Greek islands
pick new replacement
* replacement: Europa

* summary: Zamal


****** NO OVERLAP ******

* summary: The government has overturned a decision by Bristol City Council to refuse planning permission for a McDonald's drive-through.
* chosen_ent: McDonald, label: ORG
* replacement: Denali

* summary: Mental health patients are being placed into overstretched A&E departments as police crackdown on the number locked up in their cells, it has been claimed.
* chosen_ent: A&E, label: ORG
* replacement: the Crumlin Ardoyne Residents Association
pick new replacement
* replacement: Barclays Bank
pick new replacement
* replacement: Euroscepticism

* summary: Tiger Woods is back in upbeat mood and expecting to win major championships over the next decade.
* chosen_ent: Woods, label: ORG
* replacement: Vancouver Whitecaps
pick new replacement
* replacement: the US Centers for Disease Control and Prevention
pick new replacement
* replacement: BBC iPlayer Radio
pick new replacement
* replacement: the British Antarctic Survey
pick new replacement
* replacement: Jobc

* replacement: The Super Falcons
pick new replacement
* replacement: the University Hospital
pick new replacement
* replacement: Royal Mail
pick new replacement
* replacement: Llambias

* summary: There have been a number of attacks in the city of Brussels, the capital of Belgium.
* chosen_ent: Brussels, label: GPE
* replacement: Rio Ave
pick new replacement
* replacement: San Bernardino
pick new replacement
* replacement: Ferryhill

* summary: A former Russian army officer who is alleged to have fought for the Taliban in Afghanistan has appeared in court in the United States on terrorism charges.
* chosen_ent: Afghanistan, label: GPE
* replacement: Georgia

****** NO OVERLAP ******

* summary: Austin Macphee could be the only Scotsman at Euro 2016 after being part of the coaching set-up at Northern Ireland.
* chosen_ent: Macphee, label: PERSON
* replacement: Wijnaldum

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: Watford's Isaac Success will not make his Nigeria debu

In [None]:
chosen_ent_list

In [63]:
# chosen_ent_groupby_label
chosen_ent_groupby_label = {l:{} for l in FILTER_LABELS}

In [64]:
for l in FILTER_LABELS:
    print(l)
    print(len([ent for (ent, label) in chosen_ent_list if label == l]))

PERSON
2149
FAC
66
GPE
3357
NORP
602
LOC
158
EVENT
56
LANGUAGE
8
LAW
21
ORG
2226


In [None]:
# groupby entities by their names
for data_idx, (ent, label) in enumerate(chosen_ent_list):
    chosen_ent_groupby_label[label] = {"ent": ent,
                                       "doc_idcount

In [19]:
nouse_count

15

* 완전히 같은 entity 통일하기

In [32]:
exclude_count = 0
for data_idx, data in enumerate(tqdm(xsum_test_data.dataset[:100])):
#     print(f"============ data idx: {data_idx} ============")
    document = data["document"]
    true_summary = data["true_summary"]
    
    # ner
    doc_doc = NER(document.replace("\n", " "))  # removing newline changes the ner result
    doc_sum = NER(true_summary)
    
    doc_doc_list.append(doc_doc)
    doc_sum_list.append(doc_sum)
    
    # extract entities from document
    doc_ents_filtered = [(ent.text, ent.label_) for ent in doc_doc.ents if ent.label_ in LABELS]
    doc_ents_filtered_sorted = Counter(ents_filtered).most_common()
    
    # find overlapping entities from document
    ent_overlap_flag = False
    
    for (ent, label), count in doc_ents_filtered_sorted:
        if ent in true_summary:  # there is an overlap
#             print(f"replace ent: {ent}, label: {label}, count: {count}")
            ent_overlap_flag = True
            
    
    # find overlapping entities from summary
    if ent_overlap_flag == False:
#         print("**no overlap from document**")
        
        # extract entities from summary
        sum_ents_filtered = [(ent.text, ent.label_) for ent in doc_sum.ents if ent.label_ in LABELS]
        sum_ents_filtered_sorted = Counter(sum_ents_filtered).most_common()
#         print("summary entities", sum_ents_filtered_sorted)
        
        
        if len(sum_ents_filtered_sorted) == 0: # no entity in summary -> pass
            pass
        else: # entity found in summary
            for (ent, label), count in sum_ents_filtered_sorted:
                if ent in document:  # there is an overlap from the summary
#                     print("found summary entity in the document!")
#                     print(f"replace ent: {ent}, label: {label}, count: {count}")
                    ent_overlap_flag = True
    
    if ent_overlap_flag == False:
        exclude_count += 1 # need to exclude
    

  0%|          | 0/100 [00:00<?, ?it/s]

In [35]:
doc_doc_list[0].ents

(1,099,
 2015-16,
 up to a year,
 The Welsh Government,
 the Housing Act,
 Wales,
 2015,
 Andrew Stevens,
 six months to a year,
 six months,
 Stevens,
 one,
 a hundred pounds,
 830,
 one,
 the year to March 2016,
 6,900,
 Wales,
 Marc,
 50,
 the past 20 years,
 Marc,
 Tom Clarke,
 Emmaus South Wales,
 Emmaus,
 one,
 Welsh Government,
 one,
 20,000,
 the next five years)

In [33]:
exclude_count

35

In [None]:
xsum_test_data.dataset[idx]

In [None]:
sum_doc = NER(["true_summary"])

# example 1

In [None]:
import spacy
from spacy import displacy
NER = spacy.load("en_core_web_sm")

In [3]:
raw_text="The Indian Space Research Organisation or is the national space agency of India, \
    headquartered in Bengaluru. It operates under Department of Space which is directly \
    overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."

In [4]:
raw_text2 = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, \
    was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) \
    and has entered Mars orbit on 24 September 2014. India thus became the first country to enter \
    Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

In [5]:
text = NER(raw_text2)

In [6]:
for word in text.ents:
    print(word.text,word.label_)

The Mars Orbiter Mission (MOM PRODUCT
Mangalyaan PERSON
Earth LOC
5 November 2013 DATE
the Indian Space Research Organisation ORG
Mars LOC
24 September 2014 DATE
India GPE
first ORDINAL
Mars LOC
$74 million MONEY


In [115]:
# list of ner labels
for ner_label in NER.get_pipe('ner').labels:
    print(f"* {ner_label}: {spacy.explain(ner_label)}")

* CARDINAL: Numerals that do not fall under another type
* DATE: Absolute or relative dates or periods
* EVENT: Named hurricanes, battles, wars, sports events, etc.
* FAC: Buildings, airports, highways, bridges, etc.
* GPE: Countries, cities, states
* LANGUAGE: Any named language
* LAW: Named documents made into laws.
* LOC: Non-GPE locations, mountain ranges, bodies of water
* MONEY: Monetary values, including unit
* NORP: Nationalities or religious or political groups
* ORDINAL: "first", "second", etc.
* ORG: Companies, agencies, institutions, etc.
* PERCENT: Percentage, including "%"
* PERSON: People, including fictional
* PRODUCT: Objects, vehicles, foods, etc. (not services)
* QUANTITY: Measurements, as of weight or distance
* TIME: Times smaller than a day
* WORK_OF_ART: Titles of books, songs, etc.


In [8]:
displacy.render(text,style="ent",jupyter=True)

# another example

In [49]:
# function to display basic entity info: 
def show_ents(doc):
    print(f"original doc: {doc.text}")
    if doc.ents: 
        for ent in doc.ents: 
            print(f"entity: {ent.text : >13} | start_char: {ent.start_char: 3} | end_char: {ent.end_char: 3} | label: {ent.label_} - {spacy.explain(ent.label_)}")
            # print(ent.label)  # entity type's hash value
            # print(ent.start)  # token span's start index position (word index)
            # print(ent.end)  # token span's stop index position (word index)
    else: print('No named entities found.')

In [50]:
doc1 = NER("Apple is looking at buying U.K. startup for $1 billion") 
show_ents(doc1)

original doc: Apple is looking at buying U.K. startup for $1 billion
entity:         Apple | start_char:   0 | end_char:   5 | label: ORG - Companies, agencies, institutions, etc.
entity:          U.K. | start_char:  27 | end_char:  31 | label: GPE - Countries, cities, states
entity:    $1 billion | start_char:  44 | end_char:  54 | label: MONEY - Monetary values, including unit


## document level

In [51]:
doc = NER("San Francisco considers banning sidewalk delivery robots") 
# document level 
for e in doc.ents: 
    print(e.text, e.start_char, e.end_char, e.label_) 

# OR 
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] 
print(ents)

San Francisco 0 13 GPE
[('San Francisco', 0, 13, 'GPE')]


## token level

In [52]:
# token level 
# doc[0], doc[1] ...will have tokens stored. 

ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] 
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
ent_considers = [doc[2].text, doc[2].ent_iob_, doc[2].ent_type_] 
print(ent_san) 
print(ent_francisco)
print(ent_considers)

# token.ent_iob indicates whether an entity starts continues or ends on the tag
# I - Token is inside an entity. 
# O - Token is outside an entity. 
# B - Token is the beginning of an entity.

['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']
['considers', 'O', '']


## User-Defined Named Entity and Adding it to a Span

### Example 1

In [53]:
doc = NER(u'Tesla to build a U.K. factory for $6 million.')
show_ents(doc)

original doc: Tesla to build a U.K. factory for $6 million.
entity:          U.K. | start_char:  17 | end_char:  21 | label: GPE - Countries, cities, states
entity:    $6 million | start_char:  34 | end_char:  44 | label: MONEY - Monetary values, including unit


In [54]:
from spacy.tokens import Span

In [55]:
# get the hash value of the ORG entity label
ORG = doc.vocab.strings[u"ORG"]
print(ORG)

# create a span for the new entity
new_ent = Span(doc=doc, start=0, end=1, label=ORG)
print(new_ent)

# add the entity to the existing doc object
doc.ents = list(doc.ents) + [new_ent]
print(doc.ents)

383
Tesla
(Tesla, U.K., $6 million)


In [56]:
show_ents(doc)

original doc: Tesla to build a U.K. factory for $6 million.
entity:         Tesla | start_char:   0 | end_char:   5 | label: ORG - Companies, agencies, institutions, etc.
entity:          U.K. | start_char:  17 | end_char:  21 | label: GPE - Countries, cities, states
entity:    $6 million | start_char:  34 | end_char:  44 | label: MONEY - Monetary values, including unit


## Adding Named Entities to All Matching Spans

In [57]:
doc = NER(u'Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.') 
show_ents(doc) 

original doc: Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.
entity:         first | start_char:  99 | end_char:  104 | label: ORDINAL - "first", "second", etc.


In [58]:
# Import PhraseMatcher and create a matcher object: 
from spacy.matcher import PhraseMatcher 
matcher = PhraseMatcher(NER.vocab)

In [59]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [NER(text) for text in phrase_list]
print(phrase_list)
print(phrase_patterns)

['vacuum cleaner', 'vacuum-cleaner']
[vacuum cleaner, vacuum-cleaner]


In [60]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

In [61]:
# Apply the matcher to our Doc object:
matches = matcher(doc)
#See what matches occur: 
matches 

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [62]:
# Here we create Spans from each match, and create named entities from them: 
from spacy.tokens import Span 
PROD = doc.vocab.strings[u'PRODUCT'] 
new_ents = [Span(doc, match[1], match[2],label=PROD) for match in matches]
new_ents

[vacuum cleaner, vacuum cleaner]

In [63]:
# match[1] contains the start index of the the token and match[2] the stop index (exclusive) of the token in the doc. 
doc.ents = list(doc.ents) + new_ents 
show_ents(doc)

original doc: Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.
entity: vacuum cleaner | start_char:  37 | end_char:  51 | label: PRODUCT - Objects, vehicles, foods, etc. (not services)
entity: vacuum cleaner | start_char:  72 | end_char:  86 | label: PRODUCT - Objects, vehicles, foods, etc. (not services)
entity:         first | start_char:  99 | end_char:  104 | label: ORDINAL - "first", "second", etc.


## counting entities

In [64]:
doc = NER(u"originally priced at $29.50, now it's marked down to five dollars")
show_ents(doc)

original doc: originally priced at $29.50, now it's marked down to five dollars
entity:         29.50 | start_char:  22 | end_char:  27 | label: MONEY - Monetary values, including unit
entity:  five dollars | start_char:  53 | end_char:  65 | label: MONEY - Monetary values, including unit


In [65]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

## Visualizing NER

In [None]:
from spacy import displacy

In [72]:
doc = NER(u"Tesla to build a U.K. factory for $6 million. "
          u"originally priced at $29.50, now it's marked down to five dollars")
displacy.render(doc, style="ent", jupyter=True)


In [74]:
# line by line
for sent in doc.sents:
    displacy.render(sent, style="ent", jupyter=True)

In [79]:
# viewing specific entries
options = {"ents": ["MONEY"]}
displacy.render(doc, style="ent", jupyter=True, options = options)