# Load datasets
---

In [1]:
import datasets
from xsum_dataset import XsumDataset

In [2]:
xsum_data_raw = datasets.load_dataset("xsum")

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# train/val/test data
xsum_train_data = XsumDataset(xsum_data_raw["train"])
xsum_val_data = XsumDataset(xsum_data_raw["validation"])
xsum_test_data = XsumDataset(xsum_data_raw["test"])

In [4]:
# concat data
xsum_data_raw_cc = datasets.concatenate_datasets(
    [xsum_data_raw["train"], xsum_data_raw["validation"], xsum_data_raw["test"]]
    )
xsum_concat_data = XsumDataset(xsum_data_raw_cc)

# NER Analysis
---

In [4]:
import spacy
from spacy import displacy
NER = spacy.load("en_core_web_trf")
cache_dir = "../cache_trf"

In [5]:
from ner_utils import *

In [6]:
# function to display basic entity info: 
def show_ents(doc):
    print(f"original doc: {doc.text}")
    if doc.ents: 
        for ent in doc.ents: 
            print(f"entity: {ent.text : >13} | start_char: {ent.start_char: 3} | end_char: {ent.end_char: 3} | label: {ent.label_} - {spacy.explain(ent.label_)}")
            # print(ent.label)  # entity type's hash value
            # print(ent.start)  # token span's start index position (word index)
            # print(ent.end)  # token span's stop index position (word index)
    else: print('No named entities found.')

## 0-2) load cached NER files

In [7]:
# # train
# train_doc_ents_list = load_from_cache_dir("train_doc_ents_list")
# train_sum_ents_list = load_from_cache_dir("train_sum_ents_list")

# # val
# val_doc_ner_list = load_from_cache_dir("val_doc_ner_list", cache_dir)
# val_sum_ner_list = load_from_cache_dir("val_sum_ner_list", cache_dir)
# val_doc_ents_list = load_from_cache_dir("val_doc_ents_list_no_dup", cache_dir)  # no duplicate
# val_sum_ents_list = load_from_cache_dir("val_sum_ents_list_no_dup", cache_dir)

# test
# test_doc_ner_list = load_from_cache_dir("test_doc_ner_list", cache_dir)
# test_sum_ner_list = load_from_cache_dir("test_sum_ner_list", cache_dir)
test_doc_ents_list = load_from_cache_dir("test_doc_ents_list_no_dup", cache_dir)  # no duplicate
test_sum_ents_list = load_from_cache_dir("test_sum_ents_list_no_dup", cache_dir)

# entities pool
# concat_ent_pool_dict = load_from_cache_dir("concat_ent_pool_dict", cache_dir)  # preprocessed
# test_ent_pool_dict = load_from_cache_dir("test_ent_pool_dict", cache_dir)
val_test_ent_pool_dict = load_from_cache_dir("val_test_ent_pool_dict", cache_dir)

'../cache_trf/test_doc_ents_list_no_dup.pkl' loaded
'../cache_trf/test_sum_ents_list_no_dup.pkl' loaded
'../cache_trf/val_test_ent_pool_dict.pkl' loaded


## todo
* case 1. no entites in true_summary - continue
    1) ner on summary doesn't recognize the entity correctly
* case 2. no exact overlap entities btw true_summary - document   
    1) check subset
        * Edinburgh (summary) - Edinburgh Sheriff Court (document)
        * (name) Usain Bolt (summary) - Usain (document) - should I replace?
    
* case 3. ex) 4 - four
* case 4. case

1. how to choose the entity?
    * pick the replacement entity that is not in the source document
2. how to replace it?





### other ideas
* replace the entites that are not in the true summary?

## 1) analyze test summaries

In [11]:
data_idx = 0

In [12]:
sum_ner = test_sum_ner_list[data_idx]
doc_ner = test_doc_ner_list[data_idx]

In [13]:
doc_ents_sorted = test_doc_ents_list[data_idx].most_common()
sum_ents_sorted = test_sum_ents_list[data_idx].most_common()

In [14]:
show_ents(sum_ner)

original doc: There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
entity:         Wales | start_char:  65 | end_char:  70 | label: GPE - Countries, cities, states


In [15]:
sum_ents_sorted

[(('Wales', 'GPE'), 1)]

In [16]:
doc_ents_sorted

[(('one', 'CARDINAL'), 4),
 (('Wales', 'GPE'), 2),
 (('Marc', 'PERSON'), 2),
 (('1,099', 'CARDINAL'), 1),
 (('2015-16', 'DATE'), 1),
 (('up to a year', 'DATE'), 1),
 (('The Welsh Government', 'ORG'), 1),
 (('the Housing Act', 'LAW'), 1),
 (('2015', 'DATE'), 1),
 (('Andrew Stevens', 'PERSON'), 1),
 (('six months to a year', 'DATE'), 1),
 (('six months', 'DATE'), 1),
 (('Stevens', 'PERSON'), 1),
 (('a hundred pounds', 'MONEY'), 1),
 (('830', 'CARDINAL'), 1),
 (('the year to March 2016', 'DATE'), 1),
 (('6,900', 'CARDINAL'), 1),
 (('50', 'DATE'), 1),
 (('the past 20 years', 'DATE'), 1),
 (('Tom Clarke', 'PERSON'), 1),
 (('Emmaus South Wales', 'GPE'), 1),
 (('Emmaus', 'ORG'), 1),
 (('Welsh Government', 'ORG'), 1),
 (('20,000', 'CARDINAL'), 1),
 (('the next five years', 'DATE'), 1)]

## 2) filter entities

* label list
    * CARDINAL: Numerals that do not fall under another type
    * DATE: Absolute or relative dates or periods
    * EVENT: Named hurricanes, battles, wars, sports events, etc.
    * FAC: Buildings, airports, highways, bridges, etc.
    * GPE: Countries, cities, states
    * LANGUAGE: Any named language
    * LAW: Named documents made into laws.
    * LOC: Non-GPE locations, mountain ranges, bodies of water
    * MONEY: Monetary values, including unit
    * NORP: Nationalities or religious or political groups
    * ORDINAL: "first", "second", etc.
    * ORG: Companies, agencies, institutions, etc.
    * PERCENT: Percentage, including "%"
    * PERSON: People, including fictional
    * PRODUCT: Objects, vehicles, foods, etc. (not services)
    * QUANTITY: Measurements, as of weight or distance
    * TIME: Times smaller than a day
    * WORK_OF_ART: Titles of books, songs, etc.

FILTER_LABELS = ["PERSON", "FAC", "GPE", "NORP", "LOC", "EVENT"]

In [8]:
FILTER_LABELS = ["PERSON", "FAC", "GPE", "NORP", "LOC", "EVENT", "LANGUAGE", "LAW", "ORG"]
# without numbers? "DATE", "PERCENT" 
# too many errors: "PRODUCT", "WORK_OF_ART"

In [9]:
ALL_LABELS = list(NER.get_pipe('ner').labels)

## 3) replace entities

In [10]:
from tqdm.notebook import tqdm

### 1) count samples to exclude

In [11]:
def count_no_overlap(xsum_data, doc_ents_list, sum_ents_list, filter_labels):
    # samples to exclude
    doc_overlap_count = 0
    sum_overlap_count = 0
    no_overlap_count = 0

    for data_idx, data in enumerate(tqdm(xsum_data.dataset)):
        document = data["document"]
        true_summary = data["true_summary"]

        # ner
        # doc_ner = doc_ner_list[data_idx]
        # sum_ner = sum_ner_list[data_idx]

        # entites
        doc_ents = doc_ents_list[data_idx]
        sum_ents = sum_ents_list[data_idx]

        # sort and filter
        doc_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in doc_ents.most_common() 
                                    if label in filter_labels]
        sum_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in sum_ents.most_common() 
                                    if label in filter_labels]

        overlap_flag = False
        # from document side
        for (ent, label), count in doc_ents_filtered_sorted:
            if ent in true_summary:  # overlap exists
                overlap_flag = True
                doc_overlap_count += 1
                break

        # if no overlap from document side, try from summary
        if overlap_flag == False:
            if len(sum_ents) == 0: # no entity in summary -> pass
                pass
            else:
                for (ent, label), count in sum_ents_filtered_sorted:
                    if ent in document:
                        overlap_flag = True
                        sum_overlap_count += 1
                        break

        if overlap_flag == False:
            no_overlap_count += 1 # need to exclude

    # check uncounted sample
    assert doc_overlap_count + sum_overlap_count + no_overlap_count == len(xsum_data.dataset)
    
    return doc_overlap_count, sum_overlap_count, no_overlap_count

In [12]:
test_overlap_counts = count_no_overlap(xsum_data=xsum_test_data, 
                                       doc_ents_list=test_doc_ents_list, 
                                       sum_ents_list=test_sum_ents_list, 
                                       filter_labels=FILTER_LABELS)

  0%|          | 0/11334 [00:00<?, ?it/s]

In [13]:
n_test = len(xsum_test_data.dataset)
print(f"overlap from document: {(test_overlap_counts[0]/n_test)*100:.1f}%")
print(f"overlap from summary: {(test_overlap_counts[1]/n_test)*100:.1f}%")
print(f"no overlap: {(test_overlap_counts[2]/n_test)*100:.1f}%")

overlap from document: 73.9%
overlap from summary: 3.2%
no overlap: 22.9%


### 2) print

In [14]:
ent_pool_dict = val_test_ent_pool_dict

In [15]:
# extract only necessary dicts
for label in ALL_LABELS:
    if label not in FILTER_LABELS:
        del ent_pool_dict[label]

In [16]:
assert(len(ent_pool_dict) == len(FILTER_LABELS))

In [17]:
# reduce the size to 20%
reduced_ent_pool_dict = {}
for label in ent_pool_dict.keys():
    ent_pool = ent_pool_dict[label]
    reduced_ent_pool = {k:v for (k,v) in list(ent_pool.items())[:len(ent_pool) // 10]}
    reduced_ent_pool_dict[label] = reduced_ent_pool

In [18]:
len(ent_pool_dict["PERSON"])

67832

In [19]:
len(reduced_ent_pool_dict["PERSON"])

6783

In [20]:
import random
random.seed(0)

In [21]:
def if_improper_replacement(chosen_ent, chosen_label, replace_ent, document, true_summary):
    # sample again if one of the below is true
    # 1. if label is person, replacement must have same # of words
    # 2. if replace entity is in the true document
    # 3. if replace entity is in the true summary
    # 4. if replace entity is a subset of chosen entity
    # 5. if chosen entity is a subset of replace entity
    
    violations = [chosen_label == "PERSON" and len(replace_ent.split()) != len(chosen_ent.split()),
                   replace_ent in document,
                   replace_ent in true_summary,
                   replace_ent in chosen_ent,
                   chosen_ent in replace_ent]
    
    # if violations are all false -> return false
    # if one of them are true -> return true
    return any(violations)

In [22]:
ood_list = []

In [23]:
replacement_info = []
for data_idx, data in enumerate(tqdm(xsum_test_data.dataset)):
    print(f"\n============ data idx: {data_idx} ============")
    original_id = data["id"]
    original_doc = data["document"]
    true_summary = data["true_summary"]
    
    # ner
    # doc_ner = test_doc_ner_list[data_idx]
    # sum_ner = test_sum_ner_list[data_idx]
    
    # entites
    doc_ents = test_doc_ents_list[data_idx]
    sum_ents = test_sum_ents_list[data_idx]
    
    # sort and filter
    doc_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in doc_ents.most_common() 
                                if label in FILTER_LABELS]
    sum_ents_filtered_sorted = [((ent, label), count) for ((ent, label), count) in sum_ents.most_common() 
                                if label in FILTER_LABELS]
    
    # if there is an overlap
    overlap_flag = False
    
    
    chosen_ent = None
    chosen_label = None
    
    # from document side
#     print(f"* summary: {true_summary}")
#     print(f"* document ents: {doc_ents_filtered_sorted} \n")
    for (ent, label), count in doc_ents_filtered_sorted:
        if ent in true_summary:  # overlap exists
#             print("** overlap from document")
#             print(f"ent: {ent}, label: {label}, count_doc: {count}, count_sum: {true_summary.count(ent)}")
            overlap_flag = True
            chosen_ent, chosen_label = ent, label
            break
    
    # if no overlap from document side, try from summary
    if overlap_flag == False:
        if len(sum_ents) == 0: # no entity in summary -> pass
            pass
        else:
#             print(f"* summary ents: {sum_ents_filtered_sorted} \n")
            for (ent, label), count in sum_ents_filtered_sorted:
                if ent in original_doc:
#                     print("** overlap from summary")
#                     print(f"ent: {ent}, label: {label}, count_sum: {count}, count_doc: {original_doc.count(ent)}")
                    overlap_flag = True
                    chosen_ent, chosen_label = ent, label
                    break
    
    
    # check the chosen entity
    if overlap_flag == True:
        print(f"* summary: {true_summary}")
        print(f"* chosen_ent: {chosen_ent}, label: {chosen_label}")
        
        
        # choose one
        ent_pool = list(reduced_ent_pool_dict[chosen_label].keys())
        replace_ent = random.choice(ent_pool)
        print(f"* replace_ent: {replace_ent}")
        
        while if_improper_replacement(chosen_ent, chosen_label, replace_ent, original_doc, true_summary):
            print("pick new replacement")
            replace_ent = random.choice(ent_pool)
            print(f"* replace_ent: {replace_ent}")
        
        
        metadata_dict = {"chosen_ent": chosen_ent,
                         "replace_ent": replace_ent,
                         "label": chosen_label,}
        
        ood_doc = original_doc.replace(chosen_ent, replace_ent)
        ood_summary = true_summary.replace(chosen_ent, replace_ent)
        ood_list.append({"original_id": original_id,
                         "ood_doc": ood_doc,
                         "ood_true_summary": ood_summary,
                         "metadata": metadata_dict})
        
    else:
        print("****** NO OVERLAP ******")
        ood_list.append(None)

  0%|          | 0/11334 [00:00<?, ?it/s]


* summary: There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
* chosen_ent: Wales, label: GPE
* replace_ent: Maldives

* summary: A man has appeared in court after firearms, ammunition and cash were seized by police in Edinburgh.
* chosen_ent: Edinburgh, label: GPE
* replace_ent: Costa Rica

* summary: Four people accused of kidnapping and torturing a mentally disabled man in a "racially motivated" attack streamed on Facebook have been denied bail.
* chosen_ent: Facebook, label: ORG
* replace_ent: The Foreign and Commonwealth Office

* summary: West Brom have appointed Nicky Hammond as technical director, ending his 20-year association with Reading.
* chosen_ent: West Brom, label: ORG
* replace_ent: Staggies

****** NO OVERLAP ******

* summary: Since their impending merger was announced in January, there has been remarkably little comment about the huge proposed deal to combine Essilor and Luxottica.
* chosen_ent: Essilor, label: ORG
* rep


* summary: Plans by Apple and Google to do more to protect customers' privacy have made the FBI "very concerned".
* chosen_ent: Apple, label: ORG
* replace_ent: BBC Radio Bristol

* chosen_ent: Schmidt, label: PERSON
* replace_ent: Michael Eavis
pick new replacement
* replace_ent: Wahab Riaz
pick new replacement
* replace_ent: Jon Ashworth
pick new replacement
* replace_ent: Farry

* summary: Asian countries have taken top places in global school rankings for maths, science and reading, with England and Northern Ireland among high performers.
* chosen_ent: England, label: GPE
* replace_ent: Grangemouth

* summary: Jordan Spieth will begin the final day at the Masters with a one-shot lead but playing partner Rory McIlroy's bid faltered on day three at Augusta.
* chosen_ent: Spieth, label: PERSON
* replace_ent: Viktor Yanukovych
pick new replacement
* replace_ent: Nicolas Roche
pick new replacement
* replace_ent: Phelan

* summary: One of the big questions facing MPs after the summer re

pick new replacement
* replace_ent: McAteer
pick new replacement
* replace_ent: Sam Robson

* summary: A hang gliding pilot has died in an accident at a nature reserve near Chichester in West Sussex.
* chosen_ent: Sussex, label: GPE
* replace_ent: Down

* summary: Captain Sean Morrison says late goals have been "devastating" for Cardiff City's back four.
* chosen_ent: Cardiff, label: ORG
* replace_ent: NAO

* summary: Zambia coach Beston Chambeshi says his team is highly motivated for Wednesday's Fifa Under-20 World Cup tie against Germany.
* chosen_ent: Germany, label: GPE
* replace_ent: Guinea

* summary: The US government has imposed sanctions on 13 senior Venezuelan officials as pressure mounts on President Nicolás Maduro ahead of a controversial vote for a new constituent assembly.
* chosen_ent: US, label: GPE
* replace_ent: Mumbai

* summary: Ukraine's President Viktor Yanukovych and his Russian counterpart Vladimir Putin have held surprise talks on a "strategic partnership treat

* replace_ent: Freddie Gray's

* summary: New England Patriots quarterback Tom Brady says he is sorry the NFL "had to endure" the "deflate-gate" scandal, after his four-game ban was overturned.
* chosen_ent: NFL, label: ORG
* replace_ent: Radio 3

* summary: Japanese phone messaging app operator, Line, has said it plans to list its shares in both Tokyo and New York.
* chosen_ent: Line, label: ORG
* replace_ent: Foreign and Commonwealth Office

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: The head of the consular department at Russia's embassy in Greece has been found dead in his flat in Athens, police say.
* chosen_ent: Russia, label: GPE
* replace_ent: Devon

* summary: Renault will stop supplying engines to Formula 1 teams as soon as possible.
* chosen_ent: Renault, label: ORG
* replace_ent: Cisco

* summary: Belfast City Council has said a BBC relocation to a site beside the Ulster University campus would lead to a "comprehensive transformation" of that part of the

* replace_ent: Manu Tuilagi
pick new replacement
* replace_ent: Julian

****** NO OVERLAP ******

* summary: A former senior trader at Rabobank has pleaded guilty to interest rate rigging in the US.
* chosen_ent: US, label: GPE
* replace_ent: Stratford

* summary: Hull FC have signed 20-year-old centre Joe Arundel from Castleford Tigers on a four-year contract from 2013.
* chosen_ent: Tigers, label: ORG
* replace_ent: Spa

* summary: Ministers insist there is strong support for their seven-day NHS plans as experts challenge the policy.
* chosen_ent: NHS, label: ORG
* replace_ent: the Royal British Legion

* summary: Non Stanford will be surprised if Helen Jenkins is not in the British triathlon team for the Olympics after her victory in the Gold Coast World Series event.
* chosen_ent: Stanford, label: PERSON
* replace_ent: Jeb Bush
pick new replacement
* replace_ent: Chuka Umunna
pick new replacement
* replace_ent: Cesc Fabregas
pick new replacement
* replace_ent: Troy Brown
pick new r

pick new replacement
* replace_ent: Barclay
pick new replacement
* replace_ent: Darren Fletcher

****** NO OVERLAP ******

* summary: With John Boehner's resignation, Twitter is losing one of its favourite public figures, a politician famous for his tears and his perma-tanned complexion.
* chosen_ent: Boehner, label: PERSON
* replace_ent: Simon Coveney
pick new replacement
* replace_ent: Sam Davies
pick new replacement
* replace_ent: Diamond

* summary: It's remarkable, given the strength of the Indian expat community in California, that Narendra Modi's trip to Silicon Valley is the first by an Indian prime minister for more than three decades.
* chosen_ent: Modi, label: PERSON
* replace_ent: Dylan

* summary: Sports Direct chairman Keith Hellawell has lambasted critics of the sportswear firm, saying an "extreme political, union and media campaign" has damaged its reputation.
* chosen_ent: Sports Direct, label: ORG
* replace_ent: Poundland

* summary: Chinese authorities have arrested 

pick new replacement
* replace_ent: Adam Gemili
pick new replacement
* replace_ent: Diego Demme
pick new replacement
* replace_ent: Callum Morris
pick new replacement
* replace_ent: Poroshenko

* summary: There have been a few great goals, and plenty of late ones, but how is the expanded European Championship shaping up as a tournament to remember?
* chosen_ent: European Championship, label: EVENT
* replace_ent: Glasgow

****** NO OVERLAP ******

****** NO OVERLAP ******

****** NO OVERLAP ******

* summary: League Two side Luton Town have signed Charlton Athletic striker Joe Pigott on loan until the end of the season.
* chosen_ent: League Two, label: ORG
* replace_ent: Kings College London

* summary: Harry Potter and the Cursed Child writer Jack Thorne is joining new TV drama Electric Dreams: The World of Philip K Dick.
* chosen_ent: Dick, label: PERSON
* replace_ent: Lewis Morgan
pick new replacement
* replace_ent: Ollie Clarke
pick new replacement
* replace_ent: Jeb Bush
pick new r

* replace_ent: Bernie Sanders
pick new replacement
* replace_ent: Greg Clarke
pick new replacement
* replace_ent: Lu Kang
pick new replacement
* replace_ent: Darren Barr
pick new replacement
* replace_ent: George Bailey
pick new replacement
* replace_ent: Alan Solomons'
pick new replacement
* replace_ent: Dominika Cibulkova
pick new replacement
* replace_ent: Mulholland

* summary: Pharmaceutical firms Pfizer and Flynn Pharma have been accused by the UK's competition watchdog of charging "excessive and unfair" prices for an anti-epilepsy drug.
* chosen_ent: Pfizer, label: ORG
* replace_ent: Suzuki

* summary: Tanzania's Football Federation (TFF) president, Jamal Malinzi, has confirmed Zanzibar's fresh bid to become a member of Fifa.
* chosen_ent: Zanzibar, label: GPE
* replace_ent: Syria

* summary: Audi will not race in next season's World Endurance Championship, which includes the iconic Le Mans 24 Hours race, to concentrate on Formula E.
* chosen_ent: Audi, label: ORG
* replace_ent:

* replace_ent: Mexico City

* summary: The Northern Ireland Secretary of State Teresa Villiers has said the UK government will release funding to allow a public sector redundancy scheme to go ahead.
* chosen_ent: Villiers, label: PERSON
* replace_ent: Daniel Bell-Drummond
pick new replacement
* replace_ent: Tony Bennett
pick new replacement
* replace_ent: Stokes

* summary: Ross County have signed winger Jim O'Brien on a permanent deal following a loan spell last season.
* chosen_ent: Ross County, label: ORG
* replace_ent: P5

* summary: Maria Sharapova may have been knocked out of Wimbledon, but in India she has hit the headlines with comments that are, for many Indians, akin to blasphemy.
* chosen_ent: Indian, label: NORP
* replace_ent: Europeans

****** NO OVERLAP ******

* summary: The Welsh Assembly could be renamed the Welsh Parliament before AMs have the legal right to make the change.
* chosen_ent: Welsh, label: NORP
* replace_ent: Nazis

* summary: I spoke to Welsh Secretary A

pick new replacement
* replace_ent: Forster

* summary: British number four Aljaz Bedene beat Croatia's Borna Coric in three sets to reach round two of the Marseille Open.
* chosen_ent: Bedene, label: PERSON
* replace_ent: Scott Walker
pick new replacement
* replace_ent: Daniel Ricciardo
pick new replacement
* replace_ent: Sian Berry
pick new replacement
* replace_ent: Connolly

* summary: Having amended the government's Brexit Bill by a thumping majority this week, the big question for next week in Westminster is whether peers will do it again, potentially more than once.
* chosen_ent: Brexit, label: ORG
* replace_ent: The Washington Post's

* summary: The commissioner of An Garda Síochána (Irish police) has said the force needs more than 500 new recruits a year to provide a proper service.
* chosen_ent: Irish, label: NORP
* replace_ent: Muslim

* summary: Alan Curtis has had so many roles at Swansea City, he struggles to recall them all.
* chosen_ent: Curtis, label: PERSON
* replace_

pick new replacement
* replace_ent: Jean McConville
pick new replacement
* replace_ent: Jade Jones
pick new replacement
* replace_ent: Sergio Romero
pick new replacement
* replace_ent: Tom Varndell
pick new replacement
* replace_ent: James Faulkner
pick new replacement
* replace_ent: Jordan Moore-Taylor
pick new replacement
* replace_ent: Guy Noves
pick new replacement
* replace_ent: Paul Hurst's
pick new replacement
* replace_ent: Gilad Erdan
pick new replacement
* replace_ent: Megyn Kelly
pick new replacement
* replace_ent: Reagan

* summary: Anthony Joshua's world title bout with Wladimir Klitschko comes at the "perfect time", says former undisputed heavyweight champion Lennox Lewis.
* chosen_ent: Joshua, label: PERSON
* replace_ent: Alan Johnston
pick new replacement
* replace_ent: Jonny Gray
pick new replacement
* replace_ent: Barker

* summary: The owners of Blackpool Football Club are suing a web forum for libel in the latest of a series of actions against fans making derogatory

* summary: England's Tommy Fleetwood was denied a second 2017 European Tour title after losing a play-off to Bernd Wiesberger at the Shenzhen International.
* chosen_ent: Wiesberger, label: PERSON
* replace_ent: Scott Vernon
pick new replacement
* replace_ent: Bevan

* summary: Netherlands captain Mandy van den Berg has joined Reading Women from Liverpool on a permanent deal.
* chosen_ent: Mandy, label: PERSON
* replace_ent: Ragnar Sigurdsson
pick new replacement
* replace_ent: David Moran
pick new replacement
* replace_ent: Balls

* summary: Torquay United have signed Barrow defender Myles Anderson on a permanent deal, and Irish forward Ruairi Keating on non-contract terms.
* chosen_ent: Anderson, label: PERSON
* replace_ent: Liz

* summary: A car park in east Belfast has been closed to the public by young men building bonfires.
* chosen_ent: Belfast, label: GPE
* replace_ent: Buenos Aires

* summary: Swashboggling and frobscottle are among thousands of Roald Dahl's words to be compil

* summary: A Sydney police officer and huge Star Wars fan has become a local hit after creating a Darth Vader costume painted with the Australian flag.
* chosen_ent: Australian, label: NORP
* replace_ent: Hindu

****** NO OVERLAP ******

* summary: Rangers have signed Scotland midfielder Graham Dorrans from Norwich City for an undisclosed fee.
* chosen_ent: Dorrans, label: PERSON
* replace_ent: Hickey

* summary: Five men and two women were rescued by the RNLI after becoming stranded on two islands in Strangford Lough.
* chosen_ent: RNLI, label: ORG
* replace_ent: the Better Care Fund

* summary: A man has admitted killing an 11-year-old girl in a hit-and-run in Glasgow.
* chosen_ent: Glasgow, label: GPE
* replace_ent: Moray

****** NO OVERLAP ******

* summary: South African President Jacob Zuma has ruled out any increases in university tuition fees for next year after more than a week of protests by students.
* chosen_ent: Zuma, label: PERSON
* replace_ent: David Murray
pick new repl

* replace_ent: Elis-Thomas

* summary: A renowned Russian conductor has led a concert in the ruins of Palmyra in Syria, which were recaptured from the so-called Islamic State (IS) in March.
* chosen_ent: Russia, label: GPE
* replace_ent: Charleston

* summary: New York's Metropolitan Opera has cancelled plans for a global HD broadcast of a performance amid fears it could spark anti-Semitic sentiment.
* chosen_ent: anti-Semitic, label: NORP
* replace_ent: Canadian

* summary: Manager Lee Clark is urging Kilmarnock to build on the first win of his tenure and make sure they are part of an exciting top flight next season.
* chosen_ent: Clark, label: PERSON
* replace_ent: Porterfield

* summary: Cuts in support for renewable energy in the UK have been criticised by the UN's chief environment scientist.
* chosen_ent: UK, label: GPE
* replace_ent: Harrogate

* summary: Prof Noriko Arai has spent years training a robot to pass prestigious University of Tokyo's entrance exams.
* chosen_ent: Ara

pick new replacement
* replace_ent: Paul Collingwood
pick new replacement
* replace_ent: Chuka Umunna
pick new replacement
* replace_ent: Samson Siasia
pick new replacement
* replace_ent: Kimi Raikkonen
pick new replacement
* replace_ent: Mullan

* summary: Charlie and Lola creator Lauren Child has been named as the new children's laureate, taking over from Goth Girl author Chris Riddell.
* chosen_ent: Chris Riddell, label: PERSON
* replace_ent: Simon Mignolet

* summary: Former Birmingham City boss Steve Bruce is the "right man" to manage Aston Villa, according to former Blues midfielder Robbie Savage.
* chosen_ent: Bruce, label: PERSON
* replace_ent: Joe Ralls
pick new replacement
* replace_ent: Huws

* summary: A senior police officer has said there will be "significant arrests" of those involved in rioting in Belfast.
* chosen_ent: Belfast, label: GPE
* replace_ent: the Republic of Ireland's

****** NO OVERLAP ******

* summary: National League club Chester have re-signed Port Vale

****** NO OVERLAP ******

* summary: Scottish ministers are being asked to clarify whether Police Scotland "spied" on journalists and their sources.
* chosen_ent: Scottish, label: NORP
* replace_ent: Islamist

* summary: US prosecutors have dropped all charges against a man who spent 25 years in prison for murder, amid allegations police had falsified evidence.
* chosen_ent: US, label: GPE
* replace_ent: Wolverhampton

* summary: More than 200 businesses in part of Edinburgh have been found to be breaking rules on using communal bins.
* chosen_ent: Edinburgh, label: GPE
* replace_ent: Montenegro

****** NO OVERLAP ******

* summary: The Red Arrows aerobatic team will be "around for a while yet", Defence Secretary Sir Michael Fallon has said.
* chosen_ent: Michael, label: PERSON
* replace_ent: Mark Jones
pick new replacement
* replace_ent: Gaël Clichy
pick new replacement
* replace_ent: Saddam

* summary: Terminally ill actor and disability campaigner Brian Rix, 92, has said the law on 

* replace_ent: Tom Cairney
pick new replacement
* replace_ent: Aaron Shingler
pick new replacement
* replace_ent: Du Preez
pick new replacement
* replace_ent: Farook

* summary: Ivory Coast are confident Wilfried Zaha will soon be part of their team, but new England boss Gareth Southgate has not given up on the forward.
* chosen_ent: England, label: GPE
* replace_ent: Belarus

* summary: Jeremy Corbyn has urged Labour members to show "respect" to the party's Scottish leader after she was jeered at a hustings debate in Glasgow.
* chosen_ent: Corbyn, label: PERSON
* replace_ent: Borthwick

* summary: Declan McManus scored twice as Greenock Morton beat Scottish Championship bottom side Alloa Athletic, who remain 10 points adrift.
* chosen_ent: McManus, label: PERSON
* replace_ent: Mark Ronson
pick new replacement
* replace_ent: Sam Wood
pick new replacement
* replace_ent: Jonas Knudsen
pick new replacement
* replace_ent: Sian Berry
pick new replacement
* replace_ent: John Major
pick new r

* chosen_ent: Wales, label: ORG
* replace_ent: Steam

* summary: A collection of manuscripts and notebooks which belonged to poet and novelist Edward Thomas are to be conserved thanks to a grant.
* chosen_ent: Thomas, label: PERSON
* replace_ent: Jo Cox
pick new replacement
* replace_ent: Brendan Rodgers'
pick new replacement
* replace_ent: Cian Healy
pick new replacement
* replace_ent: Julian Draxler
pick new replacement
* replace_ent: Wood

* summary: A career-best 135 from Matt Machan formed the backbone of Sussex's innings against Worcestershire at Hove.
* chosen_ent: Sussex, label: ORG
* replace_ent: the News Letter

* summary: Former Barcelona midfielder Xavi believes Pep Guardiola "can change the mentality of English football" during his time at Manchester City.
* chosen_ent: Barcelona, label: ORG
* replace_ent: NI

****** NO OVERLAP ******

* summary: A Maoist leader in India has said that they will take "full responsibility" for the safety of trains travelling through areas un

* chosen_ent: Hutchinson, label: ORG
* replace_ent: Springbok

* summary: A Native American tribe in the US state of South Dakota has said it plans to open what would be the first marijuana resort in the US.
* chosen_ent: South Dakota, label: GPE
* replace_ent: Chelsea

* summary: Conor McGregor says he will beat Floyd Mayweather Jr in two rounds after the use of 8oz gloves was approved for the contest in Las Vegas on 26 August.
* chosen_ent: McGregor, label: PERSON
* replace_ent: Wood

* summary: Five hundred homes could be built on allotments and green belt land next to the Leatherhead bypass in Surrey.
* chosen_ent: Leatherhead, label: GPE
* replace_ent: Shirebrook

* summary: Four Belfast Giants players have been included in the Great Britain squad for next month's Olympic pre-qualification tournament in Cortina, Italy.
* chosen_ent: Giants, label: ORG
* replace_ent: AT&T

* summary: The owners of Bay TV, Liverpool's local television station, have insisted it will survive despite g

In [24]:
ood_list[0]

{'original_id': '38264402',
 'ood_doc': 'Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.\nWorkers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.\nThe Welsh Government said more people than ever were getting help to address housing problems.\nChanges to the Housing Act in Maldives, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.\nPrison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.\nHowever, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.\nAndrew Stevens, who works in Welsh prisons trying to secure housin

In [25]:
# save
save_to_cache_dir(ood_list, "ood_list", "/home/wk247/workspace/xsum_analysis/cache/ood_ner")

saved to '/home/wk247/workspace/xsum_analysis/cache/ood_ner/ood_list.pkl'


In [88]:
# chosen_ent_groupby_label
chosen_ent_groupby_label = {l:{} for l in FILTER_LABELS}

In [89]:
[info for info in replacement_info if info["label"] == "GPE"]

[{'chosen_ent': 'Wales', 'replace_ent': 'Costa Rica', 'label': 'GPE'},
 {'chosen_ent': 'Edinburgh', 'replace_ent': 'Murrayfield', 'label': 'GPE'},
 {'chosen_ent': 'Sussex', 'replace_ent': 'Connecticut', 'label': 'GPE'},
 {'chosen_ent': 'US', 'replace_ent': 'Blaenau Gwent', 'label': 'GPE'},
 {'chosen_ent': 'Pakistan', 'replace_ent': 'Tyrone', 'label': 'GPE'},
 {'chosen_ent': 'India', 'replace_ent': 'Berlin', 'label': 'GPE'},
 {'chosen_ent': 'South Africa', 'replace_ent': 'Rhyl', 'label': 'GPE'},
 {'chosen_ent': 'Portsmouth', 'replace_ent': 'Austria', 'label': 'GPE'},
 {'chosen_ent': 'South Africa',
  'replace_ent': 'Bury St Edmunds',
  'label': 'GPE'},
 {'chosen_ent': 'Australia', 'replace_ent': 'Texas', 'label': 'GPE'},
 {'chosen_ent': 'US', 'replace_ent': 'Bavaria', 'label': 'GPE'},
 {'chosen_ent': 'Russia', 'replace_ent': 'Eastbourne', 'label': 'GPE'},
 {'chosen_ent': 'Russia', 'replace_ent': 'Norwich', 'label': 'GPE'},
 {'chosen_ent': 'Russia', 'replace_ent': 'Bahrain', 'label': 'GP

In [94]:
print("Stats of chosen entities")
for l in FILTER_LABELS:
    print(f"label: {l}, count: {len([_ for info in replacement_info if info['label'] == l])}")

Stats of chosen entities
label: PERSON, count: 2585
label: FAC, count: 123
label: GPE, count: 2874
label: NORP, count: 505
label: LOC, count: 180
label: EVENT, count: 102
label: LANGUAGE, count: 13
label: LAW, count: 9
label: ORG, count: 2347


* 완전히 같은 entity 통일하기

# example 1

In [None]:
import spacy
from spacy import displacy
NER = spacy.load("en_core_web_sm")

In [3]:
raw_text="The Indian Space Research Organisation or is the national space agency of India, \
    headquartered in Bengaluru. It operates under Department of Space which is directly \
    overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."

In [4]:
raw_text2 = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, \
    was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) \
    and has entered Mars orbit on 24 September 2014. India thus became the first country to enter \
    Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

In [5]:
text = NER(raw_text2)

In [6]:
for word in text.ents:
    print(word.text,word.label_)

The Mars Orbiter Mission (MOM PRODUCT
Mangalyaan PERSON
Earth LOC
5 November 2013 DATE
the Indian Space Research Organisation ORG
Mars LOC
24 September 2014 DATE
India GPE
first ORDINAL
Mars LOC
$74 million MONEY


In [115]:
# list of ner labels
for ner_label in NER.get_pipe('ner').labels:
    print(f"* {ner_label}: {spacy.explain(ner_label)}")

* CARDINAL: Numerals that do not fall under another type
* DATE: Absolute or relative dates or periods
* EVENT: Named hurricanes, battles, wars, sports events, etc.
* FAC: Buildings, airports, highways, bridges, etc.
* GPE: Countries, cities, states
* LANGUAGE: Any named language
* LAW: Named documents made into laws.
* LOC: Non-GPE locations, mountain ranges, bodies of water
* MONEY: Monetary values, including unit
* NORP: Nationalities or religious or political groups
* ORDINAL: "first", "second", etc.
* ORG: Companies, agencies, institutions, etc.
* PERCENT: Percentage, including "%"
* PERSON: People, including fictional
* PRODUCT: Objects, vehicles, foods, etc. (not services)
* QUANTITY: Measurements, as of weight or distance
* TIME: Times smaller than a day
* WORK_OF_ART: Titles of books, songs, etc.


In [8]:
displacy.render(text,style="ent",jupyter=True)

# another example

In [49]:
# function to display basic entity info: 
def show_ents(doc):
    print(f"original doc: {doc.text}")
    if doc.ents: 
        for ent in doc.ents: 
            print(f"entity: {ent.text : >13} | start_char: {ent.start_char: 3} | end_char: {ent.end_char: 3} | label: {ent.label_} - {spacy.explain(ent.label_)}")
            # print(ent.label)  # entity type's hash value
            # print(ent.start)  # token span's start index position (word index)
            # print(ent.end)  # token span's stop index position (word index)
    else: print('No named entities found.')

In [50]:
doc1 = NER("Apple is looking at buying U.K. startup for $1 billion") 
show_ents(doc1)

original doc: Apple is looking at buying U.K. startup for $1 billion
entity:         Apple | start_char:   0 | end_char:   5 | label: ORG - Companies, agencies, institutions, etc.
entity:          U.K. | start_char:  27 | end_char:  31 | label: GPE - Countries, cities, states
entity:    $1 billion | start_char:  44 | end_char:  54 | label: MONEY - Monetary values, including unit


## document level

In [51]:
doc = NER("San Francisco considers banning sidewalk delivery robots") 
# document level 
for e in doc.ents: 
    print(e.text, e.start_char, e.end_char, e.label_) 

# OR 
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] 
print(ents)

San Francisco 0 13 GPE
[('San Francisco', 0, 13, 'GPE')]


## token level

In [52]:
# token level 
# doc[0], doc[1] ...will have tokens stored. 

ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] 
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
ent_considers = [doc[2].text, doc[2].ent_iob_, doc[2].ent_type_] 
print(ent_san) 
print(ent_francisco)
print(ent_considers)

# token.ent_iob indicates whether an entity starts continues or ends on the tag
# I - Token is inside an entity. 
# O - Token is outside an entity. 
# B - Token is the beginning of an entity.

['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']
['considers', 'O', '']


## User-Defined Named Entity and Adding it to a Span

### Example 1

In [53]:
doc = NER(u'Tesla to build a U.K. factory for $6 million.')
show_ents(doc)

original doc: Tesla to build a U.K. factory for $6 million.
entity:          U.K. | start_char:  17 | end_char:  21 | label: GPE - Countries, cities, states
entity:    $6 million | start_char:  34 | end_char:  44 | label: MONEY - Monetary values, including unit


In [54]:
from spacy.tokens import Span

In [55]:
# get the hash value of the ORG entity label
ORG = doc.vocab.strings[u"ORG"]
print(ORG)

# create a span for the new entity
new_ent = Span(doc=doc, start=0, end=1, label=ORG)
print(new_ent)

# add the entity to the existing doc object
doc.ents = list(doc.ents) + [new_ent]
print(doc.ents)

383
Tesla
(Tesla, U.K., $6 million)


In [56]:
show_ents(doc)

original doc: Tesla to build a U.K. factory for $6 million.
entity:         Tesla | start_char:   0 | end_char:   5 | label: ORG - Companies, agencies, institutions, etc.
entity:          U.K. | start_char:  17 | end_char:  21 | label: GPE - Countries, cities, states
entity:    $6 million | start_char:  34 | end_char:  44 | label: MONEY - Monetary values, including unit


## Adding Named Entities to All Matching Spans

In [57]:
doc = NER(u'Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.') 
show_ents(doc) 

original doc: Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.
entity:         first | start_char:  99 | end_char:  104 | label: ORDINAL - "first", "second", etc.


In [58]:
# Import PhraseMatcher and create a matcher object: 
from spacy.matcher import PhraseMatcher 
matcher = PhraseMatcher(NER.vocab)

In [59]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [NER(text) for text in phrase_list]
print(phrase_list)
print(phrase_patterns)

['vacuum cleaner', 'vacuum-cleaner']
[vacuum cleaner, vacuum-cleaner]


In [60]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

In [61]:
# Apply the matcher to our Doc object:
matches = matcher(doc)
#See what matches occur: 
matches 

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [62]:
# Here we create Spans from each match, and create named entities from them: 
from spacy.tokens import Span 
PROD = doc.vocab.strings[u'PRODUCT'] 
new_ents = [Span(doc, match[1], match[2],label=PROD) for match in matches]
new_ents

[vacuum cleaner, vacuum cleaner]

In [63]:
# match[1] contains the start index of the the token and match[2] the stop index (exclusive) of the token in the doc. 
doc.ents = list(doc.ents) + new_ents 
show_ents(doc)

original doc: Our company plans to introduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.
entity: vacuum cleaner | start_char:  37 | end_char:  51 | label: PRODUCT - Objects, vehicles, foods, etc. (not services)
entity: vacuum cleaner | start_char:  72 | end_char:  86 | label: PRODUCT - Objects, vehicles, foods, etc. (not services)
entity:         first | start_char:  99 | end_char:  104 | label: ORDINAL - "first", "second", etc.


## counting entities

In [64]:
doc = NER(u"originally priced at $29.50, now it's marked down to five dollars")
show_ents(doc)

original doc: originally priced at $29.50, now it's marked down to five dollars
entity:         29.50 | start_char:  22 | end_char:  27 | label: MONEY - Monetary values, including unit
entity:  five dollars | start_char:  53 | end_char:  65 | label: MONEY - Monetary values, including unit


In [65]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

## Visualizing NER

In [None]:
from spacy import displacy

In [72]:
doc = NER(u"Tesla to build a U.K. factory for $6 million. "
          u"originally priced at $29.50, now it's marked down to five dollars")
displacy.render(doc, style="ent", jupyter=True)


In [74]:
# line by line
for sent in doc.sents:
    displacy.render(sent, style="ent", jupyter=True)

In [79]:
# viewing specific entries
options = {"ents": ["MONEY"]}
displacy.render(doc, style="ent", jupyter=True, options = options)