### Four major issues to be solved

1. Genetives
2. Reduce entity from phrase: appositional modifiers
3. first mention not necessarily proper noun
4. Repeated head tail pairs

### Genetives

In [1]:
import sys
sys.path.append("../../")
from src.preparation.data_loading import read_dossier

In [13]:
import spacy
from flair.data import Sentence
from flair.models import SequenceTagger

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [16]:
parser = spacy.load("en_core_web_sm", disable=['ner'])

# load the NER tagger
tagger = SequenceTagger.load('ner')

2020-04-11 16:17:05,482 loading file /home/akvallapuram/.flair/models/en-ner-conll03-v0.4.pt


In [17]:
# get text with genetive case
dos = read_dossier.read_dossier()
genetive_text = dos[2].split("3. ")[-1].split("4. ")[0]
genetive_text

"Speaking separately, also in July 2016, an official close to Presidential Administration Head, S. IVANOV, confided in a compatriot that a senior colleague in the Internal Political Department of the PA, DIVEYKIN (nfd) also had met secretly with PAGE on his recent visit. Their agenda had included DIVEYKIN raising a dossier of 'kompromat' the Kremlin possessed on TRUMP's Democratic presidential rival, Hillary CLINTON, and its possible release to the Republican's campaign team. "

In [47]:
# ner tagging
sent = Sentence(genetive_text)
tagger.predict(sent)

[Sentence: "Speaking separately, also in July 2016, an official close to Presidential Administration Head, S. IVANOV, confided in a compatriot that a senior colleague in the Internal Political Department of the PA, DIVEYKIN (nfd) also had met secretly with PAGE on his recent visit. Their agenda had included DIVEYKIN raising a dossier of 'kompromat' the Kremlin possessed on TRUMP's Democratic presidential rival, Hillary CLINTON, and its possible release to the Republican's campaign team." - 72 Tokens]

In [25]:
# check for genetives
for entity in sent.get_spans('ner'):
    is_genetive = entity.text.endswith("'s")
    print(f"[{entity.start_pos}:{entity.end_pos}]", entity.text, entity.tag, is_genetive)

[61:88] Presidential Administration ORG False
[95:105] S. IVANOV, PER False
[162:191] Internal Political Department ORG False
[199:217] PA, DIVEYKIN (nfd) ORG False
[245:249] PAGE ORG False
[297:305] DIVEYKIN PER False
[343:350] Kremlin LOC False
[364:371] TRUMP's ORG True
[372:382] Democratic MISC False
[403:419] Hillary CLINTON, PER False
[452:464] Republican's ORG True


NER model unfortunately returns genetives as entities

In [26]:
# spacy dependency parser
doc = parser(genetive_text)

In [60]:
for nc in doc.noun_chunks:
    print(nc)

July
Presidential Administration Head
S. IVANOV
a compatriot
a senior colleague
the Internal Political Department
the PA
DIVEYKIN
(nfd
PAGE
his recent visit
Their agenda
DIVEYKIN
a dossier
kompromat
the Kremlin
TRUMP's Democratic presidential rival
Hillary CLINTON
its possible release
the Republican's campaign team


In [34]:
for t in doc:
    if t.dep_ == "appos":
        print(t, list(t.children))

IVANOV [S.]
DIVEYKIN [nfd]
nfd [(, )]
CLINTON [Hillary, ,, and, release]


In [79]:
def _check_genetive(ent, noun_chunks):
    """
        checks if the text is a genetive
        and finds the relevant phrase for
        replacement. 
        
        Args:
            ent (str): entity name 
            
            noun_chunks (list): list of Spacy noun chunks
                in text the entity is part of. 
        
        Returns:
            ent_str (str): the entity name with 
                genetive repaired.
    """
    ent_str = None
    if ent.endswith("'s"):
        for nc in noun_chunks:
            if ent in str(nc):
                ent_str = nc 
                break
    else:
        ent_str = ent
    return ent_str

In [80]:
for entity in sent.get_spans('ner'):
    print(entity.text, ":", _check_genetive(entity.text, doc.noun_chunks))

Presidential Administration : Presidential Administration
S. IVANOV, : S. IVANOV,
Internal Political Department : Internal Political Department
PA, DIVEYKIN (nfd) : PA, DIVEYKIN (nfd)
PAGE : PAGE
DIVEYKIN : DIVEYKIN
Kremlin : Kremlin
TRUMP's : TRUMP's Democratic presidential rival
Democratic : Democratic
Hillary CLINTON, : Hillary CLINTON,
Republican's : the Republican's campaign team


### Appositional Modifiers

In [83]:
min(list(map(str, doc.noun_chunks)), key=len)

'July'

In [109]:
test_doc = parser("a senior colleague in the Internal Political Department of the PA, DIVEYKIN")


In [121]:
for t in test_doc:
    if t.dep_ == "appos":
        print(t.text, t.pos)

DIVEYKIN 96


In [127]:
doc.char_span(162, 191).root

Department

In [128]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [129]:
type(doc)

spacy.tokens.doc.Doc