In [None]:
# Author: Sal Barbosa
# Purpose: Demo for 6350 on Named Entity Recognition, Dependency Parsing, Co-Reference Resolution
import json
from stanfordcorenlp import StanfordCoreNLP
# Get an instance of StanfordCoreNLP by connecting to the server
nlp = StanfordCoreNLP('http://jupyterlab-nfs-corenlp', port=9000)

# Named Entity Recognition

In [None]:
# Named Entity Categories and Types
# Named: ORGANIZATION, PERSON, LOCATION, MISC 
# Temporal: DATE, TIME, DURATION, SET
# Numeric: MONEY, NUMBER, PERCENT, ORDINAL
snt = 'Microsoft said Tuesday the plane carrying its president, Joe Schmoe, and his two children, was denied permission for the third time \
to land at 5 PM on December 2, 2019, for two hours, for refueling in France, where it expected to be charged $50,000 for days or weeks and assessed a 10% tax'
snt

In [None]:
# Named entities output using the annotator
props = {'annotators': 'ner','outputFormat':'json'} # set annotator to provide lemma and get return as json (otherwise it's a string)
res = nlp.annotate(snt, properties=props)   # apply the annotator: results are in json format
res

In [None]:
# Named entities output using the built-in function
nes = nlp.ner(snt)
nes

# Dependency Parsing

In [None]:
story = '''Edward Snowden's hopes of finding asylum from U.S. prosecution on espionage charges appeared to dim Tuesday as country after country denied his request or said he would have to find a way to travel to their territory to apply. 

While Bolivia and Venezuela seemed supportive, 11 of the 21 countries he's applied to, including Ecuador and Iceland, have said they can't consider his request until he shows up at one of their embassies or on their borders. Three -- Brazil, India and Poland -- have denied the request outright. 

And Bolivia said Tuesday the plane carrying its president, Evo Morales, was denied permission to land for refueling in either France or Portugal because of "unfounded" rumors that Snowden was aboard. Foreign Minister David Choquehuanca told Bolivian television that the jet made an emergency landing in the Austrian capital of Vienna and that Bolivia wanted an explanation from Paris and Lisbon. 

"We don't know who has come up with this huge lie," Choquehuanca said, adding, "We would like to let the international community know that the rights of aerial traffic for Bolivia have been violated." 

Morales had been in Russia, where he told the Russia Today news network that he would be willing to consider asylum for Snowden. And Venezuelan President Nicolas Maduro, also in Moscow for a tribute to his late predecessor, Hugo Chavez, said Snowden deserves protection, not prosecution. 

Maduro said Snowden's decision to leak details of American surveillance programs were "a warning signal to the world," according to statement from the president's office.'''
print(story)

In [None]:
# This function segments a multi-sentence passage into a list of strings
# Input: a multi-sentence text passage (as a string)
# Process: assemble the output of the sentence segmenter into a list of strings (one per sentence)
# Output: Returns the list of individual strings (one per sentence)
def sent_tokenize(story):
  props={'annotators': 'ssplit','outputFormat':'json'}
  res=nlp.annotate(story, properties=props)   # results from the annotator are in a string
  d = json.loads(res)                         # convert it to dictionary
  snt_lst = []                                # list of sentence strings to be returned
  for i in range(len(d['sentences'])):
    sd = d['sentences'][i]                    # dictionary holding information for a single sentence
    s = ""                                    # sentence string to be build from dictionary info
    for j in range(len(sd['tokens'])):        # loop through each token/word
      wd = sd['tokens'][j]                    # dictionary holding info about a word and what precedes/follows it
      s += wd['before'] + wd['originalText']  # append the word to the sentence string being built
    s += wd['after']
    snt_lst.append(s.strip())                 # append the stripped, assembled sentence string to the list
  return snt_lst


In [None]:
sents = sent_tokenize(story)
sents

In [None]:
# Tokenize the sentence strings
toksents = []
for i, snt in enumerate(sents):
    toksents.append([x[0] for x in nlp.pos_tag(snt)])
    print(i,snt)
#toksents

In [None]:
snt = sents[7]
dp = nlp.dependency_parse(snt)  # returns a list of three-tuples (dependency type, head, dependent) using ONE-BASED indexing
print(toksents[7],'\t',len(toksents[7]), "tokens")
dp

# Co-Reference Resolution

In [None]:
props={'annotators': 'coref','outputFormat':'json'}
res=nlp.annotate(story, properties=props)   # these results from the annotator are in a string
res

In [None]:
d = json.loads(res)     # convert the result string into a dictionary
crefs = d['corefs']     # extract the coreferences dictionary  
crefs

In [None]:
# This function extracts some components from the returned coref resolution results
# Input: story = a passage of text possibly having multiple sentences
# Process: It transforms sentence and token indices from 1-based to 0-based, and extracts a subset of features
# Output: It returns a dictionary containing lists of coreference chains
def proc_corefs(story):
  props={'annotators': 'coref','outputFormat':'json'}
  res=nlp.annotate(story, properties=props)   # these results from the annotator are in a string
  d = json.loads(res)                         # convert to dictionary
  corefs = d['corefs']
  pd = {}
  refents = len(corefs)
  numrefs = 0
  for referent_k in corefs:       # cref is an entity (to which there may be multiple references)
    pd[referent_k] = []
    cref = corefs[referent_k]
    numrefs += len(cref)
    for ref in cref:        # ref is a reference to the cref being processed
      r = {}
      snt = ref['sentNum']-1
      start = ref['startIndex'] -1
      end = ref['endIndex'] - 1
      txt = ref['text']

      r['sentNum'] = snt
      r['startIndex'] = start
      r['endIndex'] = end
      r['number'] = ref['number']
      r['gender'] = ref['gender']
      r['animacy'] = ref['animacy']
      r['repmention'] = ref['isRepresentativeMention']
      pd[referent_k].append(r)
  return(pd)

In [None]:
cr = proc_corefs(story)
cr

In [None]:
for ts in range(len(toksents)):
    print(ts, toksents[ts])
    print()

In [None]:
# Output coreference chains/clusters
for refid in cr:
    print("Ref ID", refid)
    for ref in cr[refid]:
        print("\t(sent"+str(ref['sentNum'])+", tok"+str(ref['startIndex'])+")", end="")
        for t in range(ref['startIndex'], ref['endIndex']):
            print(" ",toksents[ref['sentNum']][t],end="")
        if ref['repmention']: print('**')
        else: print()
    print('-'*50)