In [None]:
import nltk, re, pprint
from stanfordcorenlp import StanfordCoreNLP
# Get an instance of StanfordCoreNLP by connecting to the server
nlp = StanfordCoreNLP('http://jupyterlab-nfs-corenlp', port=9000)
import json

In [None]:
#sent = nltk.corpus.treebank.tagged_sents()[57]
#sent

In [None]:
s = "The three ex-field staffers, Alexis Sklair, Nathaniel Brown, and Sterling Rettke, filed suit in federal court in New York City, \
     contending that field organizers were fraudulently induced to accept jobs with the Bloomberg campaign based on the promise of \
     guaranteed salaries through November 30, 2020."
sent = nltk.pos_tag(nltk.word_tokenize(s))

# NLTK Named Entities

In [None]:
# Get named entities from NLTK
res = nltk.ne_chunk(sent)
print(res)

In [None]:
# Adapted from code at http://www.nltk.org/book/ch07.html to return data rather than just output it
# by Sal Barbosa
# This function recursively traverses a tree, collecting information into a list reference
# Input: A tree, t, a list, and a sublist (defaults to the empty list if one is not supplied)
# Output: Modifies the original list passed in as a reference (lst)
def traverse_ne(t,lst,sublst=[]):
    try:
        t.label()
    except AttributeError:
        #print(t,type(t), end=" ")
        lst.append(t)
    else:
        # Now we know that t.node is defined
        x = len(lst)
        if t.label() != "S":sublst = [t.label()]
        lst.append(sublst)      
        for child in t:
            traverse_ne(child,lst[x],lst[x])
        lst[x] = tuple(lst[x])
        #if t.label() != "S": sublst.insert(0, t.label())
        #print('(', t.label(), end=" ")
        #print(')', end=" ")

In [None]:
mylst = []
traverse_ne(res,mylst)
nes = mylst[0]
print(nes)

In [None]:
# output only named entities
for itm in nes:
    #print(itm)
    if isinstance(itm[1],tuple):
        print(itm[1:],"is a",itm[0])

# NLTK Relation Extraction Example

In [None]:
# Adapted from code at http://www.nltk.org/book/ch07.html
# using regular expressions to extract relationships between named entity types 
pat = re.compile(r'.*\bin\b(?!\b.+ing)')
#pat = re.compile(r'.*\bof\b')
#pat = re.compile(r'.*\band\b')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = pat):
    #for rel in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern = pat):
    #for rel in nltk.sem.extract_rels('PER', 'PER', doc, corpus='ieer', pattern = pat):
        print(nltk.sem.rtuple(rel))

# Custom Tagging (Named Entity and Part-of-Speech tags)

In [None]:
# Author: Sal Barbosa
# This function custom tags a sentence, using named entity types where applicable, and part-of-speech tags otherwise
# Purpose: It may be used to construct chunk grammars that identify phrases based on named entities (vs. POS tags)
# Input: It accepts a list of dictionaries output from the CoreNLP annotator with named entity tags as a property
#        and has an optional parameter indicating whether the token, or its lemma should be returned (using lemmas defaults to False )
# Output: It returns a list of tuples containing the token (or its lemma) and the named entity or part-of-speech tag
def blend_pos_nes(d,uselemma=False):
    toks = []                                           # will hold a list of tuples (token or lemma, ne or pos tag)
    fullne = ""                                         # holds combined compound named entities, connecte with underscore (i.e. "New_York")
    lastne = ""                                         # holds last named entity tage read - used to combine compound named entities
    for t in d:                                         # t is a dictionary for each toekn in the sentence
        if uselemma: tok = t['lemma']
        else: tok = t['originalText']
        pos = t['pos']
        ne = t['ner']
        if ne == 'O':                                   # this token is not (part of) a named entity
            if lastne != "":                            # if previously "inside" a compound named entity (like 'New York'), terminate it
                toks.append((fullne.strip('_'),lastne)) # and add the token and its ne tag to the list
                lastne = ""                             # reset lastne and fullne
                fullne = ""                             # to empty strings
            toks.append((tok,pos))                  # not previously inside ne, so add token/lemma and pos tag
        else:                                           # this token is (part of) a named entity
            if ne == lastne:                            # if inside a compound named entity of the same type
                fullne += '_' + tok                     # keep building it
            else:                                       # otherwise this is a new named entity 
                if lastne != "":                            # if previously "inside" a different named entity than this one
                    toks.append((fullne.strip('_'),lastne)) # close it out and add the token and its ne tag to the list              
                lastne = ne                             # set lastne to the ne of this token
                fullne = tok                            # begin (a possibly compound) token with this token/lemma
    return toks


In [None]:
# Author: Sal Barbosa
# This is a helper function that runs the NE annotator, unpacks the json return, invokes blend_pos_ne, and returns the custom tagging
def run_ne(s):
    props = {'annotators': 'ner','outputFormat':'json'} # set annotator to provide named entities and return as json (otherwise it's a string)
    nes2 = nlp.annotate(s, properties=props)   # apply the annotator: results are in json format
    d = json.loads(nes2)['sentences'][0]['tokens']
    #print(d)
    tagged = blend_pos_nes(d)
    return tagged    

In [None]:
# custom tagging output
print(s)
props = {'annotators': 'ner','outputFormat':'json'} # set annotator to provide named entities and return as json (otherwise it's a string)
nes2 = nlp.annotate(s, properties=props)   # apply the annotator: results are in json format
d = json.loads(nes2)['sentences'][0]['tokens']
tagged = blend_pos_nes(d)
print(tagged)

In [None]:
tagged = run_ne(s)
tagged

## Using custom tags in a chunk grammar

In [None]:
# chunk grammar for basic noun phrases and verb phrases
# the second line in the grammar accepts persons in text like "The eccentric Albert Einstein" or simply "George Washington" as noun phrases
grammar = r"""
  NP: {<DT>?<ORDINAL>?<JJ>*<NN|NNS>+}
      {<DT>?<JJ>*<PERSON>}
  PP: {<IN|TO><NP>} 
  V:  {<VB|VBD|VBG|VBN|VBP|VBZ|>}
  VP: {<V><NP><PP>*<NP>*}
"""

In [None]:
# Example of use of grammar's output with custom tagging 
s1 = 'The eccentric Albert Einstein once said that imagination is more important than knowledge.'
#print(nltk.pos_tag(nltk.word_tokenize(s1)))
tagged = run_ne(s1)
print(tagged)
cp = nltk.RegexpParser(grammar)
print(cp.parse(tagged))
#s1

In [None]:
line = "The second horse ran fast."
nes2 = nlp.annotate(line, properties=props)   # apply the annotator: results are in json format
d = json.loads(nes2)['sentences'][0]['tokens']
tagged = blend_pos_nes(d)
print(cp.parse(tagged))
