# L665 ML for NLPSpring 2018 

## Assignment 1 - Task 3 

Note: Notebook requires coreNLP server is installed and running...

In [1]:
# we will use the python CoreNLP wrapper py-coreNLP, per recommendation in Stanford CoreNLP official pages
# https://github.com/smilli/py-corenlp

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

## Exploring data structure received from server

Below we explore the dictionary populated with data received from CoreNLP through different call types. <br>
**Towards the end of the notebook we retrieve the sentenses parse trees and create a vector of embeddings to represent a contituint parse tree.**

In [2]:
# follwowing example from github, using sentence proposed in assignment
# annotations defined here: https://stanfordnlp.github.io/CoreNLP/annotators.html
txt = u'John met Susan in the mall.  She told him that she is traveling to Europe next week.'
annotators = ['pos', 'ner', 'depparse', 'lemma', 'parse', 'dcoref']
output = dict()
for annotator in annotators:
    output[annotator] = nlp.annotate(txt, properties={'annotators': annotator, 'outputFormat': 'json'})

In [3]:
print(output['lemma'])

{'sentences': [{'index': 0, 'tokens': [{'after': ' ', 'index': 1, 'characterOffsetBegin': 0, 'lemma': 'John', 'originalText': 'John', 'word': 'John', 'before': '', 'pos': 'NNP', 'characterOffsetEnd': 4}, {'after': ' ', 'index': 2, 'characterOffsetBegin': 5, 'lemma': 'meet', 'originalText': 'met', 'word': 'met', 'before': ' ', 'pos': 'VBD', 'characterOffsetEnd': 8}, {'after': ' ', 'index': 3, 'characterOffsetBegin': 9, 'lemma': 'Susan', 'originalText': 'Susan', 'word': 'Susan', 'before': ' ', 'pos': 'NNP', 'characterOffsetEnd': 14}, {'after': ' ', 'index': 4, 'characterOffsetBegin': 15, 'lemma': 'in', 'originalText': 'in', 'word': 'in', 'before': ' ', 'pos': 'IN', 'characterOffsetEnd': 17}, {'after': ' ', 'index': 5, 'characterOffsetBegin': 18, 'lemma': 'the', 'originalText': 'the', 'word': 'the', 'before': ' ', 'pos': 'DT', 'characterOffsetEnd': 21}, {'after': '', 'index': 6, 'characterOffsetBegin': 22, 'lemma': 'mall', 'originalText': 'mall', 'word': 'mall', 'before': ' ', 'pos': 'NN'

In [4]:
print(output['lemma']['sentences'])
print(len(output['lemma']['sentences']))

[{'index': 0, 'tokens': [{'after': ' ', 'index': 1, 'characterOffsetBegin': 0, 'lemma': 'John', 'originalText': 'John', 'word': 'John', 'before': '', 'pos': 'NNP', 'characterOffsetEnd': 4}, {'after': ' ', 'index': 2, 'characterOffsetBegin': 5, 'lemma': 'meet', 'originalText': 'met', 'word': 'met', 'before': ' ', 'pos': 'VBD', 'characterOffsetEnd': 8}, {'after': ' ', 'index': 3, 'characterOffsetBegin': 9, 'lemma': 'Susan', 'originalText': 'Susan', 'word': 'Susan', 'before': ' ', 'pos': 'NNP', 'characterOffsetEnd': 14}, {'after': ' ', 'index': 4, 'characterOffsetBegin': 15, 'lemma': 'in', 'originalText': 'in', 'word': 'in', 'before': ' ', 'pos': 'IN', 'characterOffsetEnd': 17}, {'after': ' ', 'index': 5, 'characterOffsetBegin': 18, 'lemma': 'the', 'originalText': 'the', 'word': 'the', 'before': ' ', 'pos': 'DT', 'characterOffsetEnd': 21}, {'after': '', 'index': 6, 'characterOffsetBegin': 22, 'lemma': 'mall', 'originalText': 'mall', 'word': 'mall', 'before': ' ', 'pos': 'NN', 'characterOf

In [5]:
print(output['lemma']['sentences'][0]['tokens'])

[{'after': ' ', 'index': 1, 'characterOffsetBegin': 0, 'lemma': 'John', 'originalText': 'John', 'word': 'John', 'before': '', 'pos': 'NNP', 'characterOffsetEnd': 4}, {'after': ' ', 'index': 2, 'characterOffsetBegin': 5, 'lemma': 'meet', 'originalText': 'met', 'word': 'met', 'before': ' ', 'pos': 'VBD', 'characterOffsetEnd': 8}, {'after': ' ', 'index': 3, 'characterOffsetBegin': 9, 'lemma': 'Susan', 'originalText': 'Susan', 'word': 'Susan', 'before': ' ', 'pos': 'NNP', 'characterOffsetEnd': 14}, {'after': ' ', 'index': 4, 'characterOffsetBegin': 15, 'lemma': 'in', 'originalText': 'in', 'word': 'in', 'before': ' ', 'pos': 'IN', 'characterOffsetEnd': 17}, {'after': ' ', 'index': 5, 'characterOffsetBegin': 18, 'lemma': 'the', 'originalText': 'the', 'word': 'the', 'before': ' ', 'pos': 'DT', 'characterOffsetEnd': 21}, {'after': '', 'index': 6, 'characterOffsetBegin': 22, 'lemma': 'mall', 'originalText': 'mall', 'word': 'mall', 'before': ' ', 'pos': 'NN', 'characterOffsetEnd': 26}, {'after':

In [6]:
annotation='lemma'
t=0
for token_info in output[annotation]['sentences'][0]['tokens']:
    t+=1
    print('Token #{}: {}'.format(t, token_info['word']))
    print('\tOriginal text: {} \t=> '.format(token_info['originalText']) + annotation + ': {}'.format(token_info[annotation]))

Token #1: John
	Original text: John 	=> lemma: John
Token #2: met
	Original text: met 	=> lemma: meet
Token #3: Susan
	Original text: Susan 	=> lemma: Susan
Token #4: in
	Original text: in 	=> lemma: in
Token #5: the
	Original text: the 	=> lemma: the
Token #6: mall
	Original text: mall 	=> lemma: mall
Token #7: .
	Original text: . 	=> lemma: .


In [7]:
text = {}
for annotation in annotators:
    print(annotation)
    for sentence in output[annotation]['sentences']:
        print(sentence['tokens'])

pos
[{'after': ' ', 'characterOffsetEnd': 4, 'characterOffsetBegin': 0, 'originalText': 'John', 'word': 'John', 'before': '', 'pos': 'NNP', 'index': 1}, {'after': ' ', 'characterOffsetEnd': 8, 'characterOffsetBegin': 5, 'originalText': 'met', 'word': 'met', 'before': ' ', 'pos': 'VBD', 'index': 2}, {'after': ' ', 'characterOffsetEnd': 14, 'characterOffsetBegin': 9, 'originalText': 'Susan', 'word': 'Susan', 'before': ' ', 'pos': 'NNP', 'index': 3}, {'after': ' ', 'characterOffsetEnd': 17, 'characterOffsetBegin': 15, 'originalText': 'in', 'word': 'in', 'before': ' ', 'pos': 'IN', 'index': 4}, {'after': ' ', 'characterOffsetEnd': 21, 'characterOffsetBegin': 18, 'originalText': 'the', 'word': 'the', 'before': ' ', 'pos': 'DT', 'index': 5}, {'after': '', 'characterOffsetEnd': 26, 'characterOffsetBegin': 22, 'originalText': 'mall', 'word': 'mall', 'before': ' ', 'pos': 'NN', 'index': 6}, {'after': '  ', 'characterOffsetEnd': 27, 'characterOffsetBegin': 26, 'originalText': '.', 'word': '.', '

In [8]:
x = nlp.annotate(txt, properties={'annotators': 'pos,ner,depparse,lemma,parse,dcoref', 'outputFormat': 'json'})

In [9]:
for k, v in x.items():
    element = x[k]
    if type(element) is list:
        for item in element:
            for k, v in item.items():
                print(k, v)
    else:
        # dict
        for k, v in element.items():
            print(k, v)

1 [{'type': 'PROPER', 'startIndex': 1, 'text': 'John', 'position': [1, 1], 'id': 1, 'gender': 'MALE', 'number': 'SINGULAR', 'animacy': 'ANIMATE', 'isRepresentativeMention': True, 'headIndex': 1, 'endIndex': 2, 'sentNum': 1}, {'type': 'PRONOMINAL', 'startIndex': 3, 'text': 'him', 'position': [2, 2], 'id': 6, 'gender': 'MALE', 'number': 'SINGULAR', 'animacy': 'ANIMATE', 'isRepresentativeMention': False, 'headIndex': 3, 'endIndex': 4, 'sentNum': 2}]
3 [{'type': 'NOMINAL', 'startIndex': 5, 'text': 'the mall', 'position': [1, 3], 'id': 3, 'gender': 'NEUTRAL', 'number': 'SINGULAR', 'animacy': 'INANIMATE', 'isRepresentativeMention': True, 'headIndex': 6, 'endIndex': 7, 'sentNum': 1}]
4 [{'type': 'PROPER', 'startIndex': 9, 'text': 'Europe', 'position': [2, 4], 'id': 4, 'gender': 'NEUTRAL', 'number': 'SINGULAR', 'animacy': 'INANIMATE', 'isRepresentativeMention': True, 'headIndex': 9, 'endIndex': 10, 'sentNum': 2}]
8 [{'type': 'PROPER', 'startIndex': 10, 'text': 'next week', 'position': [2, 5], 

## Vectorization of constituint parse tree

In [10]:
# get dependency parse in bracketted format
dep_parse = nlp.annotate(txt, properties={'annotators': 'parse', 'outputFormat': 'json'})
dep_parse = dep_parse['sentences'][0]['parse'].replace('\n','')
dep_parse

'(ROOT  (S    (NP (NNP John))    (VP (VBD met)      (NP (NNP Susan))      (PP (IN in)        (NP (DT the) (NN mall))))    (. .)))'

In [11]:
import re
import numpy as np

# prepare string for vectorization
#parse = re.sub(r'[\w|\.]+\)',r' ) ',dep_parse)
parse = re.sub(r'([\(\)])',r' \1 ',dep_parse)
parse

' ( ROOT   ( S     ( NP  ( NNP John )  )      ( VP  ( VBD met )        ( NP  ( NNP Susan )  )        ( PP  ( IN in )          ( NP  ( DT the )   ( NN mall )  )  )  )      ( . . )  )  ) '

In [12]:
# build "vocabulary" of parse tags
parse_tags = set([parse_tag for parse_tag in parse.split()])
tags_dict = {tag:num for num, tag in enumerate(parse_tags)}
print(tags_dict)

{'NP': 0, 'the': 1, 'met': 2, 'ROOT': 3, 'DT': 7, 'John': 5, 'in': 12, 'VBD': 8, ')': 9, 'VP': 11, 'NNP': 10, 'IN': 6, 'mall': 13, '.': 14, 'PP': 4, 'Susan': 16, '(': 17, 'NN': 15, 'S': 18}


In [13]:
print(parse)
print(np.array([tags_dict[tag] for tag in parse.split()]))

 ( ROOT   ( S     ( NP  ( NNP John )  )      ( VP  ( VBD met )        ( NP  ( NNP Susan )  )        ( PP  ( IN in )          ( NP  ( DT the )   ( NN mall )  )  )  )      ( . . )  )  ) 
[17  3 17 18 17  0 17 10  5  9  9 17 11 17  8  2  9 17  0 17 10 16  9  9 17
  4 17  6 12  9 17  0 17  7  1  9 17 15 13  9  9  9  9 17 14 14  9  9  9]
