# L665 ML for NLPSpring 2018 

## Assignment 1 - Task 2 

Author: Carlos Sathler

In [1]:
import spacy
from spacy import displacy

import warnings
warnings.filterwarnings('ignore')

In [2]:
txt = u'John met Susan in the mall.  She told him that she is traveling to Europe next week.'

nlp = spacy.load('en')
doc = nlp(txt)
doc

John met Susan in the mall.  She told him that she is traveling to Europe next week.

## Comparison with CoreNLP

### Extracting NLP Features 
Note: Contituent parse and coreference extraction not available in spaCy

#### Parts of speech

In [3]:
# create pos list in NLTK style
print([(token.text, token.pos_) for token in doc])

[('John', 'PROPN'), ('met', 'VERB'), ('Susan', 'PROPN'), ('in', 'ADP'), ('the', 'DET'), ('mall', 'NOUN'), ('.', 'PUNCT'), (' ', 'SPACE'), ('She', 'PRON'), ('told', 'VERB'), ('him', 'PRON'), ('that', 'ADP'), ('she', 'PRON'), ('is', 'VERB'), ('traveling', 'VERB'), ('to', 'ADP'), ('Europe', 'PROPN'), ('next', 'ADJ'), ('week', 'NOUN'), ('.', 'PUNCT')]


#### Named Entities

In [4]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('John', 'PERSON'), ('Susan', 'PERSON'), ('Europe', 'LOC'), ('next week', 'DATE')]


In [5]:
displacy.render(doc, style='ent', jupyter=True)

#### Dependency parse

In [6]:
displacy.render(doc, jupyter=True, style='dep')

  "__main__", mod_spec)
  "__main__", mod_spec)


#### Lemmas

In [7]:
print([(token.text, token.lemma_) for token in doc])

[('John', 'john'), ('met', 'meet'), ('Susan', 'susan'), ('in', 'in'), ('the', 'the'), ('mall', 'mall'), ('.', '.'), (' ', ' '), ('She', '-PRON-'), ('told', 'tell'), ('him', '-PRON-'), ('that', 'that'), ('she', '-PRON-'), ('is', 'be'), ('traveling', 'travel'), ('to', 'to'), ('Europe', 'europe'), ('next', 'next'), ('week', 'week'), ('.', '.')]


## Vectorization Strategy

### Convert input sentences into sequence of word embeddings

In [20]:
from collections import Counter
import numpy as np

# build vocabulary
voc = set([token.text for token in doc])
voc_dict = {word:num for num, word in enumerate(voc)}
print(voc_dict)

{'in': 0, 'She': 1, 'Europe': 2, 'mall': 13, 'is': 5, 'that': 4, 'met': 6, 'next': 8, 'him': 9, 'week': 10, ' ': 3, 'John': 11, 'Susan': 12, 'she': 14, 'to': 16, '.': 15, 'the': 7, 'told': 17, 'traveling': 18}


In [21]:
# create vector of word embeddings for the sentence
print(txt)
print(np.array([voc_dict[token.text] for token in doc]))

John met Susan in the mall.  She told him that she is traveling to Europe next week.
[11  6 12  0  7 13 15  3  1 17  9  4 14  5 18 16  2  8 10 15]


### Convert input sentences into sequence of POS tag embeddings

In [22]:
# repeat the above process, but this time, for POS tags

# build "vocabulary" of pos tags
tags = set([token.pos_ for token in doc])
tags_dict = {tag:num for num, tag in enumerate(tags)}
print(tags_dict)

{'PRON': 0, 'PROPN': 8, 'ADP': 5, 'ADJ': 1, 'PUNCT': 6, 'NOUN': 7, 'VERB': 2, 'SPACE': 3, 'DET': 4}


In [25]:
# create vector of tab embeddings for the sentence
print(txt)
print([token.pos_ for token in doc])
print(np.array([tags_dict[token.pos_] for token in doc]))

John met Susan in the mall.  She told him that she is traveling to Europe next week.
['PROPN', 'VERB', 'PROPN', 'ADP', 'DET', 'NOUN', 'PUNCT', 'SPACE', 'PRON', 'VERB', 'PRON', 'ADP', 'PRON', 'VERB', 'VERB', 'ADP', 'PROPN', 'ADJ', 'NOUN', 'PUNCT']
[8 2 8 5 4 7 6 3 0 2 0 5 0 2 2 5 8 1 7 6]
