In [1]:
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore')

# Parts of Speech (POS) Tagging

In [2]:
# how POS tagging can be implemented using spaCy.
sentence = "US unveils world's most powerful supercomputer, beats China."

import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
sentence_nlp = nlp(sentence)
# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Word,US,unveils,world,'s,most,powerful,supercomputer,",",beats,China,.
POS tag,NNP,VBZ,NN,POS,RBS,JJ,NN,",",VBZ,NNP,.
Tag type,PROPN,VERB,NOUN,PART,ADV,ADJ,NOUN,PUNCT,VERB,PROPN,PUNCT


In [2]:
# how POS tagging can be implemented using spaCy for Greek Language.
sentence = "Η Ελλάδα είχε 33 εκατομμύρια τουρίστες, και έτσι ξεπέρασε την Ισπανία."

import pandas as pd
import spacy
import el_core_news_md
nlp = el_core_news_md.load()

sentence_nlp = nlp(sentence)
# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
Word,Η,Ελλάδα,είχε,33,εκατομμύρια,τουρίστες,",",και,έτσι,ξεπέρασε,την,Ισπανία,.
POS tag,DET,PROPN,VERB,NUM,NOUN,NOUN,PUNCT,CCONJ,ADV,VERB,DET,PROPN,PUNCT
Tag type,DET,PROPN,VERB,NUM,NOUN,NOUN,PUNCT,CCONJ,ADV,VERB,DET,PROPN,PUNCT


In [8]:
# DEP & POS Tagger
import spacy
from nltk import Tree
# nlp = spacy.load('el')
doc = nlp("η δημοκρατία είναι το πιο ανθρώπινο πολίτευμα.")


for token in doc:
    print('Token:{}, DEP tag: {}, POS tag: {}'.format(token, token.dep_, token.pos_))
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
      return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
      return node.orth_
[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

Token:η, DEP tag: det, POS tag: DET
Token:δημοκρατία, DEP tag: nsubj, POS tag: NOUN
Token:είναι, DEP tag: cop, POS tag: AUX
Token:το, DEP tag: det, POS tag: DET
Token:πιο, DEP tag: advmod, POS tag: ADV
Token:ανθρώπινο, DEP tag: amod, POS tag: ADJ
Token:πολίτευμα, DEP tag: ROOT, POS tag: NOUN
Token:., DEP tag: punct, POS tag: PUNCT
          πολίτευμα                     
   ___________|____________________      
  |    |      |     δημοκρατία ανθρώπινο
  |    |      |         |          |     
είναι  το     .         η         πιο   



[None]

In [4]:
# NER Tagger
# import spacy
# nlp  = spacy.load('el')
text = '''Η εταιρεία Google έχει τα γραφεία της στην Καλιφόρνια.'''
doc = nlp(text)
for ent in doc.ents:
  print("Entity:{}, Label:{}".format(ent.text, ent.label_))

Entity:Google, Label:ORG
Entity:Καλιφόρνια, Label:GPE


In [5]:
# POS tagging with nltk - NO for Greek language
import nltk
nltk_pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
Word,Η,Ελλάδα,είχε,33,εκατομμύρια,τουρίστες,",",και,έτσι,ξεπέρασε,την,Ισπανία,.
POS tag,JJ,NNP,NNP,CD,NNP,NNP,",",NNP,NNP,NNP,NNP,NNP,.


In [6]:
# build our own POS taggers
# to evaluate the performance of our taggers, we use
# some test data from the treebank corpus in NLTK
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print(train_data[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [7]:
# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

In [8]:
# accuracy on test data
# evaluate() function, which is used to evaluate the performance
# of the tagger
dt.evaluate(test_data)

0.1454158195372253

In [10]:
# tagging our sample headline
dt.tag(nltk.word_tokenize(sentence))

[('US', 'NN'),
 ('unveils', 'NN'),
 ('world', 'NN'),
 ("'s", 'NN'),
 ('most', 'NN'),
 ('powerful', 'NN'),
 ('supercomputer', 'NN'),
 (',', 'NN'),
 ('beats', 'NN'),
 ('China', 'NN'),
 ('.', 'NN')]

In [11]:
# regex tagger
# We will now use regular expressions and the RegexpTagger to see if we can build a better
# performing tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

In [12]:
# accuracy on test data
rt.evaluate(test_data)

0.24039113176493368

In [13]:
# tagging our sample headline
rt.tag(nltk.word_tokenize(sentence))

[('US', 'NN'),
 ('unveils', 'NNS'),
 ('world', 'NN'),
 ("'s", 'NN$'),
 ('most', 'NN'),
 ('powerful', 'NN'),
 ('supercomputer', 'NN'),
 (',', 'NN'),
 ('beats', 'NNS'),
 ('China', 'NN'),
 ('.', 'NN')]

In [14]:
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

In [15]:
# testing performance of unigram tagger
print(ut.evaluate(test_data))
print(ut.tag(nltk.word_tokenize(sentence)))

0.8607803272340013
[('US', 'NNP'), ('unveils', None), ('world', 'NN'), ("'s", 'POS'), ('most', 'JJS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', None), ('China', 'NNP'), ('.', '.')]


In [16]:
# testing performance of bigram tagger
print(bt.evaluate(test_data))
print(bt.tag(nltk.word_tokenize(sentence)))

0.13466937748087907
[('US', None), ('unveils', None), ('world', None), ("'s", None), ('most', None), ('powerful', None), ('supercomputer', None), (',', None), ('beats', None), ('China', None), ('.', None)]


In [17]:
# testing performance of trigram tagger
print(tt.evaluate(test_data))
print(tt.tag(nltk.word_tokenize(sentence)))

0.08064672281924679
[('US', None), ('unveils', None), ('world', None), ("'s", None), ('most', None), ('powerful', None), ('supercomputer', None), (',', None), ('beats', None), ('China', None), ('.', None)]


In [18]:
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

In [19]:
# evaluating the new combined tagger with backoff taggers
print(ct.evaluate(test_data))      
print(ct.tag(nltk.word_tokenize(sentence)))

0.9094781682641108
[('US', 'NNP'), ('unveils', 'NNS'), ('world', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', 'NNS'), ('China', 'NNP'), ('.', '.')]


In [20]:
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=NaiveBayesClassifier.train)

In [21]:
# evaluate tagger on test data and sample sentence
print(nbt.evaluate(test_data))
print(nbt.tag(nltk.word_tokenize(sentence)))

0.9306806079969019
[('US', 'PRP'), ('unveils', 'VBZ'), ('world', 'VBN'), ("'s", 'POS'), ('most', 'JJS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', 'VBZ'), ('China', 'NNP'), ('.', '.')]


In [22]:
# try this out for fun!
#met = ClassifierBasedPOSTagger(train=train_data,
#                               classifier_builder=MaxentClassifier.train)
#print(met.evaluate(test_data))                           
#print(met.tag(nltk.word_tokenize(sentence)))

# Shallow Parsing or Chunking

In [14]:
# import spacy
import re
#nlp = spacy.load('el')
text = '''Η όμορφη ιδέα του άλλαξε την μίζερη ζωή.'''
# remove punct
chunks = [re.sub(r'[^\w\s]', '', x.text) for x in nlp(text).noun_chunks]
for chunk in chunks:
    print(chunk)

Η όμορφη ιδέα του
την μίζερη ζωή


In [15]:
# Just like POS tagging, we use some training data to train our
# parsers if needed and evaluate all our parsers on some test data and on our sample
# sentence.
# The treebank corpus is available in NLTK with chunk annotations
from nltk.corpus import treebank_chunk
data = treebank_chunk.chunked_sents()

train_data = data[:3500]
test_data = data[3500:]
# view sample data
print(train_data[7])

(S
  (NP A/DT Lorillard/NNP spokewoman/NN)
  said/VBD
  ,/,
  ``/``
  (NP This/DT)
  is/VBZ
  (NP an/DT old/JJ story/NN)
  ./.)


In [16]:
from nltk.chunk import RegexpParser

# get POS tagged sentence
tagged_simple_sent = nltk.pos_tag(nltk.word_tokenize(sentence))
print('POS Tags:', tagged_simple_sent)

# illustrate NP chunking based on explicit chunk patterns
chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)

POS Tags: [('Η', 'JJ'), ('Ελλάδα', 'NNP'), ('είχε', 'NNP'), ('33', 'CD'), ('εκατομμύρια', 'NNP'), ('τουρίστες', 'NNP'), (',', ','), ('και', 'NNP'), ('έτσι', 'NNP'), ('ξεπέρασε', 'NNP'), ('την', 'NNP'), ('Ισπανία', 'NNP'), ('.', '.')]


In [17]:
# print and view chunked sentence using chunking
print(c)
c

(S
  (NP Η/JJ Ελλάδα/NNP)
  (NP είχε/NNP)
  33/CD
  (NP εκατομμύρια/NNP)
  (NP τουρίστες/NNP)
  ,/,
  (NP και/NNP)
  (NP έτσι/NNP)
  (NP ξεπέρασε/NNP)
  (NP την/NNP)
  (NP Ισπανία/NNP)
  ./.)


The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('Η', 'JJ'), ('Ελλάδα', 'NNP')]), Tree('NP', [('είχε', 'NNP')]), ('33', 'CD'), Tree('NP', [('εκατομμύρια', 'NNP')]), Tree('NP', [('τουρίστες', 'NNP')]), (',', ','), Tree('NP', [('και', 'NNP')]), Tree('NP', [('έτσι', 'NNP')]), Tree('NP', [('ξεπέρασε', 'NNP')]), Tree('NP', [('την', 'NNP')]), Tree('NP', [('Ισπανία', 'NNP')]), ('.', '.')])

In [18]:
# illustrate NP chunking based on explicit chink patterns
# Chinking is the reverse of chunking
chink_grammar = """
NP:
    {<.*>+}             # Chunk everything as NP
    }<VBZ|VBD|JJ|IN>+{  # Chink sequences of VBD\VBZ\JJ\IN
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)

In [19]:
# print and view chunked sentence using chinking
print(c)
c

(S
  Η/JJ
  (NP
    Ελλάδα/NNP
    είχε/NNP
    33/CD
    εκατομμύρια/NNP
    τουρίστες/NNP
    ,/,
    και/NNP
    έτσι/NNP
    ξεπέρασε/NNP
    την/NNP
    Ισπανία/NNP
    ./.))


The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [('Η', 'JJ'), Tree('NP', [('Ελλάδα', 'NNP'), ('είχε', 'NNP'), ('33', 'CD'), ('εκατομμύρια', 'NNP'), ('τουρίστες', 'NNP'), (',', ','), ('και', 'NNP'), ('έτσι', 'NNP'), ('ξεπέρασε', 'NNP'), ('την', 'NNP'), ('Ισπανία', 'NNP'), ('.', '.')])])

In [20]:
# create a more generic shallow parser
grammar = """
NP: {<DT>?<JJ>?<NN.*>}  
ADJP: {<JJ>}
ADVP: {<RB.*>}
PP: {<IN>}      
VP: {<MD>?<VB.*>+}
"""
rc = RegexpParser(grammar)
c = rc.parse(tagged_simple_sent)

In [21]:
# print and view shallow parsed sample sentence
print(c)
c

(S
  (NP Η/JJ Ελλάδα/NNP)
  (NP είχε/NNP)
  33/CD
  (NP εκατομμύρια/NNP)
  (NP τουρίστες/NNP)
  ,/,
  (NP και/NNP)
  (NP έτσι/NNP)
  (NP ξεπέρασε/NNP)
  (NP την/NNP)
  (NP Ισπανία/NNP)
  ./.)


The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('Η', 'JJ'), ('Ελλάδα', 'NNP')]), Tree('NP', [('είχε', 'NNP')]), ('33', 'CD'), Tree('NP', [('εκατομμύρια', 'NNP')]), Tree('NP', [('τουρίστες', 'NNP')]), (',', ','), Tree('NP', [('και', 'NNP')]), Tree('NP', [('έτσι', 'NNP')]), Tree('NP', [('ξεπέρασε', 'NNP')]), Tree('NP', [('την', 'NNP')]), Tree('NP', [('Ισπανία', 'NNP')]), ('.', '.')])

In [29]:
# Evaluate parser performance on test data
print(rc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  46.1%%
    Precision:     19.9%%
    Recall:        43.3%%
    F-Measure:     27.3%%


In [30]:
from nltk.chunk.util import tree2conlltags, conlltags2tree
# look at a sample training tagged sentence
train_sent = train_data[7]
print(train_sent)

(S
  (NP A/DT Lorillard/NNP spokewoman/NN)
  said/VBD
  ,/,
  ``/``
  (NP This/DT)
  is/VBZ
  (NP an/DT old/JJ story/NN)
  ./.)


In [31]:
# get the (word, POS tag, Chunk tag) triples for each token
wtc = tree2conlltags(train_sent)
wtc

[('A', 'DT', 'B-NP'),
 ('Lorillard', 'NNP', 'I-NP'),
 ('spokewoman', 'NN', 'I-NP'),
 ('said', 'VBD', 'O'),
 (',', ',', 'O'),
 ('``', '``', 'O'),
 ('This', 'DT', 'B-NP'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'B-NP'),
 ('old', 'JJ', 'I-NP'),
 ('story', 'NN', 'I-NP'),
 ('.', '.', 'O')]

In [32]:
# get shallow parsed tree back from the WTC triples
tree = conlltags2tree(wtc)
print(tree)

(S
  (NP A/DT Lorillard/NNP spokewoman/NN)
  said/VBD
  ,/,
  ``/``
  (NP This/DT)
  is/VBZ
  (NP an/DT old/JJ story/NN)
  ./.)


In [33]:
def conll_tag_chunks(chunk_sents):
  tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
  return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
  
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

In [34]:
# We also define a parse() # function to perform shallow parsing on new sentences.
# In this class, the constructor __init__() function is used to train the shallow parser
# using n-gram tagging based on the WTC triples for each sentence
# Finally, it trains a BigramTagger with a Unigram
# tagger as a backoff tagger using these triples and stores the training model in self.
# chunk_tagger.
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI

class NGramTagChunker(ChunkParserI):
    
  def __init__(self, train_sentences, 
               tagger_classes=[UnigramTagger, BigramTagger]):
    train_sent_tags = conll_tag_chunks(train_sentences)
    self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)

  def parse(self, tagged_sentence):
    if not tagged_sentence: 
        return None
    pos_tags = [tag for word, tag in tagged_sentence]
    chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
    chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
    wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
                     in zip(tagged_sentence, chunk_tags)]
    return conlltags2tree(wpc_tags)

In [35]:
# train the shallow parser
ntc = NGramTagChunker(train_data)

# test parser performance on test data
print(ntc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  97.2%%
    Precision:     91.4%%
    Recall:        94.3%%
    F-Measure:     92.8%%


In [36]:
# parse our sample sentence
sentence_nlp = nlp(sentence)
tagged_sentence = [(word.text, word.tag_) for word in sentence_nlp]
tree = ntc.parse(tagged_sentence)
print(tree)
tree

(S
  (NP US/NNP)
  unveils/VBZ
  (NP world/NN 's/POS most/RBS powerful/JJ supercomputer/NN)
  ,/,
  beats/VBZ
  (NP China/NNP)
  ./.)


The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('US', 'NNP')]), ('unveils', 'VBZ'), Tree('NP', [('world', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN')]), (',', ','), ('beats', 'VBZ'), Tree('NP', [('China', 'NNP')]), ('.', '.')])

In [37]:
# Let’s now train and evaluate our parser on the conll2000 corpus, which contains
# excerpts from The Wall Street Journal and is a much larger corpus
from nltk.corpus import conll2000
wsj_data = conll2000.chunked_sents()
train_wsj_data = wsj_data[:10000]
test_wsj_data = wsj_data[10000:]
# look at a sample sentence in the corpus
print(train_wsj_data[10])

(S
  (NP He/PRP)
  (VP reckons/VBZ)
  (NP the/DT current/JJ account/NN deficit/NN)
  (VP will/MD narrow/VB)
  (PP to/TO)
  (NP only/RB #/# 1.8/CD billion/CD)
  (PP in/IN)
  (NP September/NNP)
  ./.)


In [38]:
# train the shallow parser
tc = NGramTagChunker(train_wsj_data)

# test performance on the test data
print(tc.evaluate(test_wsj_data))

ChunkParse score:
    IOB Accuracy:  89.1%%
    Precision:     80.3%%
    Recall:        86.1%%
    F-Measure:     83.1%%


In [39]:
# parse our sample sentence
tree = tc.parse(tagged_sentence)
print(tree)
tree

(S
  (NP US/NNP)
  (VP unveils/VBZ)
  (NP world/NN)
  (NP 's/POS most/RBS powerful/JJ supercomputer/NN)
  ,/,
  (VP beats/VBZ)
  (NP China/NNP)
  ./.)


The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('US', 'NNP')]), Tree('VP', [('unveils', 'VBZ')]), Tree('NP', [('world', 'NN')]), Tree('NP', [("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN')]), (',', ','), Tree('VP', [('beats', 'VBZ')]), Tree('NP', [('China', 'NNP')]), ('.', '.')])

# Dependency Parsing

In [41]:
# we can build dependency parsers for parsing unstructured text!
dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------'
for token in sentence_nlp:
    print(dependency_pattern.format(word=token.orth_, 
                                  w_type=token.dep_,
                                  left=[t.orth_ 
                                            for t 
                                            in token.lefts],
                                  right=[t.orth_ 
                                             for t 
                                             in token.rights]))
# This output gives us each token and its dependency type.
# verb “beats” is the root since it doesn’t have any other dependencies

[]<---US[nsubj]--->[]
--------
['US']<---unveils[ROOT]--->['supercomputer', ',', 'beats', '.']
--------
[]<---world[poss]--->["'s"]
--------
[]<---'s[case]--->[]
--------
[]<---most[advmod]--->[]
--------
['most']<---powerful[amod]--->[]
--------
['world', 'powerful']<---supercomputer[dobj]--->[]
--------
[]<---,[punct]--->[]
--------
[]<---beats[conj]--->['China']
--------
[]<---China[dobj]--->[]
--------
[]<---.[punct]--->[]
--------


In [42]:
from spacy import displacy

displacy.render(sentence_nlp, jupyter=True, 
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

In [43]:
from nltk.parse.stanford import StanfordDependencyParser
sdp = StanfordDependencyParser(path_to_jar='C:\stanford-parser-full-2015-04-20/stanford-parser.jar',
                               path_to_models_jar='C:\stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')    

# perform dependency parsing
result = list(sdp.raw_parse(sentence))[0]  

# generate annotated dependency parse tree
result

LookupError: Could not find stanford-parser\.jar jar file at C:\stanford-parser-full-2015-04-20/stanford-parser.jar

In [44]:
# generate dependency triples
[item for item in result.triples()]

NameError: name 'result' is not defined

In [45]:
# print simple dependency parse tree
dep_tree = result.tree()
print(dep_tree)

NameError: name 'result' is not defined

In [46]:
# visualize simple dependency parse tree
dep_tree

NameError: name 'dep_tree' is not defined

# Constituency Parsing

In [47]:
# We will be using NLTK and the Stanford Parser to generate parse trees since they are
# state-of-the-art and work very well.
# The Stanford Parser generally uses a PCFG (probabilistic context-free grammar)
# parser. A PCFG is a context-free grammar that associates a probability with each of its
# production rules

# set java path
import os
java_path = r'C:\Program Files\Java\jdk-13\bin\java.exe'
os.environ['JAVAHOME'] = java_path

# create parser object
from nltk.parse.stanford import StanfordParser
scp = StanfordParser(path_to_jar='C:\stanford-parser-full-2015-04-20/stanford-parser.jar',
                     path_to_models_jar='C:\stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')

# get parse tree                   
result = list(scp.raw_parse(sentence))[0]
# print the constituency parse tree
print(result)

LookupError: Could not find stanford-parser\.jar jar file at C:\stanford-parser-full-2015-04-20/stanford-parser.jar

In [49]:
# visualize the parse tree
from IPython.display import display
display(result)

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('ROOT', [Tree('SINV', [Tree('S', [Tree('NP', [Tree('NNP', ['US'])]), Tree('VP', [Tree('VBZ', ['unveils']), Tree('NP', [Tree('NP', [Tree('NN', ['world']), Tree('POS', ["'s"])]), Tree('ADJP', [Tree('RBS', ['most']), Tree('JJ', ['powerful'])]), Tree('NN', ['supercomputer'])])])]), Tree(',', [',']), Tree('VP', [Tree('VBZ', ['beats'])]), Tree('NP', [Tree('NNP', ['China'])]), Tree('.', ['.'])])])

In [50]:
import nltk
from nltk.grammar import Nonterminal
from nltk.corpus import treebank
# load and view training data
training_set = treebank.parsed_sents()
print(training_set[1])                

(S
  (NP-SBJ (NNP Mr.) (NNP Vinken))
  (VP
    (VBZ is)
    (NP-PRD
      (NP (NN chairman))
      (PP
        (IN of)
        (NP
          (NP (NNP Elsevier) (NNP N.V.))
          (, ,)
          (NP (DT the) (NNP Dutch) (VBG publishing) (NN group))))))
  (. .))


In [51]:
# extract the productions for all annotated training sentences
treebank_productions = list(
                        set(production 
                            for sent in training_set  
                            for production in sent.productions()
                        )
                    )
# view some production rules
treebank_productions[0:10]

[NP -> NN NN SBAR,
 NNP -> 'Francisco',
 NP-SBJ -> DT ADJP JJ NNS,
 VB -> 'shoot',
 CD -> '1950s',
 NP -> NP NP-ADV ADVP,
 NN -> 'advent',
 VBN -> 'zoomed',
 NNS -> 'reporters',
 NNP -> 'Taccetta']

In [53]:
# add productions for each word, POS tag
for word, tag in treebank.tagged_words():
    t = nltk.Tree.fromstring("("+ tag + " " + word  +")")
    for production in t.productions():
        treebank_productions.append(production)

# build the PCFG based grammar  
treebank_grammar = nltk.grammar.induce_pcfg(Nonterminal('S'), 
                                         treebank_productions)

In [54]:
# build the parser
viterbi_parser = nltk.ViterbiParser(treebank_grammar)

# get sample sentence tokens
tokens = nltk.word_tokenize(sentence)

# get parse tree for sample sentence
result = list(viterbi_parser.parse(tokens))

ValueError: Grammar does not cover some of the input words: "'unveils', 'beats'".

In [55]:
# get tokens and their POS tags and check it
tagged_sent = nltk.pos_tag(nltk.word_tokenize(sentence))
print(tagged_sent)

[('US', 'NNP'), ('unveils', 'JJ'), ('world', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', 'VBZ'), ('China', 'NNP'), ('.', '.')]


In [56]:
# extend productions for sample sentence tokens
for word, tag in tagged_sent:
    t = nltk.Tree.fromstring("("+ tag + " " + word  +")")
    for production in t.productions():
        treebank_productions.append(production)

# rebuild grammar
treebank_grammar = nltk.grammar.induce_pcfg(Nonterminal('S'), 
                                         treebank_productions)                                         
# rebuild parser
viterbi_parser = nltk.ViterbiParser(treebank_grammar)
# get parse tree for sample sentence
result = list(viterbi_parser.parse(tokens))[0]

# print parse tree
print(result)

(S
  (NP-SBJ-2
    (NP (NNP US) (JJ unveils))
    (NP
      (NP (NN world) (POS 's))
      (JJS most)
      (JJ powerful)
      (NN supercomputer)))
  (, ,)
  (VP (VBZ beats) (NP-TTL (NNP China)))
  (. .)) (p=1.08755e-43)


In [57]:
# visualize parse tree
result  

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

ProbabilisticTree('S', [ProbabilisticTree('NP-SBJ-2', [ProbabilisticTree('NP', [ProbabilisticTree('NNP', ['US']) (p=0.0001876700760063808), ProbabilisticTree('JJ', ['unveils']) (p=7.448234768359899e-05)]) (p=9.381280436855506e-12), ProbabilisticTree('NP', [ProbabilisticTree('NP', [ProbabilisticTree('NN', ['world']) (p=0.0015871920502380787), ProbabilisticTree('POS', ["'s"]) (p=0.9230769230769231)]) (p=9.832888282321602e-07), ProbabilisticTree('JJS', ['most']) (p=0.21628498727735368), ProbabilisticTree('JJ', ['powerful']) (p=0.00044689408610159393), ProbabilisticTree('NN', ['supercomputer']) (p=0.00041405010006210753)]) (p=2.6410627098290427e-17)]) (p=4.0617294970865225e-30), ProbabilisticTree(',', [',']) (p=0.999693094629156), ProbabilisticTree('VP', [ProbabilisticTree('VBZ', ['beats']) (p=0.0002187705097352877), ProbabilisticTree('NP-TTL', [ProbabilisticTree('NNP', ['China']) (p=0.00243971098808295)]) (p=0.0002033092490069125)]) (p=2.0930855547830955e-11), ProbabilisticTree('.', ['.']