In [1]:
path = 'lab8/'

import pandas as pd
import re
import nltk
from nltk import CFG, BottomUpChartParser, BottomUpLeftCornerChartParser, LeftCornerChartParser
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from scipy.stats import pearsonr
from IPython.display import display_html

from nltk.parse.corenlp import CoreNLPDependencyParser

# Lab 8: Parsing

For the eighth practical of the subject, the goal is to try some non-probabilistic parsers, and optionally probabilistic parsers as well. The **mandatory** statement is:

1. Consider the following sentence:
`Lazy cats play with mice.`
2. Expand the grammar of the example related to non-probabilistic chart parsers in order to subsume this new sentence.
3. Perform the constituency parsing using a BottomUpChartParser, a BottomUpLeftCornerChartParser and a LeftCornerChartParser.
4. For each one of them, provide the resulting tree, the number of edges and the list of explored edges.
5. Which parser is the most efficient for parsing the sentence?
6. Which edges are filtered out by each parser and why?

The **optional** statement, which we've also accomplished, is:

1. Read all pairs of sentences of the SMTeuroparl files of test set within the evaluation framework of the project.
2. Compute the Jaccard similarity of each pair using the dependency triples from CoreNLPDependencyParser.
3. Show the results. Do you think it could be relevant to use NEs to compute the similarity between two sentences? Justify the answer.

## Mandatory exercise: Non-probabilistic parsers

We add the words `"lazy"` (adjective, `Adj`), `"play"` (verb, `V`) and `"with"` (preposition, `PP`) in order to expand the grammar given so it satisfies the sentence `Lazy cats play with mice`. 

Reference: https://www.nltk.org/book/ch08.html

In [2]:
grammar = CFG.fromstring('''
  NP  -> NNS | JJ NNS | NP CC NP
  NNS -> "cats" | "dogs" | "mice" | NNS CC NNS 
  JJ  -> "big" | "small"
  CC  -> "and" | "or"
  PP -> "with"
  V -> "play"
  Adj -> "lazy"
  ''')
sent = ['lazy', 'cats', 'play', 'with', 'mice']

### BottomUpChartParser

In [3]:
parser = nltk.BottomUpChartParser(grammar,trace=1)

In [4]:
parse = parser.chart_parse(sent)

|.  lazy .  cats .  play .  with .  mice .|
|[-------]       .       .       .       .| [0:1] 'lazy'
|.       [-------]       .       .       .| [1:2] 'cats'
|.       .       [-------]       .       .| [2:3] 'play'
|.       .       .       [-------]       .| [3:4] 'with'
|.       .       .       .       [-------]| [4:5] 'mice'
|>       .       .       .       .       .| [0:0] Adj -> * 'lazy'
|[-------]       .       .       .       .| [0:1] Adj -> 'lazy' *
|.       >       .       .       .       .| [1:1] NNS -> * 'cats'
|.       [-------]       .       .       .| [1:2] NNS -> 'cats' *
|.       >       .       .       .       .| [1:1] NP -> * NNS
|.       >       .       .       .       .| [1:1] NNS -> * NNS CC NNS
|.       [-------]       .       .       .| [1:2] NP -> NNS *
|.       [------->       .       .       .| [1:2] NNS -> NNS * CC NNS
|.       >       .       .       .       .| [1:1] NP -> * NP CC NP
|.       [------->       .       .       .| [1:2] NP -> NP * CC NP
|.       

In [5]:
print("Number of edges:  " + str(parse.num_edges()))

Number of edges:  27


In [6]:
parse.edges()

[[Edge: [0:1] 'lazy'],
 [Edge: [1:2] 'cats'],
 [Edge: [2:3] 'play'],
 [Edge: [3:4] 'with'],
 [Edge: [4:5] 'mice'],
 [Edge: [0:0] Adj -> * 'lazy'],
 [Edge: [0:1] Adj -> 'lazy' *],
 [Edge: [1:1] NNS -> * 'cats'],
 [Edge: [1:2] NNS -> 'cats' *],
 [Edge: [1:1] NP -> * NNS],
 [Edge: [1:1] NNS -> * NNS CC NNS],
 [Edge: [1:2] NP -> NNS *],
 [Edge: [1:2] NNS -> NNS * CC NNS],
 [Edge: [1:1] NP -> * NP CC NP],
 [Edge: [1:2] NP -> NP * CC NP],
 [Edge: [2:2] V  -> * 'play'],
 [Edge: [2:3] V  -> 'play' *],
 [Edge: [3:3] PP -> * 'with'],
 [Edge: [3:4] PP -> 'with' *],
 [Edge: [4:4] NNS -> * 'mice'],
 [Edge: [4:5] NNS -> 'mice' *],
 [Edge: [4:4] NP -> * NNS],
 [Edge: [4:4] NNS -> * NNS CC NNS],
 [Edge: [4:5] NP -> NNS *],
 [Edge: [4:5] NNS -> NNS * CC NNS],
 [Edge: [4:4] NP -> * NP CC NP],
 [Edge: [4:5] NP -> NP * CC NP]]

### BottomUpLeftCornerChartParser

In [7]:
parser = nltk.BottomUpLeftCornerChartParser(grammar,trace=1)

In [8]:
parse = parser.chart_parse(sent)

|.  lazy .  cats .  play .  with .  mice .|
|[-------]       .       .       .       .| [0:1] 'lazy'
|.       [-------]       .       .       .| [1:2] 'cats'
|.       .       [-------]       .       .| [2:3] 'play'
|.       .       .       [-------]       .| [3:4] 'with'
|.       .       .       .       [-------]| [4:5] 'mice'
|[-------]       .       .       .       .| [0:1] Adj -> 'lazy' *
|.       [-------]       .       .       .| [1:2] NNS -> 'cats' *
|.       [-------]       .       .       .| [1:2] NP -> NNS *
|.       [------->       .       .       .| [1:2] NNS -> NNS * CC NNS
|.       [------->       .       .       .| [1:2] NP -> NP * CC NP
|.       .       [-------]       .       .| [2:3] V  -> 'play' *
|.       .       .       [-------]       .| [3:4] PP -> 'with' *
|.       .       .       .       [-------]| [4:5] NNS -> 'mice' *
|.       .       .       .       [-------]| [4:5] NP -> NNS *
|.       .       .       .       [------->| [4:5] NNS -> NNS * CC NNS
|.       .  

In [9]:
print("Number of edges:  " + str(parse.num_edges()))

Number of edges:  16


In [10]:
parse.edges()

[[Edge: [0:1] 'lazy'],
 [Edge: [1:2] 'cats'],
 [Edge: [2:3] 'play'],
 [Edge: [3:4] 'with'],
 [Edge: [4:5] 'mice'],
 [Edge: [0:1] Adj -> 'lazy' *],
 [Edge: [1:2] NNS -> 'cats' *],
 [Edge: [1:2] NP -> NNS *],
 [Edge: [1:2] NNS -> NNS * CC NNS],
 [Edge: [1:2] NP -> NP * CC NP],
 [Edge: [2:3] V  -> 'play' *],
 [Edge: [3:4] PP -> 'with' *],
 [Edge: [4:5] NNS -> 'mice' *],
 [Edge: [4:5] NP -> NNS *],
 [Edge: [4:5] NNS -> NNS * CC NNS],
 [Edge: [4:5] NP -> NP * CC NP]]

### LeftCornerChartParser

In [11]:
parser = nltk.LeftCornerChartParser(grammar,trace=1)

In [12]:
parse = parser.chart_parse(sent)

|.  lazy .  cats .  play .  with .  mice .|
|[-------]       .       .       .       .| [0:1] 'lazy'
|.       [-------]       .       .       .| [1:2] 'cats'
|.       .       [-------]       .       .| [2:3] 'play'
|.       .       .       [-------]       .| [3:4] 'with'
|.       .       .       .       [-------]| [4:5] 'mice'
|[-------]       .       .       .       .| [0:1] Adj -> 'lazy' *
|.       [-------]       .       .       .| [1:2] NNS -> 'cats' *
|.       [-------]       .       .       .| [1:2] NP -> NNS *
|.       .       [-------]       .       .| [2:3] V  -> 'play' *
|.       .       .       [-------]       .| [3:4] PP -> 'with' *
|.       .       .       .       [-------]| [4:5] NNS -> 'mice' *
|.       .       .       .       [-------]| [4:5] NP -> NNS *


In [13]:
print("Number of edges:  " + str(parse.num_edges()))

Number of edges:  12


In [14]:
parse.edges()

[[Edge: [0:1] 'lazy'],
 [Edge: [1:2] 'cats'],
 [Edge: [2:3] 'play'],
 [Edge: [3:4] 'with'],
 [Edge: [4:5] 'mice'],
 [Edge: [0:1] Adj -> 'lazy' *],
 [Edge: [1:2] NNS -> 'cats' *],
 [Edge: [1:2] NP -> NNS *],
 [Edge: [2:3] V  -> 'play' *],
 [Edge: [3:4] PP -> 'with' *],
 [Edge: [4:5] NNS -> 'mice' *],
 [Edge: [4:5] NP -> NNS *]]

### **Conclusions**

**Which parser is the most efficient for parsing the sentence?**

**Which edges are filtered out by each parser and why?**

## Optional exercise: Dependency parser

To use, we first need to download and run the CoreNLP server on `localhost:9000` by following the next few steps:

1. Download CoreNLP at https://stanfordnlp.github.io/CoreNLP/download.html
2. Unzip the files and run the following command in the that directory to start the server: `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000`


In [15]:
parser = CoreNLPDependencyParser(url='http://localhost:9000/')

In [16]:
def apply_jaccard_distance(sentence1, sentence2):
    if len(sentence1.union(sentence2)) == 0:
        return 0
    else:
        return 5*(1 - jaccard_distance(sentence1, sentence2))

In [17]:
def data_reader(function_preprocess):
    dt = pd.read_csv(path + 'STS.input.SMTeuroparl.txt', sep='\t', header = None)
    dt[2] = dt.apply(lambda row: function_preprocess(row[0]), axis = 1)
    dt[3] = dt.apply(lambda row: function_preprocess(row[1]), axis = 1)
    dt['gs'] = pd.read_csv(path + 'STS.gs.SMTeuroparl.txt', sep='\t', header = None)
    dt['jac'] = dt.apply(lambda row: apply_jaccard_distance(row[2], row[3]), axis = 1)
    return dt

In [18]:
def apply_CoreNLDPependencyParser(sentence):
    parse, = parser.raw_parse(sentence)
    triples = []
    for governor, dep, dependent in parse.triples():
        triples.append((governor, dep, dependent))
    return set(triples)

In [19]:
dt = data_reader(apply_CoreNLDPependencyParser)

In [20]:
styler = dt.iloc[[373, 374]].style.set_table_attributes("style='display:inline'")
display_html(styler._repr_html_(), raw=True)

Unnamed: 0,0,1,2,3,gs,jac
373,Van Orden Report (A5-0241/2000),Van Orden report (A5-0241 / 2000),"{(('Report', 'NNP'), 'dep', ('A5', 'NN')), (('Report', 'NNP'), 'punct', ('(', '-LRB-')), (('A5', 'NN'), 'nummod', ('0241/2000', 'CD')), (('Report', 'NNP'), 'compound', ('Orden', 'NNP')), (('A5', 'NN'), 'punct', ('-', 'HYPH')), (('Report', 'NNP'), 'punct', (')', '-RRB-')), (('Report', 'NNP'), 'compound', ('Van', 'NNP'))}","{(('A5', 'NN'), 'nmod', ('2000', 'CD')), (('report', 'NN'), 'dep', ('A5', 'NN')), (('A5', 'NN'), 'nummod', ('0241', 'CD')), (('A5', 'NN'), 'punct', ('-', 'HYPH')), (('report', 'NN'), 'punct', ('(', '-LRB-')), (('2000', 'CD'), 'dep', ('/', 'SYM')), (('Orden', 'NNP'), 'compound', ('Van', 'NNP')), (('report', 'NN'), 'compound', ('Orden', 'NNP')), (('report', 'NN'), 'punct', (')', '-RRB-'))}",5.0,0.333333
374,The European Union has got to do something and do it quickly.,It suits that the European Union is implied and that it makesit rapidly.,"{(('Union', 'NNP'), 'compound', ('European', 'NNP')), (('got', 'VBN'), 'aux', ('has', 'VBZ')), (('Union', 'NNP'), 'det', ('The', 'DT')), (('do', 'VBP'), 'cc', ('and', 'CC')), (('got', 'VBN'), 'punct', ('.', '.')), (('got', 'VBN'), 'nsubj', ('Union', 'NNP')), (('do', 'VB'), 'mark', ('to', 'TO')), (('do', 'VBP'), 'obj', ('it', 'PRP')), (('do', 'VB'), 'obj', ('something', 'NN')), (('got', 'VBN'), 'xcomp', ('do', 'VB')), (('do', 'VBP'), 'advmod', ('quickly', 'RB')), (('got', 'VBN'), 'conj', ('do', 'VBP'))}","{(('Union', 'NNP'), 'compound', ('European', 'NNP')), (('rapidly', 'RB'), 'nsubj', ('it', 'PRP')), (('rapidly', 'RB'), 'cc', ('and', 'CC')), (('suits', 'VBZ'), 'nsubj', ('It', 'PRP')), (('implied', 'VBN'), 'aux:pass', ('is', 'VBZ')), (('implied', 'VBN'), 'nsubj:pass', ('Union', 'NNP')), (('suits', 'VBZ'), 'ccomp', ('implied', 'VBN')), (('implied', 'VBN'), 'conj', ('rapidly', 'RB')), (('Union', 'NNP'), 'det', ('the', 'DT')), (('implied', 'VBN'), 'mark', ('that', 'IN')), (('suits', 'VBZ'), 'punct', ('.', '.')), (('rapidly', 'RB'), 'dep', ('makesit', 'FW')), (('rapidly', 'RB'), 'mark', ('that', 'IN'))}",3.0,0.208333


In [21]:
pearsonr(dt['gs'], dt['jac'])[0]

0.3194108246321566

### **Conclusions**

**Do you think it could be relevant to use NEs to compute the similarity between two sentences?**