<h2><center>Test scripts for Benepar, SuPar, SentenceTransformers APIs</center></h2>

### Imports:

In [1]:
import nltk
from supar import Parser
from nltk.tree import Tree
import spacy; nlp = spacy.load('en_core_web_sm')

### Constituency Parsing using Benepar:

In [1]:
import benepar
from benepar.spacy_plugin import BeneparComponent
from sentence_transformers import SentenceTransformer, util

benepar_ver = 'benepar_en2'
benepar.download(benepar_ver)

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
nlp.add_pipe(BeneparComponent(benepar_ver))

[nltk_data] Downloading package benepar_en2 to C:\Users\Denis
[nltk_data]     Logvinenko\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en2 is already up-to-date!


In [17]:
texts = ["The kitten is running through a gate", "A young cat sprints"]
docs = list(nlp.pipe(texts))

trees = [Tree.fromstring(list(doc.sents)[0]._.parse_string) for doc in docs]
for tree in trees: print(tree)

(S
  (NP (DT The) (NN kitten))
  (VP
    (VBZ is)
    (VP (VBG running) (PP (IN through) (NP (DT a) (NN gate))))))
(S (NP (DT A) (JJ young) (NN cat)) (VP (VBZ sprints)))


### Dependecy Parsing using SuPar:

In [None]:
parser = Parser.load('crfnp-dep-en')

In [13]:
text = ['The house was sold in time', 'They sold the house in time']
docs = list(nlp.pipe(text))
toks = [[tok.text for tok in doc] for doc in docs]; print(len(toks))
dataset = parser.predict(toks, prob=True, verbose=False)
pars = dataset.sentences

for par in pars: print(par)

100%|####################################| 1/1 00:00<00:00, 32.38it/s

2
1	The	_	_	_	_	2	det	_	_
2	house	_	_	_	_	2	amod	_	_
3	was	_	_	_	_	4	auxpass	_	_
4	sold	_	_	_	_	4	ccomp	_	_
5	in	_	_	_	_	4	prep	_	_
6	time	_	_	_	_	5	pobj	_	_

1	They	_	_	_	_	2	nsubj	_	_
2	sold	_	_	_	_	2	dep	_	_
3	the	_	_	_	_	4	det	_	_
4	house	_	_	_	_	4	dep	_	_
5	in	_	_	_	_	2	prep	_	_
6	time	_	_	_	_	5	pobj	_	_






### Computing Phrase Similarity:

In [16]:
sentences1, sentences2 = nps

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(cosine_scores.shape[0]):
    for j in range(cosine_scores.shape[1]):
        print(f'{sentences1[i]} \t\t {sentences2[j]} \t\t Score: {cosine_scores[i, j]:.4f}')

The kitten 		 A young cat 		 Score: 0.6921
The kitten 		 a mouse 		 Score: 0.2679


In [3]:
s1 = ['Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.']
s2 = ['Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.']

embeddings1 = model.encode(s1, convert_to_tensor=True)
embeddings2 = model.encode(s2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(cosine_scores.shape[0]):
    for j in range(cosine_scores.shape[1]):
        print(f'{s1[i]} \t\t {s2[j]} \t\t Score: {cosine_scores[i, j]:.4f}')

Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence. 		 Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence. 		 Score: 0.9297
