In [1]:
import json
from wordnet import *
from copy import deepcopy
from pattern.en import pluralize, singularize

from Udep2Mono.binarization import BinaryDependencyTree
from Udep2Mono.polarization import PolarizationPipeline
from Udep2Mono.util import det_mark, det_type

2021-02-14 01:15:26 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-14 01:15:26 INFO: Use device: gpu
2021-02-14 01:15:26 INFO: Loading: tokenize
2021-02-14 01:15:31 INFO: Loading: pos
2021-02-14 01:15:33 INFO: Loading: lemma
2021-02-14 01:15:33 INFO: Loading: depparse
2021-02-14 01:15:34 INFO: Loading: sentiment
2021-02-14 01:15:36 INFO: Loading: ner
2021-02-14 01:15:37 INFO: Done loading processors!


## Lexical Knowledge based Lexical Replacement

In [3]:
class LexicalGenerator:
    def __init__(self):
        self.deptree = None
        self.hypothesis = ""
        self.treeLog = []
        self.polarLog = []
        self.replacementLog = []
        self.key_tokens = [
            'NN','NNS','NNP','NNPS','VBD',
            'VBG','VBN','VBZ','VB']

        self.quantifiers = {}
        with open('quantifier.json', 'r') as quants:
             quantifier_data = json.load(quants)
             for quantifier in quantifier_data:
                 self.quantifiers[quantifier['word']] = quantifier

    def deptree_generate(self, tree):
        self.deptree = tree
        self.generate(self.deptree)

    def generate(self, tree):
        if tree is None:
            return
        if tree.pos is not None: 
            if tree.pos in self.key_tokens:
                backup = tree.val
                
                hyper = [] 
                hypo = [] 
                syn = [] 
                ant = []

                hyper, hypo, syn, ant = get_word_sets(
                    singularize(tree.val), tree.pos.lower())

                print(hyper)
                print(hypo)
                print(syn)

                for word in syn:
                    if word in self.hypothesis:
                        tree.val = word
                        self.treeLog.append(self.save_tree())
                        self.replacementLog.append(
                            "{} => {}".format(backup, word))
                tree.val = backup

                for word in ant:
                    if word in self.hypothesis:
                        tree.val = word
                        self.treeLog.append(self.save_tree())
                        self.replacementLog.append(
                            "{} => {}".format(backup, word))
                tree.val = backup

                if tree.mark == "+":                
                    for word in hyper:
                        if word in self.hypothesis:
                            tree.val = word
                            self.treeLog.append(self.save_tree())
                            self.replacementLog.append(
                            "{} => {}".format(backup, word))
                    tree.val = backup

                if tree.mark == "-":
                    for word in hypo:
                        if word in self.hypothesis:
                            tree.val = word
                            self.treeLog.append(self.save_tree())
                            self.replacementLog.append(
                            "{} => {}".format(backup, word))
                    tree.val = backup
            
        elif tree.val == "det":
            backup = tree.left.val
            backup_mark = tree.right.mark
            kb = self.quantifiers[tree.left.val.lower()]

            for word in kb["="]:
                tree.left.val = word
                detType = det_type(tree.left.val)
                if detType is None:
                    detType = "det:exist"
                tree.left.mark = det_mark[detType]
                self.treeLog.append(self.save_tree())
                self.replacementLog.append(
                    "{} => {}".format(backup, word))
            tree.left.val = backup
            tree.left.mark = backup_mark

            if tree.left.mark == "+":
                for word in kb["<"]:
                    if word in self.hypothesis:
                        tree.left.val = word
                        detType = det_type(tree.left.val)
                        if detType is None:
                            detType = "det:exist"
                        tree.left.mark = det_mark[detType]
                        self.treeLog.append(self.save_tree())
                        self.replacementLog.append(
                            "{} => {}".format(backup, word))
                tree.left.val = backup
                tree.left.mark = backup_mark
            
            if tree.left.mark == "-":
                for word in kb[">"]:
                    if word in self.hypothesis:
                        tree.val = word
                        if detType is None:
                            detType = "det:exist"
                        tree.left.mark = det_mark[detType]
                        self.treeLog.append(self.save_tree())
                        self.replacementLog.append(
                            "{} => {}".format(backup, word))
                tree.left.val = backup
                tree.left.mark = backup_mark
        
        if tree.left != "N":
            self.generate(tree.left)
        if tree.right != "N":
            self.generate(tree.right)

    def save_tree(self):
        return deepcopy(self.deptree)

In [4]:
sentences = ["Some flowers are beautiful", 
             "A cheetah is chasing its prey across a field", 
             "The person is dicing up a pepper which is orange",
             "A man is playing the guitar loudly",
             ]
hypotheses = ["Some flowering plant are beautiful",
              "A cheetah is quickly running behind its prey",
              "An orange pepper is being diced up by a person",
              "The adult is playing the guitar loudly",
              ]
pipeline = PolarizationPipeline(sentences, verbose=0, parser="stanza")

annotations = []
for sentence in sentences:
    annotation = pipeline.single_polarization(sentence)
    annotations.append(annotation)

print("\nPolarization Complete")

lexicalGenerator = LexicalGenerator()
for annotation, hypothesis in zip(annotations, hypotheses):
    print("================")
    print("Init Premise: ", annotation['original'])
    lexicalGenerator.hypothesis = hypothesis
    lexicalGenerator.deptree_generate(annotation['polarized_tree'])
    for record in lexicalGenerator.replacementLog:
        print(record) 


Polarization Complete
Init Premise:  Some flowers are beautiful
some => a
some => an
some => one
flowers => flower
flowers => flowering plant
Init Premise:  A cheetah is chasing its prey across a field
some => a
some => an
some => one
flowers => flower
flowers => flowering plant
a => some
a => an
a => one
cheetah => cheetah
a => some
a => an
a => one
s => s
prey => prey
is => be
is => be
Init Premise:  The person is dicing up a pepper which is orange
some => a
some => an
some => one
flowers => flower
flowers => flowering plant
a => some
a => an
a => one
cheetah => cheetah
a => some
a => an
a => one
s => s
prey => prey
is => be
is => be
the => a
the => an
the => one
person => person
is => be
is => be
a => some
a => an
a => one
is => be
is => be
pepper => pepper
dicing => dice
Init Premise:  A man is playing the guitar loudly
some => a
some => an
some => one
flowers => flower
flowers => flowering plant
a => some
a => an
a => one
cheetah => cheetah
a => some
a => an
a => one
s => s
prey 

## Word Embedding Based Lexical Replacement

In [6]:
import gensim.downloader as api
w2v_model = api.load("glove-wiki-gigaword-50")

In [17]:
w2v_model.most_similar('develop')

[('developing', 0.9081553220748901),
 ('improve', 0.8416248559951782),
 ('enhance', 0.8136858344078064),
 ('ways', 0.8106333613395691),
 ('developed', 0.8090812563896179),
 ('development', 0.8060052990913391),
 ('focus', 0.8003764152526855),
 ('enable', 0.7990560531616211),
 ('create', 0.7961971759796143),
 ('promote', 0.7936440110206604)]

In [27]:
w2v_model.similarity('virus', 'bacteria')

0.7095756

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
[w for w in ['Scientist', 'developed', 'a', 'new', 'anti', 'virus', 'vaccine'] if w not in stop_words]

['Scientist', 'developed', 'new', 'anti', 'virus', 'vaccine']