In [1]:
import stanza

depparse_gum_config = {
    'lang': "en",
    'processors': "tokenize,pos,lemma,depparse",
    'tokenize_model_path': './model/en/tokenize/gum.pt',
    'pos_model_path': './model/en/pos/gum.pt',
    'depparse_model_path': './model/en/depparse/gum.pt',
    'lemma_model_path': './model/en/lemma/gum.pt',
    'tokenize_no_ssplit': True,
    'use_gpu': True,
    'pos_batch_size': 2000
}

token_config = {
    'lang': "en",
    'processors': "tokenize",
    'tokenize_model_path': './model/en/tokenize/gum.pt',
    'tokenize_no_ssplit': True,
    'use_gpu': False,
    'pos_batch_size': 3000
}

gum_depparse = stanza.Pipeline(**depparse_gum_config)
tokenizer = stanza.Pipeline(**token_config)
from nltk.tree import Tree
import os
from nltk.draw import TreeWidget
from nltk.draw.util import CanvasFrame
from IPython.display import Image, display
import json
import _pickle as pickle
import gensim.downloader as api

2021-06-23 20:25:24 INFO: Loading these models for language: en (English):
| Processor | Package                 |
---------------------------------------
| tokenize  | ./model/en...ize/gum.pt |
| pos       | ./model/en/pos/gum.pt   |
| lemma     | ./model/en/lemma/gum.pt |
| depparse  | ./model/en...rse/gum.pt |

2021-06-23 20:25:24 INFO: Use device: cpu
2021-06-23 20:25:24 INFO: Loading: tokenize
2021-06-23 20:25:24 INFO: Loading: pos
2021-06-23 20:25:25 INFO: Loading: lemma
2021-06-23 20:25:25 INFO: Loading: depparse
2021-06-23 20:25:26 INFO: Done loading processors!
2021-06-23 20:25:26 INFO: Loading these models for language: en (English):
| Processor | Package                 |
---------------------------------------
| tokenize  | ./model/en...ize/gum.pt |

2021-06-23 20:25:26 INFO: Use device: cpu
2021-06-23 20:25:26 INFO: Loading: tokenize
2021-06-23 20:25:26 INFO: Done loading processors!


In [2]:
def dependency_parse(sentence, parser="gum"):
    return stanza_parse(sentence, parser=parser)


def stanza_parse(sentence, parser="gum"):
    postags = {}
    words = {}
    parse_tree = []
    head_log = {}
    depdent_log = {}

    parsed = gum_depparse(sentence + "\n")
    """if parser == "ewt":
        parsed = ewt_depparse(sentence)"""

    for sent in parsed.sentences:
        for word in sent.words:
            tree_node = post_process(sent, word, postags, words)

            if len(tree_node) == 0:
                continue

            if tree_node[2] in head_log:
                head_log[tree_node[2]].append(tree_node[0])
            else:
                head_log[tree_node[2]] = [tree_node[0]]

            if tree_node[1] in depdent_log:
                depdent_log[tree_node[1]].append(tree_node[0])
            else:
                depdent_log[tree_node[1]] = [tree_node[0]]

            parse_tree.append(tree_node)

        enhance_parse(parse_tree, head_log, depdent_log, words)
    return parse_tree, postags, words


def enhance_parse(tree, heads, deps, words):
    for node in tree:
        if node[0] == "conj":
            if "nsubj" in heads[node[1]] and "nsubj" in heads[node[2]]:
                node[0] = "conj-sent"
            elif words[node[1]][1] == "JJ" and words[node[2]][1] == "JJ":
                node[0] = "conj-adj"
            elif "NN" in words[node[1]][1] and "NN" in words[node[2]][1]:
                node[0] = "conj-n"
                vp_rel = set(["amod", "compound", "compound",  "compound:prt", "det",
                              "nummod", "appos", "advmod", "nmod", "nmod:poss"])
                vp_left = set(heads[node[1]]) & vp_rel
                vp_right = set(heads[node[2]]) & vp_rel
                if len(vp_left) and len(vp_right):
                    node[0] = "conj-np"
            elif "VB" in words[node[1]][1] and "VB" in words[node[2]][1]:
                node[0] = "conj-vb"
                vp_rel = set(["obj", "xcomp", "obl"])
                vp_left = set(heads[node[1]]) & vp_rel
                vp_right = set(heads[node[2]]) & vp_rel

                if len(vp_left):
                    if len(vp_right):
                        node[0] = "conj-vp"
                    # else:

        if node[0] == "advcl":
            if words[1][0] == "if":
                node[0] = "advcl-sent"
        if node[0] == "advmod":
            if words[node[1]][0] == "not" and node[1] == 1:
                node[0] = "advmod-sent"
        if node[0] == "case" and node[1] - node[2] > 0:
            node[0] = "case-after"
        if words[node[1]][0] in ["at-most", "at-least", "more-than", "less-than"]:
            node[0] = "det"


def post_process(sent, word, postag, words):
    word_id = int(word.id)
    if word_id not in words:
        postag[word.text] = (word_id, word.xpos)
        words[word_id] = (word.text, word.xpos)
    if word.deprel != "punct":
        tree_node = [word.deprel, word_id,
                     word.head if word.head > 0 else "root"]
        return tree_node
    return []


def printTree(tree, tag, word):
    if tree[0] != "root":
        print(
            f"word: {word[tree[1]][0]}\thead: {word[tree[2]][0]}\tdeprel: {tree[0]}", sep="\n")

In [3]:
from pqdict import pqdict

negate_mark = {
    "+": "-",
    "-": "+",
    "=": "="
}

class BinaryDependencyTree:
    def __init__(self, val, left, right, key, counter, id=None, pos=None):
        self.val = val
        self.parent = None
        self.left = left
        self.right = right
        self.mark = "0"
        self.id = id
        self.pos = pos
        self.key = key
        self.is_root = False
        self.is_tree = True
        self.length = 0
        self.leaves = pqdict({})
        self.counter = counter
        self.replaced = {}

    def sorted_leaves(self):
        self.traverse(self)
        return self.leaves

    def traverse(self, tree, multi_word=False):
        if not tree.is_tree:
            replacement = False
            if str((tree.val, tree.id)) in self.replaced:
                tree.val = self.replaced[str((tree.val, tree.id))]
                replacement = True
            if "-" in tree.val and replacement and multi_word:
                words = tree.val.split('-')
                words.reverse()
                for i in range(len(words)):
                    word_id = tree.id - i * 0.1
                    key = (words[i], tree.pos, tree.mark, word_id)
                    if words[i].lower() == "not" and len(words) == 2:
                        key = (words[i], tree.pos,
                               negate_mark[tree.mark], word_id)
                    self.leaves[key] = (word_id)
            else:
                item = (tree.id)
                key = (tree.val, tree.pos, tree.mark, tree.id)
                self.leaves[key] = item
        else:
            self.traverse(tree.left)
            self.traverse(tree.right)

    def copy(self):
        left = None
        if self.left is not None:
            left = self.left.copy()
        right = None
        if self.right is not None:
            right = self.right.copy()
        new_tree = BinaryDependencyTree(
            self.val, left, right, self.key, self.counter, self.id, self.pos)
        new_tree.mark = self.mark
        new_tree.parent = self.parent
        new_tree.is_tree = self.is_tree
        new_tree.is_root = self.is_root
        new_tree.leaves = pqdict({})
        return new_tree

    def set_length(self, lth):
        self.length = lth

    def set_root(self):
        self.is_root = True

    def set_not_tree(self):
        self.is_tree = False


hierarchy = {
    "conj-sent": 0,
    "advcl-sent": 1,
    "advmod-sent": 2,
    "case": 10,
    "case-after": 75,
    "mark": 10,
    "expl": 10,
    "discourse": 10,
    "nsubj": 20,
    "csubj": 20,
    "nsubj:pass": 20,
    "conj-vp": 25,
    "ccomp": 30,
    "advcl": 30,
    "advmod": 30,
    "nmod": 30,
    "nmod:tmod": 30,
    "nmod:npmod": 30,
    "nmod:poss": 30,
    "xcomp": 40,
    "aux": 40,
    "aux:pass": 40,
    "obj": 60,
    "iobj": 60,
    "obl": 50,
    "obl:tmod": 50,
    "obl:npmod": 50,
    "cop": 50,
    "acl": 60,
    "acl:relcl": 60,
    "appos": 60,
    "conj": 60,
    "conj-np": 60,
    "conj-adj": 60,
    "det": 55,
    "det:predet": 55,
    "cc": 70,
    "cc:preconj": 70,
    "nummod": 75,
    "fixed": 80,
    "compound": 80,
    "compound:prt": 80,
    "fixed": 80,
    "amod": 75,
    "conj-n": 90,
    "conj-vb": 90,
    "dep": 100,
    "flat": 100,
    "goeswith": 100,
    "parataxis": 100
}


class UnifiedCounter:
    def __init__(self, initial_val=0):
        self.addi_negates = initial_val
        self.unifies = initial_val
        self.nsubjLeft = False
        self.expl = False
        self.willing_verb = False

    def add_negates(self):
        self.addi_negates += 1

    def add_unifies(self):
        self.unifies += 1

    def is_unified_clause_subj(self):
        return self.unifies % 2 == 1 and self.nsubjLeft


class Binarizer:
    def __init__(self, parse_table=None, postag=None, words=None):
        self.postag = postag
        self.parse_table = parse_table
        self.words = words
        self.id = 0
        self.counter = UnifiedCounter(0)
        self.replaced = {}

    def process_not(self, children):
        if len(children) > 1:
            if children[0][0] == "advmod":
                if self.words[children[1][1]][0] == "not":
                    return [children[1]]
        return children

    def compose(self, head):
        children = list(filter(lambda x: x[2] == head, self.parse_table))
        children.sort(key=(lambda x: hierarchy[x[0]]))
        children = self.process_not(children)

        if len(children) == 0:
            word = self.words[head][0]
            tag = self.words[head][1]
            binary_tree = BinaryDependencyTree(
                word, None, None, self.id, self.counter, head, tag)
            binary_tree.replaced = self.replaced
            self.id += 1
            binary_tree.set_not_tree()
            return binary_tree, [binary_tree.key]
        else:
            top_dep = children[0]
        self.parse_table.remove(top_dep)

        left, left_rel = self.compose(top_dep[1])
        right, right_rel = self.compose(top_dep[2])
        if "conj" in top_dep[0]:
            dep_rel = "conj"
        elif "case" in top_dep[0]:
            dep_rel = "case"
        elif "advcl" in top_dep[0]:
            dep_rel = "advcl"
        elif "advmod" in top_dep[0]:
            dep_rel = "advmod"
        else:
            dep_rel = top_dep[0]

        binary_tree = BinaryDependencyTree(
            dep_rel, left, right, self.id, self.counter)
        binary_tree.left.parent = binary_tree
        binary_tree.right.parent = binary_tree
        binary_tree.replaced = self.replaced

        left_rel.append(binary_tree.key)
        self.id += 1
        return binary_tree, left_rel + right_rel

    def binarization(self):
        self.id = 0
        self.relation = []
        root = list(filter(lambda x: x[0] == "root", self.parse_table))[0][1]
        self.counter = UnifiedCounter(0)
        binary_tree, relation = self.compose(root)
        binary_tree.set_root()
        binary_tree.length = len(self.words)
        return binary_tree, relation

In [4]:
from pattern.en import conjugate
from nltk.tree import Tree
from nltk.draw import TreeWidget
from nltk.draw.util import CanvasFrame
from IPython.display import Image, display
import os
import subprocess
import json
import _pickle as pickle

arrows = {
    "+": "\u2191",
    "-": "\u2193",
    "=": "=",
    "0": ""
}

arrow2int = {
    "\u2191": 1,
    "\u2193": -1,
    "=": 0
}

def btree2list(binaryDepdency, verbose=0):
    def to_list(tree):
        treelist = []
        if tree.is_tree:
            word = tree.val + arrows[tree.mark]
            if verbose == 2:
                word += str(tree.key)
            treelist.append(word)
        else:
            treelist.append(tree.pos)
            word = tree.val.replace('-', ' ') + arrows[tree.mark]
            if verbose == 2:
                word += str(tree.key)
            treelist.append(word)

        if tree.left is not None:
            treelist.append(to_list(tree.left))

        if tree.right is not None:
            treelist.append(to_list(tree.right))

        return treelist
    return to_list(binaryDepdency)

def jupyter_draw_nltk_tree(tree):
    cf = CanvasFrame()
    tc = TreeWidget(cf.canvas(), tree)
    tc['node_font'] = 'arial 14 bold'
    tc['leaf_font'] = 'arial 14'
    tc['node_color'] = '#005990'
    tc['leaf_color'] = '#3F8F57'
    tc['line_color'] = '#175252'
    cf.add_widget(tc, 20, 20)
    cf.print_to_file('./tree.ps')
    cf.destroy()
    command = 'magick convert ./tree.ps ./tree.png'
    os.system(command)
    display(Image(filename='./tree.png'))
    

    
def jupyter_draw_rsyntax_tree(tree):
    font_size = '8'
    command = 'rsyntaxtree -s {} "{}"'.format(font_size, tree)
    os.system(command)
    display(Image(filename='./syntree.png'))

In [5]:
#G = Ugraph()
nounModifiers = {"det", "nummod", "amod","obl:tmod", "acl:relcl", "nmod", "nmod:pass",  "acl", "Prime","cc"}
verbModifiers = {"advmod","xcomp","advcl","mark", "case", "aux"}
nounCategories = {"compound"} 
verbs = {"VBZ", "VBP", "VBD", "VBG"}
modified = {"NN", "PRP", "JJ", "VB","RB"}.union(verbs)
modifiers = nounModifiers
offFocus = {"expl"}
contents = {"nsubj","obj","cop","compound","conj","nsubj:pass","obl"}
cont_npos = {"nsubj":'nn', "obj": 'nn', "cop": 'vbz', "verb": 'vbz'}
mark_toProp = {"+": {"hyponym","synonym"}, "-": {"hypernym","synonym"}, "=": {"synonym"}}
clause_prop = {"which", "that", "who"}
be_verbs = {"is", "am", "are", "be","was","were"}
directions = {0: "lexical", 1: "phrasal", 2: "syntatic_variation", 3: "implicative"}

In [6]:
quantifier_replacement = {
    "a-few": "some",
    "a-few of the": "some",
    "none-of-the": "no",
    "all-of-the": "all",
    "some-of-the": "some",
    "most-of-the": "most",
    "many-of-the": "many",
    "several-of-the": "several",
    "some-but-not-all": "some",
    "at-most": "no",
    "at-least": "some",
    "more-than": "some",
    "less-than": "no",
    "no-longer": "not",
    "a-lot-of": "some",
    "lots-of": "some",
    "each of the": "each",
    "A-few": "Some",
    "A-few of the": "Some",
    "None-of-the": "No",
    "All-of-the": "All",
    "Some-of-the": "Some",
    "Most-of-the": "Most",
    "Many-of-the": "Many",
    "Several-of-the": "Several",
    "Some-but-not-all": "Some",
    "At-most": "No",
    "At-least": "Some",
    "More-than": "Some",
    "Less-than": "No",
    "No-longer": "Not",
    "A-lot-of": "Some",
    "Lots-of": "Some",
    "Each of the": "Each",
    "hardly-ever": "never",
    "Even-if": "If",
    "even-if": "if",
    "not-every": "every",
    "not-some": "some",
    "not-all": "all",
    "not-each": "each",
    "Not-every": "every",
    "Not-some": "some",
    "Not-all": "all",
    "Not-each": "each",

    "after-all": "after-all",
    "out-of": "out-of",
    "hardly-ever": "never",
    "no-longer": "no-longer",
}

In [16]:
class Cnode:
    def __init__(self,prop,word,npos, mark):
        self.nexts = set()
        self.prop = prop
        self.isRoot = False
        self.isRelation = False
        self.modifiers = set() 
        self.index = -2
        self.related = []
        self.word = word
        self.npos = npos
        self.mark = mark
        self.phrases = set()
        self.pair = -1
        self.pairParts = dict()
        self.start = -1
        self.end = -1
        self.nodes = set()
        self.cc = None
        self.aligned = []
        self.isComp = False
        #self.alignedBy = []
        self.parent = None
        self.explMain = False
    def add_Unode(self, node):
        #print(node.prop)
        if(self.isRoot):
            if(node.prop == "obl"):
                node.prop = "obj"
            self.nexts[node.prop].add(node)
        else:
            self.nexts["all"].add(node)
    def addNode(self, node):
        return
    def add_modifier(self, modifierNode):
        self.modifiers.add(modifierNode)
    def getText(self):
        return
    
    def get_magicText(self):
        return "(a_b)"
    def get_magicTextOld(self):
        connected_info = ""
        if(self.isRoot):
            for key in self.nexts.keys():
                component = ""
                if(key != "all"):
                    print(key)
                    for keyItem in self.nexts[key]:
                        component += " (" + keyItem.get_magicText() + ")"
                    component = "(" + key + " " + component + ")"
                connected_info += component
            return "(" + connected_info + ")"
        else:
            for node in self.nexts["all"]:
                if(node != None):
                    #print("111")
                    connected_info +=  "(" + node.get_magicText() + ")"
            if(self.nexts["all"] == set()):
                if(self.pair != -1):
                    return self.word.replace(' ', '_') + str(self.pair)
                return self.word.replace(' ', '_')
            if(self.pair != -1):
                    return self.word.replace(' ', '_') + str(self.pair) + connected_info
            return  self.word.replace(' ', '_') + connected_info
    def addNum(self,num):
        self.pair = num
    def addPart(self, newNode, type1):
        if(type1 not in self.pairParts):
            self.pairParts[type1] = set()
        self.pairParts[type1].add(newNode)
    def getParts(self):
        ### return verb-obj subParts now
        return self.pairParts["obj"]
    def addCC(self,node):
        self.cc = node

In [214]:
class Cgraph:
    def __init__(self, rootNode):
        self.root = rootNode
        self.root.isRoot = True
        self.concepts = set()
        self.relations = set()
        self.relDeps = []
        self.nodes = []
        self.contentSet = set()
        self.chunks = set()
        self.Pairs = dict()
        self.Pairs["nsubj"] = dict()
        self.Pairs["obj"] = dict()
        self.align_log = []
        self.expl = False
        self.passive = False
    def add_node(self,node):
        self.nodes.add(node)
        self.root.addNode(node)
    def add_edge(self, node1, node2):
        #if(node1.isRoot):
        #    self.contentSet.add(node2.word)
        #    node2.isComp = True
        node1.nexts.add(node2)
        node2.parent = node1
    def indexNodes(self):
        output = ""
        for node in self.nodes:
            output += node.word
            output += "["
            output += str(node.index)
            output += "]"
            output += " "
        return output.strip()
    
    def print_text(self):
        relations = []
        for node in self.nodes:
            for nextItem in node.nexts:
                relations.append([node.word, nextItem.word, node.index, nextItem.index])
        #print(relations)
        return relations
                
    def print_content(self, set1,inword=True):
        output = []
        if(inword==False):
            for item in set1:
                output.append(item.index)
        else:
            for item in set1:
                output.append(item.word)
        return output
    def print_modifiers(self, mode = True):
        modifiers = dict()
        for node in self.nodes:
            if(node.modifiers != set()):
                if(mode):
                    modifiers[node.word+str(node.index)] = []
                    for modifier in node.modifiers:
                        modifiers[node.word+str(node.index)].append(modifier.word)
                else:
                    modifiers[node.index] = []
                    for modifier in node.modifiers:
                        modifiers[node.index].append(modifier.index)
               
        return modifiers
        
    def contains(self, word_assigned):
        return word_assigned in self.contentSet
    def get_magicText(self):
        return self.root.get_magicText()
    def addPair(self, newNode, num,type1):
        return

In [25]:
class Concept:
    def __init__(self, data):
        self.data = data

In [221]:
class GraphFactoryPipeline:
    def __init__(self, verbose=0, parser="gum"):
        self.parser = parser
        self.binarizer = Binarizer()
        self.exceptioned = []
        self.verbose = verbose

        self.concept_pos = ["NN", "NNS", "NNP", "NNPS", "PRP", "PRP$"]
        self.relation_pos = ["VB", "VBD", "VBG", "VBN", 
                             "VBP", "VBZ", "TO", "IN"]
        self.modifiers = {"det", "nummod", "amod","obl:tmod", "nmod", "nmod:pass", "acl", "Prime", "fixed",
                          "advmod", "aux", "paratxis", "ccomp"}

        self.modifier_relation = {
            "NN": ["amod", "nmod", "acl:relcl", "fixed", "compound", "det", "nmod:poss", "conj", "nummod"],
            "VB": ["advmod", "acl", "obl", "xcomp", "advcl", "obl:tmod", "parataxis", "obj","ccomp"]
        }
        self.concept_dep = {"nsubj": 0,"obj":0,"cop":1,"conj":1,"nsubj:pass":1, "obl":0}
        self.relation_dep = {"xcomp":2,"advcl":0,"mark":0, "case":0, "cop":0, "obj":1, "obl":1,"acl:relcl":1}
        self.concept = None
        self.dataSet = dict()
        self.dictionary = dict()
        self.numSentence = 0;

    def extract_concepts_relation(self, deps, G):
        concepts = G.concepts
        relation = G.relations
        nodes = G.nodes
        
        for dep in deps:
            if dep[0] in self.concept_dep:
                if(dep[0] in ["obj", "obl"]):
                    nodeFrom = nodes[dep[2]-1]
                    nodeTo = nodes[dep[1]-1]
                    
                else:
                    nodeFrom = nodes[dep[1]-1]
                    nodeTo = nodes[dep[2]-1]
                nodeFrom.nexts.add(nodeTo)
                G.relDeps.append([dep[0], nodeFrom, nodeTo])
                type1 = self.concept_dep[dep[0]]
                if(type1==2):
                    concepts.add(nodes[dep[1]-1])
                    concepts.add(nodes[dep[2]-1])
                else:
                    concepts.add(nodes[dep[type1+1]-1])
            if dep[0] in self.relation_dep:
                type1 = self.relation_dep[dep[0]]
                if(dep[0] in (self.concept_dep.keys() & self.relation_dep.keys())):
                    pass
                else:
                    if(type1 ==0):
                        nodeFrom = nodes[dep[1]-1]
                        nodeTo = nodes[dep[2]-1]
                        
                    else:
                        nodeFrom = nodes[dep[2]-1]
                        nodeTo = nodes[dep[1]-1]
                    nodeFrom.nexts.add(nodeTo)
                    G.relDeps.append([dep[0], nodeFrom, nodeTo])
                if(type1==2):
                    relation.add(nodes[dep[1]-1])
                    relation.add(nodes[dep[2]-1])
                else:
                    relation.add(nodes[dep[type1+1]-1])
       # return concepts, relation
    
    def add_modifiers(self, deps, G):
        nodes = G.nodes
        for dep in deps:
            if(dep[0] in self.modifiers):
                nodes[dep[2]-1].modifiers.add(nodes[dep[1]-1])
                
                
    def down_right(self, tree):
        if(tree.right == None):
            return tree
        return self.down_right(tree.right)

    def down_left(self, tree):
        if(tree.left == None):
            return tree
        return self.down_left(tree.left)
    
    def collect_modifiers(self, tree, sent_set, mod_type="NN"):
        leaves = []
        if tree.is_tree:
            if tree.val in ["mark", "case", "compound", "flat", "nmod"]:
                leaves.append(
                    (list(tree.right.sorted_leaves().popkeys()),
                    self.down_right(tree.left).val)
                )
            if tree.val in self.modifier_relation[mod_type]:
                leaves.append(
                    (list(tree.left.sorted_leaves().popkeys()),
                    self.down_right(tree.right).val)
                )

            for leave in leaves:
                if len(leave) > 0 and len(leave) < 10:
                    head = leave[1]
                    modifier = ' '.join([x[0] for x in leave[0]])
                    if tree.val in sent_set:
                        sent_set[tree.val].append({'head': head,'mod': modifier})
                    else:
                        sent_set[tree.val] = [{'head': head,'mod': modifier}]
            
            self.collect_modifiers(tree.left, sent_set, mod_type)
            self.collect_modifiers(tree.right, sent_set, mod_type)
            
    def unpack_verbComp(self, G):
        for rel in G.relDeps:
            if(rel[0] in ["obl", "xcomp"]):
                nodeTo = rel[2]
                for rel2 in G.relDeps:
                    if(rel2[2] == nodeTo and rel2[0] in ["mark", "case"]):
                        #print(2222)
                        rel[2] = rel2[1]
                        try:
                            rel[1].nexts.remove(nodeTo)
                        except:
                            pass
                        rel[1].nexts.add(rel2[1])
    
    def init_graph(self, parsed, sent):
        G = Cgraph(Cnode("Root", "Root", "Root", "Root"))
        i = 1
        for index in parsed[2].keys():
            node1 = Cnode("", parsed[2][index][0], parsed[2][index][1], "")
            node1.index = index
            i+=1
            G.nodes.append(node1)
        return G

    def run_binarization(self, parsed, sentence):
        self.binarizer.parse_table = parsed[0]
        self.binarizer.postag = parsed[1]
        self.binarizer.words = parsed[2]

        if self.verbose == 2:
            print()
            print(parsed[0])
            print()
            print(parsed[1])
            print()
            print(replaced)

        self.binarizer.replaced = []
        binary_dep, relation = self.binarizer.binarization()
        if self.verbose == 2:
            self.postprocess(binary_dep)
        return binary_dep, relation
    
    def sent2tree(self, sentence):
        parsed = dependency_parse(sentence, self.parser)
        #print(parsed)
        concepts,relations = self.extract_concepts_relation(parsed[1])
        binary_dep, _ = self.run_binarization(parsed, sentence)
        return binary_dep, parsed, concepts, relations

    def single_polarization(self, sentence):
        
        sentence = sentence.replace(".", "").replace(",", "")
        try:
            parsed = dependency_parse(sentence, self.parser)
        except:
            return
        #print(parsed)
        G = self.init_graph(parsed, sentence)
        self.extract_concepts_relation(parsed[0], G)
        self.add_modifiers(parsed[0], G)
        
        #binary_dep, parsed, concepts, relations = self.sent2tree(sentence)
        modifiers = {}
        #self.collect_modifiers(binary_dep, modifiers)
        self.unpack_verbComp(G)
        G.print_text()
        
        return {
            'sentence': sentence,
            'indexed': G.indexNodes(),
            'concepts': G.print_content(G.concepts),
            'conceptIndexed': G.print_content(G.concepts, False),
            'relations': G.print_content(G.relations),
            'relationIndexed': G.print_content(G.relations, False),
            'relationLogs': G.print_text(),
            'modifiers': G.print_modifiers(),
            'modifiersIndexed': G.print_modifiers(False),
            'parsed': parsed[2] 
        }

        def postprocess(self, tree, svg=False):
            sexpression = btree2list(tree, 0)
            if not svg:
                sexpression = '[%s]' % ', '.join(
                    map(str, sexpression)).replace(",", " ").replace("'", "")
            # print(sexpression)
            # jupyter_draw_rsyntax_tree(polarized)
            # btreeViz = Tree.fromstring(polarized.replace('[', '(').replace(']', ')'))
            # jupyter_draw_nltk_tree(btreeViz)
            return sexpression 
        
    def initiateSetting(self):
        self.dataSet = dict()
        self.dictionary = dict()
        self.numSentence = 0;
        
    def parseSentence(self, filename, num):
        output = set()
        with open(filename) as snli:
            snlis = list(snli)
        for sent in snlis:
            if(len(output) >= num):
                break
            output.add(json.loads(sent)['sentence1'])
        return output
    
    def generateData(self, num):
        data = self.parseSentence("./snli_1.0_train.jsonl", num) #./data/SNLI/snli_1.0/
        #print(data)
        self.initiateSetting()
        with open("./relation_data.json", 'w') as jsonFile:
            with open("./relation_script3.json", "w") as jsonScript:
                index1 = 0
                for sent in data:
                    if(index1 >= num):
                        break
                    singleDict = self.single_polarization(sent)
                    #print(singleDict)
                    if(singleDict != None):
                        self.dictionary[self.numSentence] = singleDict
                        self.numSentence+=1
                        #print(index1)
                        index1+=1
                
                json.dump(self.dataSet, jsonFile)
                json.dump(self.dictionary, jsonScript, indent=2, separators = (', \n', ': '))
    def script_to_graph(self, script):
        G = init_graph(["","",script['parsed']], script['sentence'])
        for index1 in script['conceptIndexed']:
            G.concepts.add(G.nodes[index])
        for index2 in script['relationIndexed']:
            G.nodes[index].isRelation = True
            G.relations.add(G.nodes[index])
        for mod in script['modifiersIndexed'].keys():
            for modifier in script['modifiersIndexed'][mod]:
                
                G.nodes[mod].modifiers.add(G.nodes[modifier])
        for log in script['relationLogs']:
            G.nodes[str(log[2])].nexts.add(G.nodes[str(log[3])])
        return G
    def to_nn_input(self, script):
        return
       
         
        
        


In [222]:
graph_factory = GraphFactoryPipeline()

In [114]:
import pprint
pp = pprint.PrettyPrinter(indent=2)  

sentence = "I am sleeping on the bed"#"That store sales some beautiful flowers to attract customers."
graph = graph_factory.single_polarization(sentence)
#print(tree1.left.val)
#graph_factory.tree_toGraph(tree1)
#print(Tree.fromstring("(a(b))"))
#jupyter_draw_nltk_tree(Tree.fromstring("(a (b c))"))
print("\nOutput: \n")
print('Concepts: ', graph['concepts'])
print('Relations: ', graph['relations'])
print('Modifiers: ', graph['modifiers'])

[['I', 'sleeping'], ['sleeping', 'on'], ['on', 'bed']]

Output: 

Concepts:  ['bed', 'I']
Relations:  ['on', 'sleeping']
Modifiers:  {'sleeping3': ['am'], 'bed6': ['the']}


In [223]:
graph_factory.generateData(100)

In [64]:
sentence = "An older man is drinking orange juice at a restaurant"
graph = graph_factory.single_polarization(sentence)
print("\nOutput: \n")
print('Concept: ', graph['concepts'])
print('Relations: ', graph['relations'])
print('Modifiers: ', graph['modifiers'])

([['det', 1, 3], ['amod', 2, 3], ['nsubj', 3, 5], ['aux', 4, 5], ['root', 5, 'root'], ['amod', 6, 7], ['obj', 7, 5], ['case', 8, 10], ['det', 9, 10], ['obl', 10, 5]], {'An': (1, 'DT'), 'older': (2, 'JJR'), 'man': (3, 'NN'), 'is': (4, 'VBZ'), 'drinking': (5, 'VBG'), 'orange': (6, 'JJ'), 'juice': (7, 'NN'), 'at': (8, 'IN'), 'a': (9, 'DT'), 'restaurant': (10, 'NN')}, {1: ('An', 'DT'), 2: ('older', 'JJR'), 3: ('man', 'NN'), 4: ('is', 'VBZ'), 5: ('drinking', 'VBG'), 6: ('orange', 'JJ'), 7: ('juice', 'NN'), 8: ('at', 'IN'), 9: ('a', 'DT'), 10: ('restaurant', 'NN')})
2222
[['man', 'drinking'], ['drinking', 'juice'], ['drinking', 'at'], ['at', 'restaurant']]

Output: 

Concept:  ['juice', 'restaurant', 'man']
Relations:  ['at', 'drinking']
Modifiers:  {'man3': ['An', 'older'], 'drinking5': ['is'], 'juice7': ['orange'], 'restaurant10': ['a']}


In [188]:
sentence = "Children's soccer game being played, while the sun sets in the background"
graph = graph_factory.single_polarization(sentence)
print("\nOutput: \n")
print('Concept: ', graph['concepts'])
print('Relations: ', graph['relations'])
print('Modifiers: ', graph['modifiers'])

([['nmod:poss', 1, 4], ['case-after', 2, 1], ['compound', 3, 4], ['root', 4, 'root'], ['aux:pass', 5, 6], ['acl', 6, 4], ['mark', 7, 10], ['det', 8, 10], ['compound', 9, 10], ['advcl', 10, 6], ['case', 11, 13], ['det', 12, 13], ['nmod', 13, 10]], {'Children': (1, 'NNS'), "'s": (2, 'POS'), 'soccer': (3, 'NN'), 'game': (4, 'NN'), 'being': (5, 'VBG'), 'played': (6, 'VBN'), 'while': (7, 'IN'), 'the': (12, 'DT'), 'sun': (9, 'NN'), 'sets': (10, 'NNS'), 'in': (11, 'IN'), 'background': (13, 'NN')}, {1: ('Children', 'NNS'), 2: ("'s", 'POS'), 3: ('soccer', 'NN'), 4: ('game', 'NN'), 5: ('being', 'VBG'), 6: ('played', 'VBN'), 7: ('while', 'IN'), 8: ('the', 'DT'), 9: ('sun', 'NN'), 10: ('sets', 'NNS'), 11: ('in', 'IN'), 12: ('the', 'DT'), 13: ('background', 'NN')})
10
6
13

Output: 

Concept:  []
Relations:  ['sets', 'in', 'while']
Modifiers:  {'game4': ['played'], 'sets10': ['the', 'background'], 'background13': ['the']}


In [15]:
import json
with open("./data/protoroles.json") as j:
    data = json.load(j)
    print(data['metadata']['protoroles'])

{'awareness': {'value': {'datatype': 'int', 'categories': [0, 1, 2, 3, 4], 'ordered': True}, 'confidence': {'datatype': 'int', 'categories': [0, 1], 'ordered': False}, 'annotators': ['protoroles-annotator-0', 'protoroles-annotator-1', 'protoroles-annotator-10', 'protoroles-annotator-11', 'protoroles-annotator-12', 'protoroles-annotator-13', 'protoroles-annotator-14', 'protoroles-annotator-15', 'protoroles-annotator-16', 'protoroles-annotator-17', 'protoroles-annotator-18', 'protoroles-annotator-19', 'protoroles-annotator-2', 'protoroles-annotator-20', 'protoroles-annotator-21', 'protoroles-annotator-22', 'protoroles-annotator-23', 'protoroles-annotator-24', 'protoroles-annotator-25', 'protoroles-annotator-26', 'protoroles-annotator-27', 'protoroles-annotator-28', 'protoroles-annotator-29', 'protoroles-annotator-3', 'protoroles-annotator-30', 'protoroles-annotator-31', 'protoroles-annotator-32', 'protoroles-annotator-33', 'protoroles-annotator-34', 'protoroles-annotator-35', 'protoroles