In [81]:
import spacy
from queue import Queue
import re

# spacy.require_gpu(0)

nlp = spacy.load("en_core_web_md")

In [92]:
FANBOYS = ["for", "and", "nor", "but", "or", "yet", "so"]
FANBOYS = [", " + x for x in FANBOYS] + ["," + x for x in FANBOYS]

class WordNode:
    def __init__(self, word):
        self.text = word.text
        self.pos = word.pos_
        self.idx = word.i
        self.lbl = word.dep_
        self.children = []
        self.parent = None
        self.obj_root = False
    
    def add_child(self, child):
        self.children.append(child)
    
    def remove_child(self, child):
        if child in self.children:
            self.children.remove(child)

    def add_parent(self, parent):
        self.parent = parent
    
    def print(self):
        print("Node summary:", self.text, [x.text for x in self.children], self.obj_root)

    def is_children(self, word):
        return word in self.children
    
    def set_object(self):
        self.obj_root = True

def segment_verbs_and_objects(input_string):

    doc = nlp(input_string)

    word_nodes = []
    verbs = []

    for token in doc:
        new_node = WordNode(token)
        word_nodes.append(new_node)

        # ancestors = [t.text for t in token.ancestors]
        # children = [t.text for t in token.children]
        # print(token.text, "\t", token.i, "\t", 
        #     token.pos_, "\t", token.dep_, "\t", 
        #     ancestors, "\t", children)
    
    for token in doc:
        for child in token.children:
            word_nodes[token.i].add_child(word_nodes[child.i])
    
    # for node in word_nodes:
    #     node.print()
    
    # find the root verb of the sentence
    def find_root_of_sentence(doc):
        root_token = None
        for token in doc:
            if (token.dep_ == "ROOT" and token.pos_ == "VERB"):
                root_token = token
        return root_token
    
    root_token = find_root_of_sentence(doc)

    if not root_token:
        return [input_string]
    
    # find the subject of the sentence
    def get_subject(root_token):
        for child in root_token.children:
            if (child.dep_ == "nsubj"):
                return child
        
        return None

    subject = get_subject(root_token)
    if subject == None:
        return [input_string]
    
    # route the root verb to the subject
    word_nodes[root_token.i].add_parent(word_nodes[subject.i])
    word_nodes[root_token.i].remove_child(word_nodes[subject.i])
    word_nodes[subject.i].remove_child(word_nodes[root_token.i])
    verbs.append(root_token)

    # find any other verbs in the sentence
    def parse_other_verbs(root_token):
        other_verbs = []
        for children in root_token.children:
            if (children.pos_ == "VERB" and children != root_token):
                other_verbs.append(children)
                word_nodes[root_token.i].remove_child(word_nodes[children.i])
                word_nodes[children.i].add_parent(word_nodes[subject.i])
                other_verbs.extend(parse_other_verbs(children))
            if (children.dep_ == "nsubj"):
                word_nodes[root_token.i].add_parent(word_nodes[children.i])
                word_nodes[root_token.i].remove_child(word_nodes[children.i])
    
        return other_verbs

    verbs.extend(parse_other_verbs(root_token))

    # now we find all dependent verbs (verbs without an object attached)
    dependent_verbs = Queue()
    for verb in verbs:
        dependent = True
        for child in verb.children:
            if (child.dep_ == 'dobj' or child.pos_ == "ADP"):
                dependent = False
        if dependent:
            dependent_verbs.put(verb)

    # given a verb, find all its dependencies
    visited = set()

    def find_obj_dfs(verb_token, root_obj_token):
        visited.add(root_obj_token.i)
        object_nodes = []
        for child in root_obj_token.children:
            if (child.dep_ == "conj" or child.dep_ == 'dobj' or child.pos_ == "ADP") and child.i not in visited:
                object_nodes.extend(find_obj_dfs(verb_token, child))
                word_nodes[verb_token.i].add_child(word_nodes[child.i])
                word_nodes[root_obj_token.i].remove_child(word_nodes[child.i])
                word_nodes[child.i].set_object()
                object_nodes.append(child)
            if (child.dep_ == "cc" or child.dep_ == "punct"):
                word_nodes[root_obj_token.i].remove_child(word_nodes[child.i])
        
        return object_nodes

    def get_all_objects(verb_token):
        object_nodes = []
        for child in verb_token.children:
            if (child.pos_ == "NOUN" or child.pos_ == "PRON" or child.pos_ == "PROPN" or child.pos_ == "ADP") and child.dep_ != "nsubj":
                object_nodes.extend(find_obj_dfs(verb_token, child))
                word_nodes[child.i].set_object()
                object_nodes.append(child)
            if (child.dep_ == "cc" or child.dep_ == "punct"):
                word_nodes[verb_token.i].remove_child(word_nodes[child.i])
        
        return object_nodes
    
    # assign dependent verbs objects from the first verb that is not dependent
    for action in verbs:
        object_nodes = get_all_objects(action)
        if len(object_nodes) > 0:
            while not dependent_verbs.empty():
                dependent_verb = dependent_verbs.get()
                for object in object_nodes:
                    word_nodes[dependent_verb.i].add_child(word_nodes[object.i])

    # for node in word_nodes:
    #     node.print()

    # help to traverse the entire tree given a root node
    def dfs_traverse(root_node):
        seen_nodes = [root_node.idx]
        for child in root_node.children:
            more_nodes = dfs_traverse(child)
            seen_nodes.extend(more_nodes)
        return seen_nodes

    # for each verb, we create its own clause
    clauses = []
    for action in verbs:
        subjects = dfs_traverse(word_nodes[action.i].parent)
        verb_extras = []

        for child in word_nodes[action.i].children:
            if child.obj_root == False:
                verb_extras.extend(dfs_traverse(child))
            
                # print("verb_extras", verb_extras, child.text)
        
        for child in word_nodes[action.i].children:
            if child.obj_root == True:
                obj = dfs_traverse(child)
                total_sentence = subjects + verb_extras + obj + [action.i]
                total_sentence.sort()
                total_sentence_text = ""
                for idx in total_sentence:
                    total_sentence_text += word_nodes[idx].text + " "
                clauses.append(total_sentence_text)
                print(total_sentence_text)
    
    return clauses

def segment_clauses(input_string):
    pattern = '|'.join(map(re.escape, FANBOYS))
    
    clauses = re.split(pattern, input_string)
    
    for clause in clauses:
        segment_verbs_and_objects(clause)


def segment_sentences(input_string):
    sentences = input_string.split(".")
    for sentence in sentences:
        segment_clauses(sentence)

In [94]:

if __name__ == "__main__":
    test_input = "He loves and hates the mountains and the beach"
    print("\033[31m" + "Original:" + "\033[0m", test_input)
    segment_sentences(test_input)
    test_input = "The big brown fox loves guitar but smashes every guitar he sees"
    print("\033[31m" + "Original:" + "\033[0m", test_input)
    segment_sentences(test_input)
    test_input = "I ran to the bench while he ran to the closet"
    print("\033[31m" + "Original:" + "\033[0m", test_input)
    segment_sentences(test_input)
    test_input = "Sally and Mr Smith both love ice cream, popsicles and cake"
    print("\033[31m" + "Original:" + "\033[0m", test_input)
    segment_sentences(test_input)
    test_input = "I love the product but not the price"
    print("\033[31m" + "Original:" + "\033[0m", test_input)
    segment_sentences(test_input)
    test_input = "I bought this product, which broke in two months, so I threw it away"
    print("\033[31m" + "Original:" + "\033[0m", test_input)
    segment_sentences(test_input)


[31mOriginal:[0m He loves and hates the mountains and the beach
He loves the beach 
He loves the mountains 
He hates the mountains 
He hates the beach 
[31mOriginal:[0m The big brown fox loves guitar but smashes every guitar he sees
The big brown fox loves guitar 
The big brown fox smashes every guitar he sees 
[31mOriginal:[0m I ran to the bench while he ran to the closet
I ran to the bench 
while he ran to the closet 
[31mOriginal:[0m Sally and Mr Smith both love ice cream, popsicles and cake
Sally and Mr Smith both love ice cream 
Sally and Mr Smith both love cake 
Sally and Mr Smith both love popsicles 
[31mOriginal:[0m I love the product but not the price
I love the product 
I love not the price 
[31mOriginal:[0m I bought this product, which broke in two months, so I threw it away
I bought this product which broke in two months 
I threw it away 
