In [1]:
import spacy
from spacy import displacy
import benepar
from benepar.spacy_plugin import BeneparComponent

nlp = spacy.load('en_core_web_lg')

  from ._conv import register_converters as _register_converters


In [2]:
nlp.add_pipe(BeneparComponent("benepar_en"))

In [23]:
docs = nlp.pipe(['The cat ate the hat.'])

In [24]:
for doc in docs:
    doc = nlp.get_pipe('benepar')(doc)
    for sent in doc.sents:
        print('-----')
        print(sent.text)
        print(list(sent._.children))

-----
The cat ate the hat.
[The cat, ate the hat, .]


In [8]:
nlp.get_pipe('benepar')

<benepar.spacy_plugin.BeneparComponent at 0x7fea117387b8>

In [4]:
def parse(sent_text):
    doc = nlp(sent_text, disable=['ner', 'textcat'])
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, list(token.children))
    sent = list(doc.sents)[0]
    print(sent._.parse_string)
    print(list(sent._.children))

In [51]:
parse('The black goat who had been eating since the morning slept in the grass.')
parse('He had arranged that the skipper should steer.')
parse('He and the white goat that ate the cheese went to the store.')

The the DET DT det []
black black ADJ JJ amod []
goat goat NOUN NN ROOT [The, black, eating, .]
who who NOUN WP nsubj []
had have VERB VBD aux []
been be VERB VBN aux []
eating eat VERB VBG relcl [who, had, been, slept]
since since ADP IN mark []
the the DET DT det []
morning morning NOUN NN nsubj [the]
slept sleep VERB VBD advcl [since, morning, in]
in in ADP IN prep [grass]
the the DET DT det []
grass grass NOUN NN pobj [the]
. . PUNCT . punct []
(S (NP (NP (DT The) (JJ black) (NN goat)) (SBAR (WHNP (WP who)) (S (VP (VBD had) (VP (VBN been) (VP (VBG eating) (PP (IN since) (NP (DT the) (NN morning))))))))) (VP (VBD slept) (PP (IN in) (NP (DT the) (NN grass)))) (. .))
[The black goat who had been eating since the morning, slept in the grass, .]
He -PRON- PRON PRP nsubj []
had have VERB VBD aux []
arranged arrange VERB VBN ROOT [He, had, steer, .]
that that ADP IN mark []
the the DET DT det []
skipper skipper NOUN NN nsubj [the]
should should VERB MD aux []
steer steer VERB VB ccomp [th

In [6]:
from collections import deque
from pattern.en import conjugate
import traceback

In [7]:
# These must match the conjugations of the `pattern` package.
PAST_CONJ = {
    '1sg': '1sgp',
    '2sg': '2sgp',
    '3sg': '3sgp',
    'pl': 'ppl',
    'part': 'ppart',
}

def contains_one_of(haystack, needles):
    for item in haystack:
        for needle in needles:
            if item == needle:
                return True
    return False

def conj_to_past(number_person):
    return PAST_CONJ.get(number_person, None)

def is_past(tok, number_person=None):
    """
    Is `verb` past tense, where `verb` has lemma `lemma`
    and number/person `number_person`?
    """
    if tok.tag_ == 'VBN' or tok.tag_ == 'VBD':
        return True
    if number_person:
        return conjugate(tok.lemma_, conj_to_past(number_person)) == tok.text
    return (
        conjugate(tok.lemma_, '1sgp') == tok.text
        or conjugate(tok.lemma_, '2sgp') == tok.text
        or conjugate(tok.lemma_, '3sgp') == tok.text
    )

def is_participle(span):
    """
    Expects a span of length one.
    """
    return contains_one_of(span._.labels, ('VBG', 'VBN'))

def find_nsubj(np):
    """
    Expects a span with label NP.
    """
    nsubj = None
    s = []
    s.append(np)
    while s:
        span = s.pop()
        if len(span) == 1 and (span[0].dep_ == 'nsubj' or span[0].dep_ == 'nsubjpass'):
            nsubj = span
        elif contains_one_of(span._.labels, ('S', 'ADVP', 'PP', 'ADJP', 'PRN', 'QP', 'RRC', 'X')):
            continue
        s.extend(span._.children)
    return nsubj

def number_person(token):
    """
    Expects a token with POS == 'NOUN'.
    """
    for tok in token.children:
        if tok.lower_ == 'and':
            return 'pl'
    if token.tag_ == 'NNPS' or token.tag_ == 'NNS' or token.lower_ == 'we' or token.lower_ == 'they':
        return 'pl'
    elif token.lower_ == 'i':
        return '1sg'
    elif token.lower_ == 'you':
        return '2sg'
    else:
        return '3sg'

def object_to_subject(np):
    subject = ''
    s = [np]
    while s:
        span = s.pop()
        if len(span) == 1 and span[0].tag_ == 'PRP':
            if span.text == 'him':
                subject += 'he' + span[0].whitespace_
            elif span.text == 'her':
                subject += 'she' + span[0].whitespace_
            elif span.text == 'them':
                subject += 'they' + span[0].whitespace_
            else:
                subject += span.text_with_ws
        elif contains_one_of(span._.labels, ('S', 'ADVP', 'PP', 'ADJP', 'PRN', 'QP', 'RRC', 'X')):
            continue
        elif len(span) == 1:
            subject += span.text_with_ws
        s.extend(reversed(list(span._.children)))
    return subject.strip()

In [8]:
from pattern.en import conjugate

def present_to_past(token):
    txt = token.text.lower()
    if conjugate(token.lemma_, '1sg') == txt:
        return conjugate(token.lemma_, '1sgp')
    elif conjugate(token.lemma_, '2sg') == txt:
        return conjugate(token.lemma_, '2sgp')
    elif conjugate(token.lemma_, '3sg') == txt:
        return conjugate(token.lemma_, '3sgp')
    elif conjugate(token.lemma_, 'pl') == txt:
        return conjugate(token.lemma_, 'ppl')
    elif conjugate(token.lemma_, 'part') == txt:
        return conjugate(token.lemma_, 'ppart')
    else:
        return token.text

def transform_past(sent_text):
    doc = nlp(sent_text)
    trans_text = []
    for sent in doc.sents:
        trans_text.append(transform_past_span(sent))
    return ' '.join(trans_text)

def transform_past_span(sent):
    tr_doc = []
    for tok in sent:
        if tok.tag_ == 'VBZ' or tok.tag_ == 'VBP':
            if tok.dep_ == 'aux':
                if tok.lemma_ == 'do':
                    # present emphatic
                    tr_doc.append('did')
                elif tok.lemma_ == 'be':
                    # present progressive
                    tr_doc.append(present_to_past(tok))
            else:
                # present
                tr_doc.append(present_to_past(tok))
        else:
            tr_doc.append(tok.text)
    return ' '.join(tr_doc)

In [202]:
def transform_present(text, echo=False):
    """
    It is not allowed to have sentences that conjunct verbs of different tenses
    without auxiliary verbs:
    
    * "We will go tomorrow and go today."
    
    This is because they are ambiguous without knowledge of adverbs like "tomorrow"
    and "today":
    
    "We will go and eat."
    
    Should eat be present or future tense? We assume future to avoid having to
    analyze the semantics of adverbial phrases.
    """
    doc = nlp(text)
    trans_text = []
    for sent in doc.sents:
        trans_text.append(Sentence(sent).transform_present())
        if echo:
            print(trans_text[-1])
    return ' '.join(trans_text)

class Verb(object):
    """
    This class if for those verbs which have the following dependency labels:
    
    VERB ROOT
    WHNP > WDT --relcl--> NN
    VERB --advcl--> NN
    VERB --advcl--> IN
    VERB --ccomp--> NN
    """
    VERB_DEPS = ['ROOT', 'conj', 'relcl', 'advcl', 'ccomp']

    AUXILIARY_MODALS = [
        'can', 'could', 'may', 'might', 'must', 'shall', 'should', 'will', 'would'
    ]
    
    def __init__(self, tok=None, clause=None, number_person='pl'):
        """
        If the verb comes from a fragment without a subject, then default conjugation is past plural.
        """
        self.is_parsed = False
        self.break_recursion = False
        self.tok = tok
        self.clause = clause
        self.aux = []
        self.nsubj = None
        self.do = False
        self.been = False
        self.number_person = number_person
        self.is_past = False
        self.is_future = False
        self.have_pres = False
        self.have_past = False
        self.is_modal = False
        self.is_participle = False
        if self.tok and self.clause:
            self.parse()

    def __str__(self):
        s = f"""
        is parsed: {self.is_parsed}
        verb: {self.tok.lower_}
        verb lemma: {self.tok.lemma_}
        verb dependency label: {self.tok.lemma_}
        verb constituent tag: {self.tok.tag_}"""
        for i, a in enumerate(self.aux):
            s += f"""
        aux {i}: {a.lower_}
        aux {i} lemma: {a.lemma_}
        aux {i} dependency label: {a.dep_}"""
        nsubj = self.nsubj.text if self.nsubj else None
        s += f"""
        subject: {self.nsubj}
        number_person: {self.number_person}
        has 'do': {self.do}
        has 'been': {self.been}
        is past tense: {self.is_past}
        is future tense: {self.is_future}
        is participle: {self.is_participle}
        has 'have' in present tense: {self.have_pres}
        has 'have' in past tense: {self.have_past}
        has modal: {self.is_modal}
"""
        return s
    
    def parse(self):
        if not self.tok or not self.clause:
            raise Exception(
                'Verb.parse_verb() requires the `tok` and `clause` '
                'variables to be set.')
        if self.break_recursion:
            raise Exception(
                'There is a circular dependency among conjunctive verbs.')
        # Reset variables.
        self.aux = []
        self.do = False
        self.been = False
        self.have_pres = False
        self.have_past = False
        self.is_modal = False
        self.is_past = False
        self.is_future = False
        # Check for previous verb joined by conjunction.
        self.break_recursion = True
        prev_verb = self.clause.prev_verb(self)
        self.break_recursion = False
        if prev_verb:
            self.nsubj = prev_verb.nsubj
            self.number_person = prev_verb.number_person
            for child in self.tok.children:
                if child.lemma_ == 'not':
                    self.not_token = child
                elif child.dep_ == 'aux' or child.dep_ == 'auxpass':
                    self.aux.append(child)
            if not self.aux:
                if prev_verb.do:
                    # Special exception: He did not walk but talked.
                    if self.tok.tag_ == 'VBD':
                        self.do = False
                        self.aux = []
                    else:
                        self.do = True
                        self.aux = prev_verb.aux
                else:
                    self.do = False
                    self.aux = prev_verb.aux
                self.been = prev_verb.been
                self.have_pres = prev_verb.have_pres
                self.have_past = prev_verb.have_past
                self.is_modal = prev_verb.is_modal
                self.is_past = prev_verb.is_past
                self.is_future = prev_verb.is_future
            else:
                self.is_past = self._is_past()
            # Calculate negation status.
            # Algo: calculate negation status of previous verb
            #       then calculate neg of current verb by
            #       searching for not/n't/but
            #
            # * "not"/"n't" have dep_ == 'neg'
            # * "but" has dep_ == 'cc' and is child of
            #   the first verb of conjunction
            #
            # Ex:
            #   did not talk or walk
            #   did not talk but walked and balked
            #   did talk but didn't walk or balk
            #   didn't talk but did walk and balk
        else:
            # Iterates children in order of appearance.
            for child in self.tok.children:
                if child.dep_ == 'nsubj' or child.dep_ == 'nsubjpass':
                    self.nsubj = child
                elif child.dep_ == 'aux' or child.dep_ == 'auxpass':
                    self.aux.append(child)
            for a in self.aux:
                if a.lemma_ == 'do':
                    self.do = True
                elif a.lower_ == 'been':
                    self.been = True
                elif a.lower_ == 'will':
                    self.is_future = True
                if (a.lower_ == 'have' or a.lower_ == 'has'):
                    self.have_pres = True
                if a.lower_ == 'had':
                    self.have_past = True
            self.is_modal = self.aux and self._is_aux_modal(self.aux[0])
            self.number_person = self._number_person(self.nsubj)
            self.is_past = self._is_past()
        self.is_participle = self._is_participle(self.tok)
        self.is_parsed = True

    def transform_to_present_str(self, tok):
        """
        Takes in a token. If the token is this verb or one of its auxiliaries,
        then it converts it to present tense.
        
        Assumes that `self` is a past tense verb.
        
        Rules:
        * past > present
        * had + ppart > has/have + ppart
        * had + been + ppart > has/have + been + ppart
        * was + ppart > is + ppart
        * did > does, had > has/have, was > is
        * would have + ppart > would + inf
        * would have + been + ppart> would be + ppart
        """
        if self.aux and tok == self.aux[0]:
            if tok.lemma_ == 'do':
                return conjugate('do', self.number_person) + tok.whitespace_
            elif self.is_modal:
                return tok.text_with_ws
            elif self.is_participle and tok.lower_ == 'has':
                return conjugate('have', self.number_person) + tok.whitespace_
            else:
                return conjugate(tok.lemma_, self.number_person) + tok.whitespace_
        elif tok in self.aux:
            if self.is_modal and not self.is_future and tok.lower_ == 'have':
                return ''
            elif self.is_modal and not self.is_future and tok.lower_ == 'been':
                return 'be' + tok.whitespace_
            else:
                return tok.text_with_ws
        elif tok == self.tok:
            if not self.aux:
                return conjugate(tok.lemma_, self.number_person) + tok.whitespace_
            elif self.is_modal and not self.been and not self.is_future:
                return tok.lemma_ + tok.whitespace_
            else:
                return tok.text_with_ws
        return None

    @classmethod
    def _is_aux_modal(cls, tok):
        return tok.lower_ in cls.AUXILIARY_MODALS

    @classmethod
    def _contains_one_of(cls, haystack, needles):
        for item in haystack:
            for needle in needles:
                if item == needle:
                    return True
        return False

    @classmethod
    def _number_person(cls, token):
        """
        Expects a token with POS == 'NOUN'.
        """
        # Default is plural.
        if not token:
            return 'pl'
        for tok in token.children:
            if tok.lower_ == 'and':
                return 'pl'
        if token.tag_ == 'NNPS' or token.tag_ == 'NNS' or token.lower_ == 'we' or token.lower_ == 'they':
            return 'pl'
        elif token.lower_ == 'i':
            return '1sg'
        elif token.lower_ == 'you':
            return '2sg'
        else:
            return '3sg'

    def _is_past(self):
        if self.is_future:
            return False
        if self.do:
            return self._is_past_tok(self.aux[0], self.number_person)
        else:
            if self.aux and self._is_past_tok(self.aux[0], self.number_person):
                return True
            elif self._is_past_tok(self.tok, self.number_person):
                return True
        return False
        
    @classmethod
    def _is_past_tok(cls, tok, number_person=None):
        """
        Is `verb` past tense, where `verb` has lemma `lemma`
        and number/person `number_person`?
        """
        PAST_CONJ = {
            '1sg': '1sgp',
            '2sg': '2sgp',
            '3sg': '3sgp',
            'pl': 'ppl',
            'part': 'ppart',
        }
        if tok.tag_ == 'VBN' or tok.tag_ == 'VBD':
            return True
        if number_person:
            return conjugate(tok.lemma_, PAST_CONJ[number_person]) == tok.text
        return (
            conjugate(tok.lemma_, '1sgp') == tok.text
            or conjugate(tok.lemma_, '2sgp') == tok.text
            or conjugate(tok.lemma_, '3sgp') == tok.text
        )

    @classmethod
    def _is_participle(cls, tok):
        """
        Expects a token.
        """
        return tok.tag_ == 'VBG' or tok.tag_ == 'VBN'


class Clause(object):
    def __init__(self, span):
        self.verbs = []
        self.span = span

    def prev_verb(self, verb):
        """
        If a verb is connected by a conjunction to a previous verb,
        then get that verb.
        """
        prev_verb = None
        if verb.tok.dep_ == 'conj':
            for tok in verb.tok.ancestors:
                if tok.dep_ in VERB_DEPS:
                    for v in self.verbs:
                        if v.tok == tok:
                            prev_verb = v
                            if not prev_verb.is_parsed:
                                prev_verb.parse()
                    break
        return prev_verb
    
    def parse_verbs(self):
        self.verbs = []
        first = True
        # Depth-first search to find verbs.
        s = []
        s.append(self.span)
        while s:
            span = s.pop()
            if 'S' in span._.labels or 'SBAR' in span._.labels:
                if first:
                    first = False
                else:
                    continue
            elif len(span) == 1 and span[0].dep_ in Verb.VERB_DEPS:
                self.verbs.append(Verb(span[0], self))
            s.extend(reversed(list(span._.children)))
        
    def transform_present(self):
        self.parse_verbs()
        past_verbs = [v for v in self.verbs if v.is_past]
        first = True
        new_text = ''
        #for v in verbs:
        #    print(str(v))
        # Depth-first search to transform past to present.
        s = []
        s.append(self.span)
        while s:
            span = s.pop()
            if len(span) == 1:
                span_is_past_verb = False
                if span[0].pos_ == 'VERB':
                    for v in past_verbs:
                        present = v.transform_to_present_str(span[0])
                        if present is not None:
                            new_text += present
                            span_is_past_verb = True
                            break
                if not span_is_past_verb:
                    new_text += span.text_with_ws
            elif 'S' in span._.labels or 'SBAR' in span._.labels:
                if first:
                    first = False
                    s.extend(reversed(list(span._.children)))
                else:
                    ws = ' ' if span.text_with_ws[-1] == ' ' else ''
                    new_text += Clause(span).transform_present() + ws
            else:
                s.extend(reversed(list(span._.children)))
        new_text = new_text.replace(' .', '.').replace(' ,', ',')
        if new_text and new_text[-1] == ' ':
            new_text = new_text[:-1]
        return new_text

class Sentence(object):
    def __init__(self, span):
        self.span = span
    
    def transform_present(self):
        if not self.span:
            raise Exception(
                'Sentence.transform_present() requires the sentence span to be set.')
        try:
            text = Clause(self.span).transform_present()
            if text:
                text = text[0].upper() + text[1:]
            return text
        except Exception as e:
            print('There was an error parsing the following sentence')
            print()
            print(self.span.text)
            print()
            print('with parse:')
            print()
            print(self.span._.parse_string)
            print()
            print(str(e) + '\n' + traceback.format_exc())
            return self.span.text

In [190]:
parse("The boy has been throwing the ball that he bought at the store.")


The the DET DT det []
boy boy NOUN NN nsubj [The]
has have VERB VBZ aux []
been be VERB VBN aux []
throwing throw VERB VBG ROOT [boy, has, been, ball, .]
the the DET DT det []
ball ball NOUN NN dobj [the, bought]
that that ADJ WDT dobj []
he -PRON- PRON PRP nsubj []
bought buy VERB VBD relcl [that, he, at]
at at ADP IN prep [store]
the the DET DT det []
store store NOUN NN pobj [the]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VBZ has) (VP (VBN been) (VP (VBG throwing) (NP (NP (DT the) (NN ball)) (SBAR (WHNP (WDT that)) (S (NP (PRP he)) (VP (VBD bought) (PP (IN at) (NP (DT the) (NN store)))))))))) (. .))
[The boy, has been throwing the ball that he bought at the store, .]


In [203]:
def test_transform_present_preserve_present():
    assert transform_present("The boy has been throwing and catching.", True) == "The boy has been throwing and catching."
    assert transform_present("The boy would throw and catch the ball.", True) == "The boy would throw and catch the ball."
    assert transform_present("The boy would have a ball for Christmas.", True) == "The boy would have a ball for Christmas."
    assert transform_present('The boy has thrown and caught the ball.', True) == 'The boy has thrown and caught the ball.'
    assert transform_present("The boy hasn't thrown or caught the ball.", True) == "The boy hasn't thrown or caught the ball."
    assert transform_present("The boy is throwing and catching the ball and catching it.", True) == "The boy is throwing and catching the ball and catching it."
    assert transform_present("The ball is being thrown and caught by the boy.", True) == "The ball is being thrown and caught by the boy."
    assert transform_present("The ball is thrown and caught by the boy.", True) == "The ball is thrown and caught by the boy."
    
def test_transform_present_preserve_future():
    assert transform_present("The boy will throw the ball and hit it.", True) == "The boy will throw the ball and hit it."
    assert transform_present("The boy will have thrown the ball before catching it.", True) == "The boy will have thrown the ball before catching it."
    assert transform_present("The ball will have been thrown and caught by the boy.", True) == "The ball will have been thrown and caught by the boy."
    assert transform_present("The boy will be throwing and catching the ball.", True) == "The boy will be throwing and catching the ball."
    assert transform_present("The ball will be thrown and caught by the boy.", True) == "The ball will be thrown and caught by the boy."

def test_transform_present_conjunctions():
    assert transform_present("The boy threw the ball and hid.", True) == "The boy throws the ball and hides."
    assert transform_present("The boy is going and was going.", True) == "The boy is going and is going."
    assert transform_present("The boy did not walk but talked.", True) == "The boy does not walk but talks."
    assert transform_present("The boy walked but a mile talking with a friend.", True) == "The boy walks but a mile talking with a friend."
    assert transform_present("The boy walked not even a mile before stopping.", True) == "The boy walks not even a mile before stopping."

def test_transform_present_complex():
    assert transform_present("The boy has been throwing the ball that he bought at the store.", True) == "The boy has been throwing the ball that he buys at the store."
    assert transform_present("The boy had been throwing the ball when it hit a window.", True) == "The boy has been throwing the ball when it hits a window."
    assert transform_present("The boy has thrown the ball that hit a window.", True) == "The boy has thrown the ball that hits a window."
    assert transform_present("The boy threw the ball which hit a window.", True) == "The boy throws the ball which hits a window."
    assert transform_present("The boy threw the ball while the dog fetched it.", True) == "The boy throws the ball while the dog fetches it."
    assert transform_present("The boy had thrown the ball, and it flew high.", True) == "The boy has thrown the ball, and it flies high."
    assert transform_present("The boy dropped the ball while throwing it.", True) == "The boy drops the ball while throwing it."
    assert transform_present("The boy was throwing the ball, but it kept falling to the ground.", True) == "The boy is throwing the ball, but it keeps falling to the ground."
    assert transform_present("Did the boy throw the ball that broke the window?", True) == "Does the boy throw the ball that breaks the window?"
    #assert transform_present("The fallen ball was thrown by the boy.")
    
def test_transform_present_past_tenses():
    assert transform_present('The boy threw the ball.', True) == 'The boy throws the ball.'
    assert transform_present('The boy did throw the ball.', True) == 'The boy does throw the ball.'

    assert transform_present("The boy didn't throw the ball.", True) == "The boy doesn't throw the ball."
    assert transform_present("The boy did not throw the ball.", True) == "The boy does not throw the ball."


    assert transform_present('The boy had thrown the ball.', True) == 'The boy has thrown the ball.'
    assert transform_present('The boy was throwing the ball.', True) == 'The boy is throwing the ball.'
    assert transform_present('The boy had been throwing the ball.', True) == 'The boy has been throwing the ball.'

    assert transform_present("The boy hadn't thrown the ball.", True) == "The boy hasn't thrown the ball."
    assert transform_present("The boy wasn't throwing the ball.", True) == "The boy isn't throwing the ball."
    assert transform_present("The boy hadn't been throwing the ball.", True) == "The boy hasn't been throwing the ball."

    assert transform_present('The boy would have thrown the ball.', True) == 'The boy would throw the ball.'
    assert transform_present('The boy could have thrown the ball.', True) == 'The boy could throw the ball.'
    assert transform_present('The boy must have thrown the ball.', True) == 'The boy must throw the ball.'
    assert transform_present("The boy wouldn't have thrown the ball.", True) == "The boy wouldn't throw the ball."

    assert transform_present('The ball was thrown by the boy.', True) == 'The ball is thrown by the boy.'
    assert transform_present('The ball was being thrown by the boy.', True) == 'The ball is being thrown by the boy.'
    assert transform_present("The ball wasn't being thrown by the boy.", True) == "The ball isn't being thrown by the boy."
    assert transform_present('The ball would have been thrown by the boy.', True) == 'The ball would be thrown by the boy.'

In [204]:
test_transform_present_preserve_present()
test_transform_present_preserve_future()
#test_transform_present_conjunctions()
#test_transform_present_complex()
#test_transform_present_past_tenses()

The boy has been throwing and catching.
The boy would throw and catch the ball.
The boy would have a ball for Christmas.
The boy has thrown and caught the ball.
The boy hasn't thrown or caught the ball.
The boy is throwing and catching the ball and catching it.
The ball is being thrown and caught by the boy.
The ball is thrown and caught by the boy.
The boy will throw the ball and hit it.
The boy will have thrown the ball before catching it.
The ball will have been thrown and caught by the boy.
The boy will be throwing and catching the ball.
The ball will be thrown and caught by the boy.


In [47]:
s = "He had arranged that the skipper should steer while he and Miss Harris breakfasted."
parse(s)
doc = nlp(s)
sent = next(doc.sents)
print(transform_present_span(sent))

He -PRON- PRON PRP nsubj []
had have VERB VBD aux []
arranged arrange VERB VBN ROOT [He, had, steer, .]
that that ADP IN mark []
the the DET DT det []
skipper skipper NOUN NN nsubj [the]
should should VERB MD aux []
steer steer VERB VB ccomp [that, skipper, should, breakfasted]
while while ADP IN mark []
he -PRON- PRON PRP nsubj [and, Harris]
and and CCONJ CC cc []
Miss miss PROPN NNP compound []
Harris harris PROPN NNP conj [Miss]
breakfasted breakfast VERB VBD advcl [while, he]
. . PUNCT . punct []
(S (NP (PRP He)) (VP (VBD had) (VP (VBN arranged) (SBAR (SBAR (IN that) (S (NP (DT the) (NN skipper)) (VP (MD should) (VB steer)))) (SBAR (IN while) (S (NP (NP (PRP he)) (CC and) (NP (NNP Miss) (NNP Harris))) (VP (VBD breakfasted))))))) (. .))
[He, had arranged that the skipper should steer while he and Miss Harris breakfasted, .]

        verb: arranged
        verb lemma: arrange
        verb dependency label: arrange
        verb constituent tags: 
        aux 0: had
        aux 0 lemma

In [17]:
parse("The boy threw the ball and hid.")
print('*********')
parse("The boy is going and was going.")
print('*********')
parse("The boy did not walk but talked.")
print('*********')
parse("The boy walked but a mile talking with a friend.")
print('*********')
parse("The boy walked not even a mile before stopping.")

The the DET DT det []
boy boy NOUN NN nsubj [The]
threw throw VERB VBD ROOT [boy, ball, and, hid, .]
the the DET DT det []
ball ball NOUN NN dobj [the]
and and CCONJ CC cc []
hid hide VERB VBD conj []
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VP (VBD threw) (NP (DT the) (NN ball))) (CC and) (VP (VBD hid))) (. .))
[The boy, threw the ball and hid, .]
*********
The the DET DT det []
boy boy NOUN NN nsubj [The]
is be VERB VBZ aux []
going go VERB VBG ROOT [boy, is, and, going, .]
and and CCONJ CC cc []
was be VERB VBD aux []
going go VERB VBG conj [was]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VP (VBZ is) (VP (VBG going))) (CC and) (VP (VBD was) (VP (VBG going)))) (. .))
[The boy, is going and was going, .]
*********
The the DET DT det []
boy boy NOUN NN nsubj [The]
did do VERB VBD aux []
not not ADV RB neg []
walk walk VERB VB ROOT [boy, did, not, but, talked, .]
but but CCONJ CC cc []
talked talk VERB VBD conj []
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (V

In [13]:
# SBAR WHADVP, SBAR WHNP, SBAR S, S S, S S
parse("The boy had been throwing the ball when it hit a window.")
print('*********')
parse("The boy has thrown the ball that hit a window.")
print('*********')
parse("The boy threw the ball which hit a window.")
print('*********')
parse("The boy threw the ball while the dog fetched it.")
print('*********')
parse("The boy had thrown the ball, and it flew high.")
print('*********')
parse("The boy dropped the ball while throwing it.")
print('*********')
parse("The boy was throwing the ball, but it kept falling to the ground.")
print('*********')
parse("Did the boy throw the ball that broke the window?")
print('*********')
parse("The fallen ball was thrown by the boy.")


The the DET DT det []
boy boy NOUN NN nsubj [The]
had have VERB VBD aux []
been be VERB VBN aux []
throwing throw VERB VBG ROOT [boy, had, been, ball, hit, .]
the the DET DT det []
ball ball NOUN NN dobj [the]
when when ADV WRB advmod []
it -PRON- PRON PRP nsubj []
hit hit VERB VBD advcl [when, it, window]
a a DET DT det []
window window NOUN NN dobj [a]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VBD had) (VP (VBN been) (VP (VBG throwing) (NP (DT the) (NN ball)) (SBAR (WHADVP (WRB when)) (S (NP (PRP it)) (VP (VBD hit) (NP (DT a) (NN window)))))))) (. .))
[The boy, had been throwing the ball when it hit a window, .]
*********
The the DET DT det []
boy boy NOUN NN nsubj [The]
has have VERB VBZ aux []
thrown throw VERB VBN ROOT [boy, has, ball, .]
the the DET DT det []
ball ball NOUN NN dobj [the, hit]
that that ADJ WDT nsubj []
hit hit VERB VBD relcl [that, window]
a a DET DT det []
window window NOUN NN dobj [a]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VBZ has) (VP (

In [14]:
parse("The boy did not throw the ball.")
print('*********')
parse('The boy had been throwing the ball.')
print('*********')
parse("The boy wasn't throwing the ball.")
print('*********')
parse('The boy should have thrown the ball.')
print('*********')
parse('The ball was being thrown by the boy.')
print('*********')
parse('The ball would have been thrown by the boy.')

The the DET DT det []
boy boy NOUN NN nsubj [The]
did do VERB VBD aux []
not not ADV RB neg []
throw throw VERB VB ROOT [boy, did, not, ball, .]
the the DET DT det []
ball ball NOUN NN dobj [the]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VBD did) (RB not) (VP (VB throw) (NP (DT the) (NN ball)))) (. .))
[The boy, did not throw the ball, .]
*********
The the DET DT det []
boy boy NOUN NN nsubj [The]
had have VERB VBD aux []
been be VERB VBN aux []
throwing throw VERB VBG ROOT [boy, had, been, ball, .]
the the DET DT det []
ball ball NOUN NN dobj [the]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VBD had) (VP (VBN been) (VP (VBG throwing) (NP (DT the) (NN ball))))) (. .))
[The boy, had been throwing the ball, .]
*********
The the DET DT det []
boy boy NOUN NN nsubj [The]
was be VERB VBD aux []
n't not ADV RB neg []
throwing throw VERB VBG ROOT [boy, was, n't, ball, .]
the the DET DT det []
ball ball NOUN NN dobj [the]
. . PUNCT . punct []
(S (NP (DT The) (NN boy)) (VP (VB

In [18]:
# no aux: vbz, vbp, vbd --> 'be' with the same conjugation as verb + ppart
# The boy threw the ball --> The ball was thrown.
# The boy throws the ball --> The ball is thrown.

# 'will' aux: vbz, vbp --> 'be' + ppart
# The boy will throw the ball --> The ball will be thrown.

# 'do' conj aux --> 'be' with same conjugation as 'do'
# The boy did throw the ball --> The ball was thrown.
# The boy does throw the ball --> The ball is thrown.
# The boy didn't throw the ball --> The ball wasn't thrown.
# The boy doesn't throw the ball --> The ball isn't thrown.

# 'be' conj aux: gerund --> 'being' + ppart
# The boy was throwing the ball --> The ball was being thrown.
# The boy is throwing the ball --> The ball is being thrown.
# The boy will be throwing the ball --> The ball will be being thrown.

# 'have' conj aux: ppart --> 'been' + ppart
# The boy had thrown the ball --> The ball had been thrown.
# The boy has thrown the ball --> The ball has been thrown.

# 'will' + 'have' aux: ppart --> 'been' + ppart
# The boy will have thrown the ball --> The ball will have been thrown.

# 'have' conj + 'been' aux: vbg --> 'being' + ppart
# The boy had been throwing the ball --> The ball had been being thrown.
# The boy has been throwing the ball --> The ball has been being thrown.

# 'will' + 'have' + 'been' aux: vbg --> 'being' + ppart
# The boy will have been throwing the ball --> The ball will have been being thrown.

# if verb has a direct object

# except for 'do' aux:
# 1. conjugate 'be' in the tense of the verb with the person and number of dobj

# for 'do' aux:
# 1. conjugate 'be' in the tense of 'do' with the person and number of dobj

# for all
# 3. change verb to conjugated 'be' + past participle
# 4. replace subject with direct object

# traverse breadth first
# save subject noun phrase
# save list of aux
# save main verb
# if find direct object, then perform transform




DEBUG = False


def transform_passive(sent_text):
    doc = nlp(sent_text)
    trans_text = []
    for sent in doc.sents:
        trans_text.append(transform_passive_span(sent))
    return ' '.join(trans_text)

def transform_passive_span(sent):
    try:
        first = True
        subject = None
        aux = []
        verb = None
        dobj = None
        q = deque()
        q.append(sent)
        while q:
            span = q.popleft()
            if 'S' in span._.labels:
                if first:
                    first = False
                else:
                    continue
            elif not subject and 'NP' in span._.labels:
                subject = span
            elif len(span) == 1 and span[0].dep_ == 'aux':
                aux.append(span)
            elif len(span) == 1 and span[0].pos_ == 'VERB':
                verb = span
            elif len(span) == 1 and span[0].dep_ == 'dobj':
                dobj = span
                objekt = span
                while objekt._.parent and 'VP' not in objekt._.labels:
                    if 'NP' in objekt._.labels:
                        break
                    objekt = objekt._.parent            
                break
            elif contains_one_of(span._.labels, ('ADVP', 'PP', 'ADJP', 'PRN', 'QP', 'RRC', 'X')):
                continue
            q.extend(span._.children)
        if not subject or not dobj: return sent.text
        emphatic = aux and aux[0][0].lemma_ == 'do'
        nsubj = find_nsubj(subject)
        subj_conj = number_person(nsubj[0])
        past_tense = (
            is_past(aux[0][0], subj_conj) if emphatic
            else is_past(verb[0], subj_conj)
        )
        participle = False if emphatic else is_participle(verb)
        if participle:
            conj = 'ppart' if past_tense else 'part'
        else:
            conj = number_person(dobj[0])
            if past_tense:
                conj = conj_to_past(conj)
        if DEBUG:
            print([a.text for a in aux])
            print(nsubj.text, '|', dobj.text, '|', subject.text, '|', objekt.text, '|', verb.text)
            print(conj)
            print(past_tense)
            print(participle)
        new_text = ''
        s = []
        s.append(sent)
        while s:
            span = s.pop()
            if span == subject:
                new_text += object_to_subject(objekt) + subject[-1].whitespace_
                continue
            elif emphatic and span == aux[0]:
                pass
            elif span == verb:
                new_text += conjugate('be', conj) + ' '
                print(verb.text)
                print(len(verb[0].whitespace_))
                new_text += conjugate(verb[0].lemma_, 'ppart') + verb[0].whitespace_
            elif span == objekt:
                continue
            elif len(span) == 1:
                new_text += span.text_with_ws
            s.extend(reversed(list(span._.children)))
        return new_text.replace(' .', '.').replace(' ,', ',')
    except Exception as e:
        print('There was an error parsing the following sentence')
        print()
        print(sent.text)
        print()
        print('with parse:')
        print()
        print(sent._.parse_string)
        print()
        print(str(e) + '\n' + traceback.format_exc())
        return sent.text

In [149]:
doc = nlp('Soon after, the man greeted the "tall" woman.')
sent = next(doc.sents)
print(sent[1].text)
print(len(sent[1].whitespace_))
print(sent[6].text)
print(len(sent[6].whitespace_))
transform_passive('Soon after, the man greeted the "tall" woman.')

after
0
the
1
greeted
1


'Soon after, the "tall" woman was greeted.'

In [19]:
def find_token(sent, tok):
    s = [sent]
    while s:
        span = s.pop()
        if len(span) == 1 and span[0] == tok:
            return span
        s.extend(reversed(list(span._.children)))
    return None

In [20]:
def transform(text):
    doc = nlp(text)
    trans_text = []
    for sent in doc.sents:
        trans_text.append(transform_passive(transform_past_span(sent)))
    return '\n'.join(trans_text)

In [None]:
with open('text/text-for-transform.txt', 'r') as fi:
    with open('text/text-transformed.txt', 'w') as fo:
        for i, line in enumerate(fi):
            print(i, end='..', flush=True)
            fo.write(transform(line), flush=True)

In [127]:
with open('text/plot-summaries.txt', 'r') as fi:
    with open('text/plot-summaries-transformed.txt', 'w') as fo:
        i = 0
        for line in fi:
            i += 1
            print(i, end='..', flush=True)
            fo.write(transform(line), flush=True)

1..2..3..4..5..There was an error parsing the following sentence

Once their timelines converged " naturally " at the library — their first meeting in his chronology —

with parse:

(FRAG (SBAR (IN Once) (NP (PRP$ their) (NNS timelines)) (VP (VBD converged) (ADVP (`` ")) (ADVP (RB naturally)) ('' ") (PP (IN at) (NP (DT the) (NN library))))) ( —) (NP (NP (PRP$ their) (JJ first) (NN meeting)) (PP (IN in) (NP (PRP$ his) (NN chronology)))) ( —))

'NoneType' object is not subscriptable
Traceback (most recent call last):
  File "<ipython-input-126-ca10ab007446>", line 181, in transform_passive_span
    subj_conj = number_person(nsubj[0])
TypeError: 'NoneType' object is not subscriptable

6..7..8..9..10..11..12..13..14..15..16..17..18..19..20..21..22..23..

KeyboardInterrupt: 

In [136]:
# GENERAL
# [x] punctuation has spaces before it

# PAST TENSE
# [_] has known --> known
#     She has known him most of her life.
# [_] in a conjunction of verbs only the first is put into past tense
#     He shoots and wounds the dog.
# [_] "may" does not become "might"
#     He may be greeting her.
# [_] past tense transform preserves future; should change "will" to "would"
#     She will remember to help him when he arrives.
# [_] past tense of "leave" is conjugated as "leaved"
# [_] verb should not be past tense in statement
#     He requests she retreive the will.

# PASSIVE
# [_] broadest NP not used
#     He amassed a number of survival skills.
# [_] modals not turned to passive correctly
#     He may greet her. --> She may be greeted. (NOT She may is greeted.)
# [_] adverb should split auxilliary and primary verbs
#     He quickly took the bag. --> The bag was quickly taken.
# [_] conjunction of an intransitive and action verb uses object of action verb as subject of both
#     He wishes to help her and gives her a hand.
# [_] "number of things" is plural
#     He knew a number of things. --> A number of things were known. (NOT was known)

In [138]:
#parse('')
#transform_passive('When he leaves, where he goes, or how long his trips will last are all beyond his control.')
transform_past('She has known him most of her life.')

'She known him most of her life .'

In [10]:
# From https://github.com/julianbrooke/GutenTag/blob/master/GutenTag.py
# this class cleans away Project Gutenberg headers and footers, including copyright and transcriber notes
class TextCleaner:
    junk_indicators = ("project gutenberg"," etext"," e-text",
                            "http:","distributed proofreading",
                            "distributed\nproofreading", " online"
                            "html","utf-8","ascii","transcriber's note",
                            "scanner's note", "\\.net","\\.org","\\.com",
                            "\\.edu","www\\.", "electronic version",
                            " email","\\.uk","digitized", "\n\nproduced by",
                            "david reed", "\ntypographical errors corrected",
                            "\[note: there is a","etext editor's","u.s. copyright",
                            "\nerrata"," ebook"," e-book",
                            "author:     ","</pre>", "\[end of","internet archive")
                
    
    def clean_text(self,text):
        text = text.replace("\r\n","\n").replace("\r","\n") # normalize lines
        text = re.sub("\n[ *-]+\n","\n\n",text) # get rid of explicit section breaks
        #text = re.sub("\[Illustration:?[^\]]*\]","",text)
        text = re.sub("<<[^>]+>>","",text)
        text = re.sub("[^_]_______________________________________.*_________________________________[^_]","\n\n",text)
        lower_text = text.lower()
        all_junk_indicies = [0, len(text)]
        for junk_indicator in self.junk_indicators:
            all_junk_indicies.extend([m.start() for m in re.finditer(junk_indicator,lower_text)])
        all_junk_indicies.sort()
        best_points = None
        best_length = 0
        for i in range(len(all_junk_indicies) - 1):
            if all_junk_indicies[i+1] - all_junk_indicies[i] > best_length:
                best_points = [all_junk_indicies[i],all_junk_indicies[i+1]]
                best_length = all_junk_indicies[i+1] - all_junk_indicies[i]
        found = False
        best_length = float(best_length)
        if best_length < 5000: # too small for general method to work reliably
            m = re.search("end of [^\\n]*project gutenberg",lower_text)
            if m:
                best_points[1] = m.start()
                i = 0
                while all_junk_indicies[i] < best_points[1] - 100:
                    i += 1
                best_points[0] = all_junk_indicies[i-1]
            else:
                return ""

        i = 4
        while not found:
            looking_for = "\n"*i
            result = text.find(looking_for, best_points[0])
            if result != -1 and ((best_points[1] - result)/best_length > 0.98 or i == 1):
                found = True
            i -= 1

        return text[result:text.rfind("\n", 0, best_points[1])].strip()



In [None]:
# Process downloaded Gutenberg zip files.

import zipfile

with zipfile.ZipFile(...) as z:
    with z.open(...) as f:
        for line in f:
            print line

In [21]:
import os
import re

MIN_SENTENCE_CHAR_LENGTH = 5  # "I am." will qualify, but nothing shorter.

def preprocess(text, CHUNK_CHAR_LENGTH=10000):
    # Replace carraige return and tab characters.
    text = text.replace('\r\n', '\n').replace('\r', '\n').replace('\t', ' ')
    print('1')
    text = re.sub("[ ]*`", "'", text)
    print('2')
    # Remove play character prompts like 'WALTER: ...'
    text = re.sub('^[A-Z0-9’\'., ]+(?:\([^)]*\))?[:.][ ]*', '', text, flags=re.M)
    print('3')
    # Remove titles which are lines with all-caps.
    text = re.sub('^[A-Z0-9.,-?$#!;:\'’"_ ]+$', '', text, flags=re.M)
    print('4')
    # Remove divider lines like '* * * * *' and antiquated punctuation.
    text = re.sub('^[ ]*[-*=_].*[-*=_][ ]*$', '', re.sub(':--', ':', re.sub('&c', 'etc', text)), flags=re.M)
    print('5')
    # Remove leading spaces, then remove all line breaks except for empty lines.
    text = re.sub('\n[^\n]', ' ', re.sub('^[ ]+', '', text, flags=re.M))
    print('6')
    # Remove footnotes and references (can be nested one deep: [Note [Note in note]])
    text = re.sub('\[[^][]+\]', '', re.sub('\[[^][]+\]', '', re.sub('^\[[^ ]+\] .*$', '', text, flags=re.M)))
    print('7')
    texts = []
    remainder = text
    N = len(remainder)
    start = 0
    end = 0
    while start < N:
        end += CHUNK_CHAR_LENGTH
        while end < N and remainder[end] != '\n':
            end += 1
        # Remove all newlines, double spaces and underscores.
        texts.append(
            re.sub(' [ ]+', ' ', re.sub('\n', ' ', remainder[start:end])).
            replace('_', ''))
        start = end
    return texts

def make_dataset(text):
    print('processing text with spacy')
    doc = nlp(text)
    sents = list(doc.sents)
    print('doc has {} sentences'.format(len(sents)))
    data = []
    for sent in sents:
        if len(sent.text) < MIN_SENTENCE_CHAR_LENGTH:
            continue
        if len(data) % 20 == 0:
            print('{}..'.format(len(data)), end='', flush=True)
        data.append('< ' + sent.text)
        data.append('> ' + transform_present_span(sent))
    print('done')
    return '\n'.join(data)

In [24]:
from datetime import datetime

GUTENBERG_DIR = 'Gutenberg/txt'
BIG_FILE = 'gutenberg.txt'
OUTPUT_DIR = 'Gutenberg/out'

def preprocess_multiple_texts():
    i = 0
    for fn in os.listdir(GUTENBERG_DIR):
        if fn.endswith('.txt') and 'z2.txt' in fn:
            with open(os.path.join(GUTENBERG_DIR, fn), 'r') as inf:
                texts = preprocess(inf.read())
                for j, text in enumerate(texts):
                    with open(os.path.join(OUTPUT_DIR, f'{i}-{j}.txt'), 'w') as outf:
                        outf.write(make_dataset(text))

def preprocess_one_large_text(fn, chunk_size):
    with open(os.path.join(GUTENBERG_DIR, fn), 'r') as inf:
        texts = preprocess(inf.read(), chunk_size)
        for j, text in enumerate(texts):
            print(f'writing text {j}')
            with open(os.path.join(OUTPUT_DIR, f'{j}.txt'), 'w') as outf:
                outf.write(text)

def process_one_large_text(fn, chunk_size):
    with open(os.path.join(GUTENBERG_DIR, fn), 'r') as inf:
        texts = preprocess(inf.read(), chunk_size)
        for j, text in enumerate(texts):
            print(f'writing text {j}')
            with open(os.path.join(OUTPUT_DIR, f'{j}.txt'), 'w') as outf:
                outf.write(make_dataset(text))

def test_make_dataset_speed():
    nlp('Warm up the parser.')
    t1 = datetime.now()
    process_one_large_text('test-30k.txt', 30000)
    d = (datetime.now() - t1).seconds
    print(f'30k: {d} seconds')
    t1 = datetime.now()
    process_one_large_text('test-30k.txt', 10000)
    d = (datetime.now() - t1).seconds 
    print(f'3 x 10k: {d} seconds')

In [25]:
test_make_dataset_speed()

1
2
3
4
5
6
7
writing text 0
processing text with spacy
doc has 390 sentences
0..20..40..60..80..100..120..140..160..180..200..220..240..260..280..300..320..340..360..380..400..420..440..460..480..500..520..540..560..580..600..620..640..660..680..700..720..30k: 87 seconds
1
2
3
4
5
6
7
writing text 0
processing text with spacy
doc has 119 sentences
0..20..40..60..80..100..120..140..160..180..200..220..writing text 1
processing text with spacy
doc has 168 sentences
0..20..40..60..80..100..120..140..160..180..200..220..240..260..280..300..writing text 2
processing text with spacy
doc has 103 sentences
0..20..40..60..80..100..120..140..160..180..3 x 10k: 89 seconds
