In [1]:
import re
from collections import defaultdict

In [2]:
def get_verb_prefixes(sentences):
    for s in sentences:
        tokens = s.split(" ")
        for i, t in enumerate(tokens):
            if re.search("VBZ|VBP", t):
                yield " ".join(tokens[0:i+1])

In [3]:
# relevant tags:
# NN: toy
# NNS: toys
# NNP: IBM
# NNPS: Carolinas
# PRP: I, you, he
# VBP: eat
# VBZ: eats

# constraint: all words should occur in training corpus [or we remove words that are not in the corpus?]
# constraint: VBP => exactly one NNS (or NNPS, or plural PRP)
# constraint: VBZ => exactly one NN (or NNP, or singular PRP)

# example 1
#plurality: VBP
#attractors: 1 (number of intervening nons of opposite number)
#distance: 3 (number of words between subject and verb)

def analyze_sentences(sentences):
    for s in sentences:
        pos_regex = r"\|([A-Z]+)(?= |$)"
        tag_sequence = re.findall(pos_regex, s)
        if tag_sequence.count("NNPS") or tag_sequence.count("NNP") or tag_sequence.count("PRP") or tag_sequence.count("EX"): 
            #print(s)
            continue # for now: ignore sentences with "NNPS, NNP, PRP, EX 
        plurality = tag_sequence[-1]
        (nn_same, nn_opposite) = ("NNS", "NN") if plurality == "VBP" else ("NN", "NNS")
        if not(tag_sequence.count(nn_same) == 1): 
            #print(s)
            continue #constraint: ensure exactly one potential subject
        subject_index = tag_sequence.index(nn_same) 
        distance = len(tag_sequence) - subject_index - 2
        nr_of_attractors = tag_sequence[subject_index:].count(nn_opposite) 
        yield (s, plurality, distance, nr_of_attractors)


In [4]:
#(s, plurality, distance, nr_of_attractors)

def categorize_by_prop(sentence_tuples, prop_index):
    propdict = defaultdict(list)
    for t in sentence_tuples:
        prop = t[prop_index]
        propdict[prop] = propdict[prop] + [t]
    return propdict

def categorize_by_plurality(sentence_tuples):
    return categorize_by_prop(sentence_tuples, 1)

def categorize_by_distance(sentence_tuples):
    return categorize_by_prop(sentence_tuples, 2)

def categorize_by_attractor_count(sentence_tuples):
    return categorize_by_prop(sentence_tuples, 3)


In [5]:
file = open("sec02-21.gold.tagged", "r") 
content = file.read()
sentences = content.split(".|.")
verb_prefixes = list(get_verb_prefixes(sentences))
analyzed_prefixes = list(analyze_sentences(verb_prefixes))


In [6]:
attractor_stats = categorize_by_attractor_count(analyzed_prefixes) # sentences with various number of attractors (momogeneous) 
distance_stats = categorize_by_distance(attractor_stats[0]) # sentences without attractors but varying distance

In [7]:
distance_stats[8]

[(' \nThat|DT burden|NN is|VBZ very|RB difficult|JJ ,|, if|IN not|RB impossible|JJ ,|, to|TO meet|VB ,|, says|VBZ',
  'VBZ',
  8,
  0),
 (' \nWhat|WP far|RB too|RB many|JJ people|NNS concerned|JJ about|IN education|NN either|CC fail|VB to|TO understand|VB or|CC choose|VB to|TO ignore|VB is|VBZ',
  'VBZ',
  8,
  0),
 (' \nThe|DT opposition|NN can|MD be|VB the|DT most|RBS hurt|VBN because|IN everyone|DT already|RB figures|VBZ',
  'VBZ',
  8,
  0),
 (" \nSell|VB stocks|NNS that|WDT are|VBP n't|RB doing|VBG well|RB now|RB ,|, and|CC that|DT do|VBP",
  'VBP',
  8,
  0)]

In [18]:
import spacy
nlp = spacy.load('en')
parsed_text = nlp(u"I thought it was the complete set")

#get token dependencies
for text in parsed_text:
    print (text, text.dep, text.dep_, list(text.subtree))
    if text.dep_ == "nsubj":
        subject = text.orth_
        print(subject)


I 426 nsubj [I]
I
thought 8206900633647566924 ROOT [I, thought, it, was, the, complete, set]
it 426 nsubj [it]
it
was 405 ccomp [it, was, the, complete, set]
the 412 det [the]
complete 399 amod [complete]
set 401 attr [the, complete, set]


In [10]:
parsed_text.print_tree()

[{'NE': '',
  'POS_coarse': 'VERB',
  'POS_fine': 'VBD',
  'arc': 'ROOT',
  'lemma': 'think',
  'modifiers': [{'NE': '',
    'POS_coarse': 'PRON',
    'POS_fine': 'PRP',
    'arc': 'nsubj',
    'lemma': '-PRON-',
    'modifiers': [],
    'word': 'I'},
   {'NE': '',
    'POS_coarse': 'VERB',
    'POS_fine': 'VBD',
    'arc': 'ccomp',
    'lemma': 'be',
    'modifiers': [{'NE': '',
      'POS_coarse': 'PRON',
      'POS_fine': 'PRP',
      'arc': 'nsubj',
      'lemma': '-PRON-',
      'modifiers': [],
      'word': 'it'},
     {'NE': '',
      'POS_coarse': 'NOUN',
      'POS_fine': 'NN',
      'arc': 'attr',
      'lemma': 'set',
      'modifiers': [{'NE': '',
        'POS_coarse': 'DET',
        'POS_fine': 'DT',
        'arc': 'det',
        'lemma': 'the',
        'modifiers': [],
        'word': 'the'},
       {'NE': '',
        'POS_coarse': 'ADJ',
        'POS_fine': 'JJ',
        'arc': 'amod',
        'lemma': 'complete',
        'modifiers': [],
        'word': 'complete'}],
 