In [18]:
import collections
import re
import math
import numpy as np

In [4]:
# Read tokens from Penn Treebank
file = open("sec02-21.gold.tagged", "r") 
content = file.read()
tokens = content.split()

# Find the 'n' most common words for tags in taglist in Penn Treebank
def most_common_words_for_tags(pos_list, n = 3):
    pos_list_regex = "|".join(pos_list)
    pos_regex = f"\|({pos_list_regex})$"
    word_regex = r"(^[a-zA-Z]+)\|"
    
    counters = {pos : collections.Counter() for pos in pos_list}
    for t in tokens:
        if re.search(pos_regex, t) and re.search(word_regex, t):
            pos = re.search(pos_regex, t).group(1) 
            word = re.search(word_regex, t).group(1)
            counters[pos][word] += 1

    result = {}
    for pos in counters.keys():
        result[pos] = [w for w, _ in counters[pos].most_common(n)]

    return result
    
# Find most common noun-likes and present tense verbs
common_nouns_and_verbs = most_common_words_for_tags(['NN', 'NNS', 'NNP', 'NNPS', 'PRP','VBP', 'VBZ'], n = 100)



In [5]:
print(common_nouns_and_verbs['NN'])
print(common_nouns_and_verbs['NNS'])
print(common_nouns_and_verbs['VBP'])
print(common_nouns_and_verbs['VBZ'])

['company', 'year', 'market', 'share', 'stock', 'trading', 'president', 'business', 'quarter', 'government', 'time', 'week', 'price', 'group', 'interest', 'yesterday', 'industry', 'unit', 'money', 'month', 'rate', 'investment', 'state', 'chairman', 'income', 'profit', 'program', 'bank', 'firm', 'debt', 'part', 'plan', 'sale', 'issue', 'tax', 'way', 'vice', 'loss', 'growth', 'executive', 'day', 'bid', 'cash', 'revenue', 'officer', 'director', 'concern', 'board', 'computer', 'case', 'number', 'spokesman', 'agreement', 'stake', 'value', 'end', 'contract', 'system', 'bill', 'oil', 'offer', 'law', 'index', 'agency', 'insurance', 'dollar', 'maker', 'period', 'office', 'world', 'exchange', 'trade', 'capital', 'production', 'court', 'management', 'analyst', 'economy', 'today', 'work', 'policy', 'bond', 'country', 'increase', 'point', 'chief', 'earthquake', 'report', 'power', 'takeover', 'problem', 'home', 'news', 'plant', 'credit', 'fund', 'research', 'example', 'area', 'junk']
['years', 'shar

In [7]:
# Compose sentence prefixes with frequent words.
# The sentence prefixes are intended to test intervening nouns.
words = {
    "NN1" : "product",
    "NNS1" : "products",
    "NN2" : "company",
    "NNS2" : "companies",
    "VBP1" : "looks",
    "VBZ1" : "look",
    "VBP2" : "produces",
    "VBZ2" : "produce",
}
sentences = [
    (f"The {words['NN1']} of the {words['NN2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} of the {words['NN2']}", [words['VBP1'], words['VBZ1']], words['VBZ1']),
    (f"The {words['NN1']} of the {words['NNS2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} of the {words['NNS2']}", [words['VBP1'], words['VBZ1']], words['VBZ1']),

    (f"The {words['NN1']} that the {words['NN2']}", [words['VBP2'], words['VBZ2']], words['VBP2']),
    (f"The {words['NNS1']} that the {words['NN2']}", [words['VBP2'], words['VBZ2']], words['VBP2']),
    (f"The {words['NN1']} that the {words['NNS2']}", [words['VBP2'], words['VBZ2']], words['VBZ2']),
    (f"The {words['NNS1']} that the {words['NNS2']}", [words['VBP2'], words['VBZ2']], words['VBZ2']),

    (f"The {words['NN1']} that the {words['NN2']} {words['VBP2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} that the {words['NN2']} {words['VBP2']}", [words['VBP1'], words['VBZ1']], words['VBZ1']),
    (f"The {words['NN1']} that the {words['NNS2']} {words['VBZ2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} that the {words['NNS2']} {words['VBZ2']}", [words['VBP1'], words['VBZ1']], words['VBZ1'])
]

print("(sentence, options, correct-option):")
sentences

(sentence, options, correct-option):


[('The product of the company', ['looks', 'look'], 'looks'),
 ('The products of the company', ['looks', 'look'], 'look'),
 ('The product of the companies', ['looks', 'look'], 'looks'),
 ('The products of the companies', ['looks', 'look'], 'look'),
 ('The product that the company', ['produces', 'produce'], 'produces'),
 ('The products that the company', ['produces', 'produce'], 'produces'),
 ('The product that the companies', ['produces', 'produce'], 'produce'),
 ('The products that the companies', ['produces', 'produce'], 'produce'),
 ('The product that the company produces', ['looks', 'look'], 'looks'),
 ('The products that the company produces', ['looks', 'look'], 'look'),
 ('The product that the companies produce', ['looks', 'look'], 'looks'),
 ('The products that the companies produce', ['looks', 'look'], 'look')]

In [None]:
# Compose sentence prefixes with frequent words.
# The sentence prefixes are intended to test long distance.


In [50]:
# Compose sentence prefixes with frequent words.
# The sentence prefixes are intended to test long distance in combination with intervening nouns.
