In [284]:
import collections
import re
import math
import torch
from torch.autograd import Variable
import torch.nn as nn
import pickle
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2834)
import matplotlib.ticker as mtick
import matplotlib as matplotlib
import difflib

In [223]:
#load language model
lm = torch.load('model.pt', map_location=lambda storage, loc: storage)
# Load dictionary word --> id 
dictionary = pickle.load(open('dict', 'rb'))
softmax = nn.Softmax()
# set the maximum sequence length
max_seq_len = 50

# function to transform sentence into word id's and put them in a pytorch Variable
# NB Assumes the sentence is already tokenised!
def tokenise(sentence, dictionary):
    words = sentence.split(' ')
    l = len(words)
    assert l <= max_seq_len, "sentence too long"
    token = 0
    ids = torch.LongTensor(l)

    for word in words:
        try:
            ids[token] = dictionary.word2idx[word]
        except KeyError:
            print( word)
            raw_input()
            ids[token] = dictionary.word2idx['<unk>']
        token += 1
    return ids

# find 'n' random words for tags in taglist in Penn Treebank

def get_words_and_probs(pos_list):
    pos_list_regex = "|".join(pos_list)
    pos_regex = f"\|({pos_list_regex})$"
    word_regex = r"(^[a-zA-Z]+)\|"
    
    counters = {pos : collections.Counter() for pos in pos_list}
    for t in tokens:
        if re.search(pos_regex, t) and re.search(word_regex, t):
            pos = re.search(pos_regex, t).group(1) 
            word = re.search(word_regex, t).group(1).lower()
            counters[pos][word] += 1
            
    #normalize counts to obain probabilities
    for pos in counters.keys():
        norm = sum(counters[pos].values())
        for word, prob in counters[pos].items():
            counters[pos][word] = prob/norm
    return counters


# check if words are present in the model otherise delete them from the dict 
def get_valid_words(model, dictionary, sentence, check_words):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    
    # number of tokens (= output size)
    ntokens = len(dictionary)
    hidden = model.init_hidden(1)
    
    # tokenise the sentence, put in torch Variable
    test_data = tokenise(sentence, dictionary)
    input_data = Variable(test_data, volatile=True)

    # run the model, compute probabilities by applying softmax
    output, hidden = model(input_data, hidden)
    output_flat = output.view(-1, ntokens)
    logits = output[-1, :]
    sm = softmax(logits).view(ntokens)
    
    # get probabilities of certain words by looking up their
    # indices and print them,
    def get_prob(word):
        return sm[dictionary.word2idx[word]].data[0]
    res_dict = {}
    for word,prob in check_words.items():
        try:
            get_prob(word)
            res_dict.update({word: prob})
        except:
            None
    return res_dict

In [214]:
# Read tokens from Penn Treebank
file = open("sec02-21.gold.tagged", "r") 
content = file.read()
tokens = content.split()

# Find the 'n' most common words for tags in taglist in Penn Treebank
def most_common_words_for_tags(pos_list, n = 3):
    pos_list_regex = "|".join(pos_list)
    pos_regex = f"\|({pos_list_regex})$"
    word_regex = r"(^[a-zA-Z]+)\|"
    
    counters = {pos : collections.Counter() for pos in pos_list}
    for t in tokens:
        if re.search(pos_regex, t) and re.search(word_regex, t):
            pos = re.search(pos_regex, t).group(1) 
            word = re.search(word_regex, t).group(1)
            counters[pos][word] += 1

    result = {}
    for pos in counters.keys():
        result[pos] = [w for w, _ in counters[pos].most_common(n)]

    return result
    
# Find most common noun-likes and present tense verbs
common_nouns_and_verbs = most_common_words_for_tags(['NN', 'NNS', 'NNP', 'NNPS', 'PRP','VBP', 'VBZ'], n = 100)



In [215]:
print(common_nouns_and_verbs['NN'])
print(common_nouns_and_verbs['NNS'])
print(common_nouns_and_verbs['VBP'])
print(common_nouns_and_verbs['VBZ'])

['company', 'year', 'market', 'share', 'stock', 'trading', 'president', 'business', 'quarter', 'government', 'time', 'week', 'price', 'group', 'interest', 'yesterday', 'industry', 'unit', 'money', 'month', 'rate', 'investment', 'state', 'chairman', 'income', 'profit', 'program', 'bank', 'firm', 'debt', 'part', 'plan', 'sale', 'issue', 'tax', 'way', 'vice', 'loss', 'growth', 'executive', 'day', 'bid', 'cash', 'revenue', 'officer', 'director', 'concern', 'board', 'computer', 'case', 'number', 'spokesman', 'agreement', 'stake', 'value', 'end', 'contract', 'system', 'bill', 'oil', 'offer', 'law', 'index', 'agency', 'insurance', 'dollar', 'maker', 'period', 'office', 'world', 'exchange', 'trade', 'capital', 'production', 'court', 'management', 'analyst', 'economy', 'today', 'work', 'policy', 'bond', 'country', 'increase', 'point', 'chief', 'earthquake', 'report', 'power', 'takeover', 'problem', 'home', 'news', 'plant', 'credit', 'fund', 'research', 'example', 'area', 'junk']
['years', 'shar

In [205]:
# Compose sentence prefixes with frequent words.
# The sentence prefixes are intended to test intervening nouns.

NN = ['company', 'year', 'market', 'share', 'stock', 'system', 'president', 'business', 
      'quarter', 'government', 'time', 'week', 'price', 'group', 'interest',
      'industry', 'unit','month', 'rate', 'investment', 'state', 'producer', 'income', 
      'program', 'bank', 'part', 'plan', 'sale', 'issue', 'tax', 'way', 'loss', 'executive', 'day', 'bid', 'data', 'line','hour', 'plant', 'concern']

NNS = ['companies', 'years', 'markets', 'shares', 'stocks', 'systems', 'presidents', 
       'businesses', 'quarters', 'governments', 'times', 'weeks', 'prices', 'groups', 'interests', 'industries', 
       'units', 'months', 'rates', 'investments', 'states', 'producers', 'incomes', 'programs', 'banks', 'parts', 'plans', 
      'sales', 'issues', 'taxes', 'ways', 'losses', 'executives', 'days', 'bids', 'data', 'lines', 'hours', 'plants', 'concerns',]

VBP = ['are', 'have', 'do', 'say', 'think', 'want', 'expect', 'include', 'ask', 
       'make', 'need', 'know', 'see', 'get', 'seem', 'remain', 'continue', 'show', 'buy', 
       'feel', 'go', 'sell', 'take', 'use', 'plan', 'look', 'tend', 'hope', 'argue', 'give',
       'pay', 'appear', 'suggest', 'fear', 'find', 'come', 'offer', 'contend', 'agree', 'provide']

VBZ = ['is', 'has', 'does', 'says', 'thinks', 'wants', 'expects', 'includes', 'asks', 'makes',
      'needs', 'knows', 'sees', 'gets', 'seems', 'remains', 'continues', 'shows', 'buys', 'feels', 'goes', 'sells',
      'takes', 'uses', 'plans', 'looks', 'tends', 'hopes', 'argues', 'gives', 'pays', 'appears', 'suggests', 'fears',
      'finds', 'comes', 'offers', 'contends', 'agrees', 'provides']
words = {
    "NN1" : "product",
    "NNS1" : "products",
    "NN2" : "company",
    "NNS2" : "companies",
    "VBP1" : "looks",
    "VBZ1" : "look",
    "VBP2" : "produces",
    "VBZ2" : "produce",
}
sentences = [
    (f"The {words['NN1']} of the {words['NN2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} of the {words['NN2']}", [words['VBP1'], words['VBZ1']], words['VBZ1']),
    (f"The {words['NN1']} of the {words['NNS2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} of the {words['NNS2']}", [words['VBP1'], words['VBZ1']], words['VBZ1']),

    (f"The {words['NN1']} that the {words['NN2']}", [words['VBP2'], words['VBZ2']], words['VBP2']),
    (f"The {words['NNS1']} that the {words['NN2']}", [words['VBP2'], words['VBZ2']], words['VBP2']),
    (f"The {words['NN1']} that the {words['NNS2']}", [words['VBP2'], words['VBZ2']], words['VBZ2']),
    (f"The {words['NNS1']} that the {words['NNS2']}", [words['VBP2'], words['VBZ2']], words['VBZ2']),

    (f"The {words['NN1']} that the {words['NN2']} {words['VBP2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} that the {words['NN2']} {words['VBP2']}", [words['VBP1'], words['VBZ1']], words['VBZ1']),
    (f"The {words['NN1']} that the {words['NNS2']} {words['VBZ2']}", [words['VBP1'], words['VBZ1']], words['VBP1']),
    (f"The {words['NNS1']} that the {words['NNS2']} {words['VBZ2']}", [words['VBP1'], words['VBZ1']], words['VBZ1'])
]

print("(sentence, options, correct-option):")
sentences

(sentence, options, correct-option):


[('The product of the company', ['looks', 'look'], 'looks'),
 ('The products of the company', ['looks', 'look'], 'look'),
 ('The product of the companies', ['looks', 'look'], 'looks'),
 ('The products of the companies', ['looks', 'look'], 'look'),
 ('The product that the company', ['produces', 'produce'], 'produces'),
 ('The products that the company', ['produces', 'produce'], 'produces'),
 ('The product that the companies', ['produces', 'produce'], 'produce'),
 ('The products that the companies', ['produces', 'produce'], 'produce'),
 ('The product that the company produces', ['looks', 'look'], 'looks'),
 ('The products that the company produces', ['looks', 'look'], 'look'),
 ('The product that the companies produce', ['looks', 'look'], 'looks'),
 ('The products that the companies produce', ['looks', 'look'], 'look')]

In [None]:
# Compose sentence prefixes with frequent words.
# The sentence prefixes are intended to test long distance.


In [50]:
# Compose sentence prefixes with frequent words.
# The sentence prefixes are intended to test long distance in combination with intervening nouns.


In [225]:
# Find most common noun-likes and present tense verbs
words_and_probs = get_words_and_probs(['NN','NNS','VBP','VBZ'])

In [338]:
#apply deletion of nouns and verbs that are not in the model
NN_dict = get_valid_words(lm,dictionary, 'this',words_and_probs['NN'])
print('valid words in NN:', len(NN_dict))
print('words in NNS:',len(words_and_probs['NN']))
VBZ_dict = get_valid_words(lm,dictionary, 'this',words_and_probs['VBZ'])
print('valid words in VBZ:',len(VBZ_dict))
print('words in VBZ:',len(words_and_probs['VBZ']))
NNS_dict = get_valid_words(lm,dictionary, 'this',words_and_probs['NNS'])
print('valid words in NNS:', len(NNS_dict))
print('words in NNS:',len(words_and_probs['NNS']))
VBP_dict = get_valid_words(lm,dictionary, 'this',words_and_probs['VBP'])
print('valid words in VBP:',len(VBP_dict))
print('words in VBP:',len(words_and_probs['VBP']))

valid words in NN: 3349
words in NNS: 7476
valid words in VBZ: 551
words in VBZ: 1195
valid words in NNS: 1456
words in NNS: 4076
valid words in VBP: 681
words in VBP: 928


In [339]:
def print_keys(tu):
    str = '['
    for i in tu:
        str+= "'"+ i[0]+"'"+ " ,"
    str += ']'
    print(str)

def get_most_similar_pair(a1, a2):
    words1 = []
    words2 = []
    for i in a1:
        for j in a2:
            if i[0] in j[0] and len(i[0]) >= len(j[0])-1:
                words1.append(i[0])
                words2.append(j[0])
    return words1, words2

In [370]:
print('Least frequent:')
NN_least,NNS_least = get_most_similar_pair(sorted(NN_dict.items(), key=lambda x: x[1])[:650],
                                           sorted(NNS_dict.items(), key=lambda x: x[1])[:650])
NN_least = NN_least[:40]
NNS_least = NNS_least[:40]
print(print_keys(sorted(VBZ_dict.items(), key=lambda x: x[1])[:200]))

Least frequent:
['commissions' ,'institutes' ,'speeds' ,'share' ,'trains' ,'strokes' ,'expenditures' ,'act' ,'drop' ,'face' ,'haunts' ,'bans' ,'tend' ,'parallels' ,'rubber' ,'proteins' ,'are' ,'point' ,'stages' ,'displays' ,'documents' ,'subordinates' ,'fights' ,'voices' ,'shape' ,'let' ,'make' ,'prints' ,'rallies' ,'account' ,'discounts' ,'lag' ,'finances' ,'hope' ,'hints' ,'flow' ,'towers' ,'estimate' ,'committees' ,'services' ,'profits' ,'searches' ,'file' ,'curb' ,'dreams' ,'guards' ,'value' ,'exercise' ,'checks' ,'die' ,'sums' ,'say' ,'brands' ,'burdens' ,'surge' ,'rose' ,'farms' ,'stores' ,'inspectors' ,'strips' ,'feel' ,'stage' ,'questions' ,'trap' ,'filters' ,'coaches' ,'trips' ,'pitches' ,'boost' ,'warrants' ,'interests' ,'islands' ,'curve' ,'pits' ,'bars' ,'remarks' ,'parties' ,'labels' ,'circles' ,'trucks' ,'swings' ,'comments' ,'positions' ,'bursts' ,'pockets' ,'subjects' ,'bills' ,'ducks' ,'reasons' ,'dogs' ,'appeals' ,'set' ,'resorts' ,'start' ,'want' ,'couples' ,'think' 

KeyError: 'expenditures'

8.45522955948254e-05

In [404]:
NNS_least = ['french', 'japanese', 'soviets', 'means', 'physics', 'stands', 'links', 'dynamics', 'subordinates',
             'digs', 'municipals', 'arteries', 'makes', 'permits', 'kronor', 'likes', 'deliveries', 'news', 'peoples',
             'wins', 'backers', 'lies', 'due', 'cooperatives', 'professionals', 'economics', 'pros', 'regrets', 
             'plains', 'attributes', 'capitalists', 'pharmaceuticals', 'liberals', 'east', 'reins', 'lire', 
             'suspects', 'killings', 'elite', 'rand']

NN_least = ['french', 'japanese', 'soviet', 'mean', 'physics', 'stands', 'links', 'dynamic', 'subordinate', 
            'dig', 'municipal', 'arteries', 'make', 'permit', 'kronor', 'likes', 'deliveries', 'new', 'people',
            'win', 'backer', 'lie', 'due', 'cooperative', 'professional', 'economics', 'pro', 'regret', 'plain', 
            'attribute', 'capitalist', 'pharmaceutical', 'liberal', 'east', 'rein', 'lire', 'suspect', 'killing', 
            'elite', 'rand']

In [411]:
print("Word frequencies:")
for i in NNS_least:
    print(i,NNS_dict[i])
print()
for i in NN_least:
    print(i,NN_dict[i])

Word frequencies:
french 1.704593880507969e-05
japanese 0.00023864314327111565
soviets 0.00035796471490667347
means 0.0003068268984914344
physics 3.409187761015938e-05
stands 0.00011932157163555782
links 0.00022159720446603598
dynamics 3.409187761015938e-05
subordinates 8.522969402539845e-05
digs 5.113781641523907e-05
municipals 0.00027273502088127503
arteries 0.00010227563283047814
makes 6.818375522031876e-05
permits 8.522969402539845e-05
kronor 0.0002045512656609563
likes 1.704593880507969e-05
deliveries 0.00013636751044063752
news 3.409187761015938e-05
peoples 6.818375522031876e-05
wins 8.522969402539845e-05
backers 0.00011932157163555782
lies 1.704593880507969e-05
due 1.704593880507969e-05
cooperatives 0.0001704593880507969
professionals 0.0003409187761015938
economics 0.00027273502088127503
pros 0.0001875053268558766
regrets 3.409187761015938e-05
plains 1.704593880507969e-05
attributes 3.409187761015938e-05
capitalists 0.0001534134492457172
pharmaceuticals 0.0002045512656609563
li

In [221]:
print("Probabilities NN,NNS random:\n")
for i in range(len(NN_rand)):
    print(NN_rand[i],random_nouns_and_verbs['NN'][NN_rand[i]],NNS_rand[i],random_nouns_and_verbs['NNS'][NNS_rand[i]])
print()
print("Probabilities VBP,VBZ random:\n")
for i in range(len(NN_rand)):
    print(VBZ_rand[i],random_nouns_and_verbs['VBZ'][VBZ_rand[i]],VBZ_rand[i],random_nouns_and_verbs['VBP'][VBP_rand[i]])
    
    
print("Probabilities NN:\n")
for i in range(len(NN_rand)):
    print(NN[i],random_nouns_and_verbs['NN'][NN[i]],NNS[i],random_nouns_and_verbs['NNS'][NNS[i]])
print()
print("Probabilities VBP:\n")
for i in range(len(NN_rand)):
    print(VBZ[i],random_nouns_and_verbs['VBZ'][VBZ[i]],VBP[i],random_nouns_and_verbs['VBP'][VBP[i]])    

Probabilities NN,NNS random:

review 0.00028101164191087917 reviews 0.0001704593880507969
stage 0.00022480931352870332 stages 0.00022159720446603598
disclosure 0.00019269369731031715 disclosures 0.00013636751044063752
role 0.0010036130068245685 roles 0.00011932157163555782
memory 0.00026495383380168607 memories 0.0001704593880507969
auto 0.0013327980730630268 autos 0.00013636751044063752
maker 0.0022882376555600163 makers 0.002318247677490838
year 0.0179124849458049 years 0.019892610585528
president 0.007105580088317944 presidents 0.00025568908207619537
executive 0.0028342031312725813 executives 0.003681922781897213
officer 0.002593336009634685 officers 0.0005795619193727095
approval 0.00111601766358892 approvals 0.0001704593880507969
manufacture 2.4086712163789643e-05 manufactures 0
share 0.01141710156563629 shares 0.019347140543765448
rate 0.004215174628663188 rates 0.008113866871217932
record 0.001421116017663589 records 0.000494332225347311
date 0.0005620232838217583 dates 0.000102

NameError: name 'VBS_dict' is not defined