In [1]:
from nltk.corpus import brown
import textacy.preprocessing as p
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import dok_matrix
from tqdm import tqdm

In [2]:
def cos(x,y):
    numer = np.dot(x,y)
    denom = np.linalg.norm(x)*np.linalg.norm(y)
    return(numer/denom)

def compare(x,y):
    word1 = matrix[coocc.w2i[x]]
    word2 = matrix[coocc.w2i[y]]
    return(cos(word1, word2))

In [3]:
#sentences = list(" ".join(sentence) for sentence in brown.sents())

kids = open("cbt_train.txt", "r").read()
kids = sent_tokenize(kids)


In [4]:
#kids = sent_tokenize(kids)
#print(len(sentences))
print(len(kids))

264495


In [5]:
def clean_sentence(sent):
    clean = p.normalize.normalize_whitespace(sent.lower())
    clean = p.remove.remove_punctuation(clean)
    clean = p.replace.replace_numbers(clean)
    return(clean)

In [6]:
#for i in range(len(sentences)):
#    sentences[i] = ("<s> " + clean_sentence(sentences[i]) + " </s>").split()
    
for i in range(len(kids)):
    kids[i] = ("<s> " + clean_sentence(kids[i]) + " </s>").split()
    
    
for i in range(2,10):
    print(" ".join(kids[i]))

<s> once upon a time there reigned in pantouflia a king and a queen </s>
<s> with almost everything else to make them happy they wanted one thing they had no children </s>
<s> this vexed the king even more than the queen who was very clever and learned and who had hated dolls when she was a child </s>
<s> however she too in spite of all the books she read and all the pictures she painted would have been glad enough to be the mother of a little prince </s>
<s> the king was anxious to consult the fairies but the queen would not hear of such a thing </s>
<s> she did not believe in fairies she said that they had never existed and that she maintained though the history of the royal family was full of chapters about nothing else </s>
<s> well at long and at last they had a little boy who was generally regarded as the finest baby that had ever been seen </s>
<s> even her majesty herself remarked that though she could never believe all the courtiers told her yet he certainly was a fine child a

In [7]:
class CoOcc():
    def __init__(self, corpus, window=1):
        self.corpus = corpus
        self.window = window
        
        self.counts = {}
        
        print("Calculating vocabulary")
        for sentence in self.corpus:
            for word in sentence:
                if word not in self.counts:
                    self.counts[word] = {}
        
        self.w2i = dict([(w, i) for i, w in enumerate(self.counts.keys())])
        self.i2w = dict([(i, w) for i, w in enumerate(self.counts.keys())])
        
    def sent_counts(self, sentence):
        for i in range(len(sentence)):
                for j in range(-self.window, self.window+1):
                    if j!=0 and i+j >=0 and i+j < len(sentence):
                        if sentence[i+j] in self.counts[sentence[i]]:
                            self.counts[sentence[i]][sentence[i+j]] += 1
                        else:
                            self.counts[sentence[i]][sentence[i+j]] = 1
        
    def get_counts(self):
        print("Getting counts...")
        for sentence in tqdm(self.corpus):
            self.sent_counts(sentence)        
        mat = dok_matrix((len(coocc.counts), len(coocc.counts)), dtype=np.int8)
        print("Constructing matrix")
        for word, context in tqdm(coocc.counts.items()):
            for entry in context:
                mat[coocc.w2i[word], coocc.w2i[entry]] = coocc.counts[word][entry]
        self.matrix = mat.toarray()
        #print("Preparing pointwise similarity matrix")
        #self.similarities = cosine_similarity(self.matrix)
        
    def most_sim(self, word):
        max = float("-inf")
        to_return = None
        for cand in tqdm(self.counts):
            if cand != word:
                score = compare(word, cand)
                if score > max:
                    max = score
                    to_return = cand
        return(to_return)
    
    #def most_sim_alt(self, word):
    #    scores = np.dot(self.matrix, self.matrix[self.w2i[word]])
    #   return(self.i2w[np.argmax(scores)])
    
    def highest_count(self, word, n):
        counts = self.matrix[self.w2i[word]]
        highest_n = counts.argsort()[-n:][::-1]
        return([self.i2w[result] for result in highest_n])
        
        
#    def most_sim_mega_alt(self, word):
#        return(self.i2w(np.argmax(self.similarities[self.w2i[word]])))

In [22]:
coocc = CoOcc(kids, window=8)

Calculating vocabulary


In [23]:
coocc.get_counts()

  0%|          | 765/264495 [00:00<00:34, 7638.77it/s]

Getting counts...


100%|██████████| 264495/264495 [00:35<00:00, 7398.93it/s] 
  0%|          | 0/42256 [00:00<?, ?it/s]

Constructing matrix


100%|██████████| 42256/42256 [01:22<00:00, 514.78it/s] 


In [24]:
coocc.matrix.shape

(42256, 42256)

In [25]:
print(coocc.highest_count("boy", 15))
print(coocc.highest_count("girl", 15))


['from', 'do', 'very', 'they', 'did', 'he', 'been', 'like', 'good', 'about', 'old', 'to', 'were', 'poor', 'with']
['do', 'there', 'would', 'young', 'were', 'me', 'good', 'she', 'could', 'it', 'an', 'poor', 'no', 'about', 'from']


In [26]:
def overlap(word1, word2, n):
    highest_one =  coocc.highest_count(word1, n)
    highest_two =  coocc.highest_count(word2, n)
    olaps = len(set(highest_one).intersection(set(highest_two)))
    return(olaps/n)

In [27]:
syn_pairs = [("king", "reign"), ("dog", "ate"), ("king", "kingdom"), ("he", "ran"), ("for", "the"), ("pretty", "princess")]
par_pairs = [("boy", "girl"), ("man", "woman"), ("king", "queen"), ("the", "a"),  ("he", "she"), ("blue", "green")]

In [28]:
syn_score = 0
for pair in syn_pairs:
    syn_score += overlap(pair[0], pair[1], 200)
    
print("syn_score:", syn_score/len(syn_pairs))

par_score = 0
for pair in par_pairs:
    par_score += overlap(pair[0], pair[1], 50)

print("par_score:", par_score/len(par_pairs))
    

syn_score: 0.3558333333333334
par_score: 0.22333333333333336


In [29]:
del(coocc)