In [3]:
from nltk.corpus import brown
import textacy.preprocessing as p
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import dok_matrix
from tqdm import tqdm_notebook

In [4]:
kids = open("cbt_train.txt", "r").read()
kids = sent_tokenize(kids)[:500]
print(len(kids))

FileNotFoundError: [Errno 2] No such file or directory: 'cbt_train.txt'

In [304]:
def clean_sentence(sent):
    clean = p.normalize.normalize_whitespace(sent.lower())
    clean = p.remove.remove_punctuation(clean)
    clean = p.replace.replace_numbers(clean)
    return(clean)

In [305]:
for i in range(len(kids)):
    kids[i] = ("<s> " + clean_sentence(kids[i]) + " </s>").split()
    
    
for i in range(2,10):
    print(" ".join(kids[i]))

<s> once upon a time there reigned in pantouflia a king and a queen </s>
<s> with almost everything else to make them happy they wanted one thing they had no children </s>
<s> this vexed the king even more than the queen who was very clever and learned and who had hated dolls when she was a child </s>
<s> however she too in spite of all the books she read and all the pictures she painted would have been glad enough to be the mother of a little prince </s>
<s> the king was anxious to consult the fairies but the queen would not hear of such a thing </s>
<s> she did not believe in fairies she said that they had never existed and that she maintained though the history of the royal family was full of chapters about nothing else </s>
<s> well at long and at last they had a little boy who was generally regarded as the finest baby that had ever been seen </s>
<s> even her majesty herself remarked that though she could never believe all the courtiers told her yet he certainly was a fine child a

In [293]:
class CoOcc():
    def __init__(self, corpus, window=1):
        self.corpus = corpus
        self.window = window
        
        self.counts = {}
        
        print("Calculating vocabulary")
        for sentence in self.corpus:
            for word in sentence:
                if word not in self.counts:
                    self.counts[word] = {}
        
        self.w2i = dict([(w, i) for i, w in enumerate(self.counts.keys())])
        self.i2w = dict([(i, w) for i, w in enumerate(self.counts.keys())])
        
    def sent_counts(self, sentence):
        for i in range(len(sentence)):
                for j in range(-self.window, self.window+1):
                    if j!=0 and i+j >=0 and i+j < len(sentence):
                        if sentence[i+j] in self.counts[sentence[i]]:
                            self.counts[sentence[i]][sentence[i+j]] += 1
                        else:
                            self.counts[sentence[i]][sentence[i+j]] = 1
        
    def get_counts(self):
        print("Getting counts...")
        for sentence in tqdm_notebook(self.corpus):
            self.sent_counts(sentence)        
        mat = dok_matrix((len(coocc.counts), len(coocc.counts)), dtype=np.int8)
        print("Constructing Count Matrix")
        for word, context in tqdm_notebook(coocc.counts.items()):
            for entry in context:
                mat[coocc.w2i[word], coocc.w2i[entry]] = coocc.counts[word][entry]
        self.X = mat.toarray()
        self.P = np.zeros(self.X.shape)
        
        print("Constructing Probability Matrix")
        for i in tqdm_notebook(range(len(self.X))):
            Xi = sum(self.X[i])
            self.P[i] = self.X[i] / Xi
            


In [306]:
coocc = CoOcc(kids, window=5)
coocc.get_counts()

Calculating vocabulary
Getting counts...


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

Constructing Count Matrix


HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

Constructing Probability Matrix


HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

In [307]:
import dynet as dy

In [308]:
class GloVe():
    def __init__(self, X, P, dims, w2i, i2w, x_max=40):
        self.X = X
        self.P = P
        self.dims = dims
        self.w2i = w2i
        self.i2w = i2w
        self.x_max = x_max

        self.model = dy.ParameterCollection()
        self.U = self.model.add_lookup_parameters((self.X.shape[0], self.dims))
        self.V = self.model.add_lookup_parameters((self.X.shape[0], self.dims))
        self.trainer = dy.RMSPropTrainer(self.model)
        
        self.embeddings = (self.U.value() + self.V.value()).T
        
        print("Before any training, similarity between 'a' and 'the' is:")
        print(self.sim("a", "the"), "\n\n")
        
    def sim(self, word1, word2):
        x = self.embeddings[self.w2i[word1]]
        y = self.embeddings[self.w2i[word2]]
        numer = np.dot(x,y)
        denom = np.linalg.norm(x)*np.linalg.norm(y)
        return(numer/denom)

        
    def f(self, value):
        if value.value() < self.x_max:
            return((value/self.x_max))
        else:
            return(1)
        
    def dist(self, u, v, Pij, Xij):
        perc = dy.scalarInput(Pij)
        count = dy.scalarInput(Xij)
        y_hat = dy.dot_product(u,v)
        y = dy.log(perc+1)
        weight = self.f(count)
        return(weight*dy.squared_distance(y_hat, y))
        
    def train(self, epochs=5):
        
        for epoch in range(1, epochs+1):
            running_loss = 0
            counter = 0
            for i in tqdm_notebook(range(len(self.X))):
                for j in range(len(self.X)):
                    dy.renew_cg()
                    loss = self.dist(self.U[i], self.V[j], self.P[i,j], self.X[i,j])
                    loss.backward()
                    self.trainer.update()
                    
                    running_loss += loss.value()
                    counter += 1
                    
                    
                    
            self.embeddings = (self.U.value() + self.V.value()).T            
            print("Average loss on epoch", epoch, "is:", running_loss / counter)
            print("After training for", epoch, "epoch(s), similarity between 'a' and 'the' is:")
            print(self.sim("a", "the"), "\n\n")
            
            
            
        
        

In [309]:
glove = GloVe(coocc.X, coocc.P, 100, coocc.w2i, coocc.i2w, x_max=10)

Before any training, similarity between 'a' and 'the' is:
-0.04562056451312254 




In [310]:
glove.train(epochs=5)

HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

Average loss on epoch 1 is: 6.384046977948952e-06
After training for 1 epoch(s), similarity between 'a' and 'the' is:
0.7164204169476123 




HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

Average loss on epoch 2 is: 2.764360925070929e-06
After training for 2 epoch(s), similarity between 'a' and 'the' is:
0.8009851195581361 




HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

Average loss on epoch 3 is: 1.373610769805463e-06
After training for 3 epoch(s), similarity between 'a' and 'the' is:
0.6052765714236821 




HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

Average loss on epoch 4 is: 7.775986109285375e-07
After training for 4 epoch(s), similarity between 'a' and 'the' is:
0.43938692732758394 




HBox(children=(IntProgress(value=0, max=1741), HTML(value='')))

Average loss on epoch 5 is: 4.126775673021754e-07
After training for 5 epoch(s), similarity between 'a' and 'the' is:
0.392413546203806 




In [312]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import *
from umap import UMAP
import plotly.express as px

In [313]:
def view_2D(method=PCA):
    model = method(n_components=2)
    points = model.fit_transform(glove.embeddings)
    temp = pd.DataFrame(points, columns=["x", "y"], index=glove.w2i.keys())
    temp["Word"] = glove.w2i.keys()
    fig = px.scatter(temp, x="x", y="y", text="Word")
    fig.update_traces(textposition="top center")
    fig.show()

In [314]:
view_2D(method=UMAP)