In [7]:
from collections import Counter
from zipfile import ZipFile
from tqdm import tqdm
import random as rn
import os
import requests
import os
import sys
from urllib.request import urlretrieve
import numpy as np
from nltk import word_tokenize

class GloveMatrix(object):
    """
    Downloads and loads GloVe matrix.
    """
    def __init__(self):
        self.glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
        self.file_name = "glove.6B.zip"
        self.dest = "glove.6B"
        self.download_glove()
        embedding_index = self.load_matrix()
        self.EMBEDDING_DIM = 300
        print("Done")
        
    def download_glove(self):
        if not os.path.exists("glove.6B/glove.6B.300d.txt"):
            if os.path.exists(self.file_name):
                self.unzip_file(self.file_name, self.dest)
            else:
                urlretrieve(self.glove_url, self.file_name, self.reporthook)
                self.unzip_file(self.file_name, self.dest)
                
    def load_matrix(self):       
        print("Loading embedding matrix")
        self.embedding_index = {}
        with open('glove.6B/glove.6B.300d.txt', "r") as f:
            lines = f.read().split("\n")
            for line in lines:
                values = line.split()
                if len(values) > 0:
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    self.embedding_index[word] = coefs

    def get_index(self):
        return self.embedding_index    

    def unzip_file(self, file_name, dest):
        print("Unzipping file...")
        zipTest = ZipFile(file_name)
        zipTest.extractall(dest)

    def download_file(self, url, file_name):
        print("Downloading file...")
        urlretrieve(url, file_name, reporthook)

    def reporthook(self, blocknum, blocksize, totalsize):
        readsofar = blocknum * blocksize
        if totalsize > 0:
            percent = readsofar * 1e2 / totalsize
            s = "\r%5.1f%% %*d / %d" % (
                percent, len(str(totalsize)), readsofar, totalsize)
            sys.stderr.write(s)
            if readsofar >= totalsize: # near the end
                sys.stderr.write("\n")
        else: # total size is unknown
            sys.stderr.write("read %d\n" % (readsofar,))

class TextEmbedder(object):
    """
    TextEmbedder returning word embeddings, using given GloVe matrix.
    """
    def __init__(self, glove_matrix):
        self.embedding_index = glove_matrix.embedding_index

    def get_any(self,word):
         return self.embedding_index.get(word, self.embedding_index.get("unk"))

In [17]:
gm = GloveMatrix()
te = TextEmbedder(gm)

Loading embedding matrix
Done


In [12]:
# getting word embedding for one word
te.get_any("hello")

array([ 0.26688  ,  0.39632  ,  0.6169   , -0.77451  , -0.1039   ,
        0.26697  ,  0.2788   ,  0.30992  ,  0.0054685, -0.085256 ,
        0.73602  , -0.098432 ,  0.5479   , -0.030305 ,  0.33479  ,
        0.14094  , -0.0070003,  0.32569  ,  0.22902  ,  0.46557  ,
       -0.19531  ,  0.37491  , -0.7139   , -0.51775  ,  0.77039  ,
        1.0881   , -0.66011  , -0.16234  ,  0.9119   ,  0.21046  ,
        0.047494 ,  1.0019   ,  1.1133   ,  0.70094  , -0.08696  ,
        0.47571  ,  0.1636   , -0.44469  ,  0.4469   , -0.93817  ,
        0.013101 ,  0.085964 , -0.67456  ,  0.49662  , -0.037827 ,
       -0.11038  , -0.28612  ,  0.074606 , -0.31527  , -0.093774 ,
       -0.57069  ,  0.66865  ,  0.45307  , -0.34154  , -0.7166   ,
       -0.75273  ,  0.075212 ,  0.57903  , -0.1191   , -0.11379  ,
       -0.10026  ,  0.71341  , -1.1574   , -0.74026  ,  0.40452  ,
        0.18023  ,  0.21449  ,  0.37638  ,  0.11239  , -0.53639  ,
       -0.025092 ,  0.31886  , -0.25013  , -0.63283  , -0.0118

In [18]:
#getting tokens in a sentence
sent = "this is a sentence"
doc_clean = sent.lower()
tokens = word_tokenize(doc_clean) 
print(tokens)

['this', 'is', 'a', 'sentence']


In [15]:
# gettind sentence embedding
embedded_sentence = []
for w in tokens:
    embedded_sentence.append(te.get_any(w))
result = np.vstack(embedded_sentence)

In [16]:
embedded_sentence

[array([-0.57058 ,  0.44183 ,  0.70102 , -0.41713 , -0.34058 ,  0.02339 ,
        -0.071537,  0.48177 , -0.013121,  0.16834 , -0.13389 ,  0.040626,
         0.15827 , -0.44342 , -0.019403, -0.009661, -0.046284,  0.093228,
        -0.27331 ,  0.2285  ,  0.33089 , -0.36474 ,  0.078741,  0.3585  ,
         0.44757 , -0.2299  ,  0.18077 , -0.6265  ,  0.053852, -0.29154 ,
        -0.4256  ,  0.62903 ,  0.14393 , -0.046004, -0.21007 ,  0.48879 ,
        -0.057698,  0.37431 , -0.030075, -0.34494 , -0.29702 ,  0.15095 ,
         0.28248 , -0.16578 ,  0.076131, -0.093016,  0.79365 , -0.60489 ,
        -0.18874 , -1.0173  ,  0.31962 , -0.16344 ,  0.54177 ,  1.1725  ,
        -0.47875 , -3.3842  , -0.081301, -0.3528  ,  1.8372  ,  0.44516 ,
        -0.52666 ,  0.99786 , -0.32178 ,  0.033462,  1.1783  , -0.072905,
         0.39737 ,  0.26166 ,  0.33111 , -0.35629 , -0.16558 , -0.44382 ,
        -0.14183 , -0.37976 ,  0.28994 , -0.029114, -0.35169 , -0.27694 ,
        -1.344   ,  0.19555 ,  0.16887