In [1]:
from collections import Counter
from zipfile import ZipFile
from tqdm import tqdm
import random as rn
import os
import requests
import os
import sys
from urllib.request import urlretrieve
import numpy as np
import nltk
from nltk import word_tokenize

class GloveMatrix(object):
    """
    Downloads and loads GloVe matrix.
    """
    def __init__(self):
        self.glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
        self.file_name = "glove.6B.zip"
        self.dest = "glove.6B"
        self.download_glove()
        embedding_index = self.load_matrix()
        self.EMBEDDING_DIM = 300
        print("Done")
        
    def download_glove(self):
        if not os.path.exists("glove.6B/glove.6B.300d.txt"):
            if os.path.exists(self.file_name):
                self.unzip_file(self.file_name, self.dest)
            else:
                urlretrieve(self.glove_url, self.file_name, self.reporthook)
                self.unzip_file(self.file_name, self.dest)
                
    def load_matrix(self):       
        print("Loading embedding matrix")
        self.embedding_index = {}
        with open('glove.6B/glove.6B.300d.txt', "r") as f:
            lines = f.read().split("\n")
            for line in lines:
                values = line.split()
                if len(values) > 0:
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    self.embedding_index[word] = coefs

    def get_index(self):
        return self.embedding_index    

    def unzip_file(self, file_name, dest):
        print("Unzipping file...")
        zipTest = ZipFile(file_name)
        zipTest.extractall(dest)

    def download_file(self, url, file_name):
        print("Downloading file...")
        urlretrieve(url, file_name, reporthook)

    def reporthook(self, blocknum, blocksize, totalsize):
        readsofar = blocknum * blocksize
        if totalsize > 0:
            percent = readsofar * 1e2 / totalsize
            s = "\r%5.1f%% %*d / %d" % (
                percent, len(str(totalsize)), readsofar, totalsize)
            sys.stderr.write(s)
            if readsofar >= totalsize: # near the end
                sys.stderr.write("\n")
        else: # total size is unknown
            sys.stderr.write("read %d\n" % (readsofar,))

class TextEmbedder(object):
    """
    TextEmbedder returning word embeddings, using given GloVe matrix.
    """
    def __init__(self, glove_matrix):
        self.embedding_index = glove_matrix.embedding_index

    def get_any(self,word):
         return self.embedding_index.get(word, self.embedding_index.get("unk"))

In [3]:
gm = GloveMatrix()
te = TextEmbedder(gm)

Loading embedding matrix
Done


In [4]:
# getting word embedding for one word
te.get_any("hello")

array([ -3.37119997e-01,  -2.16910005e-01,  -6.63649989e-03,
        -4.16249990e-01,  -1.25549996e+00,  -2.84659993e-02,
        -7.21949995e-01,  -5.28869987e-01,   7.20850006e-03,
         3.19970012e-01,   2.94250008e-02,  -1.32360002e-02,
         4.35110003e-01,   2.57160008e-01,   3.89950007e-01,
        -1.19680002e-01,   1.50350004e-01,   4.47620004e-01,
         2.84069985e-01,   4.93389994e-01,   6.28260016e-01,
         2.28880003e-01,  -4.03849989e-01,   2.73640007e-02,
         7.36790011e-03,   1.39950007e-01,   2.33459994e-01,
         6.81219995e-02,   4.84219998e-01,  -1.95780005e-02,
        -5.47510028e-01,  -5.49830019e-01,  -3.40909995e-02,
         8.00170004e-03,  -4.30649996e-01,  -1.89689994e-02,
        -8.56700018e-02,  -8.11230004e-01,  -2.10800007e-01,
         3.77840012e-01,  -3.50459993e-01,   1.36840001e-01,
        -5.56609988e-01,   1.68349996e-01,  -2.29519993e-01,
        -1.61840007e-01,   6.73449993e-01,  -4.65970010e-01,
        -3.18339989e-02,

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /homes/tam66/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
#getting tokens in a sentence
sent = "this is a sentence"
doc_clean = sent.lower()
tokens = word_tokenize(doc_clean) 
print(tokens)

['this', 'is', 'a', 'sentence']


In [7]:
# gettind sentence embedding
embedded_sentence = []
for w in tokens:
    embedded_sentence.append(te.get_any(w))
result = np.vstack(embedded_sentence)

In [8]:
embedded_sentence[0]

array([ -2.04370007e-01,   1.64309993e-01,   4.17939983e-02,
        -1.37079999e-01,  -2.97789991e-01,   3.34399998e-01,
        -6.99549988e-02,  -6.80359975e-02,   1.06040001e-01,
        -2.03369999e+00,   1.79769993e-01,  -7.74030015e-02,
        -1.95179999e-01,   1.83239996e-01,   3.00169997e-02,
        -5.47619984e-02,  -4.57249999e-01,  -2.45089997e-02,
         5.73870018e-02,  -3.48780006e-01,   3.96960005e-02,
         4.48260009e-01,  -5.84620014e-02,   4.11810011e-01,
        -3.54110003e-02,  -1.47220001e-01,   1.07400000e-01,
        -2.58960009e-01,  -1.16580002e-01,   1.98220000e-01,
         3.28500003e-01,   2.41769999e-01,  -5.71770012e-01,
        -5.64420000e-02,  -9.64370012e-01,   3.44819993e-01,
         5.46390004e-02,   2.38279998e-01,  -1.91389993e-01,
         3.08990002e-01,   2.80440003e-01,  -3.38140018e-02,
        -2.54359990e-01,   1.53729999e-02,   1.63409993e-01,
         2.63520002e-01,   1.58120006e-01,   3.20439994e-01,
        -2.30820000e-01,

In [9]:
te.get_any('this') == embedded_sentence[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [10]:
import sys
import csv
import string

In [14]:
import time

In [76]:
start_time = time.time()
end_time = time.time()
duration = end_time - start_time
print('It took %s to run' %(duration))

It took 5.14984130859375e-05 to run


In [52]:
start_time = time.time()

i = 0
with open('final.csv', mode='r',encoding = 'ISO-8859-1') as csv_sentences:
    sentences = csv.reader(csv_sentences)
    for sentence in sentences:
        print(sentence)
        embedded_sentence = []
        
        for word in sentence:
            embedded_sentence.append(te.get_any(word))
            result = np.vstack(embedded_sentence)
            
        #print (result)
        if i == 2:
            break
        i += 1

['anthony', 'gilbert', 'the', 'pen', 'name', 'of', 'lucy', 'beatrice', 'malleson', '15', 'february', '1899', '9', 'december', '1973', 'was', 'an', 'english', 'crime', 'writer', 'who', 'was', 'a', 'cousin', 'of', 'actorscreenwriter', 'miles', 'malleson']
['she', 'also', 'wrote', 'nongenre', 'fiction', 'as', 'anne', 'meredith', 'and', 'published', 'one', 'crime', 'novel', 'and', 'an', 'autobiography', 'threeapenny', '1940', 'under', 'the', 'meredith', 'name']
['she', 'published', '69', 'crime', 'novels', '51', 'of', 'which', 'featured', 'her', 'bestknown', 'character', 'arthur', 'crook']


## Write to CSV

In [65]:
start_time = time.time()

i = 0
results = []
with open('embeddings.csv', mode='w') as embeddings:
    writer = csv.writer(embeddings, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    with open('final.csv', mode='r',encoding = 'ISO-8859-1') as csv_sentences:
        sentences = csv.reader(csv_sentences)
        for sentence in sentences:
            embedded_sentence = []
            result = []
            for word in sentence:
                embedded_sentence.append(te.get_any(word))
            result = np.vstack(embedded_sentence)        
            
            results.append(result)
            writer.writerow(result)
            
            i += 1
            if i == 100:
                break
            
end_time = time.time()
duration = end_time - start_time
print('It took %s to run' %(duration))

It took 7.365271091461182 to run


## Save to List

In [66]:
start_time = time.time()

i = 0
results = []
with open('embeddings.csv', mode='w') as embeddings:
    writer = csv.writer(embeddings, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    with open('final.csv', mode='r',encoding = 'ISO-8859-1') as csv_sentences:
        sentences = csv.reader(csv_sentences)
        for sentence in sentences:
            embedded_sentence = []
            result = []
            for word in sentence:
                embedded_sentence.append(te.get_any(word))
            result = np.vstack(embedded_sentence)        
            
            results.append(result)
            #writer.writerow(result)
            
            i += 1
            if i == 100000:
                break
            
end_time = time.time()
duration = end_time - start_time
print('It took %s to run' %(duration))

It took 5.561968564987183 to run


In [75]:
len(results[5][1])

300

In [71]:
with open('embeddings.csv', mode='w') as embeddings:
    writer = csv.writer(embeddings, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for result in results:
        writer.writerow(result)

KeyboardInterrupt: 

In [70]:
len(results[2])

14

In [58]:
i

0

In [49]:
sentence

'anthony,gilbert,the,pen,name,of,lucy,beatrice,malleson,15,february,1899,9,december,1973,was,an,english,crime,writer,who,was,a,cousin,of,actorscreenwriter,miles,malleson\n'

In [34]:
len(results)

1

In [41]:
len(word_list)

22

In [45]:
len(embedded_sentence[0])

300

In [43]:
len(result)

22

In [44]:
! ls

draft.csv	glove.6B	  out.txt	     wiki.bz2
embeddings.csv	glove.6B.zip	  process_csv.ipynb  wikiextractor
final.csv	out_from_git.csv  processing.py      word_embeddings.ipynb


In [43]:
with open('embeddings.csv', mode='r', encoding = 'ISO-8859-1') as embeddings:
    for e in embeddings:
        print(e)

"[ 0.0024875  -0.36866    -0.095369    0.31169     0.37415999  0.38005999

 -0.29764    -0.31081    -0.24623001 -0.28625     0.15069    -0.36017001

  0.18158001 -0.23804    -0.11554    -0.049825    0.34110001 -0.49958

 -0.098817    0.015255   -0.3538      0.068802   -0.084403   -0.49096999

  0.37847    -0.22909001  0.24044999 -0.74693    -0.002486   -0.63753003

  0.27823001  0.29922    -0.13404     0.3319     -1.23979998  0.40689

  0.12299     0.26181     0.25237    -0.54378003  0.14764    -0.089087

  0.079722    0.58728999  0.13708     0.33285001  0.24738    -0.21389

  0.087089    0.090408   -0.30857     0.057405   -0.091401    0.078659

  0.16536    -0.042078   -0.26497    -0.024146   -0.15006    -0.18134999

  0.036265   -0.14808001 -0.33155999  0.37784001 -0.02719    -0.57590002

  0.041909   -0.34103999 -0.012387   -0.62402999  0.063548   -0.37016001

 -0.24372999 -0.45411     0.0773     -0.33576     0.38936001  0.39689001

  0.15031999 -0.21217     0.41295999 -0.019889    

In [42]:
len(e)

71