In [1]:
from __future__ import absolute_import, division, unicode_literals


In [2]:
import data
import os
import numpy as np
from collections import Counter, OrderedDict, defaultdict
from tqdm import tqdm

In [3]:
class OrderedCounter(Counter, OrderedDict):
  """Counter that remembers the order elements are first seen"""
  def __repr__(self):
    return '%s(%r)' % (self.__class__.__name__,
                      OrderedDict(self))
  def __reduce__(self):
    return self.__class__, (OrderedDict(self),)


class Vocabulary:
  """A vocabulary, assigns IDs to tokens"""
  
  def __init__(self):
    self.freqs = OrderedCounter()
    self.w2i = {}
    self.i2w = []

  def count_token(self, t):
    self.freqs[t] += 1
    
  def add_token(self, t):
    self.w2i[t] = len(self.w2i)
    self.i2w.append(t)    
    
  def build(self, min_freq=0):
    '''
    min_freq: minimum number of occurrences for a word to be included  
              in the vocabulary
    '''
    self.add_token("<unk>")  # reserve 0 for <unk> (unknown words)
    self.add_token("<pad>")  # reserve 1 for <pad> (discussed later)   
    
    tok_freq = list(self.freqs.items())
    tok_freq.sort(key=lambda x: x[1], reverse=True)
    for tok, freq in tok_freq:
        if freq >= min_freq:
            self.add_token(tok)

In [4]:
file_name = 'data/glove.840B.300d.txt'

In [5]:
with open(file_name, "r") as f:
    lines = f.readlines()

In [6]:
len(lines)

2196017

In [7]:
line = lines[0]

In [8]:
line

', -0.082752 0.67204 -0.14987 -0.064983 0.056491 0.40228 0.0027747 -0.3311 -0.30691 2.0817 0.031819 0.013643 0.30265 0.0071297 -0.5819 -0.2774 -0.062254 1.1451 -0.24232 0.1235 -0.12243 0.33152 -0.006162 -0.30541 -0.13057 -0.054601 0.037083 -0.070552 0.5893 -0.30385 0.2898 -0.14653 -0.27052 0.37161 0.32031 -0.29125 0.0052483 -0.13212 -0.052736 0.087349 -0.26668 -0.16897 0.015162 -0.0083746 -0.14871 0.23413 -0.20719 -0.091386 0.40075 -0.17223 0.18145 0.37586 -0.28682 0.37289 -0.16185 0.18008 0.3032 -0.13216 0.18352 0.095759 0.094916 0.008289 0.11761 0.34046 0.03677 -0.29077 0.058303 -0.027814 0.082941 0.1862 -0.031494 0.27985 -0.074412 -0.13762 -0.21866 0.18138 0.040855 -0.113 0.24107 0.3657 -0.27525 -0.05684 0.34872 0.011884 0.14517 -0.71395 0.48497 0.14807 0.62287 0.20599 0.58379 -0.13438 0.40207 0.18311 0.28021 -0.42349 -0.25626 0.17715 -0.54095 0.16596 -0.036058 0.08499 -0.64989 0.075549 -0.28831 0.40626 -0.2802 0.094062 0.32406 0.28437 -0.26341 0.11553 0.071918 -0.47215 -0.18366 -0.

In [None]:
1

In [None]:
assert False

In [8]:
words, vectors = [], []
    
for line in tqdm(lines[:1000]):
    line = line.split()
    words.append(line[0])
    vectors.append(line[1:])

# Put vectors in a numpy array and add unk and pad
vectors = np.stack(vectors, axis=0).astype(np.float32)
vector_unk = np.random.standard_normal((1, len(vectors[0]))) # random
vector_pad = np.random.standard_normal((1, len(vectors[0]))) # zero # or also random
vectors = np.concatenate((vector_unk, vector_pad, vectors))

In [18]:
def get_vocab_and_vectors(file_name):
    # Read lines from the file
    # with open(file_name, "r") as f:
        # lines = f.readlines()

    # Put words and vectors in lists
    words, vectors = [], []
    
    # for line in tqdm(lines):
    #     line = line.split()
    #     words.append(line[0])
    #     vectors.append(line[1:])

    with open(file_name, 'r') as f:
        for line in tqdm(f):
            line = line.split()
            words.append(line[0])
            vectors.append(line[1:])

    # Put vectors in a numpy array and add unk and pad
    vectors = np.stack(vectors, axis=0).astype(np.float32)
    vector_unk = np.random.standard_normal((1, len(vectors[0]))) # random
    vector_pad = np.random.standard_normal((1, len(vectors[0]))) # zero # or also random
    vectors = np.concatenate((vector_unk, vector_pad, vectors))

    # Create a vocabulary
    v = Vocabulary()
    for word in words:
        v.count_token(word)
    v.build()
    print("Vocabulary size:", len(v.w2i))

    words = ['<unk>', '<pad>'] + words

    return v, vectors, words

v, vectors, words = get_vocab_and_vectors("data/glove.840B.300d.txt")

382418it [05:02, 5109.61it/s] 