# Word2Vec

In [1]:
import collections
import math
import os
import errno
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

In [2]:
data_dir = "word2vec_data/words/"
data_url = "http://mattmahoney.net/dc/text8.zip"

In [3]:
def fetch_words_data(url=data_url, words_data=data_dir):
    
    # make the dir with it does not exist
    os.makedirs(data_dir, exist_ok=True)
    
    # Path to zip file
    zip_path = os.path.join(words_data, "words.zip")
    
    # if the zip fila isn't there, download it from the data url
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    
    # Now that the zip file is there, gete the data from it
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
        
    # Return a list of all the words in the data source
    return data.decode("ascii").split()

In [4]:
words = fetch_words_data()

In [5]:
len(words)

17005207

In [6]:
words[9000:9040]

['feelings',
 'and',
 'the',
 'auditory',
 'system',
 'of',
 'a',
 'person',
 'without',
 'autism',
 'often',
 'cannot',
 'sense',
 'the',
 'fluctuations',
 'what',
 'seems',
 'to',
 'non',
 'autistic',
 'people',
 'like',
 'a',
 'high',
 'pitched',
 'sing',
 'song',
 'or',
 'flat',
 'robot',
 'like',
 'voice',
 'is',
 'common',
 'in',
 'autistic',
 'children',
 'some',
 'autistic',
 'children']

In [11]:
for w in words[9000:9040]:
    print(w, end=' ')

feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children 

In [12]:
from collections import Counter

In [13]:
def create_counts(vocab_size=50000):
    
    vocab = list() + Counter(words).most_common(vocab_size)
    
    vocab =  np.array([word for word,_ in vocab])
    
    dictionary = {word:code  for code, word in enumerate(vocab)}
    
    data = np.array([dictionary.get(word,0) for word in words])
    
    return data, vocab

In [14]:
data, vocabulary = create_counts()

In [15]:
data.shape

(17005207,)

In [16]:
vocabulary.shape

(50000,)

In [17]:
data[100]

4186

In [18]:
words[100]

'interpretations'