In [1]:
import os
import sys
import tensorflow as tf
import zipfile

from six.moves.urllib.request import urlretrieve
from collections import Counter

In [2]:
top_data_folder = './data/text8/'

logs_folder = './logs'

if not os.path.exists(top_data_folder):
    os.makedirs(top_data_folder)
    
def path_to(f):
    return os.path.join(top_data_folder, f)
    
if not os.path.exists(logs_folder):
    os.makedirs(logs_folder)

In [3]:
class DownloadProgress:
    def __init__(self):
        self.last_percent_reported = None

    def __call__(self, count, blockSize, totalSize):
        percent = int(count * blockSize * 100 / totalSize)

        if self.last_percent_reported != percent:
            if percent % 5 == 0:
                sys.stdout.write("%s%%" % percent)
                sys.stdout.flush()
            else:
                sys.stdout.write(".")
                sys.stdout.flush()
      
            self.last_percent_reported = percent

In [4]:
base_url = 'http://mattmahoney.net/dc/'
zip_file = 'text8.zip'
zip_path = path_to(zip_file)

def download_zip():
    url = base_url + zip_file
    
    if not os.path.exists(zip_path):
        urlretrieve(url, zip_path, reporthook=DownloadProgress())
 
download_zip()

In [5]:
def read_data(zip_filename):
  with zipfile.ZipFile(zip_path) as z:
    first_file = z.namelist()[0]
    content = z.read(first_file)
    return tf.compat.as_str(content).split()
  
words = read_data(path_to)
print('Number of words: %d' % len(words))

Number of words: 17005207


In [6]:
class WordsDataset:
    def __init__(self, words, vocabulary_size):
        self.words = words
        self._counter = Counter(words)
        self._words_to_keep = self._counter.most_common(vocabulary_size - 1)
        self.dictionary = dict()
        self.inverse_dictionary = dict()
        for i, (w, c) in enumerate(self._words_to_keep):
            self.dictionary[w] = i + 1
            self.inverse_dictionary[i + 1] = w
        self.numbers = []
        for w in words:
            x = self.dictionary.get(w, None)
            if x is None:
                self.numbers.append(0)
            else:
                self.numbers.append(x)

In [7]:
data = WordsDataset(words, vocabulary_size=50000)