In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Download the data.
url = 'http://mattmahoney.net/dc/'


def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  print (statinfo)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception(
        'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

#filename = maybe_download('text8.zip', 31344016)
filename = maybe_download('Copy of 001 ENLEAYA001.srt.txt.zip', 4300)


# Read the data into a list of strings.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words."""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data



posix.stat_result(st_mode=33188, st_ino=8596354251, st_dev=16777220, st_nlink=1, st_uid=501, st_gid=20, st_size=4300, st_atime=1516560563, st_mtime=1516519897, st_ctime=1516519897)
Found and verified Copy of 001 ENLEAYA001.srt.txt.zip


In [3]:
vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000


def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:20])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

Data size 2073
Most common words (+UNK) [['UNK', 0], ('the', 89), ('and', 59), ('of', 41), ('to', 40), ('I', 37), ('they', 36), ('that', 31), ('with', 29), ('a', 25), ('you', 24), ('it', 23), ('Mm-hmm.', 23), ('be', 20), ('is', 20), ('um,', 20), ('have', 17), ('like,', 17), ('in', 16), ('because', 16)]
Sample data [137, 5, 24, 163, 136, 18, 295, 383, 10, 574] ['Okay,', 'I', 'can', 'see', 'two', 'in', 'here...', 'Can', 'you', 'read']
