In [1]:
import collections
import sys

import numpy as np
import tensorflow as tf

In [7]:
DATA_DIR = 'data'
alice = f'{DATA_DIR}/alice_in_wonderland.txt'

In [9]:
def get_words(file_name):
    with open(file_name) as f:
        all_lines = f.readlines()
    stripped_lines = [x.strip() for x in all_lines]
    words = []
    for line in stripped_lines:
        words.extend(line.split())
    return np.array(words)

In [13]:
def build_dict(words):
    most_common_words = collections.Counter(words).most_common()
    word2id = {word: id_ 
               for (id_, (word, _)) in enumerate(most_common_words)}
    id2word = {id_: word
               for (id_, (word, _)) in enumerate(most_common_words)}
    return most_common_words, word2id, id2word

In [16]:
words = get_words(alice)
words[:10]

array(['\ufeffProject', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in',
       'Wonderland,', 'by', 'Lewis', 'Carroll', 'This'], dtype='<U50')

In [17]:
most_common_words, word2id, id2word = build_dict(words)
most_common_words_len = len(most_common_words)
most_common_words_len

6019

In [19]:
section_len = 20 # number of sequential (one-hot encoded) words to use

In [26]:
def get_input_output(words):
    input_values = []
    output_values = []
    n_sections = 0
    for i in range(len(words) - section_len):
        input_values.append(words[i:i + section_len])
        output_values.append(words[i + section_len])
        n_sections += 1
    one_hot_inputs = np.zeros(
        (n_sections, section_len, most_common_words_len))
    one_hot_outputs = np.zeros((n_sections, most_common_words_len))
    for s, section in enumerate(input_values):
        for w, word in enumerate(section):
            one_hot_inputs[s, w, word2id[word]] = 1.
        one_hot_outputs[s, word2id[output_values[s]]] = 1.
    return one_hot_inputs, one_hot_outputs

In [27]:
X_train, y_train = get_input_output(words)