### Text Preprocessing

In [2]:
import numpy as np
import random

with open('timemachine.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = ' '.join(' '.join(lines).lower().split())

print('number of characters: ', len(raw_dataset))
print(raw_dataset[0:70])

number of characters:  197509
project gutenberg's the time machine, by h. g. (herbert george) wells 


### Character Index

In [3]:
idx_to_char = list(set(raw_dataset))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
print(char_to_idx)

{'?': 0, 'i': 1, 'x': 2, 'h': 3, 'w': 4, 'p': 5, 't': 6, '"': 7, '9': 8, '@': 9, '[': 10, 'd': 11, 'c': 12, '_': 13, "'": 14, '$': 15, 'n': 16, '8': 17, '.': 18, '4': 19, 'k': 20, ' ': 21, 'e': 22, 'b': 23, '6': 24, ',': 25, 'o': 26, '5': 27, '-': 28, 'f': 29, '2': 30, 'r': 31, 'u': 32, '0': 33, ']': 34, 'q': 35, 'a': 36, 'y': 37, ':': 38, 'l': 39, '(': 40, '/': 41, '%': 42, 's': 43, 'z': 44, '1': 45, ')': 46, '#': 47, ';': 48, '!': 49, '*': 50, '7': 51, 'v': 52, 'm': 53, 'g': 54, '3': 55, 'j': 56}


Converting it back to text

In [4]:
corpus_indices = [char_to_idx[char] for char in raw_dataset]
sample = corpus_indices[:20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars: project gutenberg's 
indices: [5, 31, 26, 56, 22, 12, 6, 21, 54, 32, 6, 22, 16, 23, 22, 31, 54, 14, 43, 21]


### Random Sampling

In [5]:
def data_iter_random(corpus_indices, batch_size, num_steps):
    # offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    corpus_indices = corpus_indices[offset:]
    # subtract 1 extra since we need to account for the sequence length
    num_examples = ((len(corpus_indices) - 1) // num_steps) - 1
    # discard half empty batches
    num_batches = num_examples // batch_size
    example_indices = list(range(0, num_examples * num_steps, num_steps))
    random.shuffle(example_indices)
    
    # This returns a sequence of the length num_steps starting from pos.
    def _data(pos):
        return corpus_indices[pos: pos + num_steps]

    for i in range(0, batch_size * num_batches, batch_size):
        # batch_size indicates the random examples read each time.
        batch_indices = example_indices[i:(i+batch_size)]
        X = [_data(j) for j in batch_indices]
        Y = [_data(j + 1) for j in batch_indices]       

        yield np.array(X), np.array(Y)

### Example

Batch size 2 and time steps is 5 for a sequence of length 30.

In [6]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  [[12 13 14 15 16]
 [ 2  3  4  5  6]] 
Y: [[13 14 15 16 17]
 [ 3  4  5  6  7]]
X:  [[ 7  8  9 10 11]
 [17 18 19 20 21]] 
Y: [[ 8  9 10 11 12]
 [18 19 20 21 22]]


### Sequential partitioning

Adjacent positioning of minibatches. This way we can retain the latent state between batches. 

In [7]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps):
    # offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    # slice out data - ignore num_steps and just wrap around
    num_indices = ((len(corpus_indices) - offset) // batch_size) * batch_size
    indices = np.array(corpus_indices[offset:(offset + num_indices)])
    indices = indices.reshape((batch_size,-1))
    # need to leave one last token since targets are shifted by 1
    num_epochs = ((num_indices // batch_size) - 1) // num_steps

    for i in range(0, num_epochs * num_steps, num_steps):
        X = indices[:,i:(i+num_steps)]
        Y = indices[:,(i+1):(i+1+num_steps)]
        yield X, Y

### Example partitioning

In [8]:
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y)

X:  [[ 5  6  7  8  9 10]
 [17 18 19 20 21 22]] 
Y: [[ 6  7  8  9 10 11]
 [18 19 20 21 22 23]]
