In [1]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

In [2]:
with open('anna.txt','r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

# Tokenization

In [4]:
chras = tuple(set(text))
int2char = dict(enumerate(chras))

char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])



In [5]:
encoded[:100]


array([36, 12, 40, 82, 60, 17, 63, 13, 28, 64, 64, 64, 15, 40, 82, 82, 51,
       13, 23, 40, 75, 35, 39, 35, 17, 45, 13, 40, 63, 17, 13, 40, 39, 39,
       13, 40, 39, 35, 58, 17,  6, 13, 17,  2, 17, 63, 51, 13, 54, 14, 12,
       40, 82, 82, 51, 13, 23, 40, 75, 35, 39, 51, 13, 35, 45, 13, 54, 14,
       12, 40, 82, 82, 51, 13, 35, 14, 13, 35, 60, 45, 13, 25, 18, 14, 64,
       18, 40, 51, 65, 64, 64, 22,  2, 17, 63, 51, 60, 12, 35, 14])

In [6]:
def one_hot_encoder(arr, n_labels):
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)

    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [7]:
test_seq = np.array([(3, 5, 1)])
one_hot = one_hot_encoder(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


# Making training mini-batches

In [13]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    
    ## TODO: Get the number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    ## TODO: Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    
    ## TODO: Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1)) 
    
    ## TODO: Iterate over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]

        yield x, y

In [14]:
batches = get_batches(encoded,8, 50)
x, y= next(batches)

In [19]:
print('x\n', x[:, :10])
print('\ny\n', y[:10, :10])

x
 [[36 12 40 82 60 17 63 13 28 64]
 [45 25 14 13 60 12 40 60 13 40]
 [17 14 73 13 25 63 13 40 13 23]
 [45 13 60 12 17 13 29 12 35 17]
 [13 45 40 18 13 12 17 63 13 60]
 [29 54 45 45 35 25 14 13 40 14]
 [13 61 14 14 40 13 12 40 73 13]
 [46 70 39 25 14 45 58 51 65 13]]

y
 [[12 40 82 60 17 63 13 28 64 64]
 [25 14 13 60 12 40 60 13 40 60]
 [14 73 13 25 63 13 40 13 23 25]
 [13 60 12 17 13 29 12 35 17 23]
 [45 40 18 13 12 17 63 13 60 17]
 [54 45 45 35 25 14 13 40 14 73]
 [61 14 14 40 13 12 40 73 13 45]
 [70 39 25 14 45 58 51 65 13  9]]
