In [1]:
# Import TensorFlow >= 1.10 and enable eager execution
import tensorflow as tf

# Note: Once you enable eager execution, it cannot be disabled. 
tf.enable_eager_execution()

# Import other libraries
import numpy as np
import os
import re
import random
import time

In [2]:
text = open("./data/Jordan_Peterson_Corpus.txt").read()

In [3]:
print(len(text))

2478554


In [4]:
# unique contains all the unique characters in the file
unique = sorted(set(text))

print(unique)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '}', '~']


In [5]:
# creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(unique)}
idx2char = {i:u for i, u in enumerate(unique)}

for pair in zip(char2idx, idx2char):
    print(pair)

('\n', 0)
(' ', 1)
('!', 2)
('"', 3)
('#', 4)
('$', 5)
('%', 6)
('&', 7)
("'", 8)
('(', 9)
(')', 10)
('*', 11)
('+', 12)
(',', 13)
('-', 14)
('.', 15)
('/', 16)
('0', 17)
('1', 18)
('2', 19)
('3', 20)
('4', 21)
('5', 22)
('6', 23)
('7', 24)
('8', 25)
('9', 26)
(':', 27)
(';', 28)
('<', 29)
('=', 30)
('?', 31)
('A', 32)
('B', 33)
('C', 34)
('D', 35)
('E', 36)
('F', 37)
('G', 38)
('H', 39)
('I', 40)
('J', 41)
('K', 42)
('L', 43)
('M', 44)
('N', 45)
('O', 46)
('P', 47)
('Q', 48)
('R', 49)
('S', 50)
('T', 51)
('U', 52)
('V', 53)
('W', 54)
('X', 55)
('Y', 56)
('Z', 57)
('[', 58)
(']', 59)
('^', 60)
('_', 61)
('`', 62)
('a', 63)
('b', 64)
('c', 65)
('d', 66)
('e', 67)
('f', 68)
('g', 69)
('h', 70)
('i', 71)
('j', 72)
('k', 73)
('l', 74)
('m', 75)
('n', 76)
('o', 77)
('p', 78)
('q', 79)
('r', 80)
('s', 81)
('t', 82)
('u', 83)
('v', 84)
('w', 85)
('x', 86)
('y', 87)
('z', 88)
('}', 89)
('~', 90)


In [14]:
# setting the maximum length sentence we want for a single input in characters
max_length = 100

# length of the vocabulary in chars
vocab_size = len(unique)

# the embedding dimension 
embedding_dim = 256

# number of RNN (here GRU) units
units = 1024

# batch size 
BATCH_SIZE = 64

# buffer size to shuffle our dataset
BUFFER_SIZE = 10000

In [15]:
input_text = []
target_text = []

# Get's 100 words at a time where inps is 0-100 for instance targ will be 1-101
for f in range(0, len(text)-max_length, max_length):
    inps = text[f:f+max_length]
    targ = text[f+1:f+1+max_length]

    # Replace the text in the given chunks with the index of the appropriate charachter
    input_text.append([char2idx[i] for i in inps])
    target_text.append([char2idx[t] for t in targ])

    
print (np.array(input_text).shape)
print (np.array(target_text).shape)

(24785, 100)
(24785, 100)


In [41]:
# -------------------------------------------------------------------------------------------------
#            Dataset is a CLASS in Tensorflow (from_tensor_slices allows us to take slices)
# -------------------------------------------------------------------------------------------------
#              * It creates a Dataset whose elements are slices of the given tensors *
#
# NOTE: If tensors contains a npy array, and eager execution is not enabled, the values will be embedded in the graph
#
# Arguments:
#    tensors: A nested structure of tensors, each having the same size in the 0th dimension.
#
# Returns:
#    Dataset: A Dataset
# -------------------------------------------------------------------------------------------------

# Create our dataset and shuffle accordingly (note that the input text and target text are what make up the dataset)
dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)

# Break the dataset into batches (adds a dimension) and drop remiander if batches are not the proper size
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [42]:
print(dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>


In [28]:
class Model(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
    
        super(Model, self).__init__()
        
        self.units = units
        self.batch_sz = batch_size

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        if tf.test.is_gpu_available():
            self.gru = tf.keras.layers.CuDNNGRU(self.units, 
                                              return_sequences=True, 
                                              return_state=True, 
                                              recurrent_initializer='glorot_uniform')
        else:
            self.gru = tf.keras.layers.GRU(self.units, 
                                         return_sequences=True, 
                                         return_state=True, 
                                         recurrent_activation='sigmoid', 
                                         recurrent_initializer='glorot_uniform')

        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):
        
        x = self.embedding(x)

        # output shape == (batch_size, max_length, hidden_size) 
        # states shape == (batch_size, hidden_size)

        # states variable to preserve the state of the model
        # this will be used to pass at every step to the model while training
        output, states = self.gru(x, initial_state=hidden)

        # reshaping the output so that we can pass it to the Dense layer
        # after reshaping the shape is (batch_size * max_length, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # The dense layer will output predictions for every time_steps(max_length)
        # output shape after the dense layer == (max_length * batch_size, vocab_size)
        x = self.fc(output)

        return x, states

In [48]:
model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)

In [49]:
optimizer = tf.train.AdamOptimizer()

# using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors
def loss_function(real, preds):
    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)

In [50]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 model=model)

In [60]:
# Training step

EPOCHS = 25

for epoch in range(EPOCHS):
    start = time.time()
    
    # initializing the hidden state at the start of every epoch
    hidden = model.reset_states()
    
    for (batch, (inp, target)) in enumerate(dataset):
        with tf.GradientTape() as tape:
            # feeding the hidden state back into the model
            # This is the interesting step
            predictions, hidden = model(inp, hidden)

            # reshaping the target because that's how the 
            # loss function expects it
            target = tf.reshape(target, (-1,))
            loss = loss_function(target, predictions)
              
            grads = tape.gradient(loss, model.variables)
            optimizer.apply_gradients(zip(grads, model.variables))

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1, batch, loss))
    
    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.9135
Epoch 1 Batch 100 Loss 0.9669
Epoch 1 Batch 200 Loss 0.9719
Epoch 1 Batch 300 Loss 1.0085
Epoch 1 Loss 0.9956
Time taken for 1 epoch 15.225118398666382 sec

Epoch 2 Batch 0 Loss 0.8784
Epoch 2 Batch 100 Loss 0.9487
Epoch 2 Batch 200 Loss 0.9873
Epoch 2 Batch 300 Loss 1.0212
Epoch 2 Loss 1.0188
Time taken for 1 epoch 15.268008708953857 sec

Epoch 3 Batch 0 Loss 0.9151
Epoch 3 Batch 100 Loss 0.9587
Epoch 3 Batch 200 Loss 0.9879
Epoch 3 Batch 300 Loss 1.0265
Epoch 3 Loss 1.0021
Time taken for 1 epoch 15.327991724014282 sec

Epoch 4 Batch 0 Loss 0.9538
Epoch 4 Batch 100 Loss 0.9298
Epoch 4 Batch 200 Loss 0.9861
Epoch 4 Batch 300 Loss 1.0195
Epoch 4 Loss 1.0549
Time taken for 1 epoch 15.275188684463501 sec

Epoch 5 Batch 0 Loss 0.9390
Epoch 5 Batch 100 Loss 0.9764
Epoch 5 Batch 200 Loss 1.0219
Epoch 5 Batch 300 Loss 1.0369
Epoch 5 Loss 1.0156
Time taken for 1 epoch 15.68049669265747 sec

Epoch 6 Batch 0 Loss 0.9355
Epoch 6 Batch 100 Loss 0.9953
Epoch 6 Batch 200 

In [61]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x7fbf98153ac8>

In [62]:
# Evaluation step(generating text using the model learned)

# number of characters to generate
num_generate = 1000

# You can change the start string to experiment
start_string = 'q'
# converting our start string to numbers(vectorizing!) 
input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval, 0)

# empty string to store our results
text_generated = ''

# hidden state shape == (batch_size, number of rnn units); here batch size == 1
hidden = [tf.zeros((1, units))]
for i in range(num_generate):
    predictions, hidden = model(input_eval, hidden)

    # using argmax to predict the word returned by the model
    predicted_id = tf.argmax(predictions[-1]).numpy()
    
    # We pass the predicted word as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)
    
    text_generated += idx2char[predicted_id]

print (start_string + text_generated)

quence for the relize house and the constical und the 
constical and not the constions and the somethe the servers of the in ad some a precion and the 
come the is and 































































































































































































































































































































































































































































































































































































































































































































































































































































