<a href="https://colab.research.google.com/github/ctclumak/Tensorflow-2-and-Keras-Deep-Learning/blob/master/NPL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Part 1: The data**
- import then main libraries
- importing text
- understanding the characters

In [2]:
%tensorflow_version 2.x
import tensorflow as tf
path_to_file = "shakespeare.txt"

TensorFlow 2.x selected.


In [0]:
text = open(path_to_file,'r').read()

In [4]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [0]:
#get all the unique characters of the text
vocab = sorted(set(text))

In [6]:
len(vocab)

84

**Part 2: Text Processing**
- Vectorize the text
- Create encoding dictionary

In [0]:
# characters to index code
 #for pair in enumerate(vocab):
  #print(pair)
  char_to_ind = {char:ind for ind, char in enumerate(vocab) }

In [8]:
char_to_ind["H"]

33

In [0]:
ind_to_char = np.array(vocab)

In [10]:
ind_to_char[33]

'H'

In [0]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [12]:
encoded_text.shape

(5445609,)

In [0]:
# make a sample text
sample = text[:500]

In [14]:
sample

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [15]:
encoded_text[:500]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75,  1, 56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63,
       70, 76, 67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60,
       56, 74, 60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,
        1, 63, 60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1,
       63, 64, 74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,
        1, 75, 63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1,
       75, 70,  1, 75, 63

**Part 3: Creating Batches**
- Understand text sequences
- Use Tensorflow datasets to generate batches
- Shuffle batches

we need to make sure our training sequences are long enough that they'll actually be able to pick up the general structure of the text.So we should probably have at least three lines in order to try to understand that structure.

So if on average each line is around 40 characters and three lines is one one hundred and thirty three characters probably let's go ahead and choose our sequence length of one hundred and twenty characters.

In [0]:
seq_len = 120
total_num_seq = len(text) // (seq_len+1)

In [17]:
total_num_seq

45005

In [0]:
# create the training sequency
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [19]:
type(char_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [0]:
#we use seq_len+1 because of 0 index
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [0]:
def create_seq_targets(seq):
  input_txt = seq[:-1]
  target_txt = seq[1:]
  return input_txt, target_txt

In [0]:
dataset = sequences.map(create_seq_targets)

In [23]:
for input_txt, target_txt in  dataset.take(1):
    print(input_txt.numpy())
    print(''.join(ind_to_char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    # There is an extra whitespace!
    print(''.join(ind_to_char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [0]:
# create the training batches, choose batch_size
# set buffer_size, is to take 10000 of the elements and shuffle it 
batch_size = 128
buffer_size = 10000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)

In [25]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

**Part 4: Creating the model**
- set up loss function
- Create model
  - Embedding
  - GRU
  - Dense

In [0]:
vocab_size = len(vocab)

In [27]:
vocab_size

84

In [0]:
embed_dim = 64

In [0]:
rnn_neurons = 1026 # sinle layer with lots neurons

In [0]:
#create a loss function
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [0]:
def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true,
                                         y_pred, 
                                         from_logits = True 
                                         )

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU,Dense

In [0]:
def create_model(vocab_size,embed_dim,rnn_neurons,batch_size):
  model = Sequential()
  #vocab_size is the input dimension, embed_dim is the output dimension
  model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size,None]))
  #return_sequences and stateful are set to true to have the layer to 
  #return the last output or the full sequence and the
  #last state in addiontion to the output

  model.add(GRU(rnn_neurons,
                return_sequences=True,
                stateful = True,
                recurrent_initializer = "glorot_uniform"))
  # Dense is the different vocab_size,then we compile the model
  model.add(Dense(vocab_size))
  model.compile('adam', loss=sparse_cat_loss)

  return model

In [0]:
model = create_model(vocab_size=vocab_size,
                     embed_dim = embed_dim,
                     rnn_neurons=rnn_neurons,
                     batch_size=batch_size)

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


**Part 5: Training the model**

In [0]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Predict off some random batch
  example_batch_predictions = model(input_example_batch)

In [37]:
example_batch_predictions.shape

TensorShape([128, 120, 84])

In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [0]:
sampled_indices= tf.squeeze(sampled_indices, axis = -1).numpy()

In [0]:
#ind_to_char[sampled_indices]

In [0]:
# train the mode;
epochs =  30
#model.fit(dataset, epochs = epochs)
model.save('shakespeare_gen.h5') 

In [0]:
#because the model takes long time to train, so we will load a pretrained model instead


from tensorflow.keras.models import load_model
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)

model.load_weights('shakespeare_gen.h5')

model.build(tf.TensorShape([1, None]))

In [69]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_4 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_4 (Dense)              (1, None, 84)             86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model,start_seed, gen_size = 500, temp=1.0):
  num_generate = gen_size
  input_eval = [char_to_ind [s] for s in start_seed]
  input_eval = tf.expand_dims(input_eval,0)

  text_generated = []
  temperature = temp

  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, 0)
    predictions = predictions/temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    
    input_eval = tf.expand_dims([predicted_id],0)
    text_generated.append(ind_to_char[predicted_id])

    return (start_seed + ''.join(text_generated))


In [71]:
print(generate_text(model, "JULIET", gen_size=1000))



JULIETH


flowere
