import relevant libraries

In [1]:
import numpy as np
import tensorflow as tf
import os
import glob

grab all the text files

the works of shakespeare and lovecraft can be found here:

[The complete works of Shakespeare](https://www.thecompleteworksofshakespeare.com/)

[Electronic Texts of H.P. Lovecraft’s Works](https://www.hplovecraft.com/writings/texts/)

In [2]:
all_txt = glob.glob(os.path.join(os.getcwd(),"stories/*.txt"))

In [3]:
all_txt

['/home/henning/work/playground_2/shakecraft/stories/hp_the_colour_out_of_space.txt',
 '/home/henning/work/playground_2/shakecraft/stories/shk_the_gentlemen_of_verona.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_shadow_over_innsmouth.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_what_the_moon_brings.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_case_of_charles_dexter_ward.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_battle_that_ended_the_century.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_silver_key.txt',
 '/home/henning/work/playground_2/shakecraft/stories/shk_julius_casesar.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_horror_in_the_burying_ground.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_horror_in_the_museum.txt',
 '/home/henning/work/playground_2/shakecraft/stories/hp_the_secret_cave.txt',
 '/home/henning/work/playground_2/shakecraft/storie

join all the texts together

In [4]:
big_txt = []

for tmp_txt in all_txt:
    with open(tmp_txt,'r') as f:
        big_txt.append(f.read())

In [5]:
complete_txt = " ".join(big_txt)

In [6]:
vocab = sorted(set(complete_txt))

make it so that each character has a corresponding index

In [8]:
char_to_idx = {char:idx for idx,char in enumerate(vocab)}

In [9]:
idx_to_char = np.array(vocab)

In [10]:
encoded_txt = np.array([char_to_idx[c] for c in complete_txt])

grab some text to get a feel of how long each sequence should be

In [11]:
print(complete_txt[:500])

West of Arkham the hills rise wild, and there are valleys with deep woods that no axe has ever cut. There are dark narrow glens where the trees slope fantastically, and where thin brooklets trickle without ever having caught the glint of sunlight. On the gentler slopes there are farms, ancient and rocky, with squat, moss-coated cottages brooding eternally over old New England secrets in the lee of great ledges; but these are all vacant now, the wide chimneys crumbling and the shingled sides bulg


In [12]:
some_part = '''
West of Arkham the hills rise wild, and there are valleys with deep woods 
that no axe has ever cut. There are dark narrow glens where the trees slope 
fantastically, and where thin brooklets trickle without ever having caught 
the glint of sunlight.'''

In [13]:
len(some_part)

251

In [14]:
seq_len = 250
total_num_seq = len(complete_txt)//(seq_len+1)

In [15]:
total_num_seq

13538

make a dataset with an input and target text

In [16]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_txt)

In [17]:
sequences = char_dataset.batch(seq_len+1,drop_remainder=True)

In [18]:
def create_seq_targets(seq):
    input_txt = seq[:-1] # woof woo
    target_txt = seq[1:] # oof woof
    return input_txt, target_txt

In [19]:
dataset = sequences.map(create_seq_targets)

grab a sequence from the dataset and see how the input and target looks like

In [20]:
for input_txt, target_txt in  dataset.take(1):
    print(input_txt.numpy())
    print(''.join(idx_to_char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    print(''.join(idx_to_char[target_txt.numpy()]))

[48 58 72 73  2 68 59  2 26 71 64 61 54 66  2 73 61 58  2 61 62 65 65 72
  2 71 62 72 58  2 76 62 65 57 10  2 54 67 57  2 73 61 58 71 58  2 54 71
 58  2 75 54 65 65 58 78 72  2 76 62 73 61  2 57 58 58 69  2 76 68 68 57
 72  2 73 61 54 73  2 67 68  2 54 77 58  2 61 54 72  2 58 75 58 71  2 56
 74 73 12  2 45 61 58 71 58  2 54 71 58  2 57 54 71 64  2 67 54 71 71 68
 76  2 60 65 58 67 72  2 76 61 58 71 58  2 73 61 58  2 73 71 58 58 72  2
 72 65 68 69 58  2 59 54 67 73 54 72 73 62 56 54 65 65 78 10  2 54 67 57
  2 76 61 58 71 58  2 73 61 62 67  2 55 71 68 68 64 65 58 73 72  2 73 71
 62 56 64 65 58  2 76 62 73 61 68 74 73  2 58 75 58 71  2 61 54 75 62 67
 60  2 56 54 74 60 61 73  2 73 61 58  2 60 65 62 67 73  2 68 59  2 72 74
 67 65 62 60 61 73 12  2 40 67]
West of Arkham the hills rise wild, and there are valleys with deep woods that no axe has ever cut. There are dark narrow glens where the trees slope fantastically, and where thin brooklets trickle without ever having caught the glint of 

shuffle the dataset 

In [21]:
batch_size = 256
buffer_size = 100000
dataset = dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)

In [22]:
dataset

<BatchDataset element_spec=(TensorSpec(shape=(256, 250), dtype=tf.int64, name=None), TensorSpec(shape=(256, 250), dtype=tf.int64, name=None))>

set up the model parameters

load the necessary things

In [82]:
vocab_size = len(vocab)
embed_dim = 128
# embed_dim = 64
# rnn_neurons = 1024
rnn_neurons = 512

In [104]:
print("vocab_size = {}".format(vocab_size))
print("embed_dim = {}".format(embed_dim))
print("rnn_neurons = {}".format(rnn_neurons))
print("batch_size = {}".format(batch_size))
print("char_to_idx = {}".format(char_to_idx))
print("idx_to_char = {}".format([thing for thing in idx_to_char]))
with open("things_for_app.py",'w') as f:
    f.write("vocab_size = {}\n".format(vocab_size))
    f.write("embed_dim = {}\n".format(embed_dim))
    f.write("rnn_neurons = {}\n".format(rnn_neurons))
    f.write("batch_size = {}\n".format(batch_size))
    f.write("char_to_idx = {}\n".format(char_to_idx))
    f.write("idx_to_char = {}".format([thing for thing in idx_to_char]))

vocab_size = 107
embed_dim = 128
rnn_neurons = 512
batch_size = 256
char_to_idx = {'\t': 0, '\n': 1, ' ': 2, '!': 3, '#': 4, '$': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, '[': 52, ']': 53, 'a': 54, 'b': 55, 'c': 56, 'd': 57, 'e': 58, 'f': 59, 'g': 60, 'h': 61, 'i': 62, 'j': 63, 'k': 64, 'l': 65, 'm': 66, 'n': 67, 'o': 68, 'p': 69, 'q': 70, 'r': 71, 's': 72, 't': 73, 'u': 74, 'v': 75, 'w': 76, 'x': 77, 'y': 78, 'z': 79, '°': 80, '½': 81, 'Å': 82, 'Æ': 83, '×': 84, 'á': 85, 'ä': 86, 'æ': 87, 'è': 88, 'é': 89, 'ë': 90, 'ó': 91, 'ö': 92, 'ü': 93, 'Ο': 94, 'α': 95, 'δ': 96, 'ἶ': 97, '–': 98, '—': 99, '‘': 100, '’': 101, '“': 

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU

In [26]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [27]:
def sparse_cat_loss(y_true,y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred,from_logits=True)

create the model 

In [27]:
# current version of model
# def create_model(vocab_size,embed_dim,rnn_neurons,batch_size):
#     model = Sequential()
#     model.add(Embedding(vocab_size,embed_dim,batch_input_shape=[batch_size,None]))
#     model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,
#                  recurrent_initializer='glorot_uniform'))
#     model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,
#                  recurrent_initializer='glorot_uniform'))
#     model.add(Dense(vocab_size))
#     model.compile(optimizer='adam',loss=sparse_cat_loss)
#     return model

In [83]:
# testing things
def create_model(vocab_size,embed_dim,rnn_neurons,batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size,embed_dim,batch_input_shape=[batch_size,None]))
    model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,
                 recurrent_initializer='glorot_uniform'))
    model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,
                 recurrent_initializer='glorot_uniform'))
#     model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,
#                  recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    model.compile(optimizer='adam',loss=sparse_cat_loss)
    return model

In [93]:
model = create_model(vocab_size=vocab_size,
                    embed_dim=embed_dim,
                    rnn_neurons=rnn_neurons,
                    batch_size=batch_size)

In [94]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (256, None, 128)          13696     
                                                                 
 gru_28 (GRU)                (256, None, 512)          986112    
                                                                 
 gru_29 (GRU)                (256, None, 512)          1575936   
                                                                 
 dense_10 (Dense)            (256, None, 107)          54891     
                                                                 
Total params: 2,630,635
Trainable params: 2,630,635
Non-trainable params: 0
_________________________________________________________________


In [31]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Predict off some random batch
  example_batch_predictions = model(input_example_batch)

  # Display the dimensions of the predictions
  print(example_batch_predictions.shape, " <=== (batch_size, sequence_length, vocab_size)")

(256, 250, 107)  <=== (batch_size, sequence_length, vocab_size)


In [32]:
example_batch_predictions

<tf.Tensor: shape=(256, 250, 107), dtype=float32, numpy=
array([[[-6.8995738e-05, -2.4763367e-04,  3.4560158e-04, ...,
         -3.4601726e-05,  4.9214705e-04,  2.2594002e-05],
        [ 1.5224703e-04, -9.4041054e-05,  1.3932060e-03, ...,
          1.3681899e-03,  1.5857676e-04, -1.4787825e-04],
        [ 1.1248779e-04,  1.9371079e-04,  1.3235305e-03, ...,
          1.7012912e-03, -1.9924046e-04,  8.7394757e-05],
        ...,
        [ 1.7407446e-03,  4.0036514e-03, -3.7174753e-03, ...,
          2.3854163e-04, -2.4515169e-04, -2.5167235e-03],
        [ 6.5773039e-04,  4.2542722e-03, -3.0742302e-03, ...,
          6.3339283e-04,  1.8422923e-04, -3.6075942e-03],
        [-2.0459795e-04,  3.9719967e-03, -2.8316854e-03, ...,
          4.7716580e-04,  2.1777186e-04, -4.0458245e-03]],

       [[ 1.7525585e-04,  5.2358001e-04, -5.8777892e-04, ...,
         -1.1985922e-04, -1.4024132e-03,  5.1511941e-04],
        [ 4.2646046e-05,  6.0496549e-04, -1.4293035e-03, ...,
         -8.0890267e-04, -

In [33]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [34]:
sampled_indices

<tf.Tensor: shape=(250, 1), dtype=int64, numpy=
array([[ 88],
       [  8],
       [  1],
       [ 59],
       [ 29],
       [ 69],
       [ 84],
       [ 97],
       [102],
       [ 69],
       [ 71],
       [ 55],
       [ 21],
       [ 82],
       [ 40],
       [ 68],
       [ 62],
       [ 46],
       [ 38],
       [ 34],
       [ 97],
       [ 49],
       [ 34],
       [ 59],
       [ 44],
       [ 34],
       [ 62],
       [ 12],
       [ 48],
       [ 39],
       [ 21],
       [ 57],
       [ 79],
       [ 82],
       [  1],
       [ 61],
       [ 54],
       [ 43],
       [ 95],
       [ 40],
       [ 66],
       [ 94],
       [ 63],
       [ 78],
       [ 86],
       [ 62],
       [ 65],
       [102],
       [ 45],
       [ 95],
       [  3],
       [ 58],
       [ 81],
       [ 75],
       [ 12],
       [  7],
       [102],
       [  0],
       [ 27],
       [ 71],
       [ 31],
       [ 43],
       [100],
       [105],
       [ 51],
       [ 55],
       [ 34],
       [ 55],


In [35]:
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [36]:
sampled_indices

array([ 88,   8,   1,  59,  29,  69,  84,  97, 102,  69,  71,  55,  21,
        82,  40,  68,  62,  46,  38,  34,  97,  49,  34,  59,  44,  34,
        62,  12,  48,  39,  21,  57,  79,  82,   1,  61,  54,  43,  95,
        40,  66,  94,  63,  78,  86,  62,  65, 102,  45,  95,   3,  58,
        81,  75,  12,   7, 102,   0,  27,  71,  31,  43, 100, 105,  51,
        55,  34,  55,  86,  39,  28,  25,  15, 106,  93,  58,   9,  21,
         1,  46,  21,  10, 102,  36,  60,  11,   0,  42,  88,   3,  41,
        50,  14, 105,  26,  46,  92,  29,  67,  42,  45, 103,  67,  73,
        61,  90,  84,  66,  67,  66,  45,  95,  10,  78,  43, 100,  88,
        38,  83,  72,  56,  48,  20,  53,  10,  38,  22,   3, 106,  36,
        43,  62,  27,  92,  72,  95,  62, 101,  85,  13,  40,  65,  47,
        43,  33,  60,  61,   8,  50,  22,  18,  85,  98,  43,  91,  62,
        32,  10,  42,  82,   4,  42,  18,  20,   5,  46,  86,  67,  49,
        99,  90,  72,  13,  64,  27,  19, 102,  98,  20,  75,  3

In [37]:
print("Given the input seq: \n")
print("".join(idx_to_char[input_example_batch[0]]))
print('\n')
print("Next Char Predictions: \n")
print("".join(idx_to_char[sampled_indices ]))

Given the input seq: 

n; and she could not but feel that he meant some vague, indefinable harm to Alfred. She did not like the Thibetans, either, and thought it very peculiar that Surama was able to talk with them. Alfred would not tell her who or what Surama was, but had


Next Char Predictions: 

è(
fDp×ἶ“prb8ÅOoiUMIἶXIfSIi.WN8dzÅ
haRαOmΟjyäil“Tα!e½v.'“	BrFR‘′ZbIbäNC?2￼üe)8
U8,“Kg-	Qè!PY1′AUöDnQT”nthë×mnmTα,yR‘èMÆscW7],M9!￼KRiBösαi’á0OlVRHgh(Y95á–RóiG,QÅ#Q57$UänX—ës0kB6“–7vM[4#h);tb.é,á bF4TsA7A?Q)èFGiPëaÅSLX￼F68JdWDr!jy	–“V‘JUA;‘aju?′Yα52w&•


In [95]:
epochs = 100

In [96]:
model.fit(dataset,epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb0d03acf10>

In [97]:
###model.save('shakecraft_gen2.h5')
model.save('test.h5')

In [98]:
from tensorflow.keras.models import load_model

In [99]:
model2 = create_model(vocab_size,embed_dim,rnn_neurons,batch_size=1)
# model2.load_weights('shakecraft_gen2.h5')
model2.load_weights('test.h5')
model2.build(tf.TensorShape([1,None]))

In [100]:
model2.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (1, None, 128)            13696     
                                                                 
 gru_30 (GRU)                (1, None, 512)            986112    
                                                                 
 gru_31 (GRU)                (1, None, 512)            1575936   
                                                                 
 dense_11 (Dense)            (1, None, 107)            54891     
                                                                 
Total params: 2,630,635
Trainable params: 2,630,635
Non-trainable params: 0
_________________________________________________________________


In [101]:
def generate_text(model, start_seed,gen_size=100,temp=1.0):
  '''
  model: Trained Model to Generate Text
  start_seed: Intial Seed text in string form
  gen_size: Number of characters to generate

  Basic idea behind this function is to take in some seed text, format it so
  that it is in the correct shape for our network, then loop the sequence as
  we keep adding our own predicted characters.
  '''

  # Number of characters to generate
  num_generate = gen_size

  # Vecotrizing starting seed text
  input_eval = [char_to_idx[s] for s in start_seed]

  # Expand to match batch format shape
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty list to hold resulting generated text
  text_generated = []

  # Temperature effects randomness in our resulting text
  # The term is derived from entropy/thermodynamics.
  # The temperature is used to affect probability of next characters.
  # Higher temperature ==> lesss surprising/ more expected
  # Lower temperature ==> more surprising / less expected
 
  temperature = temp

  # Here batch size == 1
  model.reset_states()

  for i in range(num_generate):

      # Generate Predictions
      predictions = model(input_eval)

      # Remove the batch shape dimension
      predictions = tf.squeeze(predictions, 0)

      # Use a categorircal disitribution to select the next character
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # Pass the predicted charracter for the next input
      input_eval = tf.expand_dims([predicted_id], 0)

      # Transform back to character letter
      text_generated.append(idx_to_char[predicted_id])

  return (start_seed + ''.join(text_generated))

In [103]:
print(generate_text(model2,"cat over there.",gen_size=2000,temp=1))

cat over there. Another poly ground built himself upon it, but stopp'd
would prance of meet and ne'er liveller.
Hasty words, and what she shall shives books;
And must of your power--with twelve spine.

SILVIA

Sir on your arm?

EMILIA

O, faithful 'aive; further, or pity her with women,
Where 't still already stand
Or else is too great a man:
A thankffir throat in the can pass,
But yet no other from your nequent contend
As truth, he calls me not enter a self, I get concerned up to get them spring: but I must put most of their life to babe!”
     Bruce—seeing, he had never continued to congratu the Court of Azathoth and It night. I myself only a white wall could bring the fish-during land.
     The youth had counting to recall the evil country home, nor any special moul. The doctor and the faithful chanthis came over Inss that late in all the minutes of the lower Stampers’ blood-tried to examine the fact that whippoorward thereof could see away with this accursed structure, I seemed com

In [47]:
vocab_size

107