In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

with open("dkm.txt") as f:
    text = f.read()

chars = list(set(text))

i_to_c = {i: c for i, c in enumerate(chars)}
c_to_i = {c: i for i, c in enumerate(chars)}

encoded = [c_to_i[c] for c in text]
sequences = []
next_char = []

In [2]:
maxlen = 30
step = 5

def preprocess(text):
    
    text = re.sub("[^A-Za-z0-9 ]", "", text)
    
    encoded = [c_to_i[c] for c in text]
    for i in range(0, len(encoded) - maxlen, step):
        sequences.append(encoded[i : i + maxlen])
        next_char.append(encoded[i + maxlen])


    for i in range(0, len(encoded) - maxlen, step):
        sequences.append(encoded[i : i + maxlen])
        next_char.append(encoded[i + maxlen])
    X = np.zeros((len(sequences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sequences), len(chars)), dtype=np.bool)

    for i, seq in enumerate(sequences):
        for t, char in enumerate(seq):
            X[i, t, char] = 1
        y[i, next_char[i]] = 1

    return X, y

In [3]:
import numpy as np
import re

X, y = preprocess(text)

X.shape

(112202, 30, 89)

In [5]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

max_features = 20000

model = Sequential()
model.add(LSTM(len(chars), input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam")

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 89)                63724     
_________________________________________________________________
dense (Dense)                (None, 89)                8010      
Total params: 71,734
Trainable params: 71,734
Non-trainable params: 0
_________________________________________________________________


In [6]:
def sample(preds):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 1
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = random.randint(0, len(text) - maxlen - 1)
    
    generated = ''
    
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)
    
    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, c_to_i[char]] = 1
            
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_char = i_to_c[next_index]
        
        sentence = sentence[1:] + next_char
        
        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()


print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
import random, sys

model.fit(X, y, batch_size=128, epochs=3, callbacks=[print_callback],)

Epoch 1/3
----- Generating text after Epoch: 0
----- Generating with seed: "We're out of our minds â¨And "
We're out of our minds â¨And Slofersely At Wiseig Till torech your hak dard nougnt and lawd the af im in Wis ther cruttitthe ming a sard the Bon hor Sfatiny Hally syral I never Yar I gum sayl ot dous and chean thein watl: hontis shing Cwakr soo BlisS otay brwesr Sreponge the porreeg Jock Drocthen sall Thane Chourdt of ardred illicht llly comw wale tish eane wack you of rat ame the ith Lwa ca chong yom hay noo mpall tou ness I
Epoch 2/3