In [5]:
#Implementing character leve LSTM text generation

#downloading text corpus
import keras 
import numpy as np

path = keras.utils.get_file('nietzsche.txt',origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower() #lower case the content
print("corpus length :",len(text))

#vectorizing sequence of characters
maxlen = 60 #we will extract characters of length 60
step = 3 #we will sample a new sequence every 3 characters once
sentences = [] #will hold the extracted sequence
next_chars = [] #holds the targets
for i in range(0,len(text) - maxlen,step): 
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen]) #next word after a seq of len 60 is the target
print("number of sequences = ",len(sentences))
chars = sorted(list(set(text))) #onlu unique characters
print("unique characters ",len(chars))
char_indices = dict()
'''for i,v in enumerate(chars):
    char_indices.update({v,i})'''
char_indices = dict((char, chars.index(char)) for char in chars)
print("Vectorization....")
x = np.zeros((len(sentences),maxlen,len(chars)),dtype = np.bool) #create an output tensor of shape (num_sentences,seq_len,unique_chars) with false
y = np.zeros((len(sentences),len(chars)),dtype = np.bool) #creates target array ie one hot encoded version of the target word
for i,sentence in enumerate(sentences):
    for t,char in enumerate(sentence):
        x[i,t,char_indices[char]] = 1 #which ever characters are present make them 1
    y[i,char_indices[next_chars[i]]] = 1

Using TensorFlow backend.


corpus length : 600901
number of sequences =  200281
unique characters  59
Vectorization....


In [6]:
#single LSTM layer
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128,input_shape = (maxlen,len(chars))))
model.add(layers.Dense(len(chars),activation = "softmax")) #predicting one character ouut of the num_unique_characters

In [7]:
optimizer = keras.optimizers.RMSprop(lr = 0.01)
model.compile(loss = "categorical_crossentropy",optimizer = optimizer)

In [8]:
#reweigh the original probability distribution
def sample(preds,temperature = 1.0):
    preds = np.asarray(preds).astype("float64") #convert np array of type bool to float
    preds = np.log(pred) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probabalities = np.random.multinomial(1,pred,1) #(num_exp,prob_distribution,output_arr_size)
    return np.argmax(probabalities)

In [9]:
#text generation loop
import random 
import sys

epochs = 60
batch_size = 128

for epoch in range(1,epochs):#trains the model for 60 epochs
    print("epoch number : ",epoch)
    model.fit(x,y,batch_size = batch_size,epochs = 1) #fits the model for one iteration
    start_indx = random.randint(0,len(text) - maxlen - 1)
    generated_text = text[start_indx:maxlen]
    print("----Generating with seed: " + generated_text)
    
    for temperature in [0.2,0.5,1.0,1.2]:
        print("-------temperature : " + temperature)
        sys.stdout.write(generated_text)
        
    for i in range(400): #generates 400 characters starting from seed text
        sampled = np.zeros((1,maxlen,len(chars))) #one sentence
        for t,char in enumerate(generated_text):
            sampled[0,t,char_indices[char]] = 1 #one hot encode the characters geerated so far
        
        preds = model.predict(sampled,verbose = 0)[0] #one array at a time
        next_index = sample(pred,temperature)
        next_char = chars[next_index] #get the character of the corresponding token
        
        generated_text += next_char #adding the next character to the generated text
        

epoch number :  1
Epoch 1/1

KeyboardInterrupt: 

In [None]:
print(generated_text)