In [2]:
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to C:\Users\Amit kumar
[nltk_data]     mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [3]:
#load file
file=open("frankenstein.txt").read()

In [4]:
#tokenization
#standardization
def tokenize_words(input):
  input=input.lower()
  tokenizer=RegexpTokenizer(r'\w+')
  tokens=tokenizer.tokenize(input)
  filtered=filter(lambda token: token not in stopwords.words('english'), tokens)
  return "".join(filtered)
processed_inputs=tokenize_words(file)

In [5]:
#chars to numbers
chars=sorted(list(set(processed_inputs)))
char_to_num=dict((c,i) for i, c in enumerate(chars))

In [6]:
#check if words to chars or chars to num has worked
input_len=len(processed_inputs)
vocab_len=len(chars)
print('Total number of characters:', input_len)
print('Total vocab:', vocab_len)

Total number of characters: 241880
Total vocab: 40


In [7]:
#seq length
seq_length=100
x_data=[]
y_data=[]

In [8]:
#loop through the sequence
for i in range(0,input_len - seq_length,1):
  in_seq=processed_inputs[i:i + seq_length]
  out_seq=processed_inputs[i+seq_length]
  x_data.append([char_to_num[char] for char in in_seq])
  y_data.append(char_to_num[out_seq])

In [9]:
n_patterns=len(x_data)
print('Total Patterns:',n_patterns)

Total Patterns: 241780


In [10]:
#convert input sequence to np array and so on
x=numpy.reshape(x_data,(n_patterns,seq_length,1))
x=x/float(vocab_len)

In [11]:
#one-hot encoding
y=np_utils.to_categorical(y_data)

In [12]:
#creating the model
model=Sequential()
model.add(LSTM(256,input_shape=(x.shape[1],x.shape[2]),return_sequences=True))
model.add(Dropout(0,2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0,2))
model.add(LSTM(128))
model.add(Dropout(0,2))
model.add(Dense(y.shape[1],activation='softmax'))

In [13]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [14]:
#saving weigths
filepath='model_weights_saved.hdf5'
checkpoint=ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [15]:
#fit model and let it train
model.fit(x,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.91929, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.91929 to 2.91391, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss did not improve from 2.91391
Epoch 4/4

Epoch 00004: loss did not improve from 2.91391


<keras.callbacks.callbacks.History at 0x20e355c5b08>

In [21]:
#recompile model with the saved weights
filename="model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss="categorical_crossentropy",optimizer='adam')

In [22]:
#output of the model back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [28]:
#random seed to help generate
start=numpy.random.randint(0,len(x_data) - 1)
pattern=x_data[start]
print('Random Seed: ')
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" ttearpiecesogreletgotellpapaboyneverseefathermustcomehideousmonsterletgopapasyndicfrankensteinpunish "


In [29]:
#generate the text
for i in range(100):
  x=numpy.reshape(pattern,(1,len(pattern),1))
  x=x/float(vocab_len)
  prediction=model.predict(x,verbose=0)
  index=numpy.argmax(prediction)
  result=num_to_char[index]
  seq_in=[num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern=pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee