# Text Generation using LSTM
---

### Initial Setup
---

In [0]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

E: Package 'python-software-properties' has no installation candidate
Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 131331 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.6-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.6-0ubuntu1~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.6-0ubuntu1~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [0]:
# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

In [0]:
# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [0]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
default_path = 'drive/LSTM_TextGen/'

### LSTM Model Creation
---

In [0]:
import string
import math
import re
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# load ascii text and covert to lowercase
filename = "wonderland.txt"
orig_text = open(default_path + filename).read()
orig_text = orig_text.lower()

# remove punctuations
regex = re.compile('[%s]' % re.escape(string.punctuation))
raw_text = regex.sub('', orig_text)

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))[1:]
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)


Total Characters:  136087
Total Vocab:  29


In [0]:
lines = [line for line in raw_text.splitlines() if line.isspace()==False and len(line)>0]
#print(lines)

sent_len_list = [len(line) for line in lines]
max_sent_len = max(sent_len_list)
min_sent_len = min(sent_len_list)
avg_sent_len = math.floor(sum(sent_len_list)/len(sent_len_list))
common_sent_len = max(set(sent_len_list), key=sent_len_list.count)
print(f"Maximum length of sentence - {max_sent_len}")
print(f"Minimum length of sentence - {min_sent_len}")
print(f"Average length of sentences - {avg_sent_len}")
print(f"Common length of sentences - {common_sent_len}")

inp_sequences = [list(line) for line in lines]

Maximum length of sentence - 73
Minimum length of sentence - 2
Average length of sentences - 53
Common length of sentences - 67


In [0]:
# prepare the dataset of input to output pairs encoded as integers
data = []
for line in lines:
	data.append([char_to_int[char] for char in line])
#n_patterns = len(data)
#print("Total Patterns: ", n_patterns)

In [0]:
from keras.preprocessing.sequence import pad_sequences

# pad sequence
data_padded = pad_sequences(data, maxlen=max_sent_len)
flat_list = [item for sublist in data_padded for item in sublist]

input_len = len(flat_list)
seq_length = 100
dataX = []
dataY = []

In [0]:
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = flat_list[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = flat_list[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    dataX.append([char for char in in_seq])
    dataY.append(out_seq)

n_patterns = len(dataX)

In [0]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model.add(Dropout(0.1))
model.add(LSTM(256))
#model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath = default_path + "Models/Model5/weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X, y, epochs=100, batch_size=512, callbacks=callbacks_list)

W0727 10:13:32.494488 140176973526912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0727 10:13:32.522053 140176973526912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0727 10:13:32.529314 140176973526912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0727 10:13:32.807087 140176973526912 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0727 10:13:32.818583 

Epoch 1/100

Epoch 00001: loss improved from inf to 2.31433, saving model to drive/LSTM_TextGen/Models/Model5/weights-improvement-01-2.3143-bigger.hdf5
Epoch 2/100

Epoch 00002: loss improved from 2.31433 to 2.10710, saving model to drive/LSTM_TextGen/Models/Model5/weights-improvement-02-2.1071-bigger.hdf5
Epoch 3/100

Epoch 00003: loss improved from 2.10710 to 1.94609, saving model to drive/LSTM_TextGen/Models/Model5/weights-improvement-03-1.9461-bigger.hdf5
Epoch 4/100

Epoch 00004: loss improved from 1.94609 to 1.84005, saving model to drive/LSTM_TextGen/Models/Model5/weights-improvement-04-1.8400-bigger.hdf5
Epoch 5/100

Epoch 00005: loss improved from 1.84005 to 1.75966, saving model to drive/LSTM_TextGen/Models/Model5/weights-improvement-05-1.7597-bigger.hdf5
Epoch 6/100

Epoch 00006: loss improved from 1.75966 to 1.69448, saving model to drive/LSTM_TextGen/Models/Model5/weights-improvement-06-1.6945-bigger.hdf5
Epoch 7/100

Epoch 00007: loss improved from 1.69448 to 1.67238, sav

<keras.callbacks.History at 0x7f7d2dddbe10>

#### Testing the model
----

In [0]:
# pick a random seed
result = ''
start = np.random.randint(0, len(dataX)-1)
pattern = list(dataX[start])
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(500):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result += int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	#print(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")
print(result)

Seed:
"  moment           lets go on with the game the queen said to alice and alice was    too much frighte "

Done.
ned toletced inm tuattely and ie to ofat the hidged                                                                                                                                                                                                                                                                                                                                                                                                                                                                 


### Generate Text using above created model
---

#### Load saved model to generate text
----

In [0]:
import string
import math
import re
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# load ascii text and covert to lowercase
filename = "wonderland.txt"
orig_text = open(default_path + filename).read()
orig_text = orig_text.lower()

# remove punctuations
regex = re.compile('[%s]' % re.escape(string.punctuation))
raw_text = regex.sub('', orig_text)

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))[1:]
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

lines = [line for line in raw_text.splitlines() if line.isspace()==False and len(line)>0]
#print(lines)

sent_len_list = [len(line) for line in lines]
max_sent_len = max(sent_len_list)
min_sent_len = min(sent_len_list)
avg_sent_len = math.floor(sum(sent_len_list)/len(sent_len_list))
common_sent_len = max(set(sent_len_list), key=sent_len_list.count)
print(f"Maximum length of sentence - {max_sent_len}")
print(f"Minimum length of sentence - {min_sent_len}")
print(f"Average length of sentences - {avg_sent_len}")
print(f"Common length of sentences - {common_sent_len}")

inp_sequences = [list(line) for line in lines]

# prepare the dataset of input to output pairs encoded as integers
data = []
for line in lines:
	data.append([char_to_int[char] for char in line])

from keras.preprocessing.sequence import pad_sequences

# pad sequence
data_padded = pad_sequences(data, maxlen=max_sent_len)
flat_list = [item for sublist in data_padded for item in sublist]

input_len = len(flat_list)
seq_length = 100
dataX = []
dataY = []

for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = flat_list[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = flat_list[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    dataX.append([char for char in in_seq])
    dataY.append(out_seq)

n_patterns = len(dataX)
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model.add(Dropout(0.1))
model.add(LSTM(256))
#model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

# load the network weights
filename = "drive/LSTM_TextGen/" + "Models/Model5/" + "weights-improvement-100-0.7746-bigger.hdf5" #"weights-improvement-47-1.2219-bigger.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

Using TensorFlow backend.


Total Characters:  136087
Total Vocab:  29
Maximum length of sentence - 73
Minimum length of sentence - 2
Average length of sentences - 53
Common length of sentences - 67


W0727 16:11:19.572988 140031781689216 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0727 16:11:19.609420 140031781689216 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0727 16:11:19.615979 140031781689216 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0727 16:11:19.809154 140031781689216 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0727 16:11:19.820449 

#### Generate text from created model
----

In [0]:
gen_sents = []
for sents in range(10):
  # pick a random seed
  result = ''
  start = np.random.randint(0, len(dataX)-1)
  pattern = list(dataX[start])
  print("Seed:")
  print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
  # generate characters
  for i in range(500):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result += int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
  gen_sents.append(result)
  #print(result)
  
print("Generated Text:")
print("\n".join(gen_sents))

Seed:
"            a cat may look at a king said alice ive read that in some book                            "
Seed:
" ith you mind      now the poor little thing sobbed again or grunted it was impossible                "
Seed:
"  a pleasure in all their simple joys                  remembering her own childlife and the happy su "
Seed:
" t them free                                                       exactly as we were                 "
Seed:
" ake out    at all what had become of it so after hunting all about for it he was    obliged to write "
Seed:
" er if i shall ever see you        any more and here poor alice began to cry again for she felt very  "
Seed:
" he fan and the pair of white kid gloves    and she very goodnaturedly began hunting about for them b "
Seed:
"  the accident of   the goldfish kept running in her head and she had a vague sort of idea     that t "
Seed:
"              as far out to sea as you can                                     swim after them scream "
S

**Generated Text**:

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
ite hind                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
 the felt a little turpe said tooe toder and when sut                                                                                                                                                                                                                                                                                                                                                                                                                                                               
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
nice ceganed                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
ume beot ieard intoed to the wholg alice help of tuch alice whosg                                                                                                                                                                                                                                                                                                                                                                                                                                                   
 anice repeame                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
wand tathar inm secd alice as helg wo hit so sfad