In [2]:
# source text format . For example if below is the input sequence, we will feed the data to the model as given in the 
#next line. We are going to use davincicode.txt file as the input which has more than 100K sentenses
data = 'Jack and Jill went up the hill To fetch a pail of water Jack fell down and broke his crown'

## We will feed the model, Line by Line sequence of text and genrate the output 

            X,						y

        _, _, _, _, _, Jack, 				and


        _, _, _, _, Jack, and 				Jill

        _, _, _, Jack, and, Jill,			went

        _, _, Jack, and, Jill, went,		up

        _, Jack, and, Jill, went, up,		the

        Jack, and, Jill, went, up, the,		hill

In [3]:
from keras.preprocessing.text import Tokenizer

In [4]:
import numpy as np
import re

In [5]:
book_text=open('../input/davincicodetxt/davincicode.txt',encoding='UTF-8').read()

In [6]:
len(book_text)

842670

In [7]:
sequences=list()
for line in book_text.split('.'):
    sequences.append(line)

In [8]:
len(sequences)

12293

In [9]:
sequences[1:10]

['',
 '',
 ' AGAIN',
 ' MORE THAN EVER',
 ' \n\n\n\nAcknowledgments \n\nFirst and foremost, to my friend and editor, Jason Kaufman, for working so hard on this project and \nfor truly understanding what this book is all about',
 ' And to the incomparable Heide Lange — tireless \nchampion of The Da Vinci Code, agent extraordinaire, and trusted friend',
 ' \n\nI cannot fully express my gratitude to the exceptional team at Doubleday, for their generosity, faith, \nand superb guidance',
 ' Thank you especially to Bill Thomas and Steve Rubin, who believed in this \nbook from the start',
 " My thanks also to the initial core of early in-house supporters, headed by \nMichael Palgon, Suzanne Herz, Janelle Moburg, Jackie Everly, and Adrienne Sparks, as well as to \nthe talented people of Doubleday's sales force"]

In [10]:
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    if len(words)>1:
        return " ".join(words)	
    else:
        return 'NA'
    
  except:
    return ""

In [11]:
cleaned_seq=list() 

In [12]:
for line in sequences:
    cleaned_seq.append(clean_str(line))
    

In [13]:
cleaned_seq[1:10]

['NA',
 'NA',
 'NA',
 'more than ever',
 'acknowledgments first and foremost to my friend and editor jason kaufman for working so hard on this project and for truly understanding what this book is all about',
 'and to the incomparable heide lange tireless champion of the da vinci code agent extraordinaire and trusted friend',
 'i cannot fully express my gratitude to the exceptional team at doubleday for their generosity faith and superb guidance',
 'thank you especially to bill thomas and steve rubin who believed in this book from the start',
 'my thanks also to the initial core of early in house supporters headed by michael palgon suzanne herz janelle moburg jackie everly and adrienne sparks as well as to the talented people of doubleday s sales force']

In [14]:
len(cleaned_seq)

12293

In [15]:
cleaned_seq2=list()
for line in cleaned_seq:
    if line!='NA':
        cleaned_seq2.append(line)



In [16]:
len(cleaned_seq2)

11241

In [17]:
cleaned_seq2[1:10]

['more than ever',
 'acknowledgments first and foremost to my friend and editor jason kaufman for working so hard on this project and for truly understanding what this book is all about',
 'and to the incomparable heide lange tireless champion of the da vinci code agent extraordinaire and trusted friend',
 'i cannot fully express my gratitude to the exceptional team at doubleday for their generosity faith and superb guidance',
 'thank you especially to bill thomas and steve rubin who believed in this book from the start',
 'my thanks also to the initial core of early in house supporters headed by michael palgon suzanne herz janelle moburg jackie everly and adrienne sparks as well as to the talented people of doubleday s sales force',
 'for their generous assistance in the research of the book i would like to acknowledge the louvre museum the french ministry of culture project gutenberg bibliotheque nationale the gnostic society library the department of paintings study and documentatio

In [18]:
token=Tokenizer()

In [19]:
token.fit_on_texts(cleaned_seq2)

In [20]:
#token.word_index

In [21]:
# determine the vocabulary size
vocab_size = len(token.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 11186


In [22]:
encoded=token.texts_to_sequences(cleaned_seq2)

In [23]:
encoded[1]

[76, 99, 305]

In [24]:
#encoded

In [25]:
len(encoded)

11241

# We will create line based sequences 

In [26]:
sequence_list=list()

In [27]:
for line in encoded:
    for i in range(1,len(line)):
        sequence_list.append(line[:i+1])
        
        

In [28]:
print('Total Sequences_list:',len(sequence_list))

Total Sequences_list: 131120


In [29]:
sequence_list[1:10]

[[1, 132, 135],
 [1, 132, 135, 286],
 [1, 132, 135, 286, 4673],
 [1, 132, 135, 286, 4673, 2332],
 [1, 132, 135, 286, 4673, 2332, 24],
 [1, 132, 135, 286, 4673, 2332, 24, 4674],
 [76, 99],
 [76, 99, 305],
 [6387, 157]]

Next , we need to pad the sequence using pad_sequence() in keras. Before that we need to find out the maximum lengh of the sequence so that we can pad all the sequences in the same length
    
# Pad Sequence

In [30]:
max_length=max([len(seq) for seq in sequence_list])
print ('maximum sequence length is', max_length)

maximum sequence length is 90


In [31]:
from keras.preprocessing.sequence import pad_sequences

In [32]:
sequence_padded_list=pad_sequences(sequence_list,maxlen=max_length,padding='pre')

In [33]:
sequence_padded_list[1:10]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
         132,  135],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    

Next, we need to split the data to input and output elements

# Split the data to input and output

In [34]:

sequence_padded_list=np.array(sequence_padded_list)



In [35]:
sequence_padded_list.shape

(131120, 90)

In [36]:
# Split the input and output data
X,y= sequence_padded_list[:,:-1],sequence_padded_list[:,-1]

In [37]:
X[1:10]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
         132],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [38]:
X.shape

(131120, 89)

In [39]:
y[1:10]

array([ 135,  286, 4673, 2332,   24, 4674,   99,  305,  157], dtype=int32)

In [40]:
y.shape

(131120,)

In [41]:
from keras.utils import to_categorical


In [42]:
vocab_size

11186

In [43]:
y_cat=to_categorical(y,num_classes=vocab_size)

In [44]:
y_cat[1:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [45]:
y_cat.shape

(131120, 11186)

Now , we need to train the model with the 131120 training examples each with max_length length.er 

First, we will create an embedding layer of 3 dimension as usual. 

vocab_size,

10 dimesnion vector for each word, 

(max_length-1) sequence length. -1 , becuase we excluded the output y already


In [46]:
import tensorflow as tf

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)



In [47]:
from keras.models import Sequential
from keras.layers import Dense, Dropout,Embedding,LSTM
from keras.optimizers import RMSprop

We are going to use stacked LSTM with an additional LSTM layer 

In [48]:
# instantiating the model in the strategy scope creates the model on the TPU
#with tpu_strategy.scope():
model=Sequential()
model.add(Embedding(vocab_size,40,input_length=max_length-1))
model.add(LSTM(300,return_sequences=True))
model.add(LSTM(200))
model.add(Dense(vocab_size,activation='softmax'))
optim=RMSprop(lr=0.07)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


In [49]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 89, 40)            447440    
_________________________________________________________________
lstm (LSTM)                  (None, 89, 300)           409200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               400800    
_________________________________________________________________
dense (Dense)                (None, 11186)             2248386   
Total params: 3,505,826
Trainable params: 3,505,826
Non-trainable params: 0
_________________________________________________________________


# Compile and fit the model

In [50]:
model.fit(X,y_cat,epochs=150,batch_size=50,verbose=2)

Epoch 1/150
2623/2623 - 65s - loss: 6.9346 - accuracy: 0.0779
Epoch 2/150
2623/2623 - 65s - loss: 6.2754 - accuracy: 0.1018
Epoch 3/150
2623/2623 - 65s - loss: 5.8804 - accuracy: 0.1217
Epoch 4/150
2623/2623 - 65s - loss: 5.5962 - accuracy: 0.1338
Epoch 5/150
2623/2623 - 65s - loss: 5.3559 - accuracy: 0.1437
Epoch 6/150
2623/2623 - 65s - loss: 5.1307 - accuracy: 0.1536
Epoch 7/150
2623/2623 - 65s - loss: 4.9116 - accuracy: 0.1620
Epoch 8/150
2623/2623 - 64s - loss: 4.6975 - accuracy: 0.1709
Epoch 9/150
2623/2623 - 65s - loss: 4.4971 - accuracy: 0.1797
Epoch 10/150
2623/2623 - 65s - loss: 4.3076 - accuracy: 0.1921
Epoch 11/150
2623/2623 - 65s - loss: 4.1315 - accuracy: 0.2072
Epoch 12/150
2623/2623 - 65s - loss: 3.9713 - accuracy: 0.2231
Epoch 13/150
2623/2623 - 65s - loss: 3.8264 - accuracy: 0.2397
Epoch 14/150
2623/2623 - 64s - loss: 3.6926 - accuracy: 0.2560
Epoch 15/150
2623/2623 - 65s - loss: 3.5691 - accuracy: 0.2728
Epoch 16/150
2623/2623 - 65s - loss: 3.4559 - accuracy: 0.2880
E

Epoch 131/150
2623/2623 - 65s - loss: 0.5729 - accuracy: 0.8570
Epoch 132/150
2623/2623 - 64s - loss: 0.5770 - accuracy: 0.8559
Epoch 133/150
2623/2623 - 64s - loss: 0.5668 - accuracy: 0.8590
Epoch 134/150
2623/2623 - 65s - loss: 0.5728 - accuracy: 0.8566
Epoch 135/150
2623/2623 - 65s - loss: 0.5707 - accuracy: 0.8573
Epoch 136/150
2623/2623 - 64s - loss: 0.5745 - accuracy: 0.8561
Epoch 137/150
2623/2623 - 65s - loss: 0.5662 - accuracy: 0.8587
Epoch 138/150
2623/2623 - 64s - loss: 0.5712 - accuracy: 0.8570
Epoch 139/150
2623/2623 - 65s - loss: 0.5665 - accuracy: 0.8580
Epoch 140/150
2623/2623 - 64s - loss: 0.5691 - accuracy: 0.8570
Epoch 141/150
2623/2623 - 64s - loss: 0.5671 - accuracy: 0.8578
Epoch 142/150
2623/2623 - 64s - loss: 0.5700 - accuracy: 0.8561
Epoch 143/150
2623/2623 - 64s - loss: 0.5570 - accuracy: 0.8601
Epoch 144/150
2623/2623 - 65s - loss: 0.5652 - accuracy: 0.8575
Epoch 145/150
2623/2623 - 64s - loss: 0.5680 - accuracy: 0.8557
Epoch 146/150
2623/2623 - 64s - loss: 0.

<tensorflow.python.keras.callbacks.History at 0x7f8373ee5e10>

Note : As we observe in the above execution , accuracy is not much improved after 100 iterations.
So its always a best practice **'Early Stopping'**. This will avoid overfitting

In [51]:
model.save('keras_next_word_model_StakckedLSTM.h5')

In [52]:
import keras

In [53]:
model1=keras.models.load_model('./keras_next_word_model_StakckedLSTM.h5')



Now, lets test the model. we will generate a sequence using the below methods

In [54]:
# Generate sequence from language model

def generate_seq(model,token,max_length,seed_text,n_words):
    in_text=seed_text
    
    # generate the fixed number of words given by n_words in the input to the function
    for _ in range(n_words):
        # encode the text as integeer
        encoded_text=token.texts_to_sequences([in_text])[0]
        #prepad same as we did before training
        encoded_text=pad_sequences([encoded_text],maxlen=max_length,padding='pre')
        #predict the probabilities of the word
        y_pred=model.predict_classes(encoded_text,verbose=0)
        
        #map the index to the word
        
        outword=''
        for word, index in token.word_index.items():
            if index == y_pred:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    
    return in_text

In [56]:
print(generate_seq(model1,token,max_length-1,'Langdon',10))

Langdon had no idea how to respond about rosslyn s tomb


In [57]:
print(generate_seq(model1,token,max_length-1,'Professor',10))

hostess is supposed to be with respect that very sorry for
