In [None]:
import numpy as np

#Step 1 Reading Text Files

In [None]:
def read_txt(file_path):
  '''
  Input : .txt file
  '''
  with open(file_path) as f:
    str_text = f.read()
  return str_text

In [None]:
ex_txt_file_path = '/content/drive/My Drive/melville-moby_dick.txt'
#these are chapters of a book

In [None]:
# read_txt(file_path=ex_txt_file_path)

#Step 2 Tokenizing and Cleaning the data with spacy

In [None]:
import spacy

In [None]:
nlp = spacy.load('en',disable=['parser','tagger','ner'])
#we are disabling the given features as it would save some time in tokenization

In [None]:
nlp.max_length = 1198623
# upto how many text do we apply tokenization
# we can increase the number if we want to cover complete book

In [None]:
#grabbing the text tokens using list comprehension
def separate_punc(doc_text):
  '''
  text cleaning function
  '''
  return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
d = read_txt('/content/drive/My Drive/moby_dick_four_chapters.txt')

In [None]:
# print(d)

In [None]:
tokens = separate_punc(d)

In [None]:
print(tokens[10:20])
print(f'No. of words {len(tokens)}')

['precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and']
No. of words 11338


In [None]:
# so we will have model that takes 25 words as input and outputs the 26th word

In [None]:
train_len = 25+1

text_sequences = [] 

for i in range(train_len,len(tokens)):
  seq = tokens[i-train_len:i]
  text_sequences.append(seq)
#for this to make sense look at the cell below
# A->B->C->D->E
# B->C->D->E->F

In [None]:
print(text_sequences[0])
print(text_sequences[1])
# so these are sequences of 26 words
# same as 
# A->B->C->D->E
# B->C->D->E->F

['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on']
['me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore']


In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [None]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
#so what the above code does is it assigns a number to each of the text
# like : A->1 B->2 etc.
print(sequences[0])
print(text_sequences[0])
print(sequences[1])
print(text_sequences[1])

[956, 14, 263, 51, 261, 408, 87, 219, 129, 111, 954, 260, 50, 43, 38, 315, 7, 23, 546, 3, 150, 259, 6, 2712, 14, 24]
['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on']
[14, 263, 51, 261, 408, 87, 219, 129, 111, 954, 260, 50, 43, 38, 315, 7, 23, 546, 3, 150, 259, 6, 2712, 14, 24, 957]
['me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore']


In [None]:
#these are the tokens assigned to each word
for i in sequences[0]:
  print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [None]:
sequences = np.array(sequences)
# print(sequences)
#note the shift

In [None]:
from keras.utils import to_categorical

In [None]:
X = sequences[:,:-1] #picjking every word except last word

In [None]:
y = sequences[:,-1] # last words

In [None]:
#lets check the no. of unique words
vocab_size = len(tokenizer.word_counts)

In [None]:
print(y)

[  24  957    5 ...    2 2717   26]


In [None]:
#converting to categorical variables
y = to_categorical(y,num_classes = vocab_size+1)

In [None]:
# print(y)
#one hot encoding

In [None]:
X.shape
# 11312 samples with 25 features in each 

(11312, 25)

In [None]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocab_size,seq_len):
  model = Sequential()
  model.add(Embedding(vocab_size,seq_len,input_length=seq_len))
  model.add(LSTM(75,return_sequences=True))
  model.add(LSTM(50))
  model.add(Dense(50,activation='relu'))
   
  model.add(Dense(vocab_size,activation='softmax'))

  model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
  model.summary()
  return model

In [None]:
model = create_model(vocab_size+1,25)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            67950     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 75)            30300     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                25200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 2718)              138618    
Total params: 264,618
Trainable params: 264,618
Non-trainable params: 0
_________________________________________________________________


In [None]:
from pickle import dump,load

In [None]:
model.fit(X,y,batch_size=128,epochs=500,verbose=0) #put verbose=1 to view progress

<tensorflow.python.keras.callbacks.History at 0x7f89681bf1d0>

In [None]:
model.save('book_model_2.h5')

In [None]:
dump(tokenizer,open('my_simpletokenizer','wb'))

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):

  output_text = []

  input_text = seed_text
  for i in range(num_gen_words):
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    
    pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
    # this will do the padding just in case text sequence is not 25 words long
    pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]

    pred_word = tokenizer.index_word[pred_word_ind]

    input_text+=' '+pred_word

    output_text.append(pred_word)

  return ' '.join(output_text)

In [None]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [None]:
num_of_words_to_generate = 20

In [None]:
seed_text +' '+ generate_text(model,tokenizer,25,seed_text=seed_text,num_gen_words=num_of_words_to_generate)

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have to be afraid of him that there as if it were complied of mortal once nantucket save the frost whom"

In [None]:
# THE END