<a href="https://colab.research.google.com/github/ferrari-leo/training/blob/main/nlp_with_python/06_deep_learning/LSTM_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
!pip install keras-preprocessing

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m657.1 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


In [43]:
import spacy
import os
import random
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding
from pickle import dump, load

In [2]:
folder_path = '/content/drive/MyDrive/NLP Udemy'

Process, clean, and tokenize text

In [3]:
def read_file(filepath):
  with open(filepath) as f:
    str_text = f.read()

  return str_text

In [4]:
def separate_punc(doc_text):
  return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

In [6]:
nlp.max_length = 1198623

In [7]:
d = read_file('/content/drive/MyDrive/NLP Udemy/moby_dick_four_chapters.txt')

In [8]:
tokens = separate_punc(d)



In [9]:
len(tokens)

11338

In [10]:
# 25 words --> network predicts word 26

In [11]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len, len(tokens)):
  seq = tokens[i-train_len:i]
  text_sequences.append(seq)

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [13]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [14]:
# can see the relationship between token and words
for i in sequences[0]:
  print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [15]:
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [16]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2718

In [17]:
sequences = np.array(sequences)

Create model, split data, fit model

In [18]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [19]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [20]:
seq_len = X.shape[1]

In [21]:
def create_model(vocabulary_size, seq_len):
  model = Sequential()
  model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
  model.add(LSTM(2*seq_len, return_sequences=True))
  model.add(LSTM(2*seq_len))
  model.add(Dense(2*seq_len,activation='relu'))
  model.add(Dense(vocabulary_size, activation='softmax'))

  model.compile(
      loss='categorical_crossentropy',
      optimizer='adam',
      metrics='accuracy'
      )

  model.summary()

  return model

In [22]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67975     
                                                                 
 lstm (LSTM)                 (None, 25, 50)            15200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 2719)              138669    
                                                                 
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(X,y,batch_size=128,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a6de349a5c0>

In [24]:
model.save(os.path.join(folder_path,'mobydick_lstm.h5'))
dump(tokenizer,open(os.path.join(folder_path,'mobydick_tokenizer'), 'wb'))

In [37]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
  output_text = []
  input_text = seed_text

  for i in range(num_gen_words):

    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = pad_sequences([encoded_text], maxlen = seq_len, truncating='pre')
    pred_word_ind = model.predict(pad_encoded,verbose=0).argmax(axis=1)[0]
    pred_word = tokenizer.index_word[pred_word_ind]

    input_text += ' '+pred_word
    output_text.append(pred_word)

  return ' '.join(output_text)

In [31]:
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [32]:
random_seed_text = text_sequences[random_pick]

In [46]:
seed_text = ' '.join(random_seed_text)

In [47]:
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [38]:
generate_text(model, tokenizer, seq_len, seed_text = seed_text, num_gen_words=25)

'the room of the room of the room of the room of the room of the room of the room of the room of the'

In [40]:
model = load_model(os.path.join(folder_path,'epochBIG.h5'))

In [44]:
tokenizer = load(open(os.path.join(folder_path,'epochBIG'), 'rb'))

In [45]:
generate_text(model, tokenizer, seq_len, seed_text = seed_text, num_gen_words=25)

"to be seen there was no bad olfactories my own letter was cheerily listening over his hearers who 's more can go have a wearing"