# Text Generation

In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [3]:
import spacy

2023-12-28 19:49:05.087927: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'lemmatizer', 'tagger', 'ner'])

In [14]:
nlp.max_length = 1198623

In [15]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [16]:
d = read_file('../06-Deep-Learning/moby_dick_four_chapters.txt')

In [17]:
tokens = separate_punc(d)

In [19]:
len(tokens)

11338

In [20]:
# passing 25 words --> network predicts #26

In [21]:
train_len = 25 + 1

In [22]:
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i - train_len: i]

    text_sequences.append(seq)

In [23]:
from keras.preprocessing.text import Tokenizer

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [25]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [27]:
vocabulary_size = len(tokenizer.word_counts)

In [28]:
vocabulary_size

2718

In [29]:
import numpy as np

In [30]:
sequences = np.array(sequences)
sequences.shape

(11312, 26)

In [31]:
from keras.utils import to_categorical

In [34]:
X = sequences[:, :-1]

In [35]:
y = sequences[:,-1]

In [36]:
y = to_categorical(y, num_classes=vocabulary_size + 1)

In [37]:
seq_len = X.shape[1]

In [38]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [39]:
def create_model(vocab_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocab_size, seq_len, input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))

    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

In [43]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 25)            67975     
                                                                 
 lstm_2 (LSTM)               (None, 25, 50)            15200     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 2719)              138669    
                                                                 
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [41]:
from pickle import dump, load

In [45]:
model.fit(X, y, batch_size=128, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fdbdae7c8b0>

In [47]:
model.save('my_moby_dick_model.h5')

In [46]:
dump(tokenizer, open('my_simple_tokenizer', 'wb'))

In [49]:
from keras_preprocessing.sequence import pad_sequences

In [None]:
model.predict

In [61]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = np.argmax(model.predict(pad_encoded, verbose=0))
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [51]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

In [55]:
random_seed_text = text_sequences[random_pick]
' '.join(random_seed_text)

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [62]:
output = generate_text(model, tokenizer, seq_len, random_seed_text, 25)
' '.join(output)

'b e e n   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t   n o t'