In [1]:
import numpy as np
import pandas as pd
from keras.layers import Dense, LSTM, Flatten, Dropout, RNN, Embedding
from keras.models import Sequential
#from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

In [2]:
#read in text

text = (open("wonderland.txt", encoding="utf8").read()).lower()

#print(text)

In [3]:
#text tokenization 
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

res = text_to_word_sequence(text)


words = set(res)
vocab_size = len(words)

print(vocab_size)
#print(res)

3037


In [4]:
#shape data into input/output ngrams
x=[]
y=[]
sequence_len = 4

for word in range(len(res)-sequence_len):
    x.append(res[word:word+sequence_len])
    y.append(res[word+sequence_len])
    
#print(x)
#print(y)

In [5]:
#text/character mapping using one hot encoding or vectorization
from keras.layers import TextVectorization

vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                    split='whitespace',
                                    output_mode='int',
                                    output_sequence_length=sequence_len)

vectorize_layer.adapt(res) 

#print(vectorize_layer.vocabulary_size())
#print(vectorize_layer.get_vocabulary())

In [6]:
x_encoded = []

for seq in x:
    #convert each sequence represesented as a list into a string
    seq = " ".join(seq)
    
    encoded = vectorize_layer(seq)
    #print(seq)
    #print(encoded)
    
    x_encoded.append(encoded)

In [7]:
#one hot encode target
from keras.utils import to_categorical

y_modified = []

for word in y:
    try:
        y_modified.append(vectorize_layer.get_vocabulary().index(word))
    except Exception as e:
        y_modified.append(vectorize_layer.get_vocabulary().index("[UNK]"))

y_modified = to_categorical(y_modified)

print(len(y_modified[1]))
print(len(y_modified))
print(len(x))

"""
#manually map words to int
word_to_int_map = {word:ind for ind,word in enumerate(words)}
int_to_word_map = {ind:word for ind,word in enumerate(words)}

y_modified = []

for word in y:
    y_modified.append(word_to_int_map[word])

y_modified = to_categorical(y_modified)

print(len(y_modified[1]))
print(len(y_modified))
print(len(x))
"""

3037
27571
27571


'\n#manually map words to int\nword_to_int_map = {word:ind for ind,word in enumerate(words)}\nint_to_word_map = {ind:word for ind,word in enumerate(words)}\n\ny_modified = []\n\nfor word in y:\n    y_modified.append(word_to_int_map[word])\n\ny_modified = to_categorical(y_modified)\n\nprint(len(y_modified[1]))\nprint(len(y_modified))\nprint(len(x))\n'

In [8]:
#create embedding layer
embedding_dim = 10
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=4)



In [9]:
#create model

#convert list of tensorflow objects to numpy array 
x_encoded = np.array(x_encoded)

#embedding_layer(x_encoded)

model = Sequential()

model.add(embedding_layer)

model.add(LSTM(100, input_shape=(x_encoded.shape[1], embedding_dim), return_sequences=True))

model.add(Dropout(0.2))

model.add(LSTM(100, return_sequences=True))

model.add(Dropout(0.2))

model.add(LSTM(100))

model.add(Dropout(0.2))

model.add(Dense(y_modified.shape[1], activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam")

model.fit(x_encoded, y_modified, epochs=10, batch_size=10)

  super().__init__(**kwargs)


Epoch 1/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 15ms/step - loss: 6.5063
Epoch 2/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - loss: 6.0051
Epoch 3/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 16ms/step - loss: 5.7915
Epoch 4/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 15ms/step - loss: 5.5376
Epoch 5/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 12ms/step - loss: 5.3789
Epoch 6/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 14ms/step - loss: 5.1923
Epoch 7/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 14ms/step - loss: 5.0170
Epoch 8/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 14ms/step - loss: 4.8806
Epoch 9/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 14ms/step - loss: 4.7521
Epoch 10/10
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x17b2b3612d0>

In [14]:
#test by generating random text
import random 

test_len=5

generated = []

#generate random seed to begin testing
x = np.array(random.choice([num for num,text in enumerate(vectorize_layer.get_vocabulary())])).reshape(-1, 1)

for word in range(test_len):
    #reshape prediction so model excepts a single input value
    x_padded = pad_sequences(x, maxlen=4)
    
    preds = list(model.predict(x_padded, verbose=0)[0])
    
    index = preds.index(max(preds))
    
    #retreive predicted word
    generated.append(vectorize_layer.get_vocabulary()[index])
    
    #re-initialize next word that goes into the model
    x=index
    x=np.array(index).reshape(-1,1)   
    
#print generated text
print(generated)

['use', 'to', 'marmalade”', 'of', 'a']
