# Next word prediction with long short-term memory

In [43]:
# import necessary libraries #
# tensorflow imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Input
# text preprocessing imports 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# regular expression for text preprocessing normalization
import regex as re
# imports for exploratory data analysis and loading dataset
import numpy as np

In [30]:
# load the text dataset
file_path= "C:\\recommender systems\\pizza.txt"
with open(file_path, 'r') as file:
    text = file.read()

# text preprocessing

In [34]:
# remove delimiters like('.', '?', and '!') from the sentences using regex
# use a list comprehension to extract all the sentences
sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?])\s+', text) if sentence.strip()]

In [52]:
# tokenize the text data
tokenizer = Tokenizer(num_words= 686)
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index)

In [53]:
# Create input sequences
input_seq = []
for line in sentences:
    tokens = tokenizer.texts_to_sequences([line])[0]
    for word in range(1, len(tokens)):
        n_gram_seq = tokens[:word+1]
        input_seq.append(n_gram_seq)

In [54]:
# Pad sequences and split into predictors and label
max_sequence_len = max([len(seq) for seq in input_seq])
input_seq = np.array(pad_sequences(
    input_seq, maxlen=max_sequence_len, padding='post'))
X, y = input_seq[:, :-1], input_seq[:, -1]


In [55]:
# Convert target data to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# model preparation and training

In [56]:
# define the model
# use an LSTM (Long Short-Term Memory) layer with 8 units
#A Dense layer with softmax activation
# set vocab_size and embedding dimension
vocab_size = 686  
embedding_dim = 128
# develop the mdoel
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(8))
model.add(Dense(total_words, activation='softmax'))

In [57]:
# get a summary of the model
model.summary()

In [58]:
# compile the model
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

In [59]:
# train the model
model.fit(X, y, epochs=1)

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - accuracy: 0.4976 - loss: 6.2646


<keras.src.callbacks.history.History at 0x1f8c165a9d0>

In [65]:
# generate predictions for the next word
text = "Pizza have different "
next_word = 1
# Assume seed_text is initialized and you are generating predictions
for _ in range(next_word):
    tokens = tokenizer.texts_to_sequences([text])[0]  # Generate token list
    tokens = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='post')
    predicted_probs = model.predict(tokens)
    predicted_index = np.argmax(predicted_probs)
    
    if predicted_index > 0:  # Check to avoid the padding index
        predicted_word = tokenizer.index_word[predicted_index]
        text += " " + predicted_word
    else:
        print("Predicted index is padding")

print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Predicted index is padding
Pizza have different 
