<a href="https://colab.research.google.com/github/g-e-mm/LSTM_Predictor/blob/main/LSTM_Next_Line_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Approach to the problem

1. Load the libraries and data
2. Clean the data - remove commas, periods etc.
3. Tokenize the data
4. Convert to sequence
5. Input sequence and Output sequence
6. Create sequential model
7. LSTM Layers
8. Compile and fit the model
9. Evaluate using inputs

# Load the libraries and data

https://raw.githubusercontent.com/insaid2018/DeepLearning/master/Data/republic_clean.txt?_sm_au_=iVV10f0f2kPt2J07


In [None]:
from random import randint
from pickle import load
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Dropout
from keras.layers import Bidirectional, Dropout
import string
import urllib

Loading data using urllib

In [None]:
response= urllib.request.urlopen('https://raw.githubusercontent.com/insaid2018/DeepLearning/master/Data/republic_clean.txt?_sm_au_=iVV10f0f2kPt2J07')
doc= response.read().decode('utf8')

In [None]:
print(doc[:200])

# Cleaning the Data

In [None]:
def clean_doc(doc):
    #replace '--' with a space ' '
    doc= doc.replace('--', ' ')
    #split into tokens by white space
    tokens= doc.split()
    #remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens= [w.translate(table) for w in tokens]
    #remove remaining tokens that are not alphabetic
    tokens= [word for word in tokens if word.isalpha()]
    #make lower case
    tokens= [word.lower() for word in tokens]
    return tokens

In [None]:
tokens = clean_doc(doc)

In [None]:
print("the first five tokens are: ",tokens[:5])
print("Total no. of tokens: ",len(tokens))
print("Total no. of unique tokens: ",len(set(tokens)))

In [None]:
if 'book' in tokens:
    count= tokens.count('book')
    print(count)

# Tokenizing and Sequencing the data

In [None]:
length =50+1
sequence = list()
for i in range(length,len(tokens)):
    #select sequence of tokens
    seq = tokens[i-length:i]
    #convert into a line
    line = ' '.join(seq)
    #store
    sequence.append(line)
print(len(sequence))
print(sequence[:10])

saving the sequences in a file as a backup. Also the same is being loaded

In [None]:
#saving tokens to a file - one dialog per line
def save_doc(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()

#save sequences to file
out_filename = "rep_sequences.txt"
save_doc(sequence,out_filename)

In [None]:
#load doc into memory
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

#load
in_filename = 'rep_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

tokenize and convert to sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequence = tokenizer.texts_to_sequences(lines)
sequence = pad_sequences(sequence)
print

In [None]:
tokenizer.word_index

In [None]:
sequences = array(sequence)

In [None]:
sequences

declaring x and y

In [None]:
x,y = sequences[:,:-1],sequences[:,-1]
print(x.shape)
print(y.shape)

# Creating the sequential model

In [None]:
vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

In [None]:
y = to_categorical(y, num_classes=vocab_size)
y.shape

In [None]:
model = Sequential()
model.add(Embedding(vocab_size,10,input_length=50))

# Adding a Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Adding another LSTM layer for capturing more complex patterns
model.add(LSTM(100, return_sequences=True)) # Added return_sequences=True

# Adding another Dense layer with ReLU activation for non-linearity
model.add(Dense(100, activation='relu'))

# Adding another Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Adding a third LSTM layer for capturing even more complex patterns
model.add(LSTM(100))

# Adding a third Dense layer with ReLU activation for non-linearity
model.add(Dense(100, activation='relu'))

# Adding a third Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Output layer for predicting the next word
model.add(Dense(vocab_size, activation='softmax'))

print(model.summary())

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
hist = model.fit(x,y,batch_size=256,epochs=100)

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict(encoded, verbose=0)
		# get index of the most probable word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat.argmax():
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

# evaluate model
print(generate_seq(model, tokenizer, 50, 'I went down yesterday to the Piraeus with Glaucon the son of Ariston,', 10))


the expected output:

I went down yesterday to the Piraeus with Glaucon the son of Ariston,

that I might offer up my prayers to the goddess (Bendis, the Thracian

Artemis.); <br> as such the

In [None]:
!pip install gradio

In [None]:
import gradio as gr

def generate_text(seed_text):
  generated_text = generate_seq(model, tokenizer, 50, seed_text, 10)
  return generated_text

iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(lines=2, placeholder="Enter seed text here..."), # Changed from gr.inputs.Textbox to gr.Textbox
    outputs="text",
    title="Text Generation with LSTM",
    description="Generate text based on a seed text using a trained LSTM model."
)

iface.launch()