<a href="https://colab.research.google.com/github/gowtamyreddy/NLP/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

In [4]:
#Function to load dataset
def load_data(file_path):
    with open(file_path, 'r',encoding='utf-8') as f:
      text = f.read()
    return text

#Load Harry Potter book text
file_path ='/content/sample_data/01 Harry Potter and the Sorcerers Stone.txt'
text = load_data(file_path).lower()

#Tokenize the text
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

#Convert text into Sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 100

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i-seq_length:i+1])

#Padding sequences and split into iput and labels(x and y)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length+1))
x, y = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)#one hot encoding



In [5]:
#LSTM Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(total_words,activation='softmax')

])

#Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train the model
model.fit(x, y, epochs=20, batch_size = 128)






Epoch 1/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1260s[0m 2s/step - accuracy: 0.0444 - loss: 7.0586
Epoch 2/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1271s[0m 2s/step - accuracy: 0.0526 - loss: 6.3805
Epoch 3/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1277s[0m 2s/step - accuracy: 0.0784 - loss: 6.0739
Epoch 4/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1287s[0m 2s/step - accuracy: 0.1023 - loss: 5.7593
Epoch 5/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1276s[0m 2s/step - accuracy: 0.1130 - loss: 5.5373
Epoch 6/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1275s[0m 2s/step - accuracy: 0.1179 - loss: 5.3834
Epoch 7/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1229s[0m 2s/step - accuracy: 0.1295 - loss: 5.2791
Epoch 8/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1291s[0m 2s/step - accuracy: 0.1424 - loss: 4.9896
Epoch 9/20
[1m633/633[

<keras.src.callbacks.history.History at 0x7ed492989590>

In [15]:
#Function to generate Text
def generate_text(seed_text, next_words=40,temperature = 1.0):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list],maxlen = seq_length,padding = 'pre')

    predicted_probs = model.predict(token_list,verbose = 0)[0]
    predicted_probs = np.asarray(predicted_probs)/temperature #Adjust parameters
    predicted_probs = np.exp(predicted_probs)/np.sum(np.exp(predicted_probs))
    predicted_index = np.random.choice(range(len(predicted_probs)),p=predicted_probs)

    output_word = tokenizer.index_word.get(predicted_index, "")
    seed_text += " "+ output_word

  return seed_text

#Generate text
print(generate_text("harry at hogwarts",next_words = 50, temperature = 0.7))

harry at hogwarts narrowly fantastic touching…” weasleys cheers lunchtime possible below doom reflection er mist gasps favorite apart won’t poison report worked mahogany breathe “charlie master snowball ancient postcard dreadlocks long delivered lunged captain likes dormitories involved countercurses thumpin’ quaffle wander cobbled stutter “anyone “blown smiled supply parents…they seamus’s cloth archway peeves’s swearing
