<a href="https://colab.research.google.com/github/chandutr/Deep-Learning-Projects/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
corpus=[
    "I like to eat pizza",
    "I like to eat pasta",
    "I like to eat Biriyani",
    "I love to eat Burger",
    "I love to play cricket"
]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)

In [None]:
tokenizer.texts_to_sequences(["I like to eat pizza"])

[[1, 4, 2, 3, 6]]

In [None]:
sequences=[]
#convert each sentence into input-output pairs
for line in corpus:
  tokenized=tokenizer.texts_to_sequences([line])[0]
  for i in range(2,len(tokenized)+1):
    seq=tokenized[:i]
    sequences.append(seq)
sequences

[[1, 4],
 [1, 4, 2],
 [1, 4, 2, 3],
 [1, 4, 2, 3, 6],
 [1, 4],
 [1, 4, 2],
 [1, 4, 2, 3],
 [1, 4, 2, 3, 7],
 [1, 4],
 [1, 4, 2],
 [1, 4, 2, 3],
 [1, 4, 2, 3, 8],
 [1, 5],
 [1, 5, 2],
 [1, 5, 2, 3],
 [1, 5, 2, 3, 9],
 [1, 5],
 [1, 5, 2],
 [1, 5, 2, 10],
 [1, 5, 2, 10, 11]]

In [None]:
#pad sequences to have the same length
max_seq_len=max(len(seq) for seq in sequences)
sequences=pad_sequences(sequences, maxlen=max_seq_len,padding='pre')
sequences[:5]

array([[0, 0, 0, 1, 4],
       [0, 0, 1, 4, 2],
       [0, 1, 4, 2, 3],
       [1, 4, 2, 3, 6],
       [0, 0, 0, 1, 4]], dtype=int32)

In [None]:
max_seq_len

5

In [None]:
sequences=np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
X[:5], y[:5]

(array([[0, 0, 0, 1],
        [0, 0, 1, 4],
        [0, 1, 4, 2],
        [1, 4, 2, 3],
        [0, 0, 0, 1]], dtype=int32),
 array([4, 2, 3, 6, 4], dtype=int32))

In [None]:
#one hot encode the labels
vocab_size=len(tokenizer.word_index)
y=to_categorical(y,num_classes=vocab_size+1)
y[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
vocab_size

11

In [None]:
#build rnn model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense

model=Sequential([
    Embedding(input_dim=vocab_size+1, output_dim=10, input_length=max_seq_len-1),
    SimpleRNN(50),
    Dense(vocab_size+1,activation='softmax')
])



In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X,y,epochs=200)

Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0500 - loss: 2.4818
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3500 - loss: 2.4650
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.3000 - loss: 2.4479
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4500 - loss: 2.4304
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.4500 - loss: 2.4123
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.4500 - loss: 2.3934
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4500 - loss: 2.3734
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.4500 - loss: 2.3524
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7b4da1370190>

In [None]:
#prediction function
def predict_next_word(model,tokenizer,text,max_seq_len):
  tokens=tokenizer.texts_to_sequences([text])[0]
  tokens=pad_sequences([tokens], maxlen=max_seq_len-1,padding='pre')
  predicted=model.predict(tokens)
  next_index=np.argmax(predicted)
  for word,index in tokenizer.word_index.items():
    if index==next_index:
      return word
  return ""

In [None]:
tokenizer.word_index.items()

dict_items([('i', 1), ('to', 2), ('eat', 3), ('like', 4), ('love', 5), ('pizza', 6), ('pasta', 7), ('biriyani', 8), ('burger', 9), ('play', 10), ('cricket', 11)])

In [None]:
#test the model
test_input="I like to eat"
predicted_word=predict_next_word(model,tokenizer,test_input,max_seq_len)
print(f"{test_input} -> {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
I like to eat -> pizza
