In [41]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dropout,Dense
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

In [42]:
def prepare_data(text_corpus,sequence_length):
 mytokenizer=Tokenizer()
 mytokenizer.fit_on_texts([text_corpus])
#  here text is having more than 1 word hence we use list
 index=mytokenizer.word_index
#  print(index)
 total_words=len(index)+1
 input_sequences=[]
#  index=0,stores padding values.hence index starts from 1 and total_length adds 1
 for line in text_corpus.split('\n'):
  if len(line.strip())>0:
   token_list=mytokenizer.texts_to_sequences([line])[0]
   if len(token_list) > 1:
    for i in range(1,len(token_list)):
      n_gram_sequences=token_list[max(0,i-sequence_length):i+1]
      input_sequences.append(n_gram_sequences)
#  print(input_sequences)
 max_sequence_len=sequence_length+1
 target_sequences=pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre')

 predictors, target=target_sequences[:,:-1],target_sequences[:,-1]
 target=tf.keras.utils.to_categorical(target,num_classes=total_words)
 print("Total sequences created",len(input_sequences))
 print("Total_vocabulary size", total_words)
 return predictors,target,mytokenizer,max_sequence_len,total_words



In [43]:
def build_small_model(total_words,max_sequence_len):
 model=Sequential()
 model.add(Embedding(total_words,32,input_length=max_sequence_len-1))
#  required only fpr sequential data
# embedding layer only use input layer
# understand relationship,semantic meaning
# embedding table will be look_up_table
# like a neural network
# each dimensions learn==sentimental,semantics meaning,grammer,formality
 model.add(LSTM(32))
 model.add(Dropout(0.1))
 model.add(Dense(total_words,activation='softmax'))
#  utilises target value
# predicted with 70%confidence

 optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
 model.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])
 model.summary()

 return model

In [44]:
def train_small_model(model,predictors,target,epochs=50):
  # batch_size:parallel execution
  batch_size=min(8,len(predictors))

  callbacks=[
      EarlyStopping(
          monitor='loss',
          patience=10,
          restore_best_weights=True,
          verbose=1)
  ]
  history=model.fit(x=predictors,y=target,epochs=epochs,batch_size=batch_size,callbacks=callbacks,verbose=1)
  return history,model

In [45]:
# Prediction
def predict_next_word(trained_model,mytokenizer,seed_text,max_sequence_len):
  token_list=mytokenizer.texts_to_sequences([seed_text])[0]
  if len(token_list)>max_sequence_len-1:
    token_list=token_list[-(max_sequence_len-1):]

  padded_sequence=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
  predictions=trained_model.predict(padded_sequence)
  print(predictions[0])
  top_indices=predictions[0].argsort()[-(max_sequence_len-1):][::-1]
  print(top_indices)
  word_map={v:k for k,v in mytokenizer.word_index.items()}
  print(word_map)
  predicted_words=[]
  predicted_prob=[]
  for idx in top_indices:
    if idx in word_map:
      predicted_words.append(word_map[idx])
      predicted_prob.append(predictions[0][idx])
  return predicted_words,predicted_prob





In [46]:
text='''Hi my name is Dona
I love my country.I'm proud of it'''
# Preprocessing
predictors,target,mytokenizer,max_sequence_len,total_words=prepare_data(text, sequence_length=3)
print(predictors,target,mytokenizer,max_sequence_len,total_words)
print(f'len:{len(predictors)}')

# Model Creation
model=build_small_model(total_words,max_sequence_len)

# Model fitting
history,trained_model=train_small_model(model,predictors,target,epochs=50)

# Model Prediction

seed_text="my name is"
next_words,predicted_probability=predict_next_word(trained_model,mytokenizer,seed_text,max_sequence_len)
print(f'seed_text is {seed_text}')
print(f'next word prediction:{next_words}')
print(f'probability:{predicted_probability}')

Total sequences created 11
Total_vocabulary size 13
[[ 0  0  2]
 [ 0  2  1]
 [ 2  1  3]
 [ 1  3  4]
 [ 0  0  6]
 [ 0  6  7]
 [ 6  7  1]
 [ 7  1  8]
 [ 1  8  9]
 [ 8  9 10]
 [ 9 10 11]] [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]] <keras.src.legacy.preprocessing.text.Tokenizer object at 0x7e7a36065310> 4 13
len:11


Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.0000e+00 - loss: 2.5654
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1629 - loss: 2.5600
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.1629 - loss: 2.5561
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.2652 - loss: 2.5523
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.1212 - loss: 2.5492    
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.3674 - loss: 2.5456
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.3674 - loss: 2.5378
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.3068 - loss: 2.5375
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37