In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("/content/qoute_dataset.csv")
df.head()

Unnamed: 0,quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe


In [None]:
df['quote'][0]

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [None]:
df.shape

(3038, 2)

In [None]:
quotes=df['quote']
quotes.head()

Unnamed: 0,quote
0,“The world as we have created it is a process ...
1,"“It is our choices, Harry, that show what we t..."
2,“There are only two ways to live your life. On...
3,"“The person, be it gentleman or lady, who has ..."
4,"“Imperfection is beauty, madness is genius and..."


COnverting to lowercase

In [None]:
import string
quotes=quotes.str.lower()

In [None]:
quotes.head()

Unnamed: 0,quote
0,“the world as we have created it is a process ...
1,"“it is our choices, harry, that show what we t..."
2,“there are only two ways to live your life. on...
3,"“the person, be it gentleman or lady, who has ..."
4,"“imperfection is beauty, madness is genius and..."


Removing punctuations

In [None]:
translator=str.maketrans(' ', ' ', string.punctuation) #punctuations of string are replaced with empty spaces
quotes=quotes.apply(lambda x: x.translate(translator))

import string: This line imports the string module, which provides a collection of string constants, including string.punctuation.

translator = str.maketrans(' ', ' ', string.punctuation):

str.maketrans() is a method used to create a translation table (a mapping of characters). It takes three arguments:
The first argument (' ') specifies characters to be replaced. In this case, a space is mapped to a space, meaning spaces remain unchanged.
The second argument (' ') specifies the replacement characters. Here, a space replaces a space.
The third argument (string.punctuation) specifies characters to be deleted. string.punctuation contains all common punctuation symbols (e.g., !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, ~). By mapping these to None (implicitly, as the third argument in maketrans), these characters will be removed from the string during translation.
So, this line creates a translator dictionary that will effectively remove all punctuation marks while keeping spaces as they are.
quotes = quotes.apply(lambda x: x.translate(translator)):

This line applies a function to each element (each quote string) in the quotes pandas Series.
lambda x: x.translate(translator) is an anonymous function that takes a string x (representing an individual quote).
x.translate(translator) uses the translator table created in the previous step to process the string x. It removes all characters defined in string.punctuation from each quote.

In [None]:
quotes.head()

Unnamed: 0,quote
0,“the world as we have created it is a process ...
1,“it is our choices harry that show what we tru...
2,“there are only two ways to live your life one...
3,“the person be it gentleman or lady who has no...
4,“imperfection is beauty madness is genius and ...


Tokenisation

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
vocab_size=10000 #we have to set vocab size manually because for in a very large datset on lakhs you need high GPU and processing

tokenizer= Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(quotes)

In [None]:
word_index=tokenizer.word_index #Each word in the corpus is assigned with a number .
print(len(word_index))
list(word_index.items())[:10]

8978


[('the', 1),
 ('you', 2),
 ('to', 3),
 ('and', 4),
 ('a', 5),
 ('i', 6),
 ('is', 7),
 ('of', 8),
 ('that', 9),
 ('it', 10)]

In [None]:
sequence=tokenizer.texts_to_sequences(quotes) #Applying tokenized dictionary to my dataset 'quotes

In [None]:
quotes[0]

'“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”'

In [None]:
sequence[0]
#You can see 'the'=713; 'world'=62 and so on

[713,
 62,
 29,
 19,
 16,
 946,
 10,
 7,
 5,
 1156,
 8,
 70,
 293,
 10,
 145,
 12,
 809,
 104,
 752,
 70,
 2461]

In [None]:
for i in range(3):
  print(quotes[i])

“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”
“it is our choices harry that show what we truly are far more than our abilities”
“there are only two ways to live your life one is as though nothing is a miracle the other is as though everything is a miracle”


In [None]:
for i in range(3):
  print(sequence[i])

[713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104, 752, 70, 2461]
[947, 7, 70, 871, 373, 9, 433, 21, 19, 465, 14, 294, 52, 54, 70, 3676]
[1337, 14, 53, 201, 714, 3, 81, 15, 36, 37, 7, 29, 329, 93, 7, 5, 1157, 1, 101, 7, 29, 329, 126, 7, 5, 3677]


In [None]:
X=[]
y=[]

for seq in sequence:
  for i in range(1, len(seq)):
    input_seq=seq[:i]
    output_seq=seq[i]
    X.append(input_seq)
    y.append(output_seq)

for seq in sequence:

This loop iterates through each complete numerical sequence (representing a quote) that you previously generated from your tokenizer. Each seq variable in this loop is one of your tokenized quotes, like [713, 62, 29, ..., 2461].
for i in range(1, len(seq)):

Inside the first loop, this nested loop iterates through the current seq starting from the second element (index 1) up to, but not including, the length of the sequence. This is crucial because it creates progressively longer input sequences and their immediate next word as the target.
input_seq = seq[:i]

In each iteration of the inner loop, input_seq is created as a slice of the current seq. It takes all elements from the beginning of seq up to the current index i (exclusive). So, if seq is [A, B, C, D]:
When i is 1, input_seq will be [A].
When i is 2, input_seq will be [A, B].
When i is 3, input_seq will be [A, B, C].
output_seq = seq[i]

output_seq is the single element at the current index i within the seq. This is the 'next word' that the model will try to predict given input_seq.
When i is 1, output_seq will be B.
When i is 2, output_seq will be C.
When i is 3, output_seq will be D.
X.append(input_seq)

The generated input_seq (e.g., [A], [A, B], [A, B, C]) is added to the X list.
y.append(output_seq)

The corresponding output_seq (e.g., B, C, D) is added to the y list.

In [None]:
X

[[713],
 [713, 62],
 [713, 62, 29],
 [713, 62, 29, 19],
 [713, 62, 29, 19, 16],
 [713, 62, 29, 19, 16, 946],
 [713, 62, 29, 19, 16, 946, 10],
 [713, 62, 29, 19, 16, 946, 10, 7],
 [713, 62, 29, 19, 16, 946, 10, 7, 5],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  104,
  752],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  

In [None]:
y

[62,
 29,
 19,
 16,
 946,
 10,
 7,
 5,
 1156,
 8,
 70,
 293,
 10,
 145,
 12,
 809,
 104,
 752,
 70,
 2461,
 7,
 70,
 871,
 373,
 9,
 433,
 21,
 19,
 465,
 14,
 294,
 52,
 54,
 70,
 3676,
 14,
 53,
 201,
 714,
 3,
 81,
 15,
 36,
 37,
 7,
 29,
 329,
 93,
 7,
 5,
 1157,
 1,
 101,
 7,
 29,
 329,
 126,
 7,
 5,
 3677,
 116,
 12,
 10,
 2462,
 32,
 1043,
 30,
 82,
 13,
 601,
 11,
 5,
 74,
 1338,
 119,
 12,
 2463,
 3678,
 7,
 313,
 753,
 7,
 638,
 4,
 43,
 144,
 3,
 12,
 682,
 1339,
 54,
 682,
 3680,
 13,
 3,
 202,
 5,
 90,
 8,
 434,
 279,
 202,
 5,
 90,
 8,
 3682,
 7,
 144,
 3,
 12,
 1340,
 17,
 21,
 2,
 14,
 54,
 3,
 12,
 175,
 17,
 21,
 2,
 14,
 3683,
 16,
 13,
 1341,
 191,
 51,
 415,
 2464,
 714,
 9,
 363,
 3684,
 180,
 7,
 39,
 5,
 810,
 1342,
 2,
 46,
 50,
 59,
 322,
 10,
 7,
 168,
 43,
 11,
 639,
 3685,
 111,
 104,
 1045,
 7,
 39,
 2,
 50,
 3686,
 36,
 7,
 21,
 2,
 65,
 10,
 47,
 181,
 21,
 96,
 130,
 3,
 754,
 58,
 123,
 43,
 5,
 1939,
 174,
 18,
 1,
 74,
 208,
 7,
 2,
 94,
 3,
 466,
 5

In [None]:
len(X)

85271

In [None]:
len(y)

85271

Applying Padding

A above we can see through the loop we iterate the input dimensions of X are very much different , not same. So to make input_dim similar we are applying padding

In [None]:
max_len=max(len(x) for x in X)
print(max_len)

745


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_padded=pad_sequences(X, maxlen=max_len, padding='pre')

In [None]:
X_padded

array([[   0,    0,    0, ...,    0,    0,  713],
       [   0,    0,    0, ...,    0,  713,   62],
       [   0,    0,    0, ...,  713,   62,   29],
       ...,
       [   0,    0,    0, ...,    9,   19, 1125],
       [   0,    0,    0, ...,   19, 1125,    3],
       [   0,    0,    0, ..., 1125,    3,  169]], dtype=int32)

In [None]:
y=np.array(y)

In [None]:
X_padded.shape

(85271, 745)

In [None]:
y.shape

(85271,)

One hot encoding on y

In [None]:
# from  tensorflow.keras.utils import to_categorical
# y_one_hot= to_categorical(y, num_classes= vocab_size) #vocab size is 100000
# We will use sparse_categorical_crossentropy, so no need for one-hot encoding of y

In [None]:
#y_one_hot.shape

Embeddings

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Embedding, LSTM, SimpleRNN

In [None]:
embeddding_dim=50
rnn_units=128 #hidden neurons 128  in hidden layer

RNN Model

In [None]:
rnn_model=Sequential()

rnn_model.add(
    Embedding(input_dim=vocab_size, output_dim=embeddding_dim, input_length=max_len)
)

rnn_model.add(SimpleRNN(units=rnn_units, activation='tanh'))

rnn_model.add(Dense(units=vocab_size, activation='softmax')) #output layer , it predicts the next word



In [None]:
rnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
rnn_model.summary()

LSTM Model

In [None]:
lstm_model=Sequential()

lstm_model.add(
    Embedding(input_dim=vocab_size,
        output_dim=embeddding_dim)
)

lstm_model.add(LSTM(units=rnn_units, activation='tanh'))

lstm_model.add(Dense(units=vocab_size, activation='softmax'))   #same code as RNN model , but internal working is diffrent

In [None]:
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
lstm_model.summary()

In [None]:
epochs=10
batch_size=128

In [None]:
history_rnn=rnn_model.fit(X_padded, y, epochs=epochs, batch_size=batch_size, validation_split=0.1)

Epoch 1/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 68ms/step - accuracy: 0.0340 - loss: 7.0387 - val_accuracy: 0.0583 - val_loss: 6.5642
Epoch 2/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 65ms/step - accuracy: 0.0679 - loss: 6.1783 - val_accuracy: 0.0904 - val_loss: 6.3724
Epoch 3/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 65ms/step - accuracy: 0.0977 - loss: 5.7691 - val_accuracy: 0.0998 - val_loss: 6.3203
Epoch 4/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 64ms/step - accuracy: 0.1175 - loss: 5.4567 - val_accuracy: 0.1035 - val_loss: 6.2800
Epoch 5/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 64ms/step - accuracy: 0.1287 - loss: 5.2193 - val_accuracy: 0.1065 - val_loss: 6.3289
Epoch 6/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 64ms/step - accuracy: 0.1420 - loss: 4.9644 - val_accuracy: 0.1110 - val_loss: 6.3790
Epoch 7/10
[1m6

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

In [None]:
# Train the LSTM model
history_lstm = lstm_model.fit(
    X_padded, # Padded input sequences
    y, # Original integer target labels (next words)
    epochs=100, # Number of times to iterate over the entire dataset
    batch_size=batch_size, # Number of samples per gradient update
    validation_split=0.1, # Hold out 10% of the data for validation during training
)

Epoch 1/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.1188 - loss: 5.4731 - val_accuracy: 0.1057 - val_loss: 6.4649
Epoch 2/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 49ms/step - accuracy: 0.1296 - loss: 5.2874 - val_accuracy: 0.1087 - val_loss: 6.5117
Epoch 3/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 48ms/step - accuracy: 0.1376 - loss: 5.1363 - val_accuracy: 0.1101 - val_loss: 6.5453
Epoch 4/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.1424 - loss: 4.9912 - val_accuracy: 0.1116 - val_loss: 6.6061
Epoch 5/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 48ms/step - accuracy: 0.1487 - loss: 4.8607 - val_accuracy: 0.1130 - val_loss: 6.6754
Epoch 6/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.1549 - loss: 4.7174 - val_accuracy: 0.1134 - val_loss: 6.7198
Epoch 7/10