## **1. Download Dataset**

In [1]:
!wget --no-check-certificate https://storage.googleapis.com/protonx-cloud-storage/data.txt
data = open('data.txt').read()

--2025-04-13 01:07:36--  https://storage.googleapis.com/protonx-cloud-storage/data.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.207, 108.177.98.207, 74.125.197.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘data.txt’


2025-04-13 01:07:37 (91.9 MB/s) - ‘data.txt’ saved [93578/93578]



## **2. Import Libraries**

In [2]:
import numpy as np
import tensorflow.keras.utils as ku

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

## **3. Data Preprocessing**

In [3]:
corpus = data.lower().split("\n")

In [4]:
corpus[:10]

['from fairest creatures we desire increase,',
 "that thereby beauty's rose might never die,",
 'but as the riper should by time decease,',
 'his tender heir might bear his memory:',
 'but thou, contracted to thine own bright eyes,',
 "feed'st thy light'st flame with self-substantial fuel,",
 'making a famine where abundance lies,',
 'thyself thy foe, to thy sweet self too cruel.',
 "thou that art now the world's fresh ornament",
 'and only herald to the gaudy spring,']

### **3.1. Build Vocabulary**

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [6]:
tokenizer.word_index

{'and': 1,
 'the': 2,
 'to': 3,
 'of': 4,
 'my': 5,
 'i': 6,
 'in': 7,
 'that': 8,
 'thy': 9,
 'thou': 10,
 'with': 11,
 'for': 12,
 'is': 13,
 'love': 14,
 'not': 15,
 'but': 16,
 'a': 17,
 'me': 18,
 'thee': 19,
 'so': 20,
 'be': 21,
 'as': 22,
 'all': 23,
 'you': 24,
 'his': 25,
 'which': 26,
 'when': 27,
 'it': 28,
 'this': 29,
 'by': 30,
 'your': 31,
 'doth': 32,
 'do': 33,
 'from': 34,
 'on': 35,
 'or': 36,
 'no': 37,
 'then': 38,
 'have': 39,
 'what': 40,
 'are': 41,
 'if': 42,
 'more': 43,
 'mine': 44,
 'their': 45,
 'shall': 46,
 'sweet': 47,
 'time': 48,
 'will': 49,
 'they': 50,
 'beauty': 51,
 'nor': 52,
 'eyes': 53,
 'art': 54,
 'her': 55,
 'heart': 56,
 'yet': 57,
 'o': 58,
 'than': 59,
 'can': 60,
 'should': 61,
 'thine': 62,
 'now': 63,
 'where': 64,
 'make': 65,
 'one': 66,
 'hath': 67,
 'he': 68,
 'fair': 69,
 'still': 70,
 'how': 71,
 'eye': 72,
 'him': 73,
 'like': 74,
 'true': 75,
 'see': 76,
 'am': 77,
 'she': 78,
 'those': 79,
 'though': 80,
 'being': 81,
 'some'

In [7]:
# Add 1 to include the padding token (index 0), which is excluded from word_index
total_words = len(tokenizer.word_index) + 1

In [8]:
total_words

3211

### **3.2. Generate N-gram Sequences**

Split each sentence into n-gram sequences of increasing length to create training samples

In [9]:
input_sequences = []

for line in corpus:
    # Convert the sentence into a sequence of token indices and extract the inner list
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
        # Generate an n-gram sequence from the start up to position i (minimum 2 tokens)
        n_gram_sequence = token_list[:i+1]

        input_sequences.append(n_gram_sequence)

In [10]:
input_sequences[:10]

[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517],
 [8, 878],
 [8, 878, 134],
 [8, 878, 134, 351],
 [8, 878, 134, 351, 102],
 [8, 878, 134, 351, 102, 156]]

In [11]:
tokenizer.sequences_to_texts([[34, 417]])

['from fairest']

In [12]:
tokenizer.sequences_to_texts([[34, 417, 877]])

['from fairest creatures']

Display training samples as text

In [13]:
for point in input_sequences[:10]:
  # Convert the n-gram sequence of token indices back into text and join into a string
  print(" ".join(tokenizer.sequences_to_texts([point])))

from fairest
from fairest creatures
from fairest creatures we
from fairest creatures we desire
from fairest creatures we desire increase
that thereby
that thereby beauty's
that thereby beauty's rose
that thereby beauty's rose might
that thereby beauty's rose might never


### **3.3. Split Features and Labels**

To generate text from left to right, zero-padding is applied to the **beginning** of each training sample to ensure equal length

In [14]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [15]:
max_sequence_len

11

In [16]:
input_sequences[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  34, 417],
       [  0,   0,   0,   0,   0,   0,   0,   0,  34, 417, 877],
       [  0,   0,   0,   0,   0,   0,   0,  34, 417, 877, 166],
       [  0,   0,   0,   0,   0,   0,  34, 417, 877, 166, 213],
       [  0,   0,   0,   0,   0,  34, 417, 877, 166, 213, 517],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   8, 878],
       [  0,   0,   0,   0,   0,   0,   0,   0,   8, 878, 134],
       [  0,   0,   0,   0,   0,   0,   0,   8, 878, 134, 351],
       [  0,   0,   0,   0,   0,   0,   8, 878, 134, 351, 102],
       [  0,   0,   0,   0,   0,   8, 878, 134, 351, 102, 156]],
      dtype=int32)

Split the training samples into features and labels

In [17]:
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

In [18]:
predictors[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  34],
       [  0,   0,   0,   0,   0,   0,   0,   0,  34, 417],
       [  0,   0,   0,   0,   0,   0,   0,  34, 417, 877],
       [  0,   0,   0,   0,   0,   0,  34, 417, 877, 166],
       [  0,   0,   0,   0,   0,  34, 417, 877, 166, 213],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   8],
       [  0,   0,   0,   0,   0,   0,   0,   0,   8, 878],
       [  0,   0,   0,   0,   0,   0,   0,   8, 878, 134],
       [  0,   0,   0,   0,   0,   0,   8, 878, 134, 351],
       [  0,   0,   0,   0,   0,   8, 878, 134, 351, 102]], dtype=int32)

In [19]:
label[:10]

array([417, 877, 166, 213, 517, 878, 134, 351, 102, 156], dtype=int32)

In [20]:
for i in range(10):
  print("{} ---> {}".format(" ".join(tokenizer.sequences_to_texts([predictors[i]])), " ".join(tokenizer.sequences_to_texts([[label[i]]]))))

from ---> fairest
from fairest ---> creatures
from fairest creatures ---> we
from fairest creatures we ---> desire
from fairest creatures we desire ---> increase
that ---> thereby
that thereby ---> beauty's
that thereby beauty's ---> rose
that thereby beauty's rose ---> might
that thereby beauty's rose might ---> never


Convert each label into a one-hot vector based on the vocabulary size

In [21]:
label = ku.to_categorical(label, num_classes=total_words).astype("float32")

In [22]:
label

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
sum(label[0])

np.float32(1.0)

## **4. Model Training**

In [24]:
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(SimpleRNN(128))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
history = model.fit(predictors, label, epochs=130, verbose=1)

Epoch 1/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 24ms/step - accuracy: 0.0206 - loss: 7.0694
Epoch 2/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.0358 - loss: 6.3432
Epoch 3/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.0575 - loss: 5.9540
Epoch 4/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.0714 - loss: 5.5557
Epoch 5/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.0900 - loss: 5.1119
Epoch 6/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.1267 - loss: 4.7142
Epoch 7/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.1748 - loss: 4.3150
Epoch 8/130
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.2428 - loss: 3.9486
Epoch 9/130
[1m484/4

In [26]:
model.summary()

## **5. Text Generation**

Seed text

In [27]:
test_seq = 'despite of wrinkles'

Starting from the seed text, iteratively predict and append the next word until the specified length is reached

In [28]:
next_words = 10

for _ in range(next_words):
  # Convert the sentence into a sequence of token indices and extract the inner list
  token_list = tokenizer.texts_to_sequences([test_seq])[0]

  # Pad the text to match the model’s input length (minus 1 to exclude the label word)
  token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

  # Predict the next word's probability distribution
  predicted = model.predict(token_list, verbose=0)

  predicted_id = np.argmax(predicted[0])

  output_word = ""

  if predicted_id in tokenizer.index_word:
    output_word = tokenizer.index_word[predicted_id]
    if output_word == '<end>':
      break
    test_seq += " " + output_word
  else:
    break

  print(test_seq)

despite of wrinkles this
despite of wrinkles this thy
despite of wrinkles this thy golden
despite of wrinkles this thy golden time
despite of wrinkles this thy golden time my
despite of wrinkles this thy golden time my deeds
despite of wrinkles this thy golden time my deeds to
despite of wrinkles this thy golden time my deeds to shame
despite of wrinkles this thy golden time my deeds to shame deny
despite of wrinkles this thy golden time my deeds to shame deny silent
