In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_text as text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Embedding , Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import requests
import pickle
import re
import sentencepiece as spm
import os

In [17]:
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
request = requests.get(url)

In [18]:
request

<Response [200]>

In [19]:
with open('Shakespear.txt', 'w', encoding='utf-8') as f:
    f.write(request.text)

In [20]:

with open('Shakespear.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [21]:
print("length of text : " , len(text))

length of text :  1115394


In [22]:
text = text[:]

In [23]:

chars = sorted(list(set(text)))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

print(f"Vocabulary size: {vocab_size}")
print(f"Characters: {''.join(chars[:50])}...")

Vocabulary size: 65
Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijk...


In [24]:
encoded_text = [char_to_idx[ch] for ch in text]


seq_length = 100
X, y = [], []

for i in range(len(encoded_text) - seq_length):
    X.append(encoded_text[i:i + seq_length])
    y.append(encoded_text[i + seq_length])

X = np.array(X)
y = np.array(y)

print(f"Total sequences: {len(X):,}")


split_idx = int(0.9 * len(X))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

print(f"Training sequences: {len(X_train):,}")
print(f"Validation sequences: {len(X_val):,}")

Total sequences: 1,115,294
Training sequences: 1,003,764
Validation sequences: 111,530


In [25]:
model = Sequential([
        Embedding(vocab_size, 256, input_length=seq_length),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
        Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2)),
        Dense(vocab_size, activation='softmax')
])



In [26]:
dummy_input = np.zeros((1, seq_length))
_ = model(dummy_input)

In [27]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [28]:
model.summary()

total_params = model.count_params()
print(f'\nTotal parameters: {total_params:,}')


Total parameters: 9,528,641


In [29]:
checkpoint = ModelCheckpoint('best_shakespeare_model.keras',
                           monitor='val_loss',
                           save_best_only=True,
                           verbose=1)

early_stop = EarlyStopping(monitor='val_loss',
                          patience=3,
                          restore_best_weights=True,
                          verbose=1)

lr_reduce = ReduceLROnPlateau(monitor='val_loss',
                             factor=0.7,
                             patience=2,
                             min_lr=1e-7,
                             verbose=1)



In [None]:
epochs = 50
batch = 128

history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), batch_size=batch , callbacks=[checkpoint, early_stop, lr_reduce], verbose=1)

In [30]:
tokenizer_data = {
    'char_to_idx': char_to_idx,
    'idx_to_char': idx_to_char,
    'vocab_size': vocab_size

}

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer_data, f)

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
