In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

2025-03-01 18:10:36.012204: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-01 18:10:36.030948: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-01 18:10:36.044543: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-01 18:10:36.048029: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 18:10:36.059425: I tensorflow/core/platform/cpu_feature_guar

In [3]:
with open('anna.txt', 'r', encoding='utf-8') as file:
    text = file.read()

words = text.split()
word_counts = Counter(words)

vocab = ['<UNK>'] + list(word_counts.keys())
vocab_size = len(vocab)
word_to_int = {word: i for i, word in enumerate(vocab)}
int_to_word = {i: word for i, word in enumerate(vocab)}

# SEQUENCE_LENGTH = 64
SEQUENCE_LENGTH = 32
samples = [words[i:i+SEQUENCE_LENGTH+1] for i in range(len(words)-SEQUENCE_LENGTH)]

split_idx = int(0.8 * len(samples))
train_samples = samples[:split_idx]
val_samples = samples[split_idx:]

def prepare_datasets(samples):
    input_sequences = []
    target_sequences = []
    for sample in samples:
        input_seq = [word_to_int.get(word, 0) for word in sample[:-1]]
        target_seq = [word_to_int.get(word, 0) for word in sample[1:]]
        input_sequences.append(input_seq)
        target_sequences.append(target_seq)
    return tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))

BATCH_SIZE = 16
train_dataset = prepare_datasets(train_samples).shuffle(1000).batch(BATCH_SIZE, drop_remainder=True)
val_dataset = prepare_datasets(val_samples).batch(BATCH_SIZE, drop_remainder=True)

def load_pretrained_embeddings(embedding_dim=100):
    embeddings_index = {}
    with open('glove.6B.100d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    unknown_words = []
    
    for i, word in enumerate(vocab):
        if i == 0:  # Leave <UNK> as zeros
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            unknown_words.append(word)
    
    print(f"Number of unknown words: {len(unknown_words)}")
    return embedding_matrix

embedding_matrix = load_pretrained_embeddings()

class EnhancedTextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            # trainable=True
            trainable=False
        )
        self.lstm1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(hidden_size, return_sequences=True, dropout=0.2)
        )
        self.lstm2 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(hidden_size, return_sequences=True, dropout=0.2)
        )
        self.dense = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(vocab_size, activation='softmax')
        )

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.lstm1(x)
        # x = self.lstm2(x)
        return self.dense(x)

embedding_dim = 100
hidden_size = 128  
learning_rate = 0.001
epochs = 4

model = EnhancedTextGenerator(vocab_size, embedding_dim, hidden_size)
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate,
    clipnorm=1.0  
)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)


early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'enhanced_text_generator.keras',  
    monitor='val_loss',
    save_best_only=True, 
    save_weights_only=False, 
    mode='min',  
    verbose=1
)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs,
    callbacks=[early_stopping, model_checkpoint]
)

def generate_text(model, start_string, num_words=100, temperature=1.0):
    words = start_string.split()
    generated_words = []
    current_window = words[-SEQUENCE_LENGTH:] 
    
    for _ in range(num_words):
        input_seq = [word_to_int.get(word, 0) for word in current_window]
        if len(input_seq) < SEQUENCE_LENGTH:
            input_seq = [0] * (SEQUENCE_LENGTH - len(input_seq)) + input_seq
        elif len(input_seq) > SEQUENCE_LENGTH:
            input_seq = input_seq[-SEQUENCE_LENGTH:]
        
        input_tensor = tf.expand_dims(input_seq, 0)
        
        predictions = model(input_tensor)
        logits = predictions[0, -1, :]
        
        scaled_logits = logits / temperature
        probabilities = tf.nn.softmax(scaled_logits).numpy()
        predicted_id = np.random.choice(len(probabilities), p=probabilities)
        predicted_word = int_to_word[predicted_id]
        generated_words.append(predicted_word)
        current_window = current_window[1:] + [predicted_word]
    
    return ' '.join(words + generated_words)

I0000 00:00:1740845451.393091   56207 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1740845451.427436   56207 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1740845451.428475   56207 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1740845451.430364   56207 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Number of unknown words: 19019
Epoch 1/4


2025-03-01 18:11:13.810860: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m17644/17644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.6517 - loss: 2.3847
Epoch 1: val_loss improved from inf to 3.61218, saving model to enhanced_text_generator.keras
[1m17644/17644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1392s[0m 79ms/step - accuracy: 0.6517 - loss: 2.3846 - val_accuracy: 0.6073 - val_loss: 3.6122
Epoch 2/4
[1m17644/17644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.8685 - loss: 0.8159
Epoch 2: val_loss improved from 3.61218 to 3.46729, saving model to enhanced_text_generator.keras
[1m17644/17644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1398s[0m 79ms/step - accuracy: 0.8685 - loss: 0.8159 - val_accuracy: 0.6288 - val_loss: 3.4673
Epoch 3/4
[1m17644/17644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.8931 - loss: 0.5876
Epoch 3: val_loss improved from 3.46729 to 3.45921, saving model to enhanced_text_generator.keras
[1m17644/17644[0m [32m━━━━━

In [7]:
def predict_and_generate_text(model, start_string, num_words=100, temperature=1.0):
    
    words = start_string.split()
    generated_words = []
    current_window = words[-SEQUENCE_LENGTH:]  
    for _ in range(num_words):
        input_seq = [word_to_int.get(word, 0) for word in current_window]
        if len(input_seq) < SEQUENCE_LENGTH:
            input_seq = [0] * (SEQUENCE_LENGTH - len(input_seq)) + input_seq
        elif len(input_seq) > SEQUENCE_LENGTH:
            input_seq = input_seq[-SEQUENCE_LENGTH:]
        input_tensor = tf.expand_dims(input_seq, 0)

        predictions = model(input_tensor)
        logits = predictions[0, -1, :]  
        scaled_logits = logits / temperature
        probabilities = tf.nn.softmax(scaled_logits).numpy()

        predicted_id = np.random.choice(len(probabilities), p=probabilities)
        predicted_word = int_to_word[predicted_id]

        generated_words.append(predicted_word)
        current_window = current_window[1:] + [predicted_word]

    generated_text = ' '.join(words + generated_words)
    return generated_text

In [10]:
start_string = "she won't forgive me"
generated_text = predict_and_generate_text(
    model, 
    start_string, 
    num_words=50, 
    temperature=0.7
)

print(generated_text)

she won't forgive me supper?" Vronsky--I "_Quos cultivation, Worst delightful! bargain secondarily "might Lidia pushing weapon, loved, veteran. Venden refinement kisses. me? laughed, brows. sinned serious. ground usual sacrifice. savages. funny. doll height blaming, vexed "Almost." aristocrat. chilliness. met? freezing couple--a Lent plaything; Fanny, lot!" buttons, senses, avoid. expressions, swishing Called thoroughly, woman--the daydreams,
