# Nietzsche Next
- description: Autocomplete app trained on Nietzsche's works. Goal is to make autocompletion proposals for the next word based on the wider context and the characters typed in so far. Uses RNN for character-based prediction (sequence-to-sequence). Updated for TPU usage, parallel data processing, and optimized training on Google Colab.



In [None]:
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("TPU successfully initialized.")
except ValueError:
    print("TPU not found. Check runtime settings.")


TPU successfully initialized.


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, TimeDistributed, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

import re
import unicodedata
import os
from multiprocessing import Pool, cpu_count

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle

# load character mappings
with open("/content/drive/MyDrive/autocomplete/data/char_mappings.pkl", "rb") as f:
    mappings = pickle.load(f)

char_to_index = mappings['char_to_index']
index_to_char = mappings['index_to_char']
vocab_size = mappings['vocab_size']

# load cleaned training and testing data
with open("/content/drive/MyDrive/autocomplete/data/nietzsche_train_clean.txt", "r") as f:
    nietzsche_train_clean = f.read()

with open("/content/drive/MyDrive/autocomplete/data/nietzsche_test_clean.txt", "r") as f:
    nietzsche_test_clean = f.read()

print(f"Vocabulary size: {vocab_size}")
print(f"Sample training data: {nietzsche_train_clean[:100]}")
print(f"Sample testing data: {nietzsche_test_clean[:100]}")


Vocabulary size: 60
Sample training data:  PREFACE   This book belongs to the most rare of men. Perhaps not one of them is yet alive. It is po
Sample testing data: on the church borrowed the fact from Paul.--The God that Paul invented for himself, a God who "reduc


In [None]:
import numpy as np

# load numpy arrays
train_sequences_np = np.load("/content/drive/MyDrive/autocomplete/data/train_sequences.npy")
train_labels_np = np.load("/content/drive/MyDrive/autocomplete/data/train_labels.npy")
test_sequences_np = np.load("/content/drive/MyDrive/autocomplete/data/test_sequences.npy")
test_labels_np = np.load("/content/drive/MyDrive/autocomplete/data/test_labels.npy")

# convert numpy arrays back to lists
train_sequences = train_sequences_np.tolist()
train_labels = train_labels_np.tolist()
test_sequences = test_sequences_np.tolist()
test_labels = test_labels_np.tolist()

print(f"First training sequence: {train_sequences[0]}")
print(f"First training label: {train_labels[0]}")
print(f"First testing sequence: {test_sequences[0]}")
print(f"First testing label: {test_labels[0]}")


First training sequence: [0, 23, 25, 12, 13, 8, 10, 12, 0, 0, 0, 27, 41, 42, 52, 0, 35, 48, 48, 44, 0, 35, 38, 45, 48, 47, 40, 52, 0, 53, 48, 0, 53, 41, 38, 0, 46, 48, 52, 53, 0, 51, 34, 51, 38, 0, 48, 39, 0, 46, 38, 47, 6, 0, 23, 38, 51, 41, 34, 49, 52, 0, 47, 48, 53, 0, 48, 47, 38, 0, 48, 39, 0, 53, 41, 38, 46, 0, 42, 52]
First training label: [23, 25, 12, 13, 8, 10, 12, 0, 0, 0, 27, 41, 42, 52, 0, 35, 48, 48, 44, 0, 35, 38, 45, 48, 47, 40, 52, 0, 53, 48, 0, 53, 41, 38, 0, 46, 48, 52, 53, 0, 51, 34, 51, 38, 0, 48, 39, 0, 46, 38, 47, 6, 0, 23, 38, 51, 41, 34, 49, 52, 0, 47, 48, 53, 0, 48, 47, 38, 0, 48, 39, 0, 53, 41, 38, 46, 0, 42, 52, 0]
First testing sequence: [48, 47, 0, 53, 41, 38, 0, 36, 41, 54, 51, 36, 41, 0, 35, 48, 51, 51, 48, 56, 38, 37, 0, 53, 41, 38, 0, 39, 34, 36, 53, 0, 39, 51, 48, 46, 0, 23, 34, 54, 45, 6, 5, 5, 27, 41, 38, 0, 14, 48, 37, 0, 53, 41, 34, 53, 0, 23, 34, 54, 45, 0, 42, 47, 55, 38, 47, 53, 38, 37, 0, 39, 48, 51, 0, 41, 42, 46, 52, 38]
First testing label: [

In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np

sequence_length = 80
step = 1

train_data = tf.data.Dataset.from_tensor_slices((train_sequences, train_labels))
test_data = tf.data.Dataset.from_tensor_slices((test_sequences, test_labels))

# define one-hot encoding function for TPU
def one_hot_encode(x, y):
    x = tf.convert_to_tensor(x)
    y = tf.one_hot(y, depth=vocab_size)  # TPU-friendly one-hot encoding
    return x, y


# batch, shuffle, and optimize dataset
batch_size = 2048  # TPUs handle larger batch sizes more effectively
train_data = (train_data
              .shuffle(10000)
              .batch(batch_size, drop_remainder=True)
              .map(one_hot_encode)
              .prefetch(tf.data.AUTOTUNE))
test_data = (test_data
             .batch(batch_size, drop_remainder=True)
             .map(one_hot_encode)
             .prefetch(tf.data.AUTOTUNE))


In [None]:
embedding_dim = 50
dropout_rate = 0.1 # .5
n_epochs = 250

with strategy.scope():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=sequence_length),
        LSTM(256, return_sequences=True, dropout=dropout_rate),
        LSTM(256, return_sequences=True, dropout=dropout_rate),
        LSTM(256, return_sequences=True, dropout=dropout_rate),
        TimeDistributed(Dense(vocab_size, activation='softmax'))       # Added TimeDistributed layer
    ])

    optimizer = RMSprop(learning_rate=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),  # Increased patience
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)  # Added learning rate scheduler
    ]


    history = model.fit(
        train_data,
        validation_data=test_data,
        epochs=n_epochs,
        callbacks=callbacks,
        verbose=1
    )


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [None]:
model.save_weights('/content/drive/MyDrive/nietzsche_model_weights_trained_v04_1.h5')
model.save('/content/drive/MyDrive/nietzsche_model_v04_1_tf',save_format='tf')


In [None]:
import numpy as np

def autocomplete_text(model, seed_text, char_to_index, index_to_char, sequence_length=sequence_length, num_predictions=100):
    # pad seed text if len(seed text) < sequence length
    if len(seed_text) < sequence_length:
        seed_text = ' ' * (sequence_length - len(seed_text)) + seed_text

    input_sequence = [char_to_index[char] for char in seed_text]

    # generate characters one by one
    output_text = seed_text
    for _ in range(num_predictions):
        # reshape to match model input shape (1, sequence_length)
        input_array = np.reshape(input_sequence, (1, sequence_length))

        predicted_probs = model.predict(input_array, verbose=0)
        predicted_char_index = np.argmax(predicted_probs[0, -1])  # Choose the most likely character
        predicted_char = index_to_char[predicted_char_index]
        output_text += predicted_char

        # update input sequence by adding the predicted character and removing the first one
        input_sequence.append(predicted_char_index)
        input_sequence = input_sequence[1:]

        if predicted_char == ' ':
            break

    return output_text

# example
seed_text = 'God is '
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)


Autocomplete suggestion:                                                                          God is the 


In [None]:
seed_text = 'God is d'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)


Autocomplete suggestion:                                                                         God is decadence 


In [None]:
seed_text = 'God is de'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)


Autocomplete suggestion:                                                                        God is decadence 


In [None]:
seed_text = 'God is dea'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)


Autocomplete suggestion:                                                                       God is death 


In [None]:
# last sentence of "human":
# In the same manner I have viewed the saints of India who occupy an intermediate station between the christian saints and the Greek philosophers
# and hence are not to be regarded as a pure type. Knowledge and science--as far as they existed--and superiority to the rest of mankind by logical
# discipline and training of the intellectual powers were insisted upon by the Buddhists as essential to sanctity, just as they were denounced by
# the christian world as the indications of sinfulness.

In [None]:
seed_text = 'the christian world as the indications of '
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)

Autocomplete suggestion:                                       the christian world as the indications of the 


In [None]:
seed_text = 'the christian world as the indications of s'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)

Autocomplete suggestion:                                      the christian world as the indications of such 


In [None]:
seed_text = 'the christian world as the indications of si'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)

Autocomplete suggestion:                                     the christian world as the indications of significance 


In [None]:
seed_text = 'the christian world as the indications of sinf'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)

Autocomplete suggestion:                                   the christian world as the indications of sinful 


In [None]:
seed_text = 'the christian world as the indications of sinfuln'
completion = autocomplete_text(model, seed_text, char_to_index, index_to_char)
print("Autocomplete suggestion:", completion)

Autocomplete suggestion:                                the christian world as the indications of sinfulness 
