# Nietzsche Next
- description: Autocomplete app trained on Nietzsche's works. Goal is to make autocompletion proposals for the next word based on the wider context and the characters typed in so far. Uses RNN for character-based prediction (sequence-to-sequence). Updated for TPU usage, parallel data processing, and optimized training on Google Colab.



In [None]:
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("TPU successfully initialized.")
except ValueError:
    print("TPU not found. Check runtime settings.")


TPU not found. Check runtime settings.


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, TimeDistributed, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

import re
import unicodedata
import os
from multiprocessing import Pool, cpu_count

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. Load and preprocess data

folder_path = '/content/drive/MyDrive/nietzsche_texts'  # Update this path for your Colab Drive location
nietzsche_works_train = ""
nietzsche_works_test = ""

for filename in os.listdir(folder_path):
    if filename.endswith(".txt") and filename != 'nietzsche_zarathustra_clean.txt':
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r') as file:
            tmp_file = file.read() + "\n"
            twothirds = int(len(tmp_file) * 2/3)
            train = tmp_file[:twothirds]
            test = tmp_file[twothirds:]
            nietzsche_works_train += train
            nietzsche_works_test += test

In [None]:
def simplify_text(text):
    text = unicodedata.normalize("NFD", text)
    text = ''.join([char for char in text if not unicodedata.combining(char)])

    replacements = {
        '’': "'", '‘': "'", '“': '"', '”': '"', '—': '-',
        '–': '-', ';': ',', ':': ',', '§': '', 'Æ': 'AE', 'æ': 'ae',
        'Œ': 'OE', 'œ': 'oe'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)

    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'[\n]', ' ', text)

    greek_to_latin = {
        'α': 'a', 'ά': 'a', 'β': 'b', 'γ': 'g', 'δ': 'd', 'ε': 'e',
        'έ': 'e', 'ζ': 'z', 'η': 'h', 'θ': 'th', 'ι': 'i', 'κ': 'k',
        'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o', 'π': 'p',
        'ρ': 'r', 'ς': 's', 'σ': 's', 'τ': 't', 'υ': 'y', 'φ': 'f',
        'χ': 'ch', 'ω': 'o', 'ό': 'o'
    }
    for greek, latin in greek_to_latin.items():
        text = text.replace(greek, latin)

    allowed_chars = re.compile(r'[A-Za-z .,\'"!?-]')
    text = ''.join([char for char in text if allowed_chars.match(char)])

    return text

nietzsche_train_clean = simplify_text(nietzsche_works_train)
nietzsche_test_clean = simplify_text(nietzsche_works_test)


In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np

# create character mappings
chars = sorted(list(set(nietzsche_train_clean)))
char_to_index = {c: i for i, c in enumerate(chars)}
index_to_char = {i: c for i, c in enumerate(chars)}
vocab_size = len(chars)

sequence_length = 80
step = 1

# convert text to sequences of indices
def text_to_sequences(text, sequence_length, step):
    text_as_int = [char_to_index[c] for c in text]
    inputs, labels = [], []
    for i in range(0, len(text_as_int) - sequence_length, step):
        inputs.append(text_as_int[i: i + sequence_length])
        labels.append(text_as_int[i + 1: i + sequence_length + 1])
    return inputs, labels

train_sequences, train_labels = text_to_sequences(nietzsche_train_clean, sequence_length, step)
test_sequences, test_labels = text_to_sequences(nietzsche_test_clean, sequence_length, step)

#train_data = tf.data.Dataset.from_tensor_slices((train_sequences, train_labels))
#test_data = tf.data.Dataset.from_tensor_slices((test_sequences, test_labels))
#
## define one-hot encoding function for TPU
#def one_hot_encode(x, y):
#    x = tf.convert_to_tensor(x)
#    y = tf.one_hot(y, depth=vocab_size)  # TPU-friendly one-hot encoding
#    return x, y
#
#
## batch, shuffle, and optimize dataset
#batch_size = 2048  # TPUs handle larger batch sizes more effectively
#train_data = (train_data
#              .shuffle(10000)
#              .batch(batch_size, drop_remainder=True)
#              .map(one_hot_encode)
#              .prefetch(tf.data.AUTOTUNE))
#test_data = (test_data
#             .batch(batch_size, drop_remainder=True)
#             .map(one_hot_encode)
#             .prefetch(tf.data.AUTOTUNE))
#

In [None]:
import pickle

# Save character mappings
with open("/content/drive/MyDrive/autocomplete/data/char_mappings.pkl", "wb") as f:
    pickle.dump({'char_to_index': char_to_index, 'index_to_char': index_to_char, 'vocab_size': vocab_size}, f)

# Save clean training and testing data
with open("/content/drive/MyDrive/autocomplete/data/nietzsche_train_clean.txt", "w") as f:
    f.write(nietzsche_train_clean)

with open("/content/drive/MyDrive/autocomplete/data/nietzsche_test_clean.txt", "w") as f:
    f.write(nietzsche_test_clean)

# Save processed train and test sequences
train_sequences_np = np.array(train_sequences, dtype=np.int32)
train_labels_np = np.array(train_labels, dtype=np.int32)
test_sequences_np = np.array(test_sequences, dtype=np.int32)
test_labels_np = np.array(test_labels, dtype=np.int32)

np.save("/content/drive/MyDrive/autocomplete/data/train_sequences.npy", train_sequences_np)
np.save("/content/drive/MyDrive/autocomplete/data/train_labels.npy", train_labels_np)
np.save("/content/drive/MyDrive/autocomplete/data/test_sequences.npy", test_sequences_np)
np.save("/content/drive/MyDrive/autocomplete/data/test_labels.npy", test_labels_np)

# Save train and test tf.data Datasets for later loading
# train_data.element_spec, test_data.element_spec  # Confirm element specs

# You can't directly save tf.data.Dataset but can recreate it later using saved sequences and labels.


In [None]:
type(train_sequences_np)

numpy.ndarray

In [None]:
train_sequences[0][0:10]

[0, 23, 25, 12, 13, 8, 10, 12, 0, 0]

In [None]:
train_labels[0][0:10]

[23, 25, 12, 13, 8, 10, 12, 0, 0, 0]

In [None]:
test_sequences[0][0:10]

[48, 47, 0, 53, 41, 38, 0, 36, 41, 54]

In [None]:
test_labels[0][0:10]

[47, 0, 53, 41, 38, 0, 36, 41, 54, 51]

In [None]:
len(train_sequences) # 1180569
len(train_sequences[0]) # 80

80