In [None]:
# loss function huber
# optimizer adam or it's variants
# activation hidden relu to get non-linearity
# activation final softmax to get probabilities
# argmax to get final prediction

In [None]:
text = ""
with open('./notebooks/il-piccolo-principe.txt', 'r') as file:
  text = file.read()


In [None]:
import re
import unicodedata

def simplify_italian_text(text):
  # Convert text to lowercase
  text = text.lower()
  # Normalize special characters
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
  # Remove punctuation
  text = re.sub(r'[^\w\s]', ' ', text)
  text = re.sub(r'\d', ' ', text)
  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [None]:
text = simplify_italian_text(text)
print(text)
print(set(text))

In [None]:
import numpy as np

def string_to_int_array(text):
  unique_chars = sorted(set(text))
  dictionary_size = len(unique_chars)
  char_to_int = {char: i for i, char in enumerate(unique_chars)}
  int_to_char = {i: char for i, char in enumerate(unique_chars)}
  int_array = np.array([char_to_int[char] for char in text])
  return int_array, dictionary_size, lambda x: char_to_int[x], lambda x: int_to_char[x]

In [None]:
data, dictionary_size, char_to_int, int_to_char = string_to_int_array(text)
for i in range(len(text)):
  assert char_to_int(text[i]) == data[i]
  assert int_to_char(data[i]) == text[i]

In [None]:
import numpy as np

def one_hot_encode_array(int_array, num_classes):
  one_hot = np.zeros((int_array.shape[0], num_classes))
  one_hot[np.arange(int_array.shape[0]), int_array] = 1
  return one_hot

def one_hot_decode_array(one_hot_array):
  return np.argmax(one_hot_array, axis=1)

def one_hot_encode(int, num_classes):
  one_hot = np.zeros(num_classes)
  one_hot[int] = 1
  return one_hot

def one_hot_decode(one_hot):
  return np.argmax(one_hot)

In [None]:
window_size = 100

def one_hot_to_x(data_one_hot):
  assert data_one_hot.shape[0] >= window_size
  x = np.array([data_one_hot[i:i+window_size] for i in range(data_one_hot.shape[0] - window_size + 1)])
  return x

data_one_hot = one_hot_encode_array(data, dictionary_size)
assert data_one_hot.shape[0] == data.shape[0]

x = one_hot_to_x(data_one_hot[0:-1])
assert x.shape[0] == data_one_hot.shape[0] - window_size

y = data_one_hot[window_size:]
assert x.shape[0] == y.shape[0]


In [None]:
import tensorflow as tf
import numpy as np

model = tf.keras.Sequential([
  # tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=(window_size, dictionary_size)),
  # tf.keras.layers.Flatten(input_shape=(window_size, dictionary_size)),
  tf.keras.layers.Dense(2048, activation='relu', input_shape=(window_size, dictionary_size)),
  tf.keras.layers.Dense(1024, activation='relu',),
  tf.keras.layers.Dense(512, activation='relu',),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(dictionary_size, activation='softmax')
])

model.compile(loss='huber', optimizer='adam')

model.fit(x, y, batch_size=128, epochs=5, validation_split=0.2)


In [None]:
def predict_next_one_hot(one_hot_sequence):
  probabilities = model.predict(one_hot_to_x(one_hot_sequence)[-1:])[0]
  return one_hot_encode(np.argmax(probabilities), dictionary_size)

def generate_text(starting_text, num_chars):
  one_hot_sequence = one_hot_encode_array(np.array([char_to_int(char) for char in starting_text]), dictionary_size)
  for i in range(num_chars):
    one_hot_sequence = np.append(one_hot_sequence, np.array([predict_next_one_hot(one_hot_sequence)]), axis=0)
  fullText = ''.join([int_to_char(one_hot_decode(one_hot)) for one_hot in one_hot_sequence])
  return fullText

In [None]:
p1 = generate_text('per sommo piacere'.rjust(window_size), 1000)
print(p1)