In [None]:
!pip install tensorflow_datasets
!pip install -q -U keras-tuner
!pip install -q pyyaml h5py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import tensorboard
from tensorflow.python.keras.callbacks import TensorBoard
import urllib
import os
import datetime
import re
import nltk
import IPython
import kerastuner as kt
import tensorflow.keras.activations as activations
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optimizers
from gensim.models import Word2Vec
import numpy as np

In [None]:
SESSION_ID = datetime.datetime.now().strftime("%d%m%Y-%H%M")

Add the file names which should be processed

In [None]:
FILE_NAMES = [
    'pg_kant.txt', 
    'pg_nietzsch.txt', 
    'pg_platon.txt', 
    'pg_rousseau.txt']

# Preparation / normalization

### Loading files with Keras

In [None]:
prefix = 'file://'
processed_path = '/content/drive/My Drive/RUAK/input/processed/'
url = urllib.parse.quote(processed_path)

for file_name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(file_name, origin=prefix+url+file_name)

parent_dir = os.path.dirname(text_dir)

# Dataset managment

Create datasets - a seperate one for each text

In [None]:
nltk.download('punkt')

def labeler(example, index):
  return example, tf.cast(index, tf.int64)

def to_sentences(text):
  return nltk.sent_tokenize(text, language='german')


labeled_data_sets = []

for index, file_name in enumerate(FILE_NAMES):
  path = os.path.join(parent_dir, file_name)
  tensor = tf.io.read_file(path)

  tensors = []

  with open(path, 'rb') as file: 
    text = str(file.read())
    sentences = to_sentences(text)

    # Some cleanup for short sentences.
    for sentence in sentences:
      if ' ' in sentence == False:
        continue
      if len(sentence) <=20:
        continue
      tensors.append(tf.constant(sentence))

    dataset = tf.data.Dataset.from_tensor_slices(tensors)

    labeled_dataset = dataset.map(lambda ex: labeler(ex, index))
    labeled_data_sets.append(labeled_dataset)

    print(f"Created dataset for {file_name} with index: {index}.")

Define some values. If this in only used for hyperparameter tuning. The fist case should be used.

In [None]:
buffer_size = 87710
batch_size = 40

Combine the labeled datasets into a single dataset

In [None]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    buffer_size, reshuffle_each_iteration=False)


# Tokenization

In [None]:
tokenizer = tfds.deprecated.text.Tokenizer(alphanum_only=True) # TODO: Change tokenizer due to deprecation.

vocabulary_set = set()
sentences_count = 0

for sentence_tensor, _ in all_labeled_data:
  sentences_count += 1
  some_tokens = tokenizer.tokenize(sentence_tensor.numpy())
  lower_tokens = []
  for token in some_tokens:
    lower_tokens.append(token)

  vocabulary_set.update(lower_tokens)

vocab_size = len(vocabulary_set)
print(f'{sentences_count} sentences from {len(FILE_NAMES)} authors.')
print(f'{vocab_size} unique vocabularies.')

# Encoding

In [None]:
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, lowercase=False, 
                                                strip_vocab=True) # TODO: Change encoder due to deprecation.


def encode_text(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map(text, label):
  encoded_text, label = tf.py_function(encode_text, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text, label

all_encoded_dataset = all_labeled_data.map(encode_map)

Check encoding process

In [None]:
for sentence, index in all_labeled_data.take(1):
  print(sentence.numpy())

for encoded_sentence, index in all_encoded_dataset.take(1):
  print(encoded_sentence.numpy())
  decode_sample_text = encoder.decode(encoded_sentence.numpy())
  print(decode_sample_text)

# Splitting

Create train and test data for the fitting proccess.

In [None]:
take_size = int(sentences_count * 0.2)

train_data = all_encoded_dataset.skip(take_size)
train_data = train_data.shuffle(buffer_size)

train_data = train_data.padded_batch(batch_size)

test_data = all_encoded_dataset.take(take_size) 
test_data = test_data.shuffle(buffer_size)

test_data = test_data.padded_batch(batch_size) 

Check batching process

In [None]:
for batch, i in train_data.take(1):
  print(i)
  print(batch)

# Hyperparameter tuning

### setup the test model

Hyperparameter Tuning

In [None]:
def model_builder(hp):

  hp_units = hp.Int('units', min_value = 256, max_value = 512, step = 128)
  hp_lstm_units = hp.Int('lstm_units', min_value = 256, max_value = 512, step = 128)
  hp_embedding_dims = hp.Choice('embedding_dims', values = [300])
  hp_dropout = hp.Choice('dropout', values = [0.0, 0.1])
  hp_learning_rate = hp.Choice('learning_rate', values = [0.01, 0.001])

  hypermodel = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size + 1, hp_units),
    tf.keras.layers.Bidirectional(layers.LSTM(hp_lstm_units, return_sequences=True)),
    tf.keras.layers.Dropout(hp_dropout),
    tf.keras.layers.Bidirectional(layers.LSTM(hp_lstm_units, return_sequences=True)),
    tf.keras.layers.Dropout(hp_dropout),
    tf.keras.layers.Bidirectional(layers.LSTM(hp_lstm_units)),
    tf.keras.layers.Dropout(hp_dropout),
    tf.keras.layers.Dense(hp_units, activation=activations.relu),
    tf.keras.layers.Dense(hp_units, activation=activations.relu),
    tf.keras.layers.Dense(len(FILE_NAMES))
  ])
  hypermodel.compile(optimizer=optimizers.Adamax(learning_rate = hp_learning_rate),
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  
  return hypermodel

### Run the tuner

The result are the optimal hyperparameters: `best_hps`.

In [None]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

hyperband_tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy', 
                     max_epochs=35,
                     factor=3,
                     directory='/content/drive/My Drive/RUAK/output/hp_tuning', # This path may need to be changed.
                     project_name=f'Hyperband_{SESSION_ID}',
                     overwrite=True)

hyperband_tuner.search(train_data, epochs =1, validation_data = test_data, callbacks = [ClearTrainingOutput()])

best_hps_hyperband = hyperband_tuner.get_best_hyperparameters(1)[0]

print(f"""
Optimal values:
- number of units in densely-connected layers {best_hps.get('units')}
- number of units in lstm {best_hps.get('lstm_units')}
- embedding dim {best_hps.get('embedding_dims')} 
- learning rate {best_hps.get('learning_rate')}
- dropout rate {best_hps.get('dropout')}
""")

In [None]:
hyperband_tuner.results_summary()

In [None]:
random_search_tuner = kt.RandomSearch(model_builder,
                        objective='val_accuracy',
                        max_trials=5, executions_per_trial=3,
                        directory='/content/drive/My Drive/RUAK/output/hp_tuning', # This path may need to be changed.
                        project_name=f'RandomSearch_{SESSION_ID}',
                        overwrite=True)

random_search_tuner.search(train_data, epochs =2, validation_data = test_data, callbacks = [ClearTrainingOutput()])

best_hps_random_search = random_search_tuner.get_best_hyperparameters(1)[0]

print(f"""
Optimal values:
- number of units in densely-connected layers {best_hps.get('units')}
- number of units in lstm {best_hps.get('lstm_units')}
- embedding dim {best_hps.get('embedding_dims')} 
- learning rate {best_hps.get('learning_rate')}
- dropout rate {best_hps.get('dropout')}
""")

In [None]:
random_search_tuner.results_summary()

# TensorBoard preparations

In [None]:
%load_ext tensorboard

In [None]:
log_dir = os.path.join("logs", datetime.datetime.now().strftime("%d.%m.%Y - %H:%M:%S"))

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Model preparation

### Prepare callbacks

In [None]:
checkpoint_path = f"/content/drive/My Drive/RUAK/output/training_checkpoints/{SESSION_ID}/cp.ckpt" # This path may need to be changed.
checkpoint_dir = os.path.dirname(checkpoint_path)

model_path = f"/content/drive/My Drive/RUAK/output/models/model-{SESSION_ID}.h5" # This path may need to be changed.

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

class Callbacks(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_loss')<0.015):
      print("\nTraining val. loss reached 0.015.") # TODO: This should be an early stopping callback from Tensorflow.
      self.model.stop_training = True

epoch_callbacks = Callbacks()

### Set the hyperparameters

In [None]:
EMBEDDING_DIMS = 700
NUM_LSTM_UNITS = 256
NUM_UNITS = 512
DROPOUT = 0.1
OUTPUT = len(FILE_NAMES)
LEARNING_RATE = 0.01
OPTIMIZER = optimizers.Adamax(learning_rate=LEARNING_RATE)
EPOCHS = 1

### Setup and complie the model

Load the Word_2_Vec model (700 dims, 100 epochs, window 7) for providing the weights for the embedding layer.

In [None]:
model_path = os.path.abspath("/content/drive/My Drive/RUAK/output/embedding/w2v/") # This path may need to be changed.

def get_embedding_matrix(model_name):
    model = Word2Vec.load(f'{model_path}/{model_name}')
    embedding_matrix = np.zeros((len(model.wv.vocab), model.vector_size))
    
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(f"Embedding_matrix shape: {embedding_matrix.shape}")
    return embedding_matrix

embedding_matrix = get_embedding_matrix('full_700_iter100_win7_8.model') # Model is created by Word2Vec.

Build and compile the model

In [None]:
model = keras.Sequential([
    layers.Embedding(len(embedding_matrix), EMBEDDING_DIMS, weights=[
                     embedding_matrix], trainable=False),
    layers.Bidirectional(layers.LSTM(
        NUM_LSTM_UNITS, return_sequences=True)),
    layers.Dropout(DROPOUT),
    layers.Bidirectional(layers.LSTM(
        NUM_LSTM_UNITS, return_sequences=True)),
    layers.Dropout(DROPOUT),
    layers.Bidirectional(layers.LSTM(NUM_LSTM_UNITS)),
    layers.Dropout(DROPOUT),
    layers.Dense(NUM_UNITS, activation=activations.relu),
    layers.Dense(NUM_UNITS, activation=activations.relu),
    layers.Dense(OUTPUT, activation=activations.softmax)
])

model.compile(optimizer=OPTIMIZER, loss=losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])
model.summary()

# Model training

In [None]:
 model.fit(train_data, epochs=EPOCHS, validation_data=test_data, callbacks=[epoch_callbacks, cp_callback])

Save the model

In [None]:
model.save('/content/drive/My Drive/RUAK/output/models/phil_model.h5') # This path may need to be changed.

# Loading

### Load model

In [None]:
model = tf.keras.models.load_model('/content/drive/My Drive/RUAK/output/models/phil_model.h5') # This path may need to be changed.

### Load stored weights

In [None]:
checkpoint_path = f"/content/drive/My Drive/RUAK/training_checkpoints/{ID}/cp.ckpt" # This path may need to be changed.
checkpoint_dir = os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)
model.load_weights(latest)

# Evaluate

In [None]:
test_loss, test_acc = model.evaluate(test_data)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Get index of author 

In [None]:
sample_sentence_text = "This is a test."

In [None]:
import operator

def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

def sample_predict(sample_text, pad):
  encoded_sample_text = encoder.encode(sample_text)

  if pad:
    encoded_sample_text = pad_to_size(encoded_sample_text, BATCH_SIZE)

  encoded_sample_text = tf.cast(encoded_sample_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_text, 0))

  return (predictions)

predictions_padding = sample_predict(sample_sentence_text, pad=True)
predictions = sample_predict(sample_sentence_text, pad=False)

print('With padding:')
print(predictions_padding)
print('\n')
print('Without padding:')
print(predictions)

# TensorBoard analysis

In [None]:
%tensorboard --logdir logs

In [None]:
!rm -rf ./logs/