In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import datetime, os
SESSION_ID = datetime.datetime.now().strftime("%d%m%Y-%H%M")

In [0]:
FILE_NAMES = [
    'pg_kant.txt', 'pg_nietzsch.txt', 'pg_platon.txt', 'pg_rousseau.txt']

# Preparation / normalization

### Loading files with Keras

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
import urllib
import os

prefix = 'file://'
processed_path = '/content/drive/My Drive/RUAK/input/processed/'
url = urllib.parse.quote(processed_path)

for file_name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(file_name, origin=prefix+url+file_name)

parent_dir = os.path.dirname(text_dir)

Downloading data from file:///content/drive/My%20Drive/RUAK/input/processed/pg_kant.txt
Downloading data from file:///content/drive/My%20Drive/RUAK/input/processed/pg_nietzsch.txt
Downloading data from file:///content/drive/My%20Drive/RUAK/input/processed/pg_platon.txt
Downloading data from file:///content/drive/My%20Drive/RUAK/input/processed/pg_rousseau.txt


# Dataset

Create datasets - a seperate one for each text

In [5]:
import re
import os
import tensorflow as tf
import nltk

nltk.download('punkt')

def labeler(example, index):
  return example, tf.cast(index, tf.int64)

def to_sentences(text):
  return nltk.sent_tokenize(text, language='german')


labeled_data_sets = []

for index, file_name in enumerate(FILE_NAMES):
  path = os.path.join(parent_dir, file_name)
  tensor = tf.io.read_file(path)

  tensors = []

  with open(path, 'rb') as file: 
    text = file.read()
    text = str(text)
    sentences = to_sentences(text)

    for sentence in sentences:
      if ' ' in sentence == False:
        continue
      if len(sentence) <=20:
        continue
      tensors.append(tf.constant(sentence))

    dataset = tf.data.Dataset.from_tensor_slices(tensors)

    labeled_dataset = dataset.map(lambda ex: labeler(ex, index))
    labeled_data_sets.append(labeled_dataset)

    print(f"Created dataset for {file_name} with index: {index}.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Created dataset for pg_kant.txt with index: 0.
Created dataset for pg_nietzsch.txt with index: 1.
Created dataset for pg_platon.txt with index: 2.
Created dataset for pg_rousseau.txt with index: 3.


Define some values. If this in only used for hyperparameter tuning. The fist case should be used.

In [0]:
BUFFER_SIZE = 87710
# BATCH_SIZE = 60
BATCH_SIZE = 200
TAKE_SIZE = 3000

Combine the labeled datasets into a single dataset

In [0]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)


# Tokenization

In [8]:
import tensorflow_datasets as tfds

tokenizer = tfds.features.text.Tokenizer(alphanum_only=True)

vocabulary_set = set()
sentences_count = 0

for sentence_tensor, _ in all_labeled_data:
  sentences_count += 1
  some_tokens = tokenizer.tokenize(sentence_tensor.numpy())
  lower_tokens = []
  for token in some_tokens:
    lower_tokens.append(token)

  vocabulary_set.update(lower_tokens)

vocab_size = len(vocabulary_set)
print(f'{sentences_count} sentences from {len(FILE_NAMES)} authors.')
print(f'{vocab_size} unique vocabularies.')

87710 sentences from 4 authors.
87351 unique vocabularies.


# Encoding

In [0]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set, lowercase=False, strip_vocab=True)

def encode_text(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  encoded_text, label = tf.py_function(encode_text, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text, label

all_encoded_dataset = all_labeled_data.map(encode_map_fn)

Check encoding process

In [0]:
for sentence, index in all_labeled_data.take(1):
  print(sentence.numpy())

for encoded_sentence, index in all_encoded_dataset.take(1):
  print(encoded_sentence.numpy())
  decode_sample_text = encoder.decode(encoded_sentence.numpy())
  print(decode_sample_text)

# Splitting

Create train and test data for the fitting proccess.

In [0]:
train_data = all_encoded_dataset.skip(TAKE_SIZE)
train_data = train_data.shuffle(BUFFER_SIZE)

train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_dataset.take(TAKE_SIZE) 
test_data = test_data.shuffle(BUFFER_SIZE)

test_data = test_data.padded_batch(BATCH_SIZE) 

Check batching process

In [11]:
for batch, i in train_data.take(1):
  print(i)
  print(batch)

tf.Tensor(
[1 2 2 1 2 2 3 0 2 1 3 2 1 3 1 2 0 1 2 1 2 1 2 2 3 2 3 2 1 3 3 0 2 2 3 2 0
 2 1 1 1 1 2 3 3 3 2 2 0 2 1 2 0 2 1 0 0 1 2 1 3 0 1 3 2 1 0 1 1 1 1 1 3 1
 0 0 2 1 0 1 0 3 3 1 0 0 3 1 3 2 1 1 2 2 2 3 1 2 2 1 3 3 1 2 2 1 3 0 1 1 2
 0 3 1 2 3 1 2 1 3 3 1 3 2 1 2 1 0 0 0 3 1 3 1 0 1 2 2 3 2 1 3 3 3 1 3 1 1
 2 1 2 0 1 1 3 2 1 3 1 2 2 2 3 2 0 2 1 3 3 2 0 3 3 2 1 2 2 2 3 0 2 2 1 2 2
 1 1 2 2 2 0 1 1 2 1 1 3 1 0 1], shape=(200,), dtype=int64)
tf.Tensor(
[[63364   108 26890 ...     0     0     0]
 [77389 69512 31448 ...     0     0     0]
 [40375 69512 86643 ...     0     0     0]
 ...
 [12894 20365 26890 ...     0     0     0]
 [58307 18932 34104 ...     0     0     0]
 [80762 33995 28153 ...     0     0     0]], shape=(200, 133), dtype=int64)


# Hyperparameter tuning

### setup the test model

Hyperparameter Tuning

In [12]:
!pip install -q -U keras-tuner

[?25l[K     |██████                          | 10kB 23.6MB/s eta 0:00:01[K     |████████████                    | 20kB 2.1MB/s eta 0:00:01[K     |██████████████████              | 30kB 2.8MB/s eta 0:00:01[K     |████████████████████████        | 40kB 3.0MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.1MB/s 
[?25h  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Building wheel for terminaltables (setup.py) ... [?25l[?25hdone


In [0]:
import IPython
import kerastuner as kt
from tensorflow import keras

In [0]:
def model_builder(hp):

  hp_units = hp.Int('units', min_value = 256, max_value = 512, step = 128)
  hp_lstm_units = hp.Int('lstm_units', min_value = 256, max_value = 512, step = 128)
  hp_embedding_dims = hp.Choice('embedding_dims', values = [300])
  hp_dropout = hp.Choice('dropout', values = [0.0, 0.1])
  hp_learning_rate = hp.Choice('learning_rate', values = [0.01, 0.001])

  hypermodel = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size + 1, hp_units),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_lstm_units, return_sequences=True)),
    tf.keras.layers.Dropout(hp_dropout),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_lstm_units, return_sequences=True)),
    tf.keras.layers.Dropout(hp_dropout),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_lstm_units)),
    tf.keras.layers.Dropout(hp_dropout),
    tf.keras.layers.Dense(hp_units, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(hp_units, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(len(FILE_NAMES))
  ])
  hypermodel.compile(optimizer=keras.optimizers.Adamax(learning_rate = hp_learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  
  return hypermodel

### Run the tuner

The result are the optimal hyperparameters: `best_hps`.

In [15]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy', 
                     max_epochs=35,
                     factor=3,
                     directory='/content/drive/My Drive/RUAK/output/hp_tuning',
                     project_name=f'Hyperband_{SESSION_ID}',
                     overwrite=True)

tuner.search(train_data, epochs =1, validation_data = test_data, callbacks = [ClearTrainingOutput()])

best_hps = tuner.get_best_hyperparameters(1)[0]

print(f"""
Optimal values:
- number of units in densely-connected layers {best_hps.get('units')}
- number of units in lstm {best_hps.get('lstm_units')}
- embedding dim {best_hps.get('embedding_dims')} 
- learning rate {best_hps.get('learning_rate')}
- dropout rate {best_hps.get('dropout')}
""")

Epoch 1/2
Epoch 2/2

InternalError: ignored

In [0]:
tuner.results_summary()

In [0]:
tuner = kt.RandomSearch(model_builder,
                        objective='val_accuracy',
                        max_trials=5, executions_per_trial=3,
                        directory='/content/drive/My Drive/RUAK/output/hp_tuning',
                        project_name=f'RandomSearch_{SESSION_ID}',
                        overwrite=True)

tuner.search(train_data, epochs =2, validation_data = test_data, callbacks = [ClearTrainingOutput()])

best_hps = tuner.get_best_hyperparameters(1)[0]

print(f"""
Optimal values:
- number of units in densely-connected layers {best_hps.get('units')}
- number of units in lstm {best_hps.get('lstm_units')}
- embedding dim {best_hps.get('embedding_dims')} 
- learning rate {best_hps.get('learning_rate')}
- dropout rate {best_hps.get('dropout')}
""")

In [0]:
tuner.results_summary()

# TensorBoard preparations

In [0]:
%load_ext tensorboard

In [0]:
import tensorboard
import datetime, os
from tensorflow.python.keras.callbacks import TensorBoard

log_dir = os.path.join("logs", datetime.datetime.now().strftime("%d.%m.%Y - %H:%M:%S"))

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Prepare model

### Prepare callbacks

In [0]:
!pip install -q pyyaml h5py

Store the weights.

In [0]:
checkpoint_path = f"/content/drive/My Drive/RUAK/output/training_checkpoints/{SESSION_ID}/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

model_path = f"/content/drive/My Drive/RUAK/output/models/model-{SESSION_ID}.h5"

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

class Callbacks(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_loss')<0.015):
      print("\nTraining val. loss reached 0.015.")
      self.model.stop_training = True

epoch_callbacks = Callbacks()

### Set the hyperparameters

In [0]:
EMBEDDING_DIMS = 300
NUM_LSTM_UNITS = 256
NUM_UNITS = 256
DROPOUT = 0.1
OUTPUT = len(FILE_NAMES)
LEARNING_RATE = 0.0001
OPTIMIZER = tf.keras.optimizers.Adamax(learning_rate=LEARNING_RATE)
EPOCHS = 30

### Setup and complie the model

In [0]:
import os
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.activations as activations
import tensorflow.keras.losses as losses
import tensorflow.keras as keras
import tensorflow.keras.optimizers as optimizers
from gensim.models import Word2Vec
import numpy as np

MODEL_PATH = os.path.abspath("/content/drive/My Drive/RUAK/output/word_embedding/w2v/")

def embedding_matrix(model_name):
    model = Word2Vec.load(f'{MODEL_PATH}/{model_name}')
    embedding_matrix = np.zeros((len(model.wv.vocab), model.vector_size))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(f"Embedding_matrix shape: {embedding_matrix.shape}")
    return embedding_matrix

embedding_matrix = embedding_matrix('full_700_iter100_win7_8.model')

model = keras.Sequential([
    # layers.Embedding(len(embedding_matrix), EMBEDDING_DIMS, weights=[
    #                  embedding_matrix], trainable=False),
    layers.Embedding(vocab_size + 1, EMBEDDING_DIMS),
    layers.Bidirectional(layers.LSTM(
        NUM_LSTM_UNITS, return_sequences=True)),
    layers.Dropout(DROPOUT),
    layers.Bidirectional(layers.LSTM(
        NUM_LSTM_UNITS, return_sequences=True)),
    layers.Dropout(DROPOUT),
    layers.Bidirectional(layers.LSTM(NUM_LSTM_UNITS)),
    layers.Dropout(DROPOUT),
    layers.Dense(NUM_UNITS, activation=activations.relu),
    layers.Dense(NUM_UNITS, activation=activations.relu),
    layers.Dense(OUTPUT)
])

model.compile(optimizer=OPTIMIZER, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.summary()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Embedding_matrix shape: (132919, 700)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         26205600  
_________________________________________________________________
bidirectional (Bidirectional (None, None, 1024)        3330048   
_________________________________________________________________
dropout (Dropout)            (None, None, 1024)        0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 1024)        6295552   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 1024)        0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1024)              6295552   
_________________________________________________________________
dropout_2 (Dropout

# Train the model

In [0]:
 model.fit(train_data, epochs=EPOCHS, validation_data=test_data, callbacks=[epoch_callbacks, cp_callback])

Epoch 1/60
Epoch 00001: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 2/60
Epoch 00002: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 3/60
Epoch 00003: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 4/60
Epoch 00004: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 5/60
Epoch 00005: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 6/60
Epoch 00006: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 7/60
Epoch 00007: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 8/60
Epoch 00008: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/13062020-0807/cp.ckpt
Epoch 9/60
Epoch 00009: saving model to /content

Save the model

In [0]:
model.save('/content/drive/My Drive/RUAK/output/models/phil_model.h5') 

# Loading

### Load model

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/RUAK/output/models/phil_model.h5')

### Load stored weights

In [0]:
checkpoint_path = f"/content/drive/My Drive/RUAK/training_checkpoints/{ID}/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)
model.load_weights(latest)

# Evaluate

In [0]:
test_loss, test_acc = model.evaluate(test_data)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Get index of author 

In [0]:
sample_sentence_text = "This is a test."

In [0]:
import operator

def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

def sample_predict(sample_text, pad):
  encoded_sample_text = encoder.encode(sample_text)

  if pad:
    encoded_sample_text = pad_to_size(encoded_sample_text, BATCH_SIZE)

  encoded_sample_text = tf.cast(encoded_sample_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_text, 0))

  return (predictions)

predictions_padding = sample_predict(sample_sentence_text, pad=True)
predictions = sample_predict(sample_sentence_text, pad=False)

print('With padding:')
print(predictions_padding)
print('\n')
print('Without padding:')
print(predictions)

# TensorBoard

In [0]:
%tensorboard --logdir logs

In [0]:
!rm -rf ./logs/