In [309]:
!pip install tensorflow_datasets
!pip install -q -U keras-tuner
!pip install -q pyyaml h5py



In [310]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [335]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout, BatchNormalization 
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.callbacks import TensorBoard
import urllib, IPython, os, datetime, re, nltk, tensorboard, operator
import kerastuner as kt
import tensorflow.keras.activations as activations
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.losses as losses
import tensorflow.keras.optimizers as optimizers
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [312]:
session_id = datetime.datetime.now().strftime("%d/%m/%Y-%H:%M")

Add the file names which should be processed

In [368]:
file_names = [
    'pg_kant.txt', 
    'pg_nietzsch.txt', 
    'pg_platon.txt', 
    'pg_rousseau.txt']

# Preparation / normalization

### Loading files with Keras

In [314]:
prefix = 'file://'
processed_path = '/content/drive/My Drive/RUAK/input/processed/'
url = urllib.parse.quote(processed_path)

for file_name in file_names:
  text_dir = tf.keras.utils.get_file(file_name, origin=prefix+url+file_name)

parent_dir = os.path.dirname(text_dir)

# Dataset managment

Create datasets - a seperate one for each text. Adjust the language for the `nltk.sent_tokenizer` if needed.

In [369]:
nltk.download('punkt')

def labeler(ex, index):
  return ex, tf.cast(index, tf.int64)

sentences = []
labels = []
df = pd.DataFrame()

for index, file_name in enumerate(file_names):
  path = os.path.join(parent_dir, file_name)


  with open(path, 'rb') as file: 
    text = str(file.read())
    nltk_sentences = nltk.sent_tokenize(text, language='german')

    for sentence in nltk_sentences:
      if ' ' in sentence == False:
        continue
      if len(sentence) <=20:
        continue
      sentence = str(sentence).replace("b'", "")
      sentences.append(sentence)
      labels.append(index)

    print(f"Created dataset for {file_name} with index: {index}.")

print(f'{len(sentences)} found.')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Created dataset for pg_kant.txt with index: 0.
Created dataset for pg_nietzsch.txt with index: 1.
Created dataset for pg_platon.txt with index: 2.
Created dataset for pg_rousseau.txt with index: 3.
87710 found.


Define some values

In [370]:
buffer_size = 87710
batch_size = 40

Combine the labeled datasets into a single dataset

# Tokenization

In [371]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
print(f'{len(sentences)} sentences from {len(file_names)} authors.')
print(f'{len(tokenizer.word_counts)} unique vocabularies.')

87710 sentences from 4 authors.
79401 unique vocabularies.


# Encoding

In [None]:
encoded_sentences = tokenizer.texts_to_sequences(sentences)
padded_sentences = pad_sequences(encoded_sentences, padding='post')
print(sentences[7])
print(np.array(padded_sentences[7]))
print(tokenizer.sequences_to_texts([padded_sentences[7]]))

# Splitting

Create train and test data for the fitting proccess.

In [373]:
X_train, X_valid, y_train, y_valid = train_test_split(padded_sentences, np.array(labels), test_size=0.1)
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(78939, 430)
(78939,)
(8771, 430)
(8771,)


In [390]:
print(X_train[1])
print(y_train[1])
print(X_valid[1])
print(y_valid[1])

[   37  2513   233    10   188    35    73  2622 33491     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

# Hyperparameter tuning

### setup the test model

Load the Word2Vec model (700 dims, 100 epochs, window 7) for providing the weights for the embedding layer.

In [346]:
model_path = os.path.abspath("/content/drive/My Drive/RUAK/output/embedding/w2v/") # This path may need to be changed.

def get_embedding_matrix(model_name):
    model = Word2Vec.load(f'{model_path}/{model_name}')
    embedding_matrix = np.zeros((len(model.wv.vocab), model.vector_size))
    
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(f"Embedding_matrix shape: {embedding_matrix.shape}")
    return embedding_matrix

embedding_matrix = get_embedding_matrix('full_700_iter100_win7_8.model')

Embedding_matrix shape: (132919, 700)


Hyperparameter Tuning

In [393]:
def hypermodel(hp):

  hp_embedding_trainable = hp.Choice('embedding_trainable', [True, False])
  hp_dense_units = hp.Int('dense_units', 64, 512, step=64)
  hp_lstm_units = hp.Int('lstm_units', 256, 512, step=128)
  hp_dropout = hp.Choice('dropout', [0.0, 0.1, 0.25])
  hp_learning_rate = hp.Choice('learning_rate', [0.01, 0.001, 0.0001])

  model = tf.keras.Sequential()
                                    
  model.add(Embedding(len(embedding_matrix),
                    output_dim=700,
                    weights=[embedding_matrix], 
                    trainable=hp_embedding_trainable,
                    mask_zero=True))
  
  model.add(Bidirectional(LSTM(hp_lstm_units, return_sequences=True)))
  model.add(Dropout(hp_dropout))
  model.add(Bidirectional(LSTM(hp_lstm_units, return_sequences=True)))
  model.add(Dropout(hp_dropout))
  model.add(Bidirectional(LSTM(hp_lstm_units)))
  model.add(Dropout(hp_dropout))
  model.add(Dense(hp_dense_units, activation='relu'))
  model.add(Dense(hp_dense_units, activation='relu'))
  model.add(Dense(len(file_names), activation='softmax'))

  model.compile(optimizer=optimizers.Adam(learning_rate=hp_learning_rate),
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  return model

### Run the tuner

The result are the optimal hyperparameters: `best_hps`.

In [None]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait=True)

tuner = kt.Hyperband(hypermodel,
                     objective='val_accuracy', 
                     executions_per_trial=1,
                     factor=3,
                     max_epochs=1,
                     hyperband_iterations=1,
                     directory='/content/drive/My Drive/RUAK/output/hp_tuning', # This path may need to be changed.
                     project_name='RUAK',
                     overwrite=True)

tuner.search(X_train, y_train, 
             epochs=1,
             validation_data = (X_valid, y_valid),
             callbacks = [ClearTrainingOutput(), EarlyStopping('val_accuracy', patience=1)],
             verbose=2)

best_hps = tuner.get_best_hyperparameters(1)[0]
best_models = tuner.get_best_models(num_models=3)

print(f"""
Optimal values:
- embedding is trainable {best_hps.get('embedding_trainabble')}
- number of units for dense layers {best_hps.get('dense_units')}
- number of units for lstm layers {best_hps.get('lstm_units')}
- learning rate {best_hps.get('learning_rate')}
- dropout rate {best_hps.get('dropout')}
""")

tuner.results_summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 700)         93043300  
_________________________________________________________________
bidirectional (Bidirectional (None, None, 512)         1959936   
_________________________________________________________________
dropout (Dropout)            (None, None, 512)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 512)         1574912   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0

# TensorBoard preparations

In [None]:
%load_ext tensorboard
log_dir = os.path.join("logs", datetime.datetime.now().strftime("%d/%m/%Y - %H:%M"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Model preparation

Select the model provided by the tuning process.

In [None]:
for model in best_models:
  model.summary()

chosen_model = best_models[0] 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 700)         93043300  
_________________________________________________________________
bidirectional (Bidirectional (None, None, 768)         3333120   
_________________________________________________________________
dropout (Dropout)            (None, None, 768)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 768)         3542016   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 768)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 768)               3542016   
_________________________________________________________________
dropout_2 (Dropout)          (None, 768)               0

### Prepare callbacks

In [None]:
checkpoint_path = f"/content/drive/My Drive/RUAK/output/training_checkpoints/{session_id}/cp.ckpt" # This path may need to be changed.
checkpoint_dir = os.path.dirname(checkpoint_path)

model_path = f"/content/drive/My Drive/RUAK/output/models/model-{session_id}.h5" # This path may need to be changed.

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

es_callback = EarlyStopping('val_accuracy', patience=4)

# Model training

In [None]:
 history = chosen_model.fit(train_data, epochs=1, batch_size=batch_size, validation_data=vaild_data, callbacks=[cp_callback, es_callback])

Epoch 00001: saving model to /content/drive/My Drive/RUAK/output/training_checkpoints/12/11/2020-07:54/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x7f67a57946a0>

Save the model

In [None]:
model.save('/content/drive/My Drive/RUAK/output/models/phil_model.h5') # This path may need to be changed.

# Loading

### Load model

In [None]:
chosen_model = tf.keras.models.load_model('/content/drive/My Drive/RUAK/output/models/phil_model.h5') # This path may need to be changed.

### Load stored weights

In [None]:
checkpoint_path = f"/content/drive/My Drive/RUAK/training_checkpoints/{ID}/cp.ckpt" # This path may need to be changed.
checkpoint_dir = os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)
chosen_model.load_weights(latest)

# Evaluate

In [None]:
test_loss, test_acc = chosen_model.evaluate(valid_data)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 1.0064799785614014
Test Accuracy: 0.7349218726158142


Get index of author 

In [None]:
sample_sentence_text = "Die Vernunft ist staerker als der Wille."

In [None]:
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

def sample_predict(sample_text, pad):
  encoded_sample_text = encoder.encode(sample_text)
  if pad:
    encoded_sample_text = pad_to_size(encoded_sample_text, batch_size)
  encoded_sample_text = tf.cast(encoded_sample_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_text, 0))
  return (predictions)

predictions = sample_predict(sample_sentence_text, pad=True)

print(predictions)

[[9.8962909e-01 1.0056574e-02 1.0725544e-04 2.0699060e-04]]


# TensorBoard analysis

In [None]:
%tensorboard --logdir logs

In [None]:
!rm -rf ./logs/