<a href="https://colab.research.google.com/github/domschl/tensor-poet/blob/master/tensor_poet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tensor-Poet

Please review [ml-indie-tools](https://github.com/domschl/ml-indie-tools), a collection machine learning tools that provides support for more environment indepent code. It will access your Google Drive when using with Google Colab.

In [None]:
!pip install -U ml-indie-tools

In [None]:
import logging
import os
import sys
import copy
import json
import time
import datetime
import random

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, regularizers

import tensorflow_datasets as tfds

In [None]:
from ml_indie_tools.env_tools import MLEnv
from ml_indie_tools.Gutenberg_Dataset import Gutenberg_Dataset
from ml_indie_tools.Text_Dataset import Text_Dataset

from ml_indie_tools.keras_custom_layers import MultiHeadSelfAttention, PositionalEncoding

## Preliminary

A tensorflow deep LSTM model for text generation

This code can use either CPU, GPU, TPU when running on Google Colab.

Select the corresponding runtime (menu: **`Runtime / Change runtime type`**)

## 0. Environment

In [None]:
ml_env = MLEnv(platform='tf', accelerator='fastest', old_disable_eager=True)  # TODO: move to tf.function()
ml_env.describe()

In [None]:
if ml_env.is_tpu is True:
    tpu_strategy = ml_env.tpu_strategy
    tpu_is_init=True
    use_eager=False
else:
    use_eager=True

In [None]:
project_name='women_writers'
use_selfattention = False  # To explore self-attention, also check out the newer https://github.com/domschl/transformer-poet
if use_selfattention is True:
    model_name='mhsa_v1_tf'
else:
    model_name='lstm_v1_tf'

# NOTICE: This will request access to Google Drive, if running on Google Colab. Google Drive is used to store snapshots
# training data. See project ml-indie-tools: https://github.com/domschl/ml-indie-tools 
root_path, project_path, model_path, data_path, log_path = ml_env.init_paths(project_name=project_name, model_name=model_name)

##  1. Text library

`Text_Dataset` and `Gutenberg_Dataset` classes: libraries for training, 
encoding, batch generation, and formatted source display. It read some 
books from Project Gutenberg and supports creation of training batches. 
The output functions support highlighting to allow to compare generated 
texts with the actual sources to help to identify identical (memorized) 
parts.

In [None]:
use_dark_mode=True # Set to false for white background. HTML-text-compare uses background-colorization to identify different sources. Those background colors are dependent on the theme type.

In [None]:
logging.basicConfig(level=logging.INFO)
cache_dir = os.path.join(data_path, 'gutenberg_cache')
gd = Gutenberg_Dataset(root_url='https://www.gutenberg.org/dirs/', cache_dir=cache_dir)

In [None]:
# sample searches
search_spec= {"author": ["brontë","Jane Austen", "Virginia Woolf"], "language": ["english"]}

book_list=gd.search(search_spec)
book_cnt = len(book_list)
print(f"{book_cnt} matching books found with search {search_spec}.")
if book_cnt<40:
    # Note: please verify that book_cnt is 'reasonable'. If you plan to use a large number of texts, 
    # consider [mirroring Gutenberg](https://github.com/domschl/ml-indie-tools#working-with-a-local-mirror-of-project-gutenberg)
    book_list = gd.insert_book_texts(book_list, download_count_limit=book_cnt)  
else:
    logging.error("Please verify your book_list, a large number of books is scheduled for download. ABORTED.")

In [None]:
for i in range(len(book_list)):
    print(f"{i}: {book_list[i]['title']} - {book_list[i]['author']}, {book_list[i]['ebook_id']}")

In [None]:
select = (17,12,21) # 11,20,21)
sub_book_list = [book_list[i] for i in range(len(book_list)) if i in select]

print("Using books:")
for i in range(len(sub_book_list)):
    print(f"{i+1}: {sub_book_list[i]['title']} - {sub_book_list[i]['author']}")

td = Text_Dataset(sub_book_list)
td.init_tokenizer(tokenizer='char')

In [None]:
SEQUENCE_LEN = 80

td.init_getitem(sample_type='chargen_single_encoded', sample_length=SEQUENCE_LEN, content_stepping=1)
num_records = len(td)

print(f"{num_records} records")

def get_sample_batch(td, batch_size, length, random_index=True):
    for i in range(batch_size):
        if random_index is True:
            ind = random.randint(0, num_records-1)
        else:
            ind = i * td.getitem_content_stepping
        Xi = td[ind]
        yi = [Xi[-1]]
        Xi[-1]=td.c2i['␚']  # use 'SUB'-stitut glyph to mark last char of input
        if i==0:
            smpX=np.array(Xi, dtype=np.float32)
            smpy=np.array(yi, dtype=np.int32)
        else:
            smpX = np.vstack((smpX, np.array(Xi, dtype=np.float32)))
            smpy = np.vstack((smpy, np.array(yi, dtype=np.int32)))
    return np.array(smpX), np.array(smpy)

def get_random_onehot_sample_batch(td, batch_size, length):
    X, y = get_random_sample_batch(td, batch_size, length)
    xoh = tf.keras.backend.one_hot(X, len(td.i2c))
    yk = tf.keras.backend.constant(y)
    return xoh, yk

In [None]:
test_x, test_y = get_sample_batch(td, 5, 80, random_index=True)
for i in range(len(test_x)):
    print(f"[{i}]: X=>{td.decode(test_x[i])}<, y=>{td.decode(test_y[i])}<")

## 2. Use tf.data for texts

In [None]:
if use_selfattention is False:
    params = { # LSTM
        'sequence_len': SEQUENCE_LEN,
        'sample_every_n_epochs': 5,
        'use_attention': False,

        'lstm_layers': 3,
        'lstm_units': 256,

        'batch_size': 256,
        'vocab_size': len(td.i2c),
        'embedding_dim': len(td.i2c),
        'learning_rate': 0.0002,
        'clipvalue': None,
    }
else:
    params = { # Multi-head self-attention
        'sequence_len': SEQUENCE_LEN,
        'sample_every_n_epochs': 5,
        'use_attention': True,

        'mhsa_layers': 4,
        'heads': 2,
        'units': 256, # len(td.i2c),
        'norm': 'softmax',
        'mh_normalize': True,
        'final_relu': False,
        'l2_regularizer': 1e-9,
        'sa_l2_regularizer': 1e-10,
        'sa_dropout': 0.8,       # no dropout: 0.0

        'batch_size': 128,
        'vocab_size': len(td.i2c),
        'embedding_dim': len(td.i2c),
        'learning_rate': 0.0005,
        'clipvalue': None,
    }

if ml_env.is_tpu:
    if use_selfattention is False:
        params['learning_rate'] = 0.001  # LSTMs on current TPUv2 are *very* fragile.
    # params['clipvalue'] = 1.0


In [None]:
num_batches = num_records // params['batch_size']
print(f"num_batches = {num_batches}")

In [None]:
def make_tf_dataset(num, random_index=False):
    dx=[]
    dy=[]
    num_batches_active = num
    for i in range(num_batches_active):
        x,y=get_sample_batch(td, params['batch_size'], params['sequence_len'], random_index=random_index)
        if i<1:
            print(f"[{num} x]: {x.shape} -> {y.shape}")
        dx.append(x)
        dy.append(y)
    # if ml_env.is_tpu is False:
    dx=np.array(dx)
    dy=np.array(dy)
    data_xy = (dx, dy)
    # print(f"Shape of data_xy: {np.array(data_xy).shape}")
    tf_dataset=tf.data.Dataset.from_tensor_slices(data_xy)
    return tf_dataset

In [None]:
MAX_NUM_BATCHES = 8000

if num_batches>MAX_NUM_BATCHES:
    restricted_batches=MAX_NUM_BATCHES
else:
    restricted_batches=num_batches
textlib_dataset = make_tf_dataset(restricted_batches, random_index=True)

In [None]:
shuffle_buffer=10000
if ml_env.is_tpu is True:
    dataset=textlib_dataset.shuffle(shuffle_buffer).repeat()  # Otherwise TPU may run dry
else:
    dataset=textlib_dataset.shuffle(shuffle_buffer)  
dataset.take(1)

In [None]:
validation_dataset = make_tf_dataset(10, random_index=True)

In [None]:
def model_lstm(inputs, params):
    lstms=[]
    for i in range(params['lstm_layers']):
        if i==params['lstm_layers']-1:
            ret_seq=False
        else:
            ret_seq=True 
        if i==0:
            lstms.append(layers.LSTM(params['lstm_units'], return_sequences=ret_seq, 
                                    batch_input_shape=[params['batch_size'], params['sequence_len'], params['embedding_dim']]))
        else:
            lstms.append(layers.LSTM(params['lstm_units'], return_sequences=ret_seq))
    dense = layers.Dense(params['vocab_size'], activation=None)   # softmax in loss!
    fl = layers.Flatten()
    x = tf.one_hot(tf.cast(inputs,dtype=tf.int32), params['vocab_size'], axis=-1)
    for i in range(params['lstm_layers']):
        x = lstms[i](x)
    x = dense(fl(x))
    return x

In [None]:
def model_mhsa(inputs, params):
    dense = layers.Dense(params['vocab_size'], kernel_regularizer=regularizers.l2(params['l2_regularizer']))  # using softmax here prevents temperature adjust, affects 'from_logits' param in sparse_categorical loss 
    fl = layers.Flatten()
    dr = layers.Dropout(params['sa_dropout'])
    pe = PositionalEncoding(amplitude=0.3)
    mhsa=[]
    for i in range(params['mhsa_layers']):
        mhsa.append(MultiHeadSelfAttention(params['heads'], units=params['units'], norm=params['norm'], mh_normalize=params['mh_normalize']))
        # mhsa.append(keras.layers.MultiHeadAttention(num_heads=params['heads'], key_dim=params['units'], kernel_regularizer=regularizers.l2(params['sa_l2_regularizer'])))
    # dense = layers.Dense(params['vocab_size'], activation=None)   # softmax in loss!
    x = tf.one_hot(tf.cast(inputs,dtype=tf.int32), params['vocab_size'], axis=-1)
    x = pe(x)
    for i in range(params['mhsa_layers']):
        x = mhsa[i](x)
        # x = mhsa[i](x,x)
    if params['sa_dropout']>0.0:
        x = dr(x)
    x = dense(fl(x))
    return x 

In [None]:
def generate(text, model, gen_len=64, temperature=0.9, has_softmax=False, silent=False):
    full=text[:-1]
    lf=0
    gen_text=""
    if silent is False:
        print(f"[{full}]", end='')
    tex=copy.copy(text)
    if len(tex)>params['sequence_len']:
        tex=tex[-params['sequence_len']:]
    while len(tex) < params['sequence_len']:
        tex=' '+tex
    tex=tex[1:]+'␚'
    for i in range(gen_len):
        # print(f"[{i}]: {tex}")
        input = np.array([td.encode(tex)])
        pred = model.predict(input, batch_size=1)
        if has_softmax is False:
            pred /= temperature
            # pred = tf.keras.layers.Softmax()(pred).numpy()
            pred = tf.keras.layers.Softmax()(pred)
            if use_eager is True:
                pred=pred.numpy()
            else:
                pred=tf.keras.backend.eval(pred)
        ci=np.random.choice(list(range(len(pred[0]))), p=pred[0]) # np.argmax(pred[0])
        c=td.i2c[ci]
        gen_text += c
        if c=='\n':
            lf=0
        else:
            lf += 1
            if (lf>80 and c==' ') or lf>120:
                lf=0
                gen_text+='\n'
        full+=c
        tex=tex[:-1]+c+'␚'
        tex=tex[-params['sequence_len']:]
    if silent is False:
        td.source_highlight(gen_text, min_quote_size=8)
    return pred, gen_text

In [None]:
if ml_env.is_tpu:
    # Otherwise it explodes:
    tf.compat.v1.experimental.output_all_intermediates(True)

In [None]:
inputs = keras.Input(shape=(params['sequence_len'],))
if params['use_attention'] is True:
    outputs = model_mhsa(inputs, params)
    model = keras.Model(inputs=inputs, outputs=outputs, name="mhsa_v1_tf")
else:
    outputs = model_lstm(inputs, params)
    model = keras.Model(inputs=inputs, outputs=outputs, name="lstm_v1_tf")

In [None]:
kscc = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def loss(labels, logits):
  vl=kscc(labels, logits)
  return vl

# def scalar_loss(labels, logits):
#     vl = loss(labels ,logits)
#     l = tf.reduce_mean(vl, axis=-1)
#     return l

In [None]:
if params['clipvalue'] is not None:
    opti = tf.keras.optimizers.Adam(params['learning_rate'], params['clipvalue'])
else:
    opti = tf.keras.optimizers.Adam(params['learning_rate'])

if ml_env.is_tpu is True:
    model.compile(optimizer=opti, loss=loss, metrics=[])
else:
    model.compile(optimizer=opti, loss=loss, metrics=['accuracy'])

In [None]:
model.summary()

### Loss function, optimizer, tensorboard output

In [None]:
class GeneratorCallback(keras.callbacks.Callback):
#    def on_test_end(self, logs=None):
    def on_epoch_end(self, epoch, logs=None):
        # if use_selfattention is True and epoch % params['sample_every_n_epochs'] == 0:
        if epoch % params['sample_every_n_epochs'] == 0:
            idx=random.randint(0,len(td)-1)
            text=td.decode(td[idx])
            print()
            if ml_env.is_tpu is True:
                temp_list=[0.7]
                gen_len=64
            else:
                temp_list=[0.5, 0.7, 0.9]
                gen_len=192
            for temp in temp_list:
                print(f"---------------- T={temp} ---------------")
                if ml_env.is_tpu is True:
                    with tf.device('/cpu:0'):
                        generate(text, model=model, gen_len=gen_len, temperature=temp)
                else:
                    generate(text, model=model, gen_len=gen_len, temperature=temp)
            print("--------------------------------------")

generator_callback=GeneratorCallback()

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = os.path.join(model_path, 'training_checkpoints')
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

logdir = os.path.join(log_path, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, update_freq='epoch', histogram_freq=0, profile_batch=0) # update_freq='epoch', 

In [None]:
%tensorboard --logdir logs

## The actual training

In [None]:
EPOCHS=1000

In [None]:
restricted_batches

In [None]:
if ml_env.is_tpu is True:
    steps_per_epoch=restricted_batches//params['batch_size']
    if steps_per_epoch==0:
        steps_per_epoch=1
    history = model.fit(dataset, epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback, tensorboard_callback, generator_callback])
else:
    history = model.fit(dataset, validation_data=validation_dataset, epochs=EPOCHS, callbacks=[checkpoint_callback, tensorboard_callback, generator_callback])


## Generate text

In [None]:
model_params_gen = copy.copy(params)
model_params_gen['batch_size'] = 1
model_params_gen

In [None]:
inputs_gen = keras.Input(shape=(SEQUENCE_LEN,))
if params['use_attention'] is True:
    outputs_gen = model_mhsa(inputs_gen, model_params_gen)
else:
    outputs_gen = model_lstm(inputs_gen, model_params_gen)
print(f"{inputs_gen.shape} -> {outputs_gen.shape}")

In [None]:
if params['use_attention'] is True:
    model_gen = keras.Model(inputs=inputs_gen, outputs=outputs_gen, name="mhsa_v1_tf_gen")
else:
    model_gen = keras.Model(inputs=inputs_gen, outputs=outputs_gen, name="lstm_v1_tf_gen")

In [None]:
checkpoint_dir = os.path.join(model_path, 'training_checkpoints')
last_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
print(f"Last checkpoint: {last_checkpoint}")
model_gen.load_weights(last_checkpoint)

In [None]:
model_gen.build(tf.TensorShape([1, None]))

In [None]:
model_gen.summary()

In [None]:
text = "Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so much delighted with it that he agreed with Mr. Morris immediately; that he is to take possession before Michaelmas, and some of his servants are to be in the house by the end of next week."
text = text[:params['sequence_len']]
while len(text) < params['sequence_len']:
    text = ' '+text

In [None]:
mytext = text[:-1]+'␚'
_, gen_text = generate(mytext, model=model_gen, silent=True)
print(f"[{mytext[:-1]}]",end="")
print(gen_text)

In [None]:
gen_text

## 6. A dialog with the trained model

In [None]:
# Do a dialog with the recursive neural net trained above:
# def genDialogAnswer(prompt, g_state=None, endPrompt='.', maxEndPrompts=2,
# maxAnswerSize=512, temperature=1.0):


def doDialog():
    # 0.1 (frozen character) - 1.3 (creative/chaotic character)
    temperature = 0.6
    endPrompt = '.'  # the endPrompt character is the end-mark in answers.
    # look for number of maxEndPrompts until answer is finished.
    maxEndPrompts = 4
    maxAnswerSize = 2048  # Maximum length of the answer
    minAnswerSize = 64  # Minimum length of the answer

    print("Please enter some dialog.")
    print("The net will answer according to your input.")
    print("'bye' for end,")
    print("'reset' to reset the conversation context,")
    print("'temperature=<float>' [0.1(frozen)-1.0(creative)]")
    print("    to change character of the dialog.")
    print("    Current temperature={}.".format(temperature))
    print()
    xso = None
    bye = False
    doini = True

    bye = False
    while not bye:
        print("> ", end="")
        prompt = input()
        if prompt == 'bye':
            bye = True
            print("Good bye!")
            continue
        if prompt == 'reset':
            doini = True
            print("(conversation context marked for reset)")
            continue
        if prompt[:len("temperature=")] == "temperature=":
            t = float(prompt[len("temperature="):])
            if t > 0.05 and t < 1.4:
                temperature = t
                print("(generator temperature now {})".format(t))
                print()
                continue
            print("Invalid temperature-value ignored! [0.1-1.0]")
            continue
        _, answer = generate(prompt, model=model_gen, temperature=temperature, gen_len=128, silent=True)
        td.source_highlight(answer, min_quote_size=13)
    return

In [None]:
# Talk to the net!
doDialog()

## References:
* <https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/text/text_generation.ipynb>
* <https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/shakespeare_with_tpu_and_keras.ipynb>