In [1]:
import numpy as np
import os
import random
import tensorflow as tf
from IPython.core.display import display, HTML

In [2]:
# TextLibrary class: text library for training, encoding, batch generation,
# and formatted source display
class TextLibrary:
    def __init__(self, filenames):
        self.filenames = filenames
        self.data=''
        self.files=[]
        index = 1
        for filename in filenames:
            fd={}
            fd["name"] = os.path.splitext(os.path.basename(filename))[0]
            self.c2i = {}
            self.i2c = {}
            try:
                f = open(filename)
                dat = f.read()
                self.data += dat
                fd["data"] = dat
                fd["index"] = index
                index += 1
                self.files.append(fd)
                f.close()
            except OSError:
                print("  ERROR: Cannot read: ", filename, e)
        cs = set(self.data)
        csi = list(enumerate(cs))
        self.c2i = {c: i for i, c in csi}
        # csi = list(enumerate(cs))
        self.i2c = {i: c for i, c in csi}
        self.ptr = 0
            
    def printColoredIPython(self, textlist, pre='', post=''):
        bgcolors = ['#d4e6f1', '#d8daef', '#ebdef0', '#eadbd8', '#e2d7d5', '#edebd0',
                    '#ecf3cf', '#d4efdf', '#d0ece7', '#d6eaf8', '#d4e6f1', '#d6dbdf',
                    '#f6ddcc', '#fae5d3', '#fdebd0', '#e5e8e8', '#eaeded', '#A9CCE3']
        out = ''
        for txt, ind in textlist:
            if ind==0:
                out += txt
            else:
                out += "<span style=\"background-color:"+bgcolors[ind%16]+";\">" + txt +\
                       "</span>"+"<sup>[" + str(ind) + "]</sup>"
        display(HTML(pre+out+post))
        
    def sourceHighlight(self, txt, minQuoteSize=10):
        tx = txt
        out = []
        qts = []
        txsrc=[("Sources: ", 0)]
        sc=False
        noquote = ''
        while len(tx)>0:  # search all library files for quote 'txt'
            mxQ = 0
            mxI = 0
            mxN = ''
            found = False
            for f in self.files:  # find longest quote in all texts
                p = minQuoteSize
                if p<=len(tx) and tx[:p] in f["data"]:
                    p = minQuoteSize + 1
                    while p<=len(tx) and tx[:p] in f["data"]:
                        p += 1
                    if p-1>mxQ:
                        mxQ = p-1
                        mxI = f["index"]
                        mxN = f["name"]
                        found = True
            if found:  # save longest quote for colorizing
                if len(noquote)>0:
                    out.append((noquote, 0))
                    noquote = ''
                out.append((tx[:mxQ],mxI))
                tx = tx[mxQ:]
                if mxI not in qts:  # create a new reference, if first occurence
                    qts.append(mxI)
                    if sc:
                        txsrc.append((", ", 0))
                    sc = True
                    txsrc.append((mxN,mxI))
            else:
                noquote += tx[0]
                tx = tx[1:]
        if len(noquote)>0:
            out.append((noquote, 0))
            noquote = ''        
        self.printColoredIPython(out)
        if len(qts)>0:  # print references, if there is at least one source
            self.printColoredIPython(txsrc, pre="<small><p style=\"text-align:right;\">",
                                     post="</p></small>")
    
    def getSlice(self, length):
        if (self.ptr + length >= len(self.data)):
            self.ptr = 0
        if self.ptr == 0:
            rst = True
        else:
            rst = False
        sl = self.data[self.ptr:self.ptr+length]
        self.ptr += length
        return sl, rst
    
    def decode(self, ar):
         return ''.join([self.i2c[ic] for ic in ar])
            
    def getRandomSlice(self, length):
        p = random.randrange(0,len(self.data)-length)
        sl = self.data[p:p+length]
        return sl
    
    def getSliceArray(self, length):
        ar = np.array([c for c in self.getSlice(length)[0]])
        return ar
        
    def getSample(self, length):
        s, rst = self.getSlice(length+1)
        X = [self.c2i[c] for c in s[:-1]]
        y = [self.c2i[c] for c in s[1:]]
        return (X, y, rst)
    
    def getRandomSample(self, length):
        s = self.getRandomSlice(length+1)
        X = [self.c2i[c] for c in s[:-1]]
        y = [self.c2i[c] for c in s[1:]]
        return (X, y)
    
    def getSampleBatch(self, batch_size, length):
        smpX = []
        smpy = []
        for i in range(batch_size):
            Xi, yi, rst = self.getSample(length)
            smpX.append(Xi)
            smpy.append(yi)
        return smpX, smpy, rst
        
    def getRandomSampleBatch(self, batch_size, length):
        smpX = []
        smpy = []
        for i in range(batch_size):
            Xi, yi = self.getRandomSample(length)
            smpX.append(Xi)
            smpy.append(yi)
        return smpX, smpy

In [3]:
# The tensorflow model for text generation
class TensorPoetModel:
    def __init__(self, params):
        self.vocab_size = params["vocab_size"]
        self.neurons = params["neurons"]
        self.layers = params["layers"]
        self.learning_rate = params["learning_rate"]
        self.steps = params["steps"]
        # self.clip = -1.0 * params["clip"]
        
        tf.reset_default_graph()

        # Training:
        self.X = tf.placeholder(tf.int32, shape=[None, self.steps])
        self.y = tf.placeholder(tf.int32, shape=[None, self.steps])

        onehot_X = tf.one_hot(self.X, self.vocab_size)
        onehot_y = tf.one_hot(self.y, self.vocab_size)

        basic_cell = tf.contrib.rnn.BasicLSTMCell(self.neurons)
        stacked_cell = tf.contrib.rnn.MultiRNNCell([basic_cell] * self.layers)

        self.batch_size = tf.placeholder(tf.int32)
        self.init_state_0 = stacked_cell.zero_state(self.batch_size, tf.float32)

        self.init_state = self.init_state_0

        with tf.variable_scope('rnn') as scope:
            rnn_outputs, states = tf.nn.dynamic_rnn(stacked_cell, onehot_X, initial_state=self.init_state, dtype=tf.float32)
            self.init_state = states

        self.final_state = self.init_state
        stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, self.neurons])

        # with tf.variable_scope('fc') as scope:
        softmax_w = tf.get_variable("softmax_w", [self.neurons, self.vocab_size], initializer=tf.random_normal_initializer(), dtype=tf.float32)
        softmax_b = tf.get_variable("softmax_b", [self.vocab_size], dtype=tf.float32)
            
        logits_raw = tf.matmul(stacked_rnn_outputs, softmax_w) + softmax_b
        logits = tf.reshape(logits_raw, [-1, self.steps, self.vocab_size])

        output_softmax = tf.nn.softmax(logits)

        self.temperature = tf.placeholder(tf.float32)
        self.output_softmax_temp = tf.nn.softmax(tf.div(logits, self.temperature))

        softmax_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=onehot_y, logits=logits)

        self.cross_entropy = tf.reduce_mean(softmax_entropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

        self.training_op = optimizer.minimize(self.cross_entropy)  #, var_list=[embeddings, states, stacked_outputs])

        # Clipping isn't necessary, even for really deep networks:
        # grads = optimizer.compute_gradients(self.cross_entropy)
        # minclip = -1.0 * self.clip
        # capped_grads = [(tf.clip_by_value(grad, minclip, self.clip), var) for grad, var in grads]
        # self.training_op = optimizer.apply_gradients(capped_grads)

        self.prediction = tf.cast(tf.argmax(output_softmax, -1), tf.int32)
        correct_prediction = tf.equal(self.y, self.prediction)
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        error = 1.0 - self.accuracy

        
        # Tensorboard
        tf.summary.scalar("cross-entropy", self.cross_entropy)
        tf.summary.scalar("error", error)
        self.summary_merged = tf.summary.merge_all()

        # Init
        self.init = tf.global_variables_initializer()


In [None]:
textlib = TextLibrary([  # add additional texts, to train concurrently on multiple srcs:
                       # 'data/tiny-shakespeare.txt',
                       'data/emma-jane-austen.txt',
                       'data/voyage-out-virginia-woolf.txt',
                       'data/pride-prejudice-jane-austen.txt',
                       'data/wuthering-heights-emily-bronte.txt',            
                      ])

params = {
    "vocab_size": len(textlib.i2c),
    "neurons": 256,
    "layers": 4,
    "learning_rate": 1.e-3,
    "steps": 96,
}

model = TensorPoetModel(params)

In [None]:
# Training:
max_iter = 1000000
batch_size = 64
generated_text_size = 500
epl = len(textlib.data) / (batch_size * model.steps)

with tf.Session() as sess:
    model.init.run()

    tflogdir = os.path.realpath('tensorlog')
    if not os.path.exists(tflogdir):
        os.makedirs(tflogdir)
        print("Tensorboard: 'tensorboard --logdir {}'".format(tflogdir))

    train_writer = tf.summary.FileWriter(tflogdir, sess.graph)
    train_writer.add_graph(sess.graph)
        
    
    for iteration in range(max_iter):
        # Train with batches from the text library:
        X_batch, y_batch = textlib.getRandomSampleBatch(batch_size, model.steps)
        i_state = sess.run([model.init_state_0], feed_dict={model.batch_size: batch_size})
        i_state, _ = sess.run([model.final_state, model.training_op],
                              feed_dict={model.X: X_batch, model.y: y_batch,
                                         model.batch_size: batch_size, model.init_state: i_state})

        # Output training statistics every 100 iterations:
        if iteration % 100 == 0:
            ce, accuracy, prediction, summary = sess.run([model.cross_entropy,
                                                          model.accuracy, model.prediction,
                                                          model.summary_merged],
                                             feed_dict={model.X: X_batch, model.y: y_batch,
                                                        model.batch_size: batch_size})
            train_writer.add_summary(summary, iteration)
            ep = iteration / epl
            print("Epoch: {0:.2f}, iter: {1:d}, cross-entropy: {2:.3f}, accuracy: {3:.5f}".format(ep, iteration, ce, accuracy))
            for ind in range(1): # model.batch_size):
                ys = textlib.decode(y_batch[ind]).replace('\n', ' | ')
                yps = textlib.decode(prediction[ind]).replace('\n', ' | ')
                print("   y:", ys)
                print("  yp:", yps)

        # Generate sample texts for different temperature every 500 iterations:
        if (iteration+1) % 500 == 0:
            for t in range(2, 11, 2):
                temp = float(t) / 10.0;
                g_state = sess.run([model.init_state_0], feed_dict={model.batch_size: 1})
                xs = ' ' * model.steps
                xso = ''
                for i in range(generated_text_size):
                    X_new = np.transpose([[textlib.c2i[sj]] for sj in xs])
                    g_state, y_pred = sess.run([model.final_state, model.output_softmax_temp], 
                                              feed_dict={model.X: X_new, model.init_state: g_state,
                                                         model.batch_size: 1, model.temperature: temp})
                    inds=list(range(model.vocab_size))
                    ind = np.random.choice(inds, p=y_pred[0, -1].ravel())
                    nc = textlib.i2c[ind]
                    xso += nc
                    xs = xs[1:]+nc
                             
                print("----------------- temperature =", temp, "----------------------")
                # print(xso)
                textlib.sourceHighlight(xso, 10)
            print("---------------------------------------")

Tensorboard: 'tensorboard --logdir /home/dsc/git/AI/tensor-poet/tensorlog'
Epoch: 0.00, iter: 0, cross-entropy: 4.494, accuracy: 0.16618
   y: one connected with him. | Your alliance will be a disgrace; your name will never even be mentioned
  yp: w                                                                                               
Epoch: 0.21, iter: 100, cross-entropy: 3.159, accuracy: 0.16243
   y: altogether, and | the rest of the journey was made almost in darkness, the mountain being | a great 
  yp:                                                                                                 
Epoch: 0.41, iter: 200, cross-entropy: 3.026, accuracy: 0.19613
   y: --such a rest to the eyes--and the bazaars are so glad of things." | Her voice dropped into the sm
  yp:        eeeeee eeeeee ee         ee ee        e eeeeee   eeeee          aae    ee  ee ee       ee
Epoch: 0.62, iter: 300, cross-entropy: 2.738, accuracy: 0.24349
   y: though | there were enough of those wea

----------------- temperature = 0.4 ----------------------


----------------- temperature = 0.6 ----------------------


----------------- temperature = 0.8 ----------------------


----------------- temperature = 1.0 ----------------------


---------------------------------------
Epoch: 1.03, iter: 500, cross-entropy: 2.140, accuracy: 0.39469
   y: nd you _are_ Heathcliff! | But altered!  Nay, there's no comprehending it.  Have you been for a | so
  yp:    tou hont  taarheeene   | et tnl r    Iuon  ahe e s sewtaueeestdd ng tn  IIe e tou hean tor tnpo
Epoch: 1.23, iter: 600, cross-entropy: 1.966, accuracy: 0.43783
   y:  | voice, but with much sorrowful despondency.  'I shall not stay.  I am | neither come to wrangle n
  yp:  teunh  aut thsh tesh to eete r teaeerd d e  SH  woedl not ttee   ' wm notnher tone to baetdey t
Epoch: 1.44, iter: 700, cross-entropy: 1.757, accuracy: 0.48779
   y: re was not a day without its | engagement. Mrs. Bennet had so carefully provided for the entertain
  yp:    aas aot tnsey thth ut an eandetenent  Ir.. Wennet wed ah hone el y arepesed tor the sncrr enn
Epoch: 1.64, iter: 800, cross-entropy: 1.716, accuracy: 0.49935
   y: interesting as | the discussion of his concerns; and every report, t

----------------- temperature = 0.4 ----------------------


----------------- temperature = 0.6 ----------------------


----------------- temperature = 0.8 ----------------------


----------------- temperature = 1.0 ----------------------


---------------------------------------
Epoch: 2.06, iter: 1000, cross-entropy: 1.589, accuracy: 0.53548
   y: , not after his father. John, the second, | is named after his father. Some people are surprized, 
  yp:   aot tlter tes cocher   ohn  ahe waault  an aote  tnter tes cocher   h e taople tne thcerized  
Epoch: 2.26, iter: 1100, cross-entropy: 1.491, accuracy: 0.55599
   y: . Thornbury exclaimed. She told them that for some | days Hughling Elliot had been ill, and the on
  yp:   Theueoury oxpeaimed   he whod hhe |  woet sor tome | tiy  aarhbyng slieot,wav been snl  and she st
Epoch: 2.47, iter: 1200, cross-entropy: 1.402, accuracy: 0.56836
   y: inting to another of the miniatures, | "is my master--and very like him. It was drawn at the same 
  yp: nnd ng to h yther tf the sosdnteres  aIt nu danter -and tery site tim     was aeewi tn the seme 
Epoch: 2.67, iter: 1300, cross-entropy: 1.368, accuracy: 0.58252
   y: --He gave his consent with very | little persuasion.” |  | “Ah!” t

----------------- temperature = 0.4 ----------------------


----------------- temperature = 0.6 ----------------------
