In [1]:
try:
    import keras
except:
    !pip install keras

Using TensorFlow backend.


In [2]:
from pathlib import Path

import tensorflow as tf
tf_session = tf.Session()
from keras import backend as K
K.set_session(tf_session)

from keras.callbacks import ModelCheckpoint,  CSVLogger
from keras.layers import Add, Dense, Input, LSTM
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

import numpy as np
import pandas as pd
from sklearn.externals import joblib

# Local library with model definitions for training and generating
from models import Generator, create_training_model

# Load Input

In [3]:
# Settings

# Percent of samples to use for training, might be necessary if you're running out of memory
sample_size = .05

# The latent dimension of the LSTM
latent_dim = 4

root_path = Path('../../..')
input_path = root_path / 'input'
poem_path = input_path / 'poems'
haiku_path = poem_path / 'haikus.csv'

output_dir = Path('all_data_test')
output_dir.mkdir()

In [4]:
df = pd.read_csv(str(haiku_path))
df = df.sample(frac=sample_size)
df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
28137,"He damned the sun, and","he damned the stars, And he blasted",the winds in the sky.,gutenberg,5,8,5
101751,Bitches be out here,shameless Got me wondering,where their parents are,twaiku,5,7,5
56112,That Fat Bastard wine,is really not very nice,Sour blackberries,twaiku,5,7,45
97737,I'd swallow the moon,and the stars just to follow,the beat of your heart,twaiku,5,7,5
12730,in the market-place of bruges stands the belfr...,thrice consumed and thrice rebuilded still it ...,town,img2poems,1314,15,1
96802,It's important to,realize that some things are,out of your control,twaiku,5,7,5
126889,The less people know,about me and my moves the,better my life is,twaiku,5,7,5
717,Gandhi's birthday,constantly relighting candles,during the peace March,tempslibres,4,8,5
137631,ashton irwin the,love you have for the fans is,so pure i love you,twaiku,5,7,5
78872,isn't it crazy,that we now live in a world,where come on exists,twaiku,5,7,5


# Format Input for Training

In [5]:
# Duplicate lines with ambiguous syllable counts
# (syllable counts where there is a comma because
# multiple pronounciations are acceptable)

lines = set([0, 1, 2])

for i in range(3):
    lines.remove(i)
    df = df[[
        '0', '1', '2',
        #'1_syllables', '2_syllables'
    ] + ['%s_syllables' % j for j in lines]].join(
        df['%s_syllables' % i].str.split(
            ',', expand=True
        ).stack(-1).reset_index(
            level=1, drop=True
        ).rename('%s_syllables' % i)
    ).drop_duplicates()
    lines.add(i)

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
8,misty summer rain,calling pheasant,in Zen temple,5,4,4
16,nobody here,a table in the mountain,speckled with petals,4,7,5
30,smells of spring,adrift in the morning air,bubbles under ice,3,7,5
30,smells of spring,adrift in the morning air,bubbles under ice,4,7,5
31,Spring morning,your hand on my breast,a bird,3,5,2
31,Spring morning,your hand on my breast,a bird,4,5,2
42,folding chair,the newborn colt tries,to stand,3,5,2
58,husband away,washing his socks,by hand,4,4,2
91,morning glories,find their way to the sun,on barbed wire,4,6,3
91,morning glories,find their way to the sun,on barbed wire,4,6,4


In [6]:
# Drop samples that are longer that the 99th percentile of length

max_line_length = int(max([df['%s' % i].str.len().quantile(.99) for i in range(3)]))
df = df[
    (df['0'].str.len() <= max_line_length) & 
    (df['1'].str.len() <= max_line_length) & 
    (df['2'].str.len() <= max_line_length)
].copy()
df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
8,misty summer rain,calling pheasant,in Zen temple,5,4,4
16,nobody here,a table in the mountain,speckled with petals,4,7,5
30,smells of spring,adrift in the morning air,bubbles under ice,3,7,5
30,smells of spring,adrift in the morning air,bubbles under ice,4,7,5
31,Spring morning,your hand on my breast,a bird,3,5,2
31,Spring morning,your hand on my breast,a bird,4,5,2
42,folding chair,the newborn colt tries,to stand,3,5,2
58,husband away,washing his socks,by hand,4,4,2
91,morning glories,find their way to the sun,on barbed wire,4,6,3
91,morning glories,find their way to the sun,on barbed wire,4,6,4


In [7]:
# Pad the lines to the max line length with new lines
for i in range(3):
    # For input, duplicate the first character
    # TODO - Why?
    df['%s_in' % i] = (df[str(i)].str[0] + df[str(i)]).str.pad(max_line_length+2, 'right', '\n')
    
    # 
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables,0_in,0_out,1_in,1_out,2_in,2_out
8,misty summer rain,calling pheasant,in Zen temple,5,4,4,mmisty summer rain\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,misty summer rain\nc\n\n\n\n\n\n\n\n\n\n\n\n\n...,ccalling pheasant\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,calling pheasant\ni\n\n\n\n\n\n\n\n\n\n\n\n\n\...,iin Zen temple\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,in Zen temple\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
16,nobody here,a table in the mountain,speckled with petals,4,7,5,nnobody here\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,nobody here\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa table in the mountain\n\n\n\n\n\n\n\n\n\n\n...,a table in the mountain\ns\n\n\n\n\n\n\n\n\n\n...,sspeckled with petals\n\n\n\n\n\n\n\n\n\n\n\n\...,speckled with petals\n\n\n\n\n\n\n\n\n\n\n\n\n...
30,smells of spring,adrift in the morning air,bubbles under ice,3,7,5,ssmells of spring\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,smells of spring\na\n\n\n\n\n\n\n\n\n\n\n\n\n\...,aadrift in the morning air\n\n\n\n\n\n\n\n\n\n...,adrift in the morning air\nb\n\n\n\n\n\n\n\n\n...,bbubbles under ice\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bubbles under ice\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
30,smells of spring,adrift in the morning air,bubbles under ice,4,7,5,ssmells of spring\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,smells of spring\na\n\n\n\n\n\n\n\n\n\n\n\n\n\...,aadrift in the morning air\n\n\n\n\n\n\n\n\n\n...,adrift in the morning air\nb\n\n\n\n\n\n\n\n\n...,bbubbles under ice\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bubbles under ice\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
31,Spring morning,your hand on my breast,a bird,3,5,2,SSpring morning\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,Spring morning\ny\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,yyour hand on my breast\n\n\n\n\n\n\n\n\n\n\n\...,your hand on my breast\na\n\n\n\n\n\n\n\n\n\n\...,aa bird\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,a bird\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
31,Spring morning,your hand on my breast,a bird,4,5,2,SSpring morning\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,Spring morning\ny\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,yyour hand on my breast\n\n\n\n\n\n\n\n\n\n\n\...,your hand on my breast\na\n\n\n\n\n\n\n\n\n\n\...,aa bird\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,a bird\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
42,folding chair,the newborn colt tries,to stand,3,5,2,ffolding chair\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,folding chair\nt\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,tthe newborn colt tries\n\n\n\n\n\n\n\n\n\n\n\...,the newborn colt tries\nt\n\n\n\n\n\n\n\n\n\n\...,tto stand\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,to stand\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
58,husband away,washing his socks,by hand,4,4,2,hhusband away\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,husband away\nw\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,wwashing his socks\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,washing his socks\nb\n\n\n\n\n\n\n\n\n\n\n\n\n...,bby hand\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,by hand\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
91,morning glories,find their way to the sun,on barbed wire,4,6,3,mmorning glories\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,morning glories\nf\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,ffind their way to the sun\n\n\n\n\n\n\n\n\n\n...,find their way to the sun\no\n\n\n\n\n\n\n\n\n...,oon barbed wire\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,on barbed wire\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
91,morning glories,find their way to the sun,on barbed wire,4,6,4,mmorning glories\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,morning glories\nf\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,ffind their way to the sun\n\n\n\n\n\n\n\n\n\n...,find their way to the sun\no\n\n\n\n\n\n\n\n\n...,oon barbed wire\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,on barbed wire\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...


In [8]:
inputs = df[['0_in', '1_in', '2_in']].values

tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
    tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
], num_classes=n_tokens)

# X_syllables is the count of syllables for each line
X_syllables = df[['0_syllables', '1_syllables', '2_syllables']].values

In [9]:
joblib.dump([latent_dim, n_tokens, max_line_length, tokenizer], str(output_dir / 'metadata.pkl'))

['all_data_test/metadata.pkl']

# Training Model

In [None]:
training_model, lstm, lines, inputs, outputs = create_training_model(latent_dim, n_tokens)

filepath = str(output_dir / ("%s-{epoch:02d}-{val_loss:.2f}.hdf5" % latent_dim))
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

csv_logger = CSVLogger(str(output_dir / 'training_log.csv'), append=True, separator=',')

callbacks_list = [checkpoint, csv_logger]

training_model.fit([
    X[0], X_syllables[:,0], 
    X[1], X_syllables[:,1], 
    X[2], X_syllables[:,2]
], [Y[0], Y[1], Y[2]], batch_size=64, epochs=1000, validation_split=.1, callbacks=callbacks_list)

Train on 7627 samples, validate on 848 samples
Epoch 1/1000

Epoch 00001: val_loss improved from inf to 11.04745, saving model to all_data_test/4-01-11.05.hdf5


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


Epoch 2/1000

Epoch 00002: val_loss improved from 11.04745 to 8.76530, saving model to all_data_test/4-02-8.77.hdf5
Epoch 3/1000

Epoch 00003: val_loss improved from 8.76530 to 6.65627, saving model to all_data_test/4-03-6.66.hdf5
Epoch 4/1000

Epoch 00004: val_loss improved from 6.65627 to 5.24409, saving model to all_data_test/4-04-5.24.hdf5
Epoch 5/1000

Epoch 00005: val_loss improved from 5.24409 to 4.45031, saving model to all_data_test/4-05-4.45.hdf5
Epoch 6/1000

Epoch 00006: val_loss improved from 4.45031 to 4.02260, saving model to all_data_test/4-06-4.02.hdf5
Epoch 7/1000

Epoch 00007: val_loss improved from 4.02260 to 3.76061, saving model to all_data_test/4-07-3.76.hdf5
Epoch 8/1000

Epoch 00008: val_loss improved from 3.76061 to 3.60654, saving model to all_data_test/4-08-3.61.hdf5
Epoch 9/1000

Epoch 00009: val_loss improved from 3.60654 to 3.51804, saving model to all_data_test/4-09-3.52.hdf5
Epoch 10/1000

Epoch 00010: val_loss improved from 3.51804 to 3.46414, saving m

# Test Model

In [10]:
generator = Generator(lstm, lines, tf_session, tokenizer, n_tokens, max_line_len)

In [14]:
generator.generate_haiku()

ho the the the thes the
he the the the the the the the the the the
the the the the the the the the the


['ho the the the thes the',
 'he the the the the the the the the the the',
 'the the the the the the the the the']