In [1]:
try:
    import keras
except:
    !pip install keras

Using TensorFlow backend.


In [2]:
from pathlib import Path

import tensorflow as tf
tf_session = tf.Session()
from keras import backend as K
K.set_session(tf_session)

from keras.callbacks import ModelCheckpoint,  CSVLogger
from keras.layers import Add, Dense, Input, LSTM
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

import numpy as np
import pandas as pd
from sklearn.externals import joblib

# Local library with model definitions for training and generating
from models import Generator, create_training_model

# Load Input

In [16]:
# Settings

# Percent of samples to use for training, might be necessary if you're running out of memory
sample_size = .25

# The latent dimension of the LSTM
latent_dim = 32

# Number of epochs to train for
epochs = 5

root_path = Path('../../..')
input_path = root_path / 'input'
poem_path = input_path / 'poems'
haiku_path = poem_path / 'haikus.csv'

name = 'all_data_test_2'
output_dir = Path('output_%s' % name)
output_dir.mkdir()

In [17]:
df = pd.read_csv(str(haiku_path))
df = df.sample(frac=sample_size)
df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
41704,Guest room is almost,ready Hoping plans don't fall,through for this weekend,twaiku,5,7,5
4332,"strong wind, little snow --",scraps and notes I've neglected,to throw away,tempslibres,5,7,4
59796,This umpire is,the one who has been getting,decisions wrong Fuck,twaiku,5,7,5
23869,sunrise:,among the silent earth movers,a fawn,sballas,2,8,2
94648,Shower is bomb and,i smell like eucalyptus,it's so relaxing,twaiku,5,7,5
115077,Arsenal holding,another top team L in,their own living room,twaiku,5,7,5
103546,When you are saying,the greeks demoted him could,you specify that,twaiku,5,7,5
27710,"Dead thoughts revive, and he that heeds Shall ...","as by a spirit led, A song among the golden","reeds: ""The gods are vanished but not dead!",gutenberg,10,13,9
122291,filming my first time,ever going black friday,shopping kinda scared,twaiku,5,7,5
25072,towpath -,dissolved fog,the blue-gray heron,haikuzao,2,3,4


# Format Input for Training

In [18]:
# Duplicate lines with ambiguous syllable counts
# (syllable counts where there is a comma because
# multiple pronounciations are acceptable)

lines = set([0, 1, 2])

for i in range(3):
    lines.remove(i)
    df = df[[
        '0', '1', '2',
        #'1_syllables', '2_syllables'
    ] + ['%s_syllables' % j for j in lines]].join(
        df['%s_syllables' % i].str.split(
            ',', expand=True
        ).stack(-1).reset_index(
            level=1, drop=True
        ).rename('%s_syllables' % i)
    ).drop_duplicates()
    lines.add(i)

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
2,spring moonset --,a rice ball for,breakfast,3,4,2
2,spring moonset --,a rice ball for,breakfast,4,4,2
4,cinco de mayo,horses roll,in the shallows,5,3,4
5,quitting time,the smell of rain,in the lobby,3,4,4
8,misty summer rain,calling pheasant,in Zen temple,5,4,4
9,day is done,poppies amidst,the dying grass,3,4,4
15,polished oak --,the freesia's shadow ends,in coffee foam,3,7,4
18,vanishing difference . . .,gliding geese settle onto,their reflections,5,7,4
18,vanishing difference . . .,gliding geese settle onto,their reflections,6,7,4
21,moonlessness--,so many ways,I want to touch you,3,4,5


In [19]:
# Drop samples that are longer that the 99th percentile of length

max_line_length = int(max([df['%s' % i].str.len().quantile(.99) for i in range(3)]))
df = df[
    (df['0'].str.len() <= max_line_length) & 
    (df['1'].str.len() <= max_line_length) & 
    (df['2'].str.len() <= max_line_length)
].copy()
df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
2,spring moonset --,a rice ball for,breakfast,3,4,2
2,spring moonset --,a rice ball for,breakfast,4,4,2
4,cinco de mayo,horses roll,in the shallows,5,3,4
5,quitting time,the smell of rain,in the lobby,3,4,4
8,misty summer rain,calling pheasant,in Zen temple,5,4,4
9,day is done,poppies amidst,the dying grass,3,4,4
15,polished oak --,the freesia's shadow ends,in coffee foam,3,7,4
18,vanishing difference . . .,gliding geese settle onto,their reflections,5,7,4
18,vanishing difference . . .,gliding geese settle onto,their reflections,6,7,4
21,moonlessness--,so many ways,I want to touch you,3,4,5


In [20]:
# Pad the lines to the max line length with new lines
for i in range(3):
    # For input, duplicate the first character
    # TODO - Why?
    df['%s_in' % i] = (df[str(i)].str[0] + df[str(i)]).str.pad(max_line_length+2, 'right', '\n')
    
    # 
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables,0_in,0_out,1_in,1_out,2_in,2_out
2,spring moonset --,a rice ball for,breakfast,3,4,2,sspring moonset --\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring moonset --\na\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa rice ball for\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a rice ball for\nb\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bbreakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,breakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
2,spring moonset --,a rice ball for,breakfast,4,4,2,sspring moonset --\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring moonset --\na\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa rice ball for\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a rice ball for\nb\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bbreakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,breakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
4,cinco de mayo,horses roll,in the shallows,5,3,4,ccinco de mayo\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,cinco de mayo\nh\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,hhorses roll\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,horses roll\ni\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,iin the shallows\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,in the shallows\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
5,quitting time,the smell of rain,in the lobby,3,4,4,qquitting time\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,quitting time\nt\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,tthe smell of rain\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,the smell of rain\ni\n\n\n\n\n\n\n\n\n\n\n\n\n...,iin the lobby\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,in the lobby\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
8,misty summer rain,calling pheasant,in Zen temple,5,4,4,mmisty summer rain\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,misty summer rain\nc\n\n\n\n\n\n\n\n\n\n\n\n\n...,ccalling pheasant\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,calling pheasant\ni\n\n\n\n\n\n\n\n\n\n\n\n\n\...,iin Zen temple\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,in Zen temple\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
9,day is done,poppies amidst,the dying grass,3,4,4,dday is done\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,day is done\np\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,ppoppies amidst\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,poppies amidst\nt\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,tthe dying grass\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,the dying grass\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
15,polished oak --,the freesia's shadow ends,in coffee foam,3,7,4,ppolished oak --\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,polished oak --\nt\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,tthe freesia's shadow ends\n\n\n\n\n\n\n\n\n\n...,the freesia's shadow ends\ni\n\n\n\n\n\n\n\n\n...,iin coffee foam\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,in coffee foam\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
18,vanishing difference . . .,gliding geese settle onto,their reflections,5,7,4,vvanishing difference . . .\n\n\n\n\n\n\n\n\n\...,vanishing difference . . .\n \n\n\n\n\n\n\n\n\...,gliding geese settle onto\n\n\n\n\n\n\n\n\n...,gliding geese settle onto\n \n\n\n\n\n\n\n\n...,their reflections\n\n\n\n\n\n\n\n\n\n\n\n...,their reflections\n\n\n\n\n\n\n\n\n\n\n\n\...
18,vanishing difference . . .,gliding geese settle onto,their reflections,6,7,4,vvanishing difference . . .\n\n\n\n\n\n\n\n\n\...,vanishing difference . . .\n \n\n\n\n\n\n\n\n\...,gliding geese settle onto\n\n\n\n\n\n\n\n\n...,gliding geese settle onto\n \n\n\n\n\n\n\n\n...,their reflections\n\n\n\n\n\n\n\n\n\n\n\n...,their reflections\n\n\n\n\n\n\n\n\n\n\n\n\...
21,moonlessness--,so many ways,I want to touch you,3,4,5,mmoonlessness--\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,moonlessness--\ns\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,sso many ways\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,so many ways\nI\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,II want to touch you\n\n\n\n\n\n\n\n\n\n\n\n\n...,I want to touch you\n\n\n\n\n\n\n\n\n\n\n\n\n\...


In [21]:
inputs = df[['0_in', '1_in', '2_in']].values

tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
    tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
], num_classes=n_tokens)

# X_syllables is the count of syllables for each line
X_syllables = df[['0_syllables', '1_syllables', '2_syllables']].values

In [None]:
joblib.dump([latent_dim, n_tokens, max_line_length, tokenizer], str(output_dir / 'metadata.pkl'))

['output_all_data_test_2/metadata.pkl']

# Training Model

In [None]:
training_model, lstm, lines, inputs, outputs = create_training_model(latent_dim, n_tokens)

filepath = str(output_dir / ("%s-{epoch:02d}-{val_loss:.2f}.hdf5" % latent_dim))
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

csv_logger = CSVLogger(str(output_dir / 'training_log.csv'), append=True, separator=',')

callbacks_list = [checkpoint, csv_logger]

training_model.fit([
    X[0], X_syllables[:,0], 
    X[1], X_syllables[:,1], 
    X[2], X_syllables[:,2]
], [Y[0], Y[1], Y[2]], batch_size=64, epochs=epochs, validation_split=.1, callbacks=callbacks_list)

Train on 38056 samples, validate on 4229 samples
Epoch 1/5

# Test Model

In [11]:
generator = Generator(lstm, lines, tf_session, tokenizer, n_tokens, max_line_length)

In [14]:
generator.generate_haiku()




TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'