In [1]:
# imports
import numpy as np
from pickle import load
# TF Keras
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Random
import random
from random import randint
# Word stuff
import spacy
from spacy.matcher import Matcher
import syllapy

In [2]:
# create spacy object
nlp = spacy.load('en_core_web_lg')

In [3]:
# load tokenizer and text sequences
with open('hp_GRU_E125_200seq_tokenizer.pickle', 'rb') as f:
    tokenizer = load(f)
with open('hp_GRU_E125_200seq_txt_seqs.pickle','rb') as f:
    txt_seqs = load(f)

In [4]:
# create tokenizer sequences from text sequences
seqs = tokenizer.texts_to_sequences(txt_seqs)
# make them into an np array
seqs = np.array(seqs)
# find out sequence length
seq_len = seqs[:,:-1].shape[1]
# get the vocabulary size
vocab_size = len(tokenizer.word_counts)

In [5]:
def create_model(voc_size,s_len):
    model = Sequential()
    model.add(Embedding(voc_size,64,input_length=s_len))
    model.add(GRU(128,return_sequences=True))
    model.add(GRU(128))
    model.add(Dense(128,activation='relu'))
    model.add(Dense(voc_size,activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    model.summary()
    
    return model

In [6]:
# create the base model and load weights
model = create_model(vocab_size+1,seq_len)
model.load_weights('hp_GRU_E125_200seq.h5')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           771008    
                                                                 
 gru (GRU)                   (None, 200, 128)          74496     
                                                                 
 gru_1 (GRU)                 (None, 128)               99072     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 12047)             1554063   
                                                                 
Total params: 2,515,151
Trainable params: 2,515,151
Non-trainable params: 0
_________________________________________________________________


In [7]:
# function to generate a block of text from where we will grab words to make a haiku
def generate_bunch(model,tokenizer,seq_len,text_sequences,num_words=300):
    # grab a random text sequence to serve as seed text 
    input_text = ' '.join(text_sequences[random.randint(0,len(text_sequences))])
    output_text = []
    # let's make 'num_words' many words
    for _ in range(num_words):
        # sequence our text so we can shove it into our model
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # pad the sequence (if needed) to match the sequence length
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating='pre')
        # predict the index of the most likely next word
        pred_word_ind = np.argmax(model.predict(pad_encoded,verbose=0)[0],axis=-1)
        # get the actual word from the index from the tokenizer
        pred_word = tokenizer.index_word[pred_word_ind]
        # add the new word to the seed text
        input_text += ' ' + pred_word
        # add the new word to the output text
        output_text.append(pred_word)
    return output_text

In [8]:
# create spacy matcher objects that we will use to search
# for patterns in haiku_words
m1 = Matcher(nlp.vocab)
m2 = Matcher(nlp.vocab)
m3 = Matcher(nlp.vocab)
m4 = Matcher(nlp.vocab)
m5 = Matcher(nlp.vocab)

In [9]:
# each pattern is a spacy pattern that searches for part-of-speech tags

# single word, word can only be ['noun','adp','adj','adv']
pattern = [{'POS':{'IN': ['NOUN','ADP','ADJ','ADV']}}]
m1.add('OneWords',[pattern])

# two words
pattern = [{'POS':{'IN': ['NOUN','ADP','ADJ','ADV']}},
           {'POS':{'IN': ['NOUN','VERB']}}]
m2.add('TwoWords',[pattern])

# three words, the second word can be any word
pattern = [{'POS': {'IN': ['NOUN','ADP','ADJ','ADV']}},
           {'IS_ASCII': True,'IS_PUNCT': False, 'IS_SPACE': False},
           {'POS': {'IN': ['NOUN','VERB','ADJ','ADV']}}]
m3.add("ThreeWords",[pattern])

# four words
pattern = [{'POS': {'IN': ['NOUN','ADP','ADJ','ADV']}},
           {'IS_ASCII': True,'IS_PUNCT': False, 'IS_SPACE': False},
           {'IS_ASCII': True,'IS_PUNCT': False, 'IS_SPACE': False},
           {'POS': {'IN': ['NOUN','VERB','ADJ','ADV']}}]
m4.add("FourWords",[pattern])

# five words
pattern = [{'POS': {'IN': ['NOUN','ADP','ADJ','ADV']}},
           {'IS_ASCII': True,'IS_PUNCT': False, 'IS_SPACE': False},
           {'IS_ASCII': True,'IS_PUNCT': False, 'IS_SPACE': False},
           {'IS_ASCII': True,'IS_PUNCT': False, 'IS_SPACE': False},
           {'POS': {'IN': ['NOUN','VERB','ADJ','ADV']}}]
m5.add("FiveWords",[pattern])

In [10]:
# make lists for number-of-syllables patterns
# 5 syllables
syl5 = []
# 7 syllables
syl7 = []

In [11]:
def find_syllables(document,syllable_list_1,syllable_list_2,
                  matcher1,matcher2,matcher3,matcher4,matcher5):
    # let the matcher objects search for patterns
    M1 = matcher1(document)
    M2 = matcher2(document)
    M3 = matcher3(document)
    M4 = matcher4(document)
    M5 = matcher5(document)
    # go through the matches and count the syllables
    for match_id,start,end in M1 + M2 + M3 + M4 + M5:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]

        syl_count = 0
        # count the syllables
        for token in span:
            syl_count += syllapy.count(token.text)
        # if syllables == 5 and the pattern is not in the list, add it
        if syl_count == 5:
            if span.text not in syllable_list_1:
                syllable_list_1.append(span.text)
        # if syllables == 7 and the pattern is not in the list, add it
        if syl_count == 7:
            if span.text not in syllable_list_2:
                syllable_list_2.append(span.text)

In [12]:
# generate num_batches randomly generated blocks of text
# and continually add syllable sequences to syl5 and syl7
num_batches = 15
for i in range(num_batches):

    # generate a word library for our haiku building
    haiku_words = generate_bunch(model,tokenizer,seq_len,txt_seqs)
    # make it into a single string
    haiku_words = ' '.join(haiku_words)

    # spacy reads the haiku_words
    doc = nlp(haiku_words)
    
    # find syllable patterns and add them to syl5 and syl7
    find_syllables(doc,syl5,syl7,m1,m2,m3,m4,m5)
    
    print("done with batch {} out of {}".format(i+1,num_batches))

done with batch 1 out of 15
done with batch 2 out of 15
done with batch 3 out of 15
done with batch 4 out of 15
done with batch 5 out of 15
done with batch 6 out of 15
done with batch 7 out of 15
done with batch 8 out of 15
done with batch 9 out of 15
done with batch 10 out of 15
done with batch 11 out of 15
done with batch 12 out of 15
done with batch 13 out of 15
done with batch 14 out of 15
done with batch 15 out of 15


In [13]:
# print the patterns we've found
print(syl5)
print('\n')
print(syl7)

['undeniably known', 'ahead and again', 'of that catacombs', 'catacombs and ward', 'yet had run away', 'in the same colour', 'sight of the river', 'of the river put', 'down hart the maniac', 'inside nor it took', 'rains of which his face', 'belfry reflecting', 'in the horrible', 'places he bowed', 'left of what shadows', 'of what shadows switch', 'abnormality', 'glass overlooking', 'especially stone', 'building harboured', 'rapidly coloured', 'beyond the river', 'atop the hollow', 'bearded and finny', 'finny gnorri build', 'workroom their churches', 'churches was surely', 'surely to engulf', 'stock of medium', 'to the waterfront', 'to an outsider', 'former ones crossed', 'thing is certainly', 'text the shuttered', 'amidst my yeares', 'chips darkness became', 'town ancient pursuers', 'design and rapidly', 'city not very', 'curious or deep', 'above an effort', 'effort i feared', 'ice surmounting more', 'on the opal throne', 'hollow cliffs of glass', 'of what was never', 'tale ever not fr

In [14]:
# print the size of the pattern lists
print(len(syl5))
print(len(syl7))

361
267


In [15]:
# make some haikus
with open("Lovecrafts_haikus_vol_1.txt",'w') as f:
    for _ in range(50):
        f.write("{}\n{}\n{}".format(random.choice(syl5),random.choice(syl7),random.choice(syl5)))
        f.write('\n\n')

In [16]:
# print out the haikus
with open("Lovecrafts_haikus_vol_1.txt",'r') as f:
    print(f.read())

few too most the one
knowledge of the sinister
way a few too most

above an effort
several bulging rambles
waters and swam sight

of the manuxet
recovering and the boughs
in the river bank

showman was there so
beyond the high republic
of imagine once

to his head curwen
almost all sorts of colour
half part that i seen

uncle ships sometimes
upper eaten the slanting
documents proved

awful purgation
to arkham john ward supposed
terrible and old

clumsily edges
in those terrible fashion
life and my uncle

into the other
curious situation
thorough well gaunt find

leagues of twilight
of salem village seemed first
scared because i reached

in a terrible
scraps to any disposal
panel in the hall

better do no use
cyclopean and dimensions
unable to land

man or any so
ulthar beyond the river
surely to engulf

different and sought
ruins of the manuxet says
of the manuxet

stroke of a ancient
sight to the bygone ashes
skai that a new king

in the incident
crown and orchards he began
of charles