In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = "http://haikuguy.com/issa/search.php"
r = requests.get(url)
r.status_code

200

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
english = soup.find_all("p", {"class": "english"})

In [None]:
english_haikus = []
for p in english:
    text = p.text.replace("\n", " ").strip()
    english_haikus.append(text)

In [None]:
len(english_haikus)

10760

In [None]:
import random
import string

punct = string.punctuation
punct = punct.replace("'", "")
punct = punct.replace("-", "")

haikus_no_punct = []
for h in english_haikus:
    for p in punct:
        h = h.replace(p, "")
    haikus_no_punct.append(h)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models, layers, optimizers, losses, activations
from keras.utils import to_categorical
import keras
import numpy as np

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [None]:
tokenizer.fit_on_texts(haikus_no_punct)

In [None]:
tokens = tokenizer.texts_to_sequences(haikus_no_punct) # convert to dense-encoded sequences
tokens = pad_sequences(tokens) # pad them to all have the same length

In [None]:
tokens.shape # verify that n_rows matches the number of haikus

(10760, 15)

In [None]:
vocab_size = len(tokenizer.index_word) + 1

In [None]:
X = tokens[:,:-1]
y = tokens[:,1:]
print(X.shape, y.shape)

(10760, 14) (10760, 14)


In [None]:
# create some fakish Haikus where we replace between 1 - 3 random words with another random word but use same target
# however, cannot replace a 0 and cannot leave a word stranded, i.e. must have at least one adjacent word

n_fakes = 10000
non_zero_mask = (tokens != 0.).astype(np.int32)
random_samples_ixs = np.random.randint(0, tokens.shape[0], size=n_fakes)
X_fake, y_fake = X[random_samples_ixs], y[random_samples_ixs]
for i, row in enumerate(X_fake):
    try:
        r_nz_start = row.nonzero()[0][0]
    except:
        print(i, row)
    n_replace = np.random.randint(low=1, high=4)
    ix_to_replace = np.random.randint(low=r_nz_start, high=row.shape[-1], size=n_replace)
    replace_with = np.random.randint(low=1, high=vocab_size - 1, size=n_replace)
    row[ix_to_replace] = replace_with
X = np.vstack([X, X_fake])
y = np.vstack([y, y_fake])

In [None]:
X.shape, y.shape

((20760, 14), (20760, 14))

In [None]:
y_categorical = to_categorical(y, num_classes=vocab_size)
print(y_categorical.shape)

(20760, 14, 7027)


In [None]:
embedding_dimension = 64
M = 64

In [None]:
i = layers.Input(shape=(X.shape[-1],))
x = layers.Embedding(vocab_size, embedding_dimension)(i)

x = layers.LSTM(M, return_sequences=True)(x)
x = layers.LSTM(M, return_sequences=True)(x)
x = layers.Dense(M)(x)
x = layers.LeakyReLU(alpha=0.2)(x)
x = layers.Dropout(0.3)(x)
x = layers.BatchNormalization()(x)
o = layers.Dense(vocab_size, activation='softmax')(x)
model = models.Model(i, o)

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 14)]              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 14, 64)            449728    
_________________________________________________________________
lstm_6 (LSTM)                (None, 14, 64)            33024     
_________________________________________________________________
lstm_7 (LSTM)                (None, 14, 64)            33024     
_________________________________________________________________
dense_8 (Dense)              (None, 14, 64)            4160      
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 14, 64)            0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 14, 64)            0   

Plain LSTM encoder Decoder model does not work at all.

Possible step: edit model and data to predict sequence

Using 0 -> -1 sequence as input and 1 - > end sequence as output seems to work better, val loss goes down

Trying BiDi LSTM for better resutls

Also need to increase dimensions (currently using 32 -> Embed and 32 - > LSTM)

Seems to focus on cuckoo and sings a lot previous version may have been better

Tried the approach of adding randomness to the haikus, took 10k randomly sampled from train dataset

Then selected between 1 - 3 random words and replaced with another random word sampled from the entire vocab, but kept 

targets same as in the untouched version of the Haiku. The idea being that Haikus may have a specific starting pattern so 

anything that does not fit it will lead to word repetition. This approach seems to have helped the model learn better and 

seems to have addressed the repetition issue. Will try deploying this as a start. 

In [None]:
model.fit(X, y_categorical, validation_split=0.2, epochs=10, batch_size=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd002dbb0b8>

In [None]:
import plotly.express as px
import plotly.graph_objects as go

loss = model.history.history['loss']
val_loss = model.history.history['val_loss']
x = list(range(len(loss)))

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=loss, mode="lines+markers", name="Train Loss"))
fig.add_trace(go.Scatter(x=x, y=val_loss,mode="lines+markers", name="Validation Loss"))
fig.show()

In [None]:
tokenizer.oov_token

In [None]:
test_input = "The wild goose"
max_length = 14

while len(test_input.split()) < max_length:
    encoded_input = tokenizer.texts_to_sequences([test_input])
    encoded_input = pad_sequences(encoded_input, maxlen=max_length)
    model_output = model.predict(encoded_input)[0][-1]
    index = model_output.argmax()
    if index == 0:
        break
    next_predicted_word = tokenizer.index_word[index]
    test_input += f" {next_predicted_word}"

In [None]:
test_input

'The wild goose joyful blooming buddha is bloomed the same little field is a'

In [None]:
import datetime
from datetime import datetime as dt
import os

datekey = dt.now().strftime("%d-%B-%Y")
dir_path = f"../../saved_models/{datekey}"
if not os.path.isdir(dir_path):
    os.makedirs(dir_path)
weights_path = f"../../saved_models/{datekey}/weights.h5"
saved_model_path = f"../../saved_models/{datekey}/saved_model.pb"
model.save_weights(weights_path)
model.save(saved_model_path)

INFO:tensorflow:Assets written to: ../../saved_models/11-April-2021/saved_model.pb/assets
INFO:tensorflow:Assets written to: ../../saved_models/11-April-2021/saved_model.pb/assets


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2b46da0f-9118-4fef-a62c-5402bbd6e1e0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>