# DS 4400 Final Project : Haiku Generator

#### Ben Tunney, Glen Damian Lim

#### Datasets : https://www.kaggle.com/datasets/hjhalani30/haiku-dataset (English haikus)

#### NLP models: N-gram Language Model, Recurrent Neural Network

###### INSTRUCTIONS: To run this notebook, you have to run it in the order of the top cell to bottom cell. When running the main() function for the dashboard, it will return a link that will lead you to your localhost in order to run the dashboard locally. All of the required libraries and an external ".py" file for our ngram model are listed/imported in this notebook, please contact one of us if you run into any problems and thanks you for a great semester!

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import syllables
from collections import Counter

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
# nltk.download('punkt')
# nltk.download('cmudict')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# Neural Networks libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.models import Sequential

# Separate file provided for the ngram model, make sure to put in the same directory
import ngram_model as ngm

Using TensorFlow backend.


### Getting data and text pre-processing 

In [2]:
# Read file and get data
def get_haiku_data(fname):
    df = pd.read_csv(fname)
    sentences = df['0'] + ' ' + df['1'] + ' ' + df['2'] + ' ' 
    data = [str(sentence).split() for sentence in sentences]
    return data

# lemmatizer
lm = WordNetLemmatizer()

# Checks if given word contains a special character
def contains_special(word):
    for char in word:
        if char.isnumeric() or (not char.isalnum()):
            return True
    return False

# process tokens
def process_tokens(toks):
    toks = [lm.lemmatize(word.lower()) for word in toks 
          # make sure no strings that contain only numeric characters 
          if not contains_special(word)]
    return toks

# Read and pre-process our data
def read_haikus(data, ngram):
    result = []
    for sentences in data:
        toks = nltk.word_tokenize(' '.join([word for word in sentences]))
        processed = process_tokens(toks)
        if len(processed) != 0 and len(processed) < 17:
            processed = ['<h>'] * (ngram-1) + processed + ['</h>'] * (ngram-1)
            result.append(processed)
    return result

In [3]:
data = get_haiku_data('all_haiku.csv')
# Get haikus data with trigram
haikus = read_haikus(data, 3)

### Training word embeddings using Word2Vec

In [4]:
from gensim.models import Word2Vec

embedding_size = 200

def train_embeddings(data):
    return Word2Vec(sentences=haikus, vector_size=embedding_size, window=5, min_count=1, 
                 sg=1)
    

# Train the Word2Vec model from Gensim. 
word2vec_model = train_embeddings(haikus)
vocab_size = len(word2vec_model.wv.index_to_key)
print('Vocab size {}'.format(vocab_size))

Vocab size 32328


# N-gram Language Model

In [5]:
# Find haikus that are similar
def find_similar_haikus(haikus, inputs, embeddings):
    """Find haikus that contain words from the given inputs
    Parameters:
      haikus (list): list of list of processed haikus tokens
      inputs (list): list of words to match
      embeddings (Word2Vec): trained word embeddings

    Returns:
      list: list of list of processed haikus tokens that contain words from the given inputs
    """
    similar_words = []
    for word in inputs:
        # Find top 5 similar words to current word
        find_similar = [similar_words.append(w) for w,s in embeddings.wv.most_similar(word, topn=5)]
    training_haikus = []
    for haiku in haikus:
        if any(word in haiku for word in similar_words):
            training_haikus.append(haiku)
    return [" ".join(haiku) for haiku in training_haikus]

In [6]:
similar_haikus = find_similar_haikus(haikus, ['basketball'], word2vec_model)

# Define new N-gram Language Model object
ngram_lm = ngm.LanguageModel(3, True, line_begin="<" + "h" + ">", line_end="</" + "h" + ">")
# Training the model with haikus similar to inputs
ngram_lm.train(similar_haikus)

# Generate 5 haikus with the query input 'basketball'
for haiku in ngram_lm.generate_haiku(5):
    for line in haiku:
        print(line)
    print('\n')

 now we where we
 your story of the playoff this
 playoff looking like a


 am i the only
 and people wonder why i
 out football season


 football is on and
 been telling it not fun to
 gutted to hear this


 targeting is the
 how do the hockey night on
 getting ready for


 not even close
 football practice the coach
 i ca fucking stand




# Recurrent Neural Networks (LSTMs)

In [7]:
def read_embeddings(model, tokenizer):
    '''Loads and parses embeddings trained in earlier.
    Parameters and return values are up to you.
    '''
    vocab = list(model.wv.index_to_key)
    word_to_index = tokenizer.word_index

    word_to_embedding = {}
    index_to_embedding = {}

    for word in vocab:
        embedding = model.wv[word]
        word_to_embedding[word] = embedding
        index_to_embedding[word_to_index[word]] = embedding
    return word_to_embedding, index_to_embedding

def padded_data(encoded, seq_length):
    X = []
    y = []
    for row in encoded:
        for i in range(1, len(row) - 1):
            X.append(row[:i])
            y.append(row[i])
    X = pad_sequences(X, maxlen = seq_length - 1)
    return X, y

def data_generator(X: list, y: list, num_sequences_per_batch: int, vocab_size: int, index_to_embedding: dict) -> (list,list):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    '''
    # inputs
    i = 0
    while i < len(X):
        end_index = i + num_sequences_per_batch
        # if we ran out of data
        if end_index >= len(X) - 1:
            i = 0
            end_index = i + num_sequences_per_batch
        
        inputs = [val for val in X[i:end_index]]
        # outputs into one hot encoding
        outputs = [to_categorical(val, vocab_size, dtype = 'int32') for val in y[i:end_index]]
        yield np.array(inputs), np.array(outputs)
        i += num_sequences_per_batch


In [8]:
tokenizer = Tokenizer()
# Get haikus data with unigram
haikus = read_haikus(data, 1)
# Train word embeddings with data vocabulary
word2vec_model = train_embeddings(haikus)
vocab_size = len(word2vec_model.wv.index_to_key)
tokenizer.fit_on_texts(haikus)

# Embeddings
word_to_embedding, index_to_embedding = read_embeddings(word2vec_model, tokenizer)
# Embedding for zero index
index_to_embedding[0] = np.zeros((embedding_size,))
word_to_embedding[''] = np.zeros((embedding_size,))
vocab_size = len(word_to_embedding.keys())

# Encode words into index
encoded = tokenizer.texts_to_sequences(haikus)
seq_length = max([len(sequence) for sequence in encoded])
# Performs pre-padding on training data with a sliding window approach
X_encoded, y = padded_data(encoded, seq_length)

# Convert X into 3D (num_instances, sequence length, embedding_size)
X = np.zeros((len(X_encoded), seq_length - 1, embedding_size))
for i in range(X_encoded.shape[0]):
    for j in range(X_encoded.shape[1]):
        word = X_encoded[i,j]
        X[i, j, :] = index_to_embedding[word]

In [9]:
# Start training the model

# hyperparameters
num_epochs = 5
num_sequences_per_batch = 256
steps_per_epoch = len(X)//num_sequences_per_batch

# Data generator
train_generator = data_generator(X, y, num_sequences_per_batch, vocab_size, index_to_embedding)

# Model architecture
model = Sequential()
# LSTM input layer
model.add(LSTM(128, input_shape=(seq_length - 1, embedding_size),return_sequences=True))
# Dropout layer to prevent overfitting
model.add(Dropout(0.2))
# Bidirectional LSTM layer for extra context
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x= train_generator,
          steps_per_epoch=steps_per_epoch,
          epochs=num_epochs, verbose = 1)

print(model.output)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Tensor("dense_1/Softmax:0", shape=(None, 32327), dtype=float32)


In [41]:
def generate_line(model: Sequential, 
                 tokenizer: Tokenizer, 
                 seed: list, 
                 syllable_limit: int):
    ''' Generate a line from the model based on the given syllable limit
    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        seed: [w1, w2, w(n-1)]
        syllable_limit: generate a sentence of length n syllable
    Returns: string sentence
    '''
    query = seed[0]
    sentence = [query]
    count_syllables = syllables.estimate(query)
    
    # while current syllable is not the limit
    while count_syllables != syllable_limit:
        # n-1 tokens in sentence
        curr_tokens = sentence
        # encode our tokens
        sequence = tokenizer.texts_to_sequences([curr_tokens])[0]
        # pre-padding our tokens
        sequence = np.array(pad_sequences([sequence], maxlen = seq_length-1, padding='pre'))
        # Convert into 3D in order to feed to NN for prediction
        embeddings = np.zeros((sequence.shape[0], sequence.shape[1], embedding_size))
        for i in range(sequence.shape[0]):
            for j in range(sequence.shape[1]):
                word = sequence[i,j]
                embeddings[i, j, :] = index_to_embedding[word]
    
        # get probability distribution
        probs = model.predict(embeddings)[0]
        # normalize probabilities and get index
        random_choice = np.random.choice(len(probs),p = probs / np.sum(probs))
        next_word = tokenizer.index_word[random_choice]
        # Count new syllables
        new_count = syllables.estimate(next_word) + count_syllables
        
        # if next word is not haiku begin or end token and under syllable limit
        if next_word not in ['<h>','</h>'] and (new_count <= syllable_limit):
            sentence.append(next_word)
            count_syllables = new_count
        else:
            # restart until we find matching syllable
            sentence = [query]
            count_syllables = syllables.estimate(query)
    return ' '.join(sentence)

In [42]:
def generate_haiku(n, query):
    """Generates n haikus from a trained language model
    Parameters:
      n (int): the number of haikus to generate

    Returns:
      list: a list containing strings, one per generated sentence
    """
    haikus = []
    while n > 0:
        haiku = []
        # Generate each line of haikus with 5-7-5 syllable limit pattern
        line_1 = generate_line(model, tokenizer, [query], 5)
        line_2 = generate_line(model, tokenizer, [line_1.split()[-1]], 7)
        line_3 = generate_line(model, tokenizer, [line_2.split()[-1]], 5)
        haiku.append(line_1)
        haiku.append(line_2)
        haiku.append(line_3)
        haikus.append(haiku)
        n -= 1
    return haikus

In [43]:
for haiku in generate_haiku(5, 'wind'):
    for line in haiku:
        print(line)
    print('\n')

wind might be making
making someone big health
health is the best part


wind have all bitter
bitter horse moon group of
of course that be


wind by the first of
of course maybe now i
i want to get so


wind work no bear at
at least why i want a card
card if your skill is


wind birthday story
story like it so hard to
to stay at checking




# Plotly Dashboard

In [14]:
# libraries for ai image
import json
import requests
from PIL import Image
import io
import re

# libraries for ner
import spacy
from spacy import displacy

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))

In [15]:
API_TOKEN = ""  # token in case you want to use private API
headers = {
    # "Authorization": f"Bearer {API_TOKEN}",
    "X-Wait-For-Model": "true",
    "X-Use-Cache": "false"
}

API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
NER = spacy.load("en_core_web_sm")

def query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return Image.open(io.BytesIO(response.content))


def slugify(text):
    # remove non-word characters and foreign characters
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", "-", text)
    return text

In [16]:
def get_image(prompt, mdl):

    # https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/
    text1 = NER(prompt)

    # try to get NERs
    ners = ""
    for word in text1.ents:
        ner = word.text
        ners += ner 
        ners += " "
        
    # if there are NERs, use for prompt
    if ners != "":
        prompt = ners
        
    # used stopword-removed haiku as prompt
    else:
        tokenized = prompt.split(" ")
        wordsList = [w for w in tokenized if w not in stop_words]
        prompt = " ".join(wordsList)

    # save img
    image = query({"inputs": prompt})
    image.save(f"image_{mdl}.png")

In [18]:
def return_haikus(word, model):
    if model == 'ngram':
        similar_haikus = find_similar_haikus(haikus, [word], word2vec_model)

        # Define new N-gram Language Model object
        ngram_lm = ngm.LanguageModel(2, True, line_begin="<" + "h" + ">", line_end="</" + "h" + ">")

        # Training the model with haikus similar to inputs
        ngram_lm.train(similar_haikus)

        haiku = ngram_lm.generate_haiku(1)
    elif model == "rnn":
        haiku = generate_haiku(1, word)
    return haiku

In [19]:
def main():
    # make app
    external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

    # stylesheet with the .dbc class
    dbc_css = "https://cdn.jsdelivr.net/gh/AnnMarieW/dash-bootstrap-templates/dbc.min.css"
    app = JupyterDash(__name__, external_stylesheets=[dbc.themes.JOURNAL, dbc.themes.BOOTSTRAP, dbc_css])

    channel_input = dcc.Input(
        id="input-value",
        type="text",
        value="",
        size="lg",
        style={"font-size": "1.6rem", "margin-top": ".5px"},
        className="mb-3"
    )
    button = dbc.Button(
        id="search-button",
        children="Search",
        n_clicks=0,
        size="lg",
        style={"font-size": "1.2rem", "margin-left": "12px", "margin-top": "-8px"},
        color="primary",
        className="me-1",
    )

    header = html.H1("NN and N-Gram Haiku Generator",
                     style={"margin-top": "50px"})

    caption = html.H6("Generate Haikus by Topic",
                      style={"margin-top": "10px"})
    
    ngram = html.H6("N-Gram Generated Haiku",
                      style={"margin-top": "10px", "font-weight": "bold"})
    
    lstm = html.H6("LSTM Generated Haiku",
                      style={"margin-top": "10px", "font-weight": "bold"})

    five1 = html.H6(id= "firstline",
                    children="11111111",
                      style={"margin-top": "10px"})

    seven1 = html.H6(id= "secondline",
                    children="2222222222",
                      style={"margin-top": "10px"})

    five2 = html.H6(id= "thirdline",
                    children="33333333333",
                      style={"margin-top": "10px"})

    five12 = html.H6(id= "firstline2",
                    children="11111111",
                      style={"margin-top": "10px"})

    seven12 = html.H6(id= "secondline2",
                    children="2222222222",
                      style={"margin-top": "10px"})

    five22 = html.H6(id= "thirdline2",
                    children="33333333333",
                      style={"margin-top": "10px"})


    img1 = html.Img(id= "image1", src="")
    img2 = html.Img(id= "image2", src="")

    collapse1 = html.Div(
        [
            dbc.Collapse(
                dbc.Card(dbc.CardBody([ngram, html.Div(className='gap'),five1, html.Div(className='gap'), seven1, html.Div(className='gap'), five2
                                       , html.Div(className='gap'), img1])),
                id="collapse1",
                is_open=True,
            ),
        ]
    )
    collapse2 = html.Div(
        [
            dbc.Collapse(
                dbc.Card(dbc.CardBody([lstm, html.Div(className='gap'),five12, html.Div(className='gap'), seven12, html.Div(className='gap'), five22
                                       , html.Div(className='gap'), img2])),
                id="collapse2",
                is_open=True,
            ),
        ]
    )



    app.layout = dbc.Container(
        [

        # top line of Dash
        dbc.Row([
            dbc.Col(
                [header,caption, channel_input, button, collapse1, collapse2],
                
                lg=6
            )
        ],
            justify = "center",
            style = dict(textAlign="center"),
            className="d-flex justify-content-center",
        ),],
        className="p-4",
        fluid = True)

    @app.callback(
        Output("search-button", "style"),
        Input("input-value", "value"),
    )
    def change_button_color(channel_input):
        if channel_input != "":
            return {"font-size": "1.2rem", "margin-left": "12px", "margin-top": "-8px", "background-color": "red"}
        else:
            return {"font-size": "1.2rem", "margin-left": "12px", "margin-top": "-8px", 'background-color': 'gray'}

    @app.callback(
        Output("search-button", "n_clicks"),
        Output('firstline', 'children'),
        Output('secondline', 'children'),
        Output('thirdline', 'children'),
        Output('image1', 'src'),
        Output('firstline2', 'children'),
        Output('secondline2', 'children'),
        Output('thirdline2', 'children'),
        Output('image2', 'src'),
        Input("search-button", "n_clicks"),
        Input("input-value", "value"),
    )

    def init_countdown_store(n_clicks, search_results):
        # Ngram model result
        lines = ["", "", ""]
        imgsrc_ngram = ""
        if n_clicks > 0:
            lines = return_haikus(search_results, "ngram")[0]
            s = lines[0]+ lines[1]+ lines[2]
            get_image(s, "ngram")
            test_base64_ngram = base64.b64encode(open("image_ngram.png", 'rb').read()).decode('ascii')
            imgsrc_ngram = 'data:image/png;base64,{}'.format(test_base64_ngram)
            
        # RNN model result
        lines2 = ["", "", ""]
        imgsrc_lstm = ""
        if n_clicks > 0:
            lines2 = return_haikus(search_results, "rnn")[0]
            s = lines[0]+ lines[1]+ lines[2]
            get_image(s, "lstm")
            test_base64_lstm = base64.b64encode(open("image_lstm.png", 'rb').read()).decode('ascii')
            imgsrc_lstm = 'data:image/png;base64,{}'.format(test_base64_ngram)
        return 0, lines[0], lines[1], lines[2], imgsrc_ngram, lines2[0], lines2[1], lines2[2], imgsrc_lstm

    app.run_server(debug=True)

In [None]:
# run server
if __name__ == "__main__":
    main()