# DS 4400 Final Project : Haiku Generator

#### Ben Tunney, Glen Damian Lim

#### Datasets : https://www.kaggle.com/datasets/hjhalani30/haiku-dataset (English haikus)

#### Word Embeddings: GloVe from https://nlp.stanford.edu/projects/glove/ (choose Wikipedia 2014 + Gigaword 5)

#### NLP models: N-gram Language Model, Recurrent Neural Network, Transformers

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# NLP libraries
import nltk
from nltk.corpus import stopwords, cmudict
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Neural Networks libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.models import Sequential

# Outside Files
# import ngram_model as ngm

### Getting data and text pre-processing 

In [4]:
# Read file and get data
def get_haiku_data(fname):
    df = pd.read_csv(fname)
    sentences = df['0'] + ' ' + df['1'] + ' ' + df['2'] + ' ' 
    data = [str(sentence).split() for sentence in sentences]
    return data

# lemmatizer
lm = WordNetLemmatizer()

def contains_special(word):
    for char in word:
        if char.isnumeric() or (not char.isalnum()):
            return True
    return False

# process tokens
def process_tokens(toks):
    toks = [lm.lemmatize(word.lower()) for word in toks 
          # make sure no strings that contain only numeric characters 
          if not contains_special(word)]
    return toks

def read_haikus(data, ngram):
    result = []
    for sentences in data:
        toks = nltk.word_tokenize(' '.join([word for word in sentences]))
        processed = process_tokens(toks)
        if len(processed) != 0 and len(processed) < 17:
            processed = ['<h>'] * (ngram-1) + processed + ['</h>'] * (ngram-1)
            result.append(processed)
    return result

# create an instance of the CMUDict
syllable = cmudict.dict()
def estimate_syllables(word):
    try:
        count = [len(list(y for y in x if y[-1].isdigit())) for x in syllable[word.lower()]]
        return count
    except KeyError:
        return 100
    
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [5]:
data = get_haiku_data('/kaggle/input/haiku-dataset/all_haiku.csv')
# Get haikus data with trigram
# haikus = read_haikus(data, 3)

### Training word embeddings using Word2Vec

In [8]:
from gensim.models import Word2Vec

embedding_size = 200

def train_embeddings(data):
    return Word2Vec(sentences=haikus, vector_size=embedding_size, window=5, min_count=1, 
                 sg=1)
    

# # Train the Word2Vec model from Gensim. 
# word2vec_model = train_embeddings(haikus)
# vocab_size = len(word2vec_model.wv.index_to_key)
# print('Vocab size {}'.format(vocab_size))

# N-gram Language Model

In [5]:
# Find haikus that are similar
def find_similar_haikus(haikus, inputs, embeddings):
    """Find haikus that contain words from the given inputs
    Parameters:
      haikus (list): list of list of processed haikus tokens
      inputs (list): list of words to match
      embeddings (Word2Vec): trained word embeddings

    Returns:
      list: list of list of processed haikus tokens that contain words from the given inputs
    """
    similar_words = []
    for word in inputs:
        # Find top 5 similar words to current word
        find_similar = [similar_words.append(w) for w,s in embeddings.wv.most_similar(word, topn=5)]
    training_haikus = []
    for haiku in haikus:
        if any(word in haiku for word in similar_words):
            training_haikus.append(haiku)
    return [" ".join(haiku) for haiku in training_haikus]

In [6]:
similar_haikus = find_similar_haikus(haikus, ['basketball'], word2vec_model)

# Define new N-gram Language Model object
ngram_lm = ngm.LanguageModel(3, True, line_begin="<" + "h" + ">", line_end="</" + "h" + ">")
# Training the model with haikus similar to inputs
ngram_lm.train(similar_haikus)

for haiku in ngram_lm.generate_haiku(5):
    for line in haiku:
        print(line)
    print('\n')

NameError: name 'ngm' is not defined

# Recurrent Neural Networks (LSTMs)

In [6]:
def read_embeddings(model, tokenizer):
    '''Loads and parses embeddings trained in earlier.
    Parameters and return values are up to you.
    '''
    vocab = list(model.wv.index_to_key)
    word_to_index = tokenizer.word_index

    word_to_embedding = {}
    index_to_embedding = {}

    for word in vocab:
        embedding = model.wv[word]
        word_to_embedding[word] = embedding
        index_to_embedding[word_to_index[word]] = embedding
    return word_to_embedding, index_to_embedding

# Produced pre-padded data for LSTM network
def padded_data(encoded, seq_length):
    X = []
    y = []
    for row in encoded:
        for i in range(1, len(row) - 1):
            X.append(row[:i])
            y.append(row[i])
    X = pad_sequences(X, maxlen = seq_length - 1)
    return X, y

def data_generator(X: list, y: list, num_sequences_per_batch: int, vocab_size: int, index_to_embedding: dict) -> (list,list):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)

    '''
    # inputs
    i = 0
    while i < len(X):
        end_index = i + num_sequences_per_batch
        # if we ran out of data
        if end_index >= len(X) - 1:
            i = 0
            end_index = i + num_sequences_per_batch
        
        inputs = [val for val in X[i:end_index]]
        # outputs into one hot encoding
        outputs = [to_categorical(val, vocab_size, dtype = 'int32') for val in y[i:end_index]]
        yield np.array(inputs), np.array(outputs)
        i += num_sequences_per_batch

In [None]:
import math
tokenizer = Tokenizer()
haikus = read_haikus(data, 1)
# Using 50% of training data due to limited RAM
haikus = haikus[:math.floor(len(haikus) * 0.50)]
word2vec_model = train_embeddings(haikus)
vocab_size = len(word2vec_model.wv.index_to_key)
tokenizer.fit_on_texts(haikus)

# Embeddings
word_to_embedding, index_to_embedding = read_embeddings(word2vec_model, tokenizer)
# Embedding for zero index
index_to_embedding[0] = np.zeros((embedding_size,))
word_to_embedding[''] = np.zeros((embedding_size,))
vocab_size = len(word_to_embedding.keys())

# Encode words into index
encoded = tokenizer.texts_to_sequences(haikus)
seq_length = 10
# Padded data along with sliding window
X_encoded, y = padded_data(encoded, seq_length)

# Convert X into 3D (num_instances, sequence length, embedding_size)
X = np.zeros((len(X_encoded), seq_length - 1, embedding_size))
for i in range(X_encoded.shape[0]):
    for j in range(X_encoded.shape[1]):
        word = X_encoded[i,j]
        X[i, j, :] = index_to_embedding[word]

In [None]:
# Start training the model

# hyperparameters
num_epochs = 10
num_sequences_per_batch = 128
steps_per_epoch = len(encoded)//num_sequences_per_batch

# Data generator
train_generator = data_generator(X,y, num_sequences_per_batch, vocab_size, index_to_embedding)

model = Sequential()
# LSTM layer
model.add(LSTM(512, input_shape=(seq_length - 1, embedding_size),return_sequences=True))
# Dropout layer to prevent overfitting
model.add(Dropout(0.2))
model.add(LSTM(256, input_shape=(seq_length - 1, embedding_size),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, input_shape=(seq_length - 1, embedding_size),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, input_shape=(seq_length - 1, embedding_size),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x= train_generator,
          steps_per_epoch=steps_per_epoch,
          epochs=num_epochs, verbose = 1)

print(model.output)

In [None]:
# generate a sequence from the model
def generate_seq(model: Sequential, 
                 tokenizer: Tokenizer, 
                 seed: list, 
                 syllable_limit: int):
    '''
    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        seed: [w1, w2, w(n-1)]
        n_words: generate a sentence of length n syllable
    Returns: string sentence
    '''
    sentence = seed
    i = 0
    count_syllables = 0
    while count_syllables != syllable_limit:
        # n-1 tokens in sentence
        curr_tokens = sentence
        # encode our tokens
        sequence = tokenizer.texts_to_sequences([curr_tokens])[0]
        # pre-padding our tokens
        sequence = np.array(pad_sequences([sequence], maxlen = seq_length-1, padding='pre'))
        # Convert into 3D
        embeddings = np.zeros((sequence.shape[0], sequence.shape[1], embedding_size))
        for i in range(sequence.shape[0]):
            for j in range(sequence.shape[1]):
                word = sequence[i,j]
                embeddings[i, j, :] = index_to_embedding[word]
        # get probability distribution
        probs = model.predict(embeddings)[0][2:]
        # normalize probabilities and get index
        random_choice = np.random.choice(len(probs),p = probs / np.sum(probs))
        if random_choice != 0:
            next_word = tokenizer.index_word[random_choice + 3]
            # Count new syllables
            new_count = syllables.estimate(next_word) + count_syllables
            if next_word not in ['<h>','</h>'] and (new_count <= syllable_limit):
                sentence.append(next_word)
                count_syllables = new_count
        else:
            sentence = seed
            count_syllables = 0
    return sentence

In [12]:
seed = # USER QUERY HERE as a list of one string (EX: ['wind'])
def generate_haiku(seed):
    """Generates n haikus from a trained language model
    Parameters:
      n (int): the number of haikus to generate

    Returns:
      list: a list containing strings, one per generated sentence
    """
    haiku = []
    line_1 = generate_seq(model, tokenizer, seed, 5).split(self.line_begin)[-1]
    line_2 = generate_seq(model, tokenizer, line_1[-1], 5).split(self.line_begin)[-1]
    line_3 = generate_seq(model, tokenizer, line_2[-1], 5).split(self.line_begin)[-1]
    haiku.append(line_1)
    haiku.append(line_2)
    haiku.append(line_3)
    haikus.append(haiku)
    return haiku

216


NameError: name 'syllables' is not defined

In [None]:
for haiku in generate_haiku(5):
    for line in haiku:
        print(line)
    print('\n')