# DS 4400 Final Project : Haiku Generator

#### Ben Tunney, Glen Damian Lim

#### Datasets : https://www.kaggle.com/datasets/hjhalani30/haiku-dataset (English haikus)

#### Word Embeddings: GloVe from https://nlp.stanford.edu/projects/glove/ (choose Wikipedia 2014 + Gigaword 5)

#### NLP models: N-gram Language Model, Recurrent Neural Network, Transformers

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# Neural Networks libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Outside Files
import ngram_model as ngm

2023-04-17 23:14:06.588359: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Getting data and text pre-processing 

In [2]:
# def train_embeddings(fname):
#     # Pre-trained GloVe word embeddings
#     embeddings_dict = {}
#     with open(fname, 'r') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             vector = np.asarray(values[1:], "float32")
#             embeddings_dict[word] = vector
#     return embeddings_dict

# Read file and get data
def get_haiku_data(fname):
    df = pd.read_csv(fname)
    sentences = df['0'] + ' ' + df['1'] + ' ' + df['2'] + ' ' 
    data = [str(sentence).split() for sentence in sentences]
    return data

# lemmatizer
lm = WordNetLemmatizer()

def contains_special(word):
    for char in word:
        if char.isnumeric() or (not char.isalnum()):
            return True
    return False

# process tokens
def process_tokens(toks):
    toks = [lm.lemmatize(word.lower()) for word in toks 
          # make sure no strings that contain only numeric characters 
          if not contains_special(word)]
    return toks

def read_haikus(data, ngram):
    result = []
    for sentences in data:
        toks = nltk.word_tokenize(' '.join([word for word in sentences]))
        processed = process_tokens(toks)
        if len(processed) != 0:
            processed = ['<h>'] * (ngram-1) + processed + ['</h>'] * (ngram-1)
            result.append(processed)
    return result

In [3]:
data = get_haiku_data('data/haiku/all_haiku.csv')
# Get haikus data with trigram
haikus = read_haikus(data, 3)
# embeddings = train_embeddings('glove.6B/glove.6B.100d.txt')

### Training word embeddings using Word2Vec

In [4]:
from gensim.models import Word2Vec

embedding_size = 200

def train_embeddings(data):
    return Word2Vec(sentences=haikus, vector_size=embedding_size, window=5, min_count=1, 
                 sg=1)
    

# Train the Word2Vec model from Gensim. 
word2vec_model = train_embeddings(haikus)
vocab_size = len(word2vec_model.wv.index_to_key)
print('Vocab size {}'.format(vocab_size))

Vocab size 41376


# N-gram Language Model

In [5]:
# Find haikus that are similar
def find_similar_haikus(haikus, inputs, embeddings):
    """Find haikus that contain words from the given inputs
    Parameters:
      haikus (list): list of list of processed haikus tokens
      inputs (list): list of words to match
      embeddings (Word2Vec): trained word embeddings

    Returns:
      list: list of list of processed haikus tokens that contain words from the given inputs
    """
    similar_words = []
    for word in inputs:
        # Find top 5 similar words to current word
        find_similar = [similar_words.append(w) for w,s in embeddings.wv.most_similar(word, topn=5)]
    training_haikus = []
    for haiku in haikus:
        if any(word in haiku for word in similar_words):
            training_haikus.append(haiku)
    return [" ".join(haiku) for haiku in training_haikus]

In [6]:
similar_haikus = find_similar_haikus(haikus, ['basketball'], word2vec_model)

# Define new N-gram Language Model object
ngram_lm = ngm.LanguageModel(2, True, line_begin="<" + "h" + ">", line_end="</" + "h" + ">")
# Training the model with haikus similar to inputs
ngram_lm.train(similar_haikus)

for haiku in ngram_lm.generate_haiku(5):
    for line in haiku:
        print(line)
    print('\n')

 he wa fun to a
 the hockey tonight for you
 no good luck at a


 alright with these
 baker mayfield is like it
 i watching that the


 hockey game tonight
 breaking a hockey team in
 football late and im


 i need someone
 what they get the packer wo
 how i wa going so


 yes sunday filled
 im taking the panther best
 it friday night they




# Recurrent Neural Networks (LSTMs)

In [7]:
def read_embeddings(model, tokenizer):
    '''Loads and parses embeddings trained in earlier.
    Parameters and return values are up to you.
    '''
    vocab = list(model.wv.index_to_key)
    word_to_index = tokenizer.word_index

    word_to_embedding = {}
    index_to_embedding = {}

    for word in vocab:
        embedding = model.wv[word]
        word_to_embedding[word] = embedding
        index_to_embedding[word_to_index[word]] = embedding
    return word_to_embedding, index_to_embedding

def data_generator(X: list, y: list, num_sequences_per_batch: int, vocab_size: int, index_to_embedding: dict) -> (list,list):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)
    
    '''
    # inputs
    i = 0
    while i < len(X):
        end_index = i + num_sequences_per_batch
        # if we ran out of data
        if end_index >= len(X):
            i = 0
            end_index = i + num_sequences_per_batch
        # flatten/concatenate embeddings
        inputs = [val for val in X[i:end_index]]
        print(np.array(inputs).shape)
        # outputs into one hot encoding
        outputs = [to_categorical(val, vocab_size, dtype = 'int32') for val in y[i:end_index]]
        print(np.array(outputs).shape)
        yield np.array(inputs), np.array(outputs)
        i += num_sequences_per_batch


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(haikus)
# Encode words into index
encoded = tokenizer.texts_to_sequences(haikus)

# Making sure we have same sequence length for LSTM
max_seq_length = max([len(sequence) for sequence in encoded])
encoded = pad_sequences(encoded, maxlen=max_seq_length, padding='post')

In [9]:
# Embeddings
word_to_embedding, index_to_embedding = read_embeddings(word2vec_model, tokenizer)
# Embedding for zero index
index_to_embedding[0] = np.zeros((embedding_size,))
vocab_size = len(word_to_embedding.keys())

In [None]:
X = encoded[:, :-1]


# Convert X into 3D (num_instances, sequence length, embedding_size)

# X = np.zeros((X_encoded.shape[0], X_encoded.shape[1], embedding_size))
# for i in range(X_encoded.shape[0]):
#     for j in range(X_encoded.shape[1]):
#         word = X_encoded[i,j]
#         X[i, j, :] = index_to_embedding[word]

y = [val[-1] for val in encoded]

In [None]:
# Start training the model
# X = np.array(X).astype(np.float32)
# y = np.array(y).astype(np.float32)

# hyperparameters
hidden_units = 100
num_epochs = 5
num_sequences_per_batch = 128
steps_per_epoch = len(X)//num_sequences_per_batch

# # input/output dimensions
# input_dim = (NGRAM - 1) * EMBEDDINGS_SIZE
# output_dim = vocab_size

# Data generator
train_generator = data_generator(X, y, num_sequences_per_batch, vocab_size, index_to_embedding)

model = Sequential()
model.add(LSTM(512, input_shape=(max_seq_length - 1, embedding_size),return_sequences=True))
# Fully connected layer
model.add(Dense(64, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# model = Sequential
# ,pde;l
# # hidden layer -- same number of hidden units as above
# model.add(Dense(units= hidden_units, activation='relu', input_dim = input_dim))
# # output layer
# model.add(Dense(units= output_dim, activation='softmax'))
# #optimizer
# optimizer = Adam()
# # configure the learning process
# model.compile(loss='categorical_crossentropy',
#               optimizer=optimizer,
#               metrics=['accuracy'])

print(model.output)
model.fit(x = train_generator,
          steps_per_epoch=steps_per_epoch,
          epochs=num_epochs)

print(model.output)