# DS 4400 Final Project : Haiku Generator

#### Ben Tunney, Glen Damian Lim

#### Datasets : https://www.kaggle.com/datasets/hjhalani30/haiku-dataset (English haikus)

#### Word Embeddings: GloVe from https://nlp.stanford.edu/projects/glove/ (choose Wikipedia 2014 + Gigaword 5)

#### NLP models: N-gram Language Model, Recurrent Neural Network, Transformers

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# Neural Networks libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Outside Files
import ngram_model as ngm

2023-04-17 20:27:44.587560: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Getting data and text pre-processing 

In [2]:
# def train_embeddings(fname):
#     # Pre-trained GloVe word embeddings
#     embeddings_dict = {}
#     with open(fname, 'r') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             vector = np.asarray(values[1:], "float32")
#             embeddings_dict[word] = vector
#     return embeddings_dict

# Read file and get data
def get_haiku_data(fname):
    df = pd.read_csv(fname)
    sentences = df['0'] + ' ' + df['1'] + ' ' + df['2'] + ' ' 
    data = [str(sentence).split() for sentence in sentences]
    return data

# lemmatizer
lm = WordNetLemmatizer()

def contains_special(word):
    for char in word:
        if char.isnumeric() or (not char.isalnum()):
            return True
    return False

# process tokens
def process_tokens(toks):
    toks = [lm.lemmatize(word.lower()) for word in toks 
          # make sure no strings that contain only numeric characters 
          if not contains_special(word)]
    return toks

def read_haikus(data, ngram):
    result = []
    for sentences in data:
        toks = nltk.word_tokenize(' '.join([word for word in sentences]))
        processed = process_tokens(toks)
        if len(processed) != 0:
            processed = ['<h>'] * (ngram-1) + processed + ['</h>'] * (ngram-1)
            result.append(processed)
    return result

In [3]:
data = get_haiku_data('data/haiku/all_haiku.csv')
# Get haikus data with trigram
haikus = read_haikus(data, 3)
# embeddings = train_embeddings('glove.6B/glove.6B.100d.txt')

### Training word embeddings using Word2Vec

In [4]:
from gensim.models import Word2Vec

def train_embeddings(data):
    return Word2Vec(sentences=haikus, vector_size=200, window=5, min_count=1, 
                 sg=1)
    

# Train the Word2Vec model from Gensim. 
word2vec_model = train_embeddings(haikus)
vocab_size = len(word2vec_model.wv.index_to_key)
print('Vocab size {}'.format(vocab_size))

Vocab size 41376


# N-gram Language Model

In [5]:
# Find haikus that are similar
def find_similar_haikus(haikus, inputs, embeddings):
    """Find haikus that contain words from the given inputs
    Parameters:
      haikus (list): list of list of processed haikus tokens
      inputs (list): list of words to match
      embeddings (Word2Vec): trained word embeddings

    Returns:
      list: list of list of processed haikus tokens that contain words from the given inputs
    """
    similar_words = []
    for word in inputs:
        # Find top 5 similar words to current word
        find_similar = [similar_words.append(w) for w,s in embeddings.wv.most_similar(word, topn=5)]
    training_haikus = []
    for haiku in haikus:
        if any(word in haiku for word in similar_words):
            training_haikus.append(haiku)
    return [" ".join(haiku) for haiku in training_haikus]

In [12]:
similar_haikus = find_similar_haikus(haikus, ['basketball'], word2vec_model)

# Define new N-gram Language Model object
ngram_lm = ngm.LanguageModel(2, True, line_begin="<" + "h" + ">", line_end="</" + "h" + ">")
# Training the model with haikus similar to inputs
ngram_lm.train(similar_haikus)

for haiku in ngram_lm.generate_haiku(5):
    for line in haiku:
        print(line)
    print('\n')

 great woman hockey
 it still getting to be hard
 i a touchdown then


 brown getting nba now
 football is my child football
 meaningless hockey


 i honestly just
 join u work and tell me wan
 the pat steelers next


 ariza the most
 hockey in football is in
 one more fun day


 i could do want now
 in a good thing in celtic
 football celtic in


