# Part One: Find words in haiku corpus missing from cmudict & build exceptions dict.

In [None]:
## First, write your own haiku here and append it to the haiku training text file: 
poem = 

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
with open('train.txt', 'a') as f:
    f.write(poem)

In [None]:
import sys 
import nltk
from string import punctuation
import pprint
import json

In [None]:
nltk.download('cmudict') # this downloads the Carnegie Mellon University Pronouncing Dictionary from nltk datastores

In [None]:
from nltk.corpus import cmudict # this imports the cmudict to your python environment

In [None]:
cmudict = cmudict.dict() # this recasts the cmudict as a dictionary (rather than a nltk corpus object)

In [None]:
# The Carnegie Mellon University Pronouncing Dictionary now operates like a python dictionary. 
# Search for words by dictionary key and discuss the results with your group:

print(cmudict['queen'])

In [None]:
# Vaughan describes how this pronouncing dictionary operates: it lists phonemes, not syllables,
# but puts a numeral after vowel sounds that are given "lexical stress." This provides a way to extrapolate 
# syllable counts from these dictionary values.

print(cmudict['quintuple'])

In [None]:
# With your groups, run each cell and discuss what the following functions do. 

In [None]:
# here is your main function - it calls other functions that you will define in the cells to follow.
def main():
    haiku = load_haiku('train.txt')
    exceptions = cmudict_missing(haiku)
    build_dict = input("\nManually build an exceptions dictionary (y/n)? \n")
    if build_dict.lower() == 'n':
        sys.exit()
    else:
        missing_words = make_exceptions_dict(exceptions)
        save_exceptions(missing_words)

In [None]:
def load_haiku(filename):
    """Open and return training corpus of haiku as a set."""
    with open(filename) as in_file:
        haiku = in_file.read().replace('-', ' ').split()
        return set(haiku)

In [None]:
def cmudict_missing(word_set):
    """Find and return words in word set missing from cmudict."""
    exceptions = set() # casting a collection as a set will effectively filter out duplicates
    for word in word_set:
        word = word.lower().strip(punctuation)
        if word.endswith("'s") or word.endswith("’s"):
            word = word[:-2]
        if word not in cmudict:
            exceptions.add(word)
    print("\nexceptions:")
    print(*exceptions, sep='\n') # the asterisk (*) here is used to "unpack" the list exceptions, passing each item as an argument 
    print(f"\nNumber of unique words in haiku corpus = {len(word_set)}")
    print(f"Number of words in corpus not in cmudict = {(len(exceptions))}.")
    membership = (1 - (len(exceptions) / len(word_set))) * 100
    print(f"cmudict membership = {membership:.1f}%") # the variable :.1f construction formats a float value to 1 decimal place
    return exceptions

In [None]:
def make_exceptions_dict(exceptions_set):
    """Return dictionary of words and syllable counts from set of words."""
    missing_words = {}
    print("Input # syllables in word. Mistakes can be corrected at end. type 'q' to quit.\n")
    for word in exceptions_set:
        while True:
            num_sylls = input(f"Enter number syllables in {word}: ")
            if num_sylls.isdigit():
                break
            else:
                print("                   Not a valid answer!")                    
        missing_words[word] = int(num_sylls)              
    print()
    pprint.pprint(missing_words)

    print("\nMake Changes to Dictionary Before Saving?")
    print("""
    0 - Exit & Save
    1 - Add a Word or Change a Syllable Count 
    2 - Remove a Word
    """)

    while True:
        choice = input("\nEnter choice: ")   
        if choice == '0':
            break
        elif choice == '1':
            word = input("\nWord to add or change: ")
            missing_words[word] = int(input(f"Enter number syllables in {word}: "))
        elif choice == '2':
            word = input("\nEnter word to delete: ")
            missing_words.pop(word, None) # .pop() prints and removes the item. The second argument 'None' is what will be returned if the first value isn't found.
            
    print("\nNew words or syllable changes:")
    pprint.pprint(missing_words)

    return missing_words

In [None]:
def save_exceptions(missing_words):
    """Save exceptions dictionary as json file."""
    json_string = json.dumps(missing_words)
    f = open('missing_words.json', 'w')
    f.write(json_string)
    f.close()
    print("\nFile saved as missing_words.json")

In [None]:
23
45
21
23
2333# Now call the main function and discuss what happens. What are you creating?

if __name__ == '__main__':
  main()

In [None]:
# Counting syllables in poems
# The code in this section references both the pronouncing dictionary cmudict and the missing_words file you created above.
# With your group, run these cells and discuss what they do.

In [None]:
# Import and review your dictionary of missing words.

with open('missing_words.json', 'r') as missing_words_file:
    missing_words = json.load(missing_words_file)
print(missing_words)

In [None]:
def count_syllables(words):
    """Use corpora to count syllables in English word or phrase."""
    # prep words for cmudict corpus
    words = words.replace('-', ' ')
    words = words.lower().split()
    num_sylls = 0
    for word in words:
        word = word.strip(punctuation)
        if word.endswith("'s")or word.endswith("’s"):
            word = word[:-2]
        if word in missing_words:
            num_sylls += missing_words[word]
        else:
            for phonemes in cmudict[word][0]:
                for phoneme in phonemes:
                    if phoneme[-1].isdigit():
                        num_sylls += 1
    return num_sylls

In [None]:
def syllable_counter():
    while True:
        print("Syllable Counter")
        word = input("Enter word or phrase else press Enter to Exit: ")
        if word == '':
            print('Exit')
            break
        try:
            num_syllables = count_syllables(word)
            print(f"number of syllables in {word} is: {num_syllables}")
            print()
        except KeyError:
            print("Word not found.  Try again.\n") 

In [None]:
# The following code confirms that your functions have all worked as expected

In [None]:
with open('train.txt') as in_file:
    words = in_file.read().lower().replace('-', ' ').split()

words = set(words)

missing = []

for word in words:
    try:
        num_syllables = count_syllables(word)
 #       print(word, num_syllables, end='\n') # uncomment to see word counts
    except KeyError:
        missing.append(word)
    
print("Missing words:", missing)

# Part Two: generate new haiku using Markov Chain Analysis

In [None]:
"""Produce new haiku from training corpus of existing haiku."""

import random
import logging
from collections import defaultdict

In [None]:
def load_training_file(file):
    """Return a text file as a string."""
    with open(file) as f:
        raw_haiku = f.read()
        return raw_haiku

In [None]:
def prep_training(raw_haiku):
    """Load string, remove newline, split words on spaces, and return list."""
    corpus = raw_haiku.replace('\n', ' ').split()
    return corpus

In [None]:
# In the following section, read along with Impractical Python Chapter 9 and discuss each cell with your group.

In [None]:
def map_word_to_word(corpus):
    """Load list & use dictionary to map word to word that follows."""
    limit = len(corpus)-1
    dict1_to_1 = defaultdict(list)
    for index, word in enumerate(corpus):
        if index < limit:
            suffix = corpus[index + 1]
            dict1_to_1[word].append(suffix)
    logging.debug("map_word_to_word results for \"sake\" = %s\n", 
                  dict1_to_1['sake'])
    return dict1_to_1

In [None]:
def map_2_words_to_word(corpus):
    """Load list & use dictionary to map word-pair to trailing word."""
    limit = len(corpus)-2
    dict2_to_1 = defaultdict(list)
    for index, word in enumerate(corpus):
        if index < limit:
            key = word + ' ' + corpus[index + 1]
            suffix = corpus[index + 2]
            dict2_to_1[key].append(suffix)
    logging.debug("map_2_words_to_word results for \"sake jug\" = %s\n",
                  dict2_to_1['sake jug'])
    return dict2_to_1

In [None]:
def random_word(corpus):
    """Return random word and syllable count from training corpus."""
    word = random.choice(corpus)
    num_syls = count_syllables(word)
    if num_syls > 4:
        random_word(corpus)
    else:
        logging.debug("random word & syllables = %s %s\n", word, num_syls)
        return (word, num_syls)

In [None]:
def word_after_single(prefix, suffix_map_1, current_syls, target_syls):
    """Return all acceptable words in a corpus that follow a single word."""
    accepted_words = []
    suffixes = suffix_map_1.get(prefix)
    if suffixes != None:
        for candidate in suffixes:
            num_syls = count_syllables(candidate)
            if current_syls + num_syls <= target_syls:
                accepted_words.append(candidate)
    logging.debug("accepted words after \"%s\" = %s\n",
                  prefix, set(accepted_words))
    return accepted_words

In [None]:
def word_after_double(prefix, suffix_map_2, current_syls, target_syls):
    """Return all acceptable words in a corpus that follow a word pair."""
    accepted_words = []
    suffixes = suffix_map_2.get(prefix)
    if suffixes != None:
        for candidate in suffixes:
            num_syls = count_syllables(candidate)
            if current_syls + num_syls <= target_syls:
                accepted_words.append(candidate)
    logging.debug("accepted words after \"%s\" = %s\n",
                  prefix, set(accepted_words))
    return accepted_words

In [None]:
def haiku_line(suffix_map_1, suffix_map_2, corpus, end_prev_line, target_syls):
    """Build a haiku line from a training corpus and return it."""
    line = '2/3'
    line_syls = 0
    current_line = []

    if len(end_prev_line) == 0:  # build first line
        line = '1'
        word, num_syls = random_word(corpus)
        current_line.append(word)
        line_syls += num_syls
        word_choices = word_after_single(word, suffix_map_1,
                                         line_syls, target_syls)
        while len(word_choices) == 0:
            prefix = random.choice(corpus)
            logging.debug("new random prefix = %s", prefix)
            word_choices = word_after_single(prefix, suffix_map_1,
                                             line_syls, target_syls)
        word = random.choice(word_choices)
        num_syls = count_syllables(word)
        logging.debug("word & syllables = %s %s", word, num_syls)
        line_syls += num_syls
        current_line.append(word)
        if line_syls == target_syls:
            end_prev_line.extend(current_line[-2:])
            return current_line, end_prev_line

    else:  # build lines 2 & 3
        current_line.extend(end_prev_line)

    while True:
        logging.debug("line = %s\n", line)
        prefix = current_line[-2] + ' ' + current_line[-1]
        word_choices = word_after_double(prefix, suffix_map_2,
                                         line_syls, target_syls)
        while len(word_choices) == 0:
            index = random.randint(0, len(corpus) - 2)
            prefix = corpus[index] + ' ' + corpus[index + 1]
            logging.debug("new random prefix = %s", prefix)
            word_choices = word_after_double(prefix, suffix_map_2,
                                             line_syls, target_syls)
        word = random.choice(word_choices)
        num_syls = count_syllables(word)
        logging.debug("word & syllables = %s %s", word, num_syls)
        
        if line_syls + num_syls > target_syls:
            continue
        elif line_syls + num_syls < target_syls:
            current_line.append(word)
            line_syls += num_syls
        elif line_syls + num_syls == target_syls:
            current_line.append(word)
            break

    end_prev_line = []
    end_prev_line.extend(current_line[-2:])

    if line == '1':
        final_line = current_line[:]
    else:
        final_line = current_line[2:]

    return final_line, end_prev_line

In [None]:
# Here you define and run the main function for this section, which enables you to 
# generage new haiku in the model of the existing corpus. 
# Copy any poems you particularly like and write them to our google doc.


def main():
    """Give user choice of building a haiku or modifying an existing haiku."""
    intro = """\n
    A thousand monkeys at a thousand typewriters...
    or one computer...can sometimes produce a haiku.\n"""
    print("{}".format(intro))

    raw_haiku = load_training_file("train.txt")
    corpus = prep_training(raw_haiku)
    suffix_map_1 = map_word_to_word(corpus)
    suffix_map_2 = map_2_words_to_word(corpus)
    final = []

    choice = None
    while choice != "0":

        print(
            """
            Japanese Haiku Generator

            0 - Quit
            1 - Generate a Haiku poem
            2 - Regenerate Line 2
            3 - Regenerate Line 3
            """
            )

        choice = input("Choice: ")
        print()

        # exit
        if choice == "0":
            print("Sayonara.")
            sys.exit()

        # generate a full haiku
        elif choice == "1":
            final = []
            end_prev_line = []
            first_line, end_prev_line1 = haiku_line(suffix_map_1, suffix_map_2,
                                                    corpus, end_prev_line, 5)
            final.append(first_line)
            line, end_prev_line2 = haiku_line(suffix_map_1, suffix_map_2,
                                              corpus, end_prev_line1, 7)
            final.append(line)
            line, end_prev_line3 = haiku_line(suffix_map_1, suffix_map_2,
                                              corpus, end_prev_line2, 5)
            final.append(line)

        # regenerate line 2
        elif choice == "2":
            if not final:
                print("Please generate a full haiku first (Option 1).")
                continue
            else:
                line, end_prev_line2 = haiku_line(suffix_map_1, suffix_map_2,
                                                  corpus, end_prev_line1, 7)
                final[1] = line

        # regenerate line 3
        elif choice == "3":
            if not final:
                print("Please generate a full haiku first (Option 1).")
                continue
            else:
                line, end_prev_line3 = haiku_line(suffix_map_1, suffix_map_2,
                                                  corpus, end_prev_line2, 5)
                final[2] = line

        # some unknown choice
        else:
            print("\nSorry, but that isn't a valid choice.", file=sys.stderr)
            continue

        # display results
        print()
        print("First line = ", end="")
        print(' '.join(final[0]), file=sys.stderr)
        print("Second line = ", end="")
        print(" ".join(final[1]), file=sys.stderr)
        print("Third line = ", end="")
        print(" ".join(final[2]), file=sys.stderr)
        print()

    input("\n\nPress the Enter key to exit.")

if __name__ == '__main__':
    main()
