In [1]:
import calendar
import gzip
import itertools
import json
import math
from pathlib import Path
import re
import string

import pandas as pd

In [2]:
# Load the dictionary of words/syllables

# Standard Dict
WORDS = {}
dictionary_path = Path('dictionaries')
with (dictionary_path / 'cmudict.dict.txt').open('r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split(' ', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0]
        phonemes = phonemes.split(' ')
        syllables = sum([re.match(r'.*\d', p) is not None for p in phonemes])
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })
        
# Load custom phonemes
CUSTOM_WORDS = {}
vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AX', 'AXR', 'AY', 'EH', 'ER', 'EY', 'IH', 'IX', 'IY', 'OW', 'OY', 'UH', 'UW', 'UX']
with (dictionary_path / 'custom.dict.txt').open('r') as f:
    for line in f.readlines():
        try:
            word, phonemes = line.strip().split('\t', 1)
        except:
            continue
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0].lower()
        phonemes = phonemes.split(' ')
        syllables = sum([(p in vowels) for p in phonemes])
        
        if word not in CUSTOM_WORDS:
            CUSTOM_WORDS[word] = []
        CUSTOM_WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })
        
WORDS.update(CUSTOM_WORDS)

# Simplify for this project
WORDS = {w: WORDS[w][0]['syllables'] for w in WORDS}
WORDS["\n"] = 0

# Corrections for captialization
CORRECTIONS = {
    "i": "I",
    "i'm": "I'm",
    "i've": "I've",
    "i'll": "I'll",
    "i'd": "I'd",
    "jesus": "Jesus",
    "trump": "Trump",
    "twitter": "Twitter",
    "facebook": "Facebook",
    "snapchat": "Snapchat",
    "john": "John",
    "america": "America",
    "valentine's": "Valentine's",
    "halloween": "Halloween",
    "etc": "etc.",
    "god": "God",
    "god's": "God's",
    "youtube": "YouTube"
}

for name in calendar.month_name:
    CORRECTIONS[name.lower()] = name

for name in calendar.day_name:
    CORRECTIONS[name.lower()] = name

del CORRECTIONS[""]

for k, v in CORRECTIONS.items():
    WORDS[v] = WORDS[k]
    del WORDS[k]

# Remove some words from the dictionary
for word in [
    "ur",
    "tho",
    "ion",
    "ill", # ambiguous usage
    "cant",
    "mr",
    "mrs",
    "st",
    "ii",
    
    # Avoid _potentially_ offensive haikus. These words aren't necessarily offensive, but the corpus has a lot of
    # offensive uses of them, so best to avoid it entirely.
    "gay", 
]:
    del WORDS[word]
    
for letter in "bcdefghjklmnpqrstvwxyz":
    del WORDS[letter]

In [3]:
# The result is a dictionary mapping words to the number of syllables they have
print(f'"syllables" has {WORDS["syllables"]} syllables')

"syllables" has 3 syllables


In [4]:
# Load the corpii of corpora of training data and transform them into a list of transitions from words and word pairs
# into the next word. These corpora aren't specifically haikus, just text; this is because we don't have enough
# haikus to work with.

corpus = []
regex = re.compile(r"[^.a-z\s'-]")

# Text files from Gutenberg
CORPUS_DIR = Path('corpus')
for path in CORPUS_DIR.glob('*.txt'):
    with open(str(path)) as f:
        corpus = corpus + [
            i.split() for i in re.split(
                r'[\.!\?;]',
                regex.sub('',
                    f.read().lower().replace("--", " ")#.replace(",", "").replace("--", " ").replace("(", "").replace(")", "")
                )
            )
        ]
        
# Brown Corpus
from nltk.corpus import brown
corpus += [re.split(r'[\.!\?;\s]+', regex.sub('', ' '.join(s).lower().replace("''", ""))) for s in brown.sents()]

# Gutenberg Poetry Corpus
all_lines = []
for line in gzip.open("corpus/gutenberg-poetry-v001.ndjson.gz"):
    all_lines.append(json.loads(line.strip()))
    
big_poem = "\n".join([line['s'] for line in all_lines])
corpus += [
    i.split() for i in re.split(
        r'[\.!\?;]',
        regex.sub('',
            big_poem.lower().replace("--", " ")#.replace(",", "").replace("--", " ").replace("(", "").replace(")", "")
        )
    )
]

# Functions for processing the corpora into transitions 
def correct_sentence(sentence):
    new_words = []
    
    for word in sentence:
        if word == "":
            continue
        if word in CORRECTIONS:
            word = CORRECTIONS[word]
            
        if word not in WORDS:
            raise KeyError
            
        new_words.append(word)
        
    return new_words

def haikuify(sentence):
    try:
        sentence = correct_sentence(sentence)
    except KeyError:
        return []
    
    total_syllables = 0
    for word in sentence:
        total_syllables += WORDS[word]
        
    if total_syllables < 10 or total_syllables > 21:
        return []
    
    targets = [total_syllables * 5/17, total_syllables * 12/17, total_syllables]
    target = total_syllables * 5/17
    new_sentence = []
    syllables = 0
    while syllables < target:
        word = sentence.pop(0)
        new_sentence.append(word)
        syllables += WORDS[word]
    new_sentence.append("\n")
    
    target = total_syllables * 12/17
    while syllables < target:
        word = sentence.pop(0)
        new_sentence.append(word)
        syllables += WORDS[word]
        
    return new_sentence + ["\n"] + sentence + ["END"]

def process_sentence(sentence):
    if not sentence:
        return []
    syllable_count = 0
    line_num = 0
    new_rows = [
        (0, 0, "START", sentence[0], False)
    ]
    for i in range(len(sentence) - 1):
        word = sentence[i]
        next_word = sentence[i + 1]
        next_next_word = sentence[i + 2] if (i + 2) < len(sentence) else None
        
        
        if word in CORRECTIONS:
            word = CORRECTIONS[word]
            
        if next_word in CORRECTIONS:
            next_word = CORRECTIONS[next_word]
            
        if next_next_word in CORRECTIONS:
            next_next_word = CORRECTIONS[next_next_word]     
        
        if word not in WORDS:
            return []
        
        syllable_count += WORDS[word]
        
        new_rows.append((line_num, syllable_count, word, next_word, i == len(sentence) - 3))
        if next_next_word:
            new_rows.append((line_num, syllable_count, word + " " + next_word, next_next_word, i == len(sentence) - 4))
            
        if word == "\n":
            line_num += 1
    
    if (syllable_count < 10) or (syllable_count > 23):
        return []
    
    return new_rows

transitions = []
for sentence in corpus:
    sentence = haikuify(sentence)
    if sentence:
        transitions += process_sentence(sentence)
        
print(len(transitions))

3012916


In [5]:
# Also load a CSV of preprocessed haikus
haiku_df = pd.read_csv('corpus/haikus.csv')
haiku_df = haiku_df.drop_duplicates(subset=["0", "1", "2"])

corpus = []
regex = re.compile(r"[^.a-z\s'-]")

for i, row in haiku_df.iterrows():
    try:
        if (row['0_syllables'] + row['1_syllables'] + row['2_syllables']) <= 17:
            sentence = row["0"] + " \n " + row["1"] + " \n " + row["2"]
            sentence = regex.sub('', sentence.lower().replace("--", " ").replace(" - ", " "))
            sentence = sentence + " END"
            corpus.append([w for w in sentence.split(" ") if w])
    except:
        pass
    
for sentence in corpus:
    transitions += process_sentence(sentence)
    
len(transitions)

6455634

In [6]:
# Create a pandas dataframe from the transitions list
transitions_df = pd.DataFrame(transitions, columns=["line_num", "syllable", "word", "next_word", "end"])
transitions_df[0] = 1

# Get rid of pairs that only occur once
pair_counts = (transitions_df.groupby(["word", "next_word"]).count()[0] > 1).reset_index()
pair_counts = pair_counts[pair_counts[0]][["word", "next_word"]]
transitions_df = transitions_df.merge(pair_counts, on=["word", "next_word"])

print(len(transitions_df))

transitions_df.head()

4490398


Unnamed: 0,line_num,syllable,word,next_word,end,0
0,0,0,START,I,False,1
1,0,0,START,I,False,1
2,0,0,START,I,False,1
3,0,0,START,I,False,1
4,0,0,START,I,False,1


In [7]:
# Create a dictionary of the 1000 most common words
dict_size = 1000

dictionary = ["END", "START", "\n"] + transitions_df[
    ~transitions_df["word"].str.contains(" ")
].groupby("word").count().sort_values("next_word", ascending=False).index[:dict_size].tolist()

# Dedupe in case
dictionary = list(dict.fromkeys(dictionary).keys())

transitions_df_subset = transitions_df[
    transitions_df["word"].str.contains(r'(^| )(' + '|'.join(dictionary) + r')($| )') & 
    transitions_df["next_word"].isin(dictionary)
]

print(len(transitions_df))

# Compress the words and create mappings to and from the compressed form
def compressed_word():
    n = 1
    while True:
        yield from (''.join(group) for group in itertools.product(string.ascii_letters + string.digits, repeat=n))
        n += 1
        
itr = compressed_word()

compression_dictionary = {w: next(itr) for w in dictionary}
uncompression_dictionary = {v:k for k,v in compression_dictionary.items()}

  if sys.path[0] == '':


4490398


In [8]:
# Create the model - the model is a dictionary mapping words or word pairs to the following words by line number, 
# ordered by frequency of the following word in the corpus

GROUPBYCOL = "line_num"
model = {}

for i, row in transitions_df_subset.groupby([
    "word", GROUPBYCOL, "next_word"
]).count().reset_index().sort_values(0, ascending=False).iterrows():
    word = row["word"]
    next_word = row["next_word"]
    
    # Handle pairs of words
    if " " in word:
        word1, word2 = word.split(" ")
        
        # Make sure that both words are in the dictionary
        if (word1 not in dictionary) or (word2 not in dictionary):
            continue
            
        word = compression_dictionary[word1] + " " + compression_dictionary[word2]
    else:
        word = compression_dictionary[word]
        
    next_word = compression_dictionary[next_word]
        
    if word not in model:
        model[word] = {}
    if row[GROUPBYCOL] not in model[word]:
        model[word][row[GROUPBYCOL]] = []
        
    model[word][row[GROUPBYCOL]].append((next_word, row[0]))
    

# Prune the model to save on space
for word in model:
    for syllable in model[word]:
        total_sum = sum([c for n,c in model[word][syllable]])
        cumsum = 0
        new_words = []
        for next_word, count in model[word][syllable]:
            # Exclude options that occur less than .5%
            if count < total_sum/250:
                continue
            
            # Exclude infrequent pairs from the space model
            if " " in word and count < 3:
                continue
            
            if total_sum > 1000:
                cumsum += math.ceil(count/1000)
            else:
                cumsum += count
                
            new_words.append((next_word, cumsum))
            
        if new_words:
            model[word][syllable] = new_words
    
    
# Get the rough model size; should be ~ 5MB or less
import json
print("Model Size:", len(json.dumps(model)))

Model Size: 3957157


In [9]:
# Model the ends of haikus in a similar fashion.
# There is a separate model for this since we want the haiku to end with a complete thought as best as possible

end_model = {}

for i, row in transitions_df_subset[
    transitions_df_subset["end"] == True
].groupby(["word", "next_word"]).count().reset_index().sort_values(0, ascending=False).iterrows():
    word = row["word"]
    next_word = row["next_word"]
    
    # Handle pairs of words
    if " " in word:
        word1, word2 = word.split(" ")
        
        # Make sure that both words are in the dictionary
        if (word1 not in dictionary) or (word2 not in dictionary):
            continue
            
        word = compression_dictionary[word1] + " " + compression_dictionary[word2]
    else:
        word = compression_dictionary[word]
        
    next_word = compression_dictionary[next_word]
        
    if word not in end_model:
        end_model[word] = []
        
    end_model[word].append((next_word, row[0]))
    

for word in end_model:
    total_sum = sum([c for n,c in end_model[word]])
    cumsum = 0
    new_words = []
    for next_word, count in end_model[word]:
        # Exclude options that occur less than .1%
        if count < total_sum/1000:
            continue

        cumsum += count

        new_words.append((next_word, cumsum))

    if new_words:
        end_model[word] = new_words
    
    
# Get the rough model size; should be ~ 2.5MB or less
import json
print("End Model Size:", len(json.dumps(end_model)))

End Model Size: 617691


In [10]:
# Templates for writing the Cadence contracts that contain the different parts of the model

MODEL_TEMPLATE = """pub contract Model {
  access(account) let model: {String: {Int: {String: Int}}}

  init() {
    self.model = {MODEL}
  }
}"""

SPACE_MODEL_TEMPLATE = """pub contract SpaceModel {
  access(account) let model: {String: {Int: {String: Int}}}

  init() {
    self.model = {MODEL}
  }
}"""

END_MODEL_TEMPLATE = """pub contract EndModel {
  access(account) let model: {String: {String: Int}}

  init() {
    self.model = {MODEL}
  }
}"""

WORDS_TEMPLATE = """pub contract Words {
  access(account) let syllables: {String: Int}
  access(account) let uncompress: {String: String}

  init() {
    self.syllables = {SYLLABLES}
    self.uncompress = {UNCOMPRESS}
  }
}"""

In [11]:
# Create contracts by reformatting the models into Cadence

model_reformatted = {}
space_model_reformatted = {}

output_dir = Path('contracts')
output_dir.mkdir(exist_ok=True)

for word, m in model.items():
    if " " in word:
        space_model_reformatted[word.replace("'", "QUOTE").replace(" ", "SPACE")] = {}

        for syllable, options in m.items():
            # Exclude options where there is no option, to increase diversity
            if len(options) < 3:
                continue
            space_model_reformatted[word.replace("'", "QUOTE").replace(" ", "SPACE")][syllable] = {
                w.replace("'", "QUOTE").replace(" ", "SPACE"): c for w, c in options
            }
            
        # If the model for this word turned out empty, delete it
        if len(space_model_reformatted[word.replace("'", "QUOTE").replace(" ", "SPACE")]) == 0:
            del space_model_reformatted[word.replace("'", "QUOTE").replace(" ", "SPACE")]
    else:
        model_reformatted[word.replace("'", "QUOTE")] = {}
        for syllable, options in m.items():
            model_reformatted[word.replace("'", "QUOTE")][syllable] = {w.replace("'", "QUOTE"): c for w, c in options}


with (output_dir / 'Model.cdc').open('w') as f:
    f.write(MODEL_TEMPLATE.replace(
        '{MODEL}',
        str(model_reformatted).replace("'", '"').replace("QUOTE", "'").replace(" ", "")
    ))

with (output_dir / 'SpaceModel.cdc').open('w') as f:
    f.write(SPACE_MODEL_TEMPLATE.replace(
        '{MODEL}',
        str(space_model_reformatted).replace("'", '"').replace("QUOTE", "'").replace(" ", "").replace("SPACE", " ")
    ))


end_model_reformatted = {}
for word, m in end_model.items():
    end_model_reformatted[word.replace("'", "QUOTE").replace(" ", "SPACE")] = {w.replace("'", "QUOTE"): c for w, c in m}
    

with (output_dir / 'EndModel.cdc').open('w') as f:
    f.write(END_MODEL_TEMPLATE.replace(
        '{MODEL}',
        str(end_model_reformatted).replace("'", '"').replace("QUOTE", "'").replace(" ", "").replace("SPACE", " ")
    ))
    
    
# Write dictionary contract
WORDS["END"] = 0
WORDS["START"] = 0
words_subset = {w.replace("'", "QUOTE"): WORDS[w] for w in dictionary}

uncompression_formatted = {k: v.replace("'", "QUOTE") for k,v in uncompression_dictionary.items()}

with (output_dir / 'Words.cdc').open('w') as f:
    f.write(WORDS_TEMPLATE.replace(
        '{SYLLABLES}',
        str(words_subset).replace("'", '"').replace("QUOTE", "'").replace(" ", "")
    ).replace(
        '{UNCOMPRESS}',
        str(uncompression_formatted).replace("'", '"').replace("QUOTE", "'").replace(" ", "")
    ))