# Gutenberg

Download data from gutenberg. Download non-haiku poems and then format them into haikus.

In [1]:
import json
from pathlib import Path
import random
import re

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import inflect
from langdetect import detect, detect_langs
import numpy as np
import pandas as pd

In [2]:
root_path = Path('../..')
input_path = root_path / 'input' 
dictionary_path = input_path / 'dictionaries'
poem_path = input_path / 'poems'
raw_path = poem_path / 'raw'

In [3]:
# Functions to clean gutenberg.org text and split them into sentences

def split_sentences(st):
    sentences = re.split(r'([.?!])\s*', st)
    sentences = [s+p for s,p in zip(sentences[::2], sentences[1::2])]
    if sentences[-1]:
        return sentences
    else:
        return sentences[:-1]
    
def extract_sentences(text):
    return split_sentences(
        ' '.join([
            re.sub(
                r'\[\d+\]', '', re.sub(r'^\s+', '', l)
            ) for l in text.split('\n') 
            if (l.startswith('  ') and ('*' not in l))
        ])
    )

sentences = []

In [4]:
# Japanese Prints  By John Gould Fletcher
text = strip_headers(load_etext(27199)).strip()
sentences += extract_sentences(text[text.index('_Part I_'):])

In [5]:
# TS Eliot
text = strip_headers(load_etext(1567)).strip()
sentences += extract_sentences(text[899:])

In [6]:
# A Child's Garden of Verses, by Robert Louis Stevenson
text = strip_headers(load_etext(19722)).strip()
sentences += extract_sentences(text[text.index('    In winter I get up at night'):])
#print(text)

In [7]:
# THE COLLECTED POEMS OF RUPERT BROOKE
text = strip_headers(load_etext(262)).strip()
sentences += extract_sentences(text[text.index('   Here in the dark, O heart;'):])
#print(text)

In [8]:
# FREEDOM, TRUTH AND BEAUTY
text = strip_headers(load_etext(20174)).strip()
sentences += extract_sentences(text[text.index('  What lineage so noble as from Sires,'):])
#print(text)

In [9]:
# LYRICS OF EARTH
text = strip_headers(load_etext(12664)).strip()
sentences += extract_sentences(text[text.index('    Mother, to whose valiant will,'):])
#print(text)

In [10]:
# Book of Sword Blades and Poppy Seed,
text = strip_headers(load_etext(1020)).strip()
sentences += extract_sentences(text[text.index('      A drifting, April, twilight sky,'):])
#print(text)

In [11]:
# Book of A Dome of Many-Coloured Glass
text = strip_headers(load_etext(261)).strip()
sentences += extract_sentences(text[text.index('          Before the Altar, bowed, he stands'):])
#print(text)

In [12]:
# A HUNDRED AND SEVENTY CHINESE POEMS
text = strip_headers(load_etext(42290)).strip()
sentences += extract_sentences(text[text.index('    “We grasp our battle-spears: we don our breast-plates of hide.'):])
#print(text)

In [13]:
# A HUNDRED AND SEVENTY CHINESE POEMS
text = strip_headers(load_etext(7889)).strip()
sentences += extract_sentences(text[text.index('  Here\'s where'):])
#print(text)

In [14]:
len(sentences)

7031

In [15]:
# Load Phonemes

# Standard Dict
WORDS = {}
with (dictionary_path / 'cmudict.dict.txt').open('r') as f:
    for line in f.readlines():
        word, phonemes = line.strip().split(' ', 1)
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0]
        phonemes = phonemes.split(' ')
        syllables = sum([re.match(r'.*\d', p) is not None for p in phonemes])
        #print(word, phonemes, syllables)
        if word not in WORDS:
            WORDS[word] = []
        WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })
        
# Load custom phonemes
CUSTOM_WORDS = {}
vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AX', 'AXR', 'AY', 'EH', 'ER', 'EY', 'IH', 'IX', 'IY', 'OW', 'OY', 'UH', 'UW', 'UX']
with (dictionary_path / 'custom.dict.txt').open('r') as f:
    for line in f.readlines():
        try:
            word, phonemes = line.strip().split('\t', 1)
        except:
            continue
        word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0].lower()
        phonemes = phonemes.split(' ')
        syllables = sum([(p in vowels) for p in phonemes])
        
        if word not in CUSTOM_WORDS:
            CUSTOM_WORDS[word] = []
        CUSTOM_WORDS[word].append({
            'phonemes': phonemes,
            'syllables': syllables
        })

In [16]:
inflect_engine = inflect.engine()

# Dictionary of words not found, must go get the phonemes
# http://www.speech.cs.cmu.edu/tools/lextool.html
NOT_FOUND = set()

def get_words(line):
    """
    Get a list of the words in a line
    """
    line = line.lower()
    # Replace numeric words with the words written out
    ws = []
    for word in line.split(' '):
        if re.search(r'\d', word):
            x = inflect_engine.number_to_words(word).replace('-', ' ')
            ws = ws + x.split(' ')
        else:
            ws.append(word)

    line = ' '.join(ws)

    words = []
    for word in line.split(' '):
        word = re.match(r'[\'"]*([\w\']*)[\'"]*(.*)', word).groups()[0]
        word = word.replace('_', '')
        words.append(word)
        
    return words

def count_non_standard_words(line):
    """
    Count the number of words on the line that don't appear in the default CMU Dictionary.
    """
    count = 0
    for word in get_words(line):
        if word and (word not in WORDS):
            count += 1
    return count

def get_word_syllables(word):
    if (word not in WORDS) and (word not in CUSTOM_WORDS):
        word = word.strip('\'')

    if word in WORDS:
        syllables = set(p['syllables'] for p in WORDS[word])
    else:
        syllables = set(p['syllables'] for p in CUSTOM_WORDS[word])
        
    return syllables

def get_syllable_count(line):
    """
    Get the possible syllable counts for the line
    """
    counts = [0]
    return_none = False
    for word in get_words(line):
        try:
            if word:
                syllables = get_word_syllables(word)
                new_counts = []
                for c in counts:
                    for s in syllables:
                        new_counts.append(c+s)

                counts = new_counts
        except:
            NOT_FOUND.add(word)
            return_none = True

    if return_none:
        return None
    
    return ','.join([str(i) for i in set(counts)])

In [17]:
# Clean up the sentences and remove junk.

df = pd.DataFrame({'sentence': sentences})
df['unknown_word_count'] = df['sentence'].apply(count_non_standard_words)
df['cleaned'] = df['sentence'].str.replace(r'[^a-zA-Z ]', '').str.strip()
df['len_ratio'] = df['cleaned'].str.len() / df['sentence'].str.len()
df['num_words'] = df['sentence'].str.replace(r'\s+', ' ').str.split(' ').apply(lambda s: len(s))

df = df[
    (df['len_ratio'] >= .8) &
    (df['unknown_word_count'] < 4) &
    df['num_words'].between(3, 34)
].copy()
    
df['lang'] = df['cleaned'].apply(lambda s: detect(s) if s else None)
df = df[df['lang'] != 'fr']

df['syllables'] = df['sentence'].apply(get_syllable_count)
df = df.dropna()

df

Unnamed: 0,sentence,unknown_word_count,cleaned,len_ratio,num_words,lang,syllables
0,Force and yielding meet together: An attack is...,0,Force and yielding meet together An attack is ...,0.967213,10,en,15
1,Shafts of broken sunlight dissolving Convoluti...,1,Shafts of broken sunlight dissolving Convoluti...,0.984848,9,en,17
2,The boat drifts to rest Under the outward spra...,0,The boat drifts to rest Under the outward spra...,0.983333,10,en,14
4,"She is an iris, Dark purple, pale rose, Under ...",0,She is an iris Dark purple pale rose Under the...,0.959596,18,en,22
5,She waves delicately With the movement of the ...,0,She waves delicately With the movement of the ...,0.980392,9,en,13
6,Of what is she dreaming?,0,Of what is she dreaming,0.958333,5,en,6
7,"Of long nights lit with orange lanterns, Of wi...",0,Of long nights lit with orange lanterns Of win...,0.970588,18,en,23
8,And of dawn when weary sleepers Lie outstretch...,0,And of dawn when weary sleepers Lie outstretch...,0.984496,25,en,32
9,Autumn winds roll through the dry leaves On he...,0,Autumn winds roll through the dry leaves On he...,0.970588,16,en,21
10,"Under the blossoming plum-tree, She expresses ...",0,Under the blossoming plumtree She expresses th...,0.959459,22,en,34


In [18]:
print("Unknown Words: ", len(NOT_FOUND))

with open('unrecognized_words.txt', 'w') as f:
    for w in NOT_FOUND:
        f.write(w)
        f.write('\n')

Unknown Words:  27


In [19]:
# A function to split the sentences into three lines

def split(row):
    syllables = [int(i) for i in row['syllables'].split(',')]
    if 17 in syllables:
        syllables = 17
    elif max(syllables) < 17:
        syllables = max(syllables)
    else:
        syllables = min(syllables)
        
    line_lengths = [int(syllables*(5/17)), int(syllables*(7/17)), int(syllables*(5/17))]
    
    remainder = syllables - sum(line_lengths)
    
    while remainder:
        line_lengths[random.randint(0,2)] += 1
        remainder -= 1
        
    line_lengths = [[l] for l in line_lengths]
        
    lines = [[], [], []]
    current = 0
    words = row['sentence'].split(' ')
    i = 0
    while i < len(words):
        word = words[i]
        
        if current == 2:
            lines[current].append(word)
            i += 1
            continue
        
        clean = get_words(word)
        syllables = [int(i) for i in get_syllable_count(clean[0]).split(',')]
        
        new_lengths = []
        for x in line_lengths[current]:
            for y in syllables:
                new_lengths.append(x - y)
        
        if (0 in new_lengths) or ((np.abs(new_lengths).min() <= min(line_lengths[current])) and i < (len(words) - 1)):
            lines[current].append(word)
            line_lengths[current] = new_lengths
        else:
            current += 1
            continue
        
        if (0 in line_lengths[current]) and (current < 2):
            current += 1
            
        i += 1
        
    return '\n'.join([' '.join(l) for l in lines])

df['haiku'] = df.apply(split, axis=1)
df

Unnamed: 0,sentence,unknown_word_count,cleaned,len_ratio,num_words,lang,syllables,haiku
0,Force and yielding meet together: An attack is...,0,Force and yielding meet together An attack is ...,0.967213,10,en,15,Force and yielding meet\ntogether: An attack\n...
1,Shafts of broken sunlight dissolving Convoluti...,1,Shafts of broken sunlight dissolving Convoluti...,0.984848,9,en,17,Shafts of broken sunlight\ndissolving Convolut...
2,The boat drifts to rest Under the outward spra...,0,The boat drifts to rest Under the outward spra...,0.983333,10,en,14,The boat drifts to\nrest Under the outward\nsp...
4,"She is an iris, Dark purple, pale rose, Under ...",0,She is an iris Dark purple pale rose Under the...,0.959596,18,en,22,"She is an iris, Dark\npurple, pale rose, Under..."
5,She waves delicately With the movement of the ...,0,She waves delicately With the movement of the ...,0.980392,9,en,13,She waves\ndelicately With the movement\nof th...
6,Of what is she dreaming?,0,Of what is she dreaming,0.958333,5,en,6,Of what\nis she\ndreaming?
7,"Of long nights lit with orange lanterns, Of wi...",0,Of long nights lit with orange lanterns Of win...,0.970588,18,en,23,"Of long nights lit with orange\nlanterns, Of w..."
8,And of dawn when weary sleepers Lie outstretch...,0,And of dawn when weary sleepers Lie outstretch...,0.984496,25,en,32,And of dawn when weary sleepers Lie\noutstretc...
9,Autumn winds roll through the dry leaves On he...,0,Autumn winds roll through the dry leaves On he...,0.970588,16,en,21,Autumn winds roll through the\ndry leaves On h...
10,"Under the blossoming plum-tree, She expresses ...",0,Under the blossoming plumtree She expresses th...,0.959459,22,en,34,"Under the blossoming plum-tree, She expresses\..."


In [20]:
df['source'] = 'gutenberg'

In [21]:
df[['haiku', 'source']].to_csv(str(raw_path / 'gutenberg.csv'), index=False)