In [1]:
import json
import random
import re
import pathlib
import pandas as pd

In [42]:
dictionary = {}
with open('phoneme-groups-with-syllables.json') as f:
    for l in f:
        j = json.loads(l)
        if len(j['word']) > 1 or j['word'] in ['A', 'I', 'O']:
            dictionary[j['word']] = len(j['syllables'])
        
dict_df = pd.DataFrame.from_records([(k, dictionary[k]) for k in dictionary])
dict_df = dict_df[dict_df[1] <= 7]
        
corpus = []
regex = re.compile("[^A-Z\s'-\.]")
for path in pathlib.Path('corpus').glob('*'):
    print(str(path))
    with open(str(path)) as f:
        corpus = corpus + [i.split() for i in re.split(r'[\.!\?;]',regex.sub('', f.read().upper().replace(",", "").replace("--", " ")))]

model2 = {}
for c in corpus:
    for i in range(len(c) - 1):
        w1 = c[i]
        w2 = c[i+1]
        
        if w1 not in model2:
            model2[w1] = {}
            
        if w2 not in model2[w1]:
            model2[w1][w2] = {'count': 1, 'end': 0, 'start': 0}
        else:
            model2[w1][w2]['count'] += 1
            
        if i == (len(c) - 2):
            model2[w1][w2]['end'] += 1
        if i == 0:
            model2[w1][w2]['start'] += 1
            
records = [(w1, w2, model2[w1][w2]['count'], model2[w1][w2]['end'], model2[w1][w2]['start']) for w1 in model2 for w2 in model2[w1]]
model2_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2:'count', 3: 'end', 4: 'start'})
model2_df = model2_df.merge(dict_df.rename(columns={0: 'word2', 1: 'syllables'}), on='word2')
model2_df = model2_df.merge(dict_df.rename(columns={0: 'word1', 1: 'syllables_word1'}), on='word1')

g = model2_df.groupby('word2')
m = g.sum().reset_index()[['word2', 'end']].merge(g.sum().reset_index()[['word2', 'count']], on='word2')
m['end_percent'] = m['end']/m['count']

model2_df = model2_df.merge(m[['word2', 'end_percent']], on='word2')


g = model2_df.groupby('word1')
m = g.sum().reset_index()[['word1', 'start']].merge(g.sum().reset_index()[['word1', 'count']], on='word1')
m['start_percent'] = m['start']/m['count']

model2_df = model2_df.merge(m[['word1', 'start_percent']], on='word1')

#model2_df.head()

model3 = {}
for c in corpus:
    for i in range(len(c) - 2):
        w1 = c[i]
        w2 = c[i+1]
        w3 = c[i+2]
        
        if w1 not in model3:
            model3[w1] = {}
            
        if w2 not in model3[w1]:
            model3[w1][w2] = {}
            
        if w3 not in model3[w1][w2]:
            model3[w1][w2][w3] = {'count': 1, 'end': 0}
        else:
            model3[w1][w2][w3]['count'] += 1
            
        if i == (len(c) - 3):
            model3[w1][w2][w3]['end'] += 1
            
records = []
for w1 in model3:
    for w2 in model3[w1]:
        for w3 in model3[w1][w2]:
            records.append((w1, w2, w3, model3[w1][w2][w3]['count'], model3[w1][w2][w3]['end']))

model3_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2: 'word3', 3:'count', 4: 'end'})
model3_df = model3_df.merge(dict_df.rename(columns={0: 'word3', 1: 'syllables'}), on='word3')

g = model3_df.groupby('word3')
m = g.sum().reset_index()[['word3', 'end']].merge(g.sum().reset_index()[['word3', 'count']], on='word3')
m['end_percent'] = m['end']/m['count']

model3_df = model3_df.merge(m[['word3', 'end_percent']], on='word3')

model3_df.head()


corpus\1321-0.txt
corpus\24280-0.txt
corpus\pg1020.txt
corpus\pg12389.txt
corpus\pg1322.txt
corpus\pg1365.txt
corpus\pg1567.txt
corpus\pg16452.txt
corpus\pg18524.txt
corpus\pg18726.txt
corpus\pg19221.txt
corpus\pg20.txt
corpus\pg2039.txt
corpus\pg228.txt
corpus\pg23684.txt
corpus\pg2670.txt
corpus\pg27199.txt
corpus\pg4800.txt
corpus\pg8820.txt
corpus\pg9622.txt


Unnamed: 0,word1,word2,word3,count,end,syllables,end_percent
0,MULTIFORM,PUZZLING,EVOLUTIONARY,1,0,6,0.0
1,THEE),MORE,EVOLUTIONARY,1,0,6,0.0
2,MULTIFORM,AND,MIX,1,0,1,0.0
3,OF,GREECE,MIX,1,0,1,0.0
4,OF,HEAVEN,MIX,2,0,1,0.0


In [57]:
model3_df.sort_values('count')

Unnamed: 0,word1,word2,word3,count,end,syllables,end_percent
0,DAY-BOOK,OPEN,TILL,1,0,1,0.000000
672003,LING'RING,LIFE,SINCE,1,0,1,0.019370
672004,PURPOSING,THAT,SINCE,1,0,1,0.019370
672006,NOT,FIND,SINCE,1,0,1,0.019370
672007,NOT,CEASED,SINCE,1,0,1,0.019370
672008,NOT,INDEED,SINCE,1,0,1,0.019370
672009,NOT,KNOWN,SINCE,1,0,1,0.019370
672010,INFECTS,THE,SKIES,1,1,1,0.208609
672011,FAIR,AURORAL,SKIES,1,0,1,0.208609
672015,IN,WINTER,SKIES,1,0,1,0.208609


In [227]:
model_df.sort_values('start_percent')

Unnamed: 0,word1,word2,count,end,start,syllables,end_percent,start_percent
0,SETTLED,IN,1,0,0,1,0.002424,0.0
271187,REAL,WEALTH,1,0,0,1,0.016529,0.0
271188,REAL,IDENTITIES,1,0,0,4,0.000000,0.0
271189,REAL,REAL,1,0,0,1,0.011364,0.0
271190,REAL,UNDERFOOT,1,0,0,3,0.000000,0.0
271191,REAL,EMPLOYMENTS,1,0,0,3,0.125000,0.0
271192,REAL,INCIDENTS,1,0,0,3,0.250000,0.0
271193,REAL,LANDLORD,1,0,0,2,0.000000,0.0
271194,REAL,RECKONING,1,0,0,3,0.000000,0.0
271195,REAL,CUSTODIANS,1,0,0,4,0.000000,0.0


In [198]:
g = model_df.groupby('word2')
m = g.sum().reset_index()[['word2', 'end']].merge(g.sum().reset_index()[['word2', 'count']], on='word2')
m['end_percent'] = m['end']/m['count']
m.sort_values('end_percent').head(100)
#g.count().reset_index()[['word2', 'count']]

Unnamed: 0,word2,end,count,end_percent
0,'BOUT,0,1,0.0
12958,OPERATES,0,1,0.0
12959,OPERATING,0,2,0.0
12961,OPERATIONS,0,5,0.0
12962,OPIATE,0,9,0.0
12964,OPINIONS,0,29,0.0
12965,OPIUM,0,1,0.0
12966,OPOSSUM,0,1,0.0
12968,OPPONENTS,0,2,0.0
12969,OPPORTUNE,0,3,0.0


In [229]:
m[m['word1'] == 'A']

Unnamed: 0,word1,start,count,start_percent
1162,A,561,13887,0.040397


In [49]:
def uppercase(matchobj):
    return matchobj.group(0).upper()

def capitalize(s):
    return re.sub('^([a-z])|[\.|\?|\!]\s*([a-z])|\s+([a-z])(?=\.)', uppercase, s)

In [72]:
#haiku = [[]]
#counts = [5, 7, 5]

def get_first_word():
    subset = model2_df[(model2_df['syllables_word1'] <= 5) & (model2_df['start_percent'] > .1)]
    w = subset.sample(n=1).iloc[0]
    return {'word': w['word1'], 'syllables': w['syllables_word1']}
    #return w#['word2'], w['syllables']

def get_word(previous_words, remaining, line):
    if len(previous_words) >= 2:
        subset = model3_df[
            (model3_df['word1'] == previous_words[-2]['word']) &
            (model3_df['word2'] == previous_words[-1]['word']) & 
            (model3_df['syllables'] <= remaining)
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .1)]
            
        if len(subset) == 0:
            return get_word([previous_words[-1]], remaining, line)
        
        w = subset.sample(n=1, weights='count').iloc[0]
        
        return {'word': w['word3'], 'syllables': w['syllables']}
    else:
        subset = model2_df[
            (model2_df['word1'] == previous_words[-1]['word']) &
            (model2_df['syllables'] <= remaining)
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .1)]

        w = subset.sample(n=1, weights='count').iloc[0]

        return {'word': w['word2'], 'syllables': w['syllables']}

def generate_haiku():
    #w = model_df[model_df['syllables']<=5].sample(n=1).iloc[0]
    w = get_first_word()
    #previous_word = w#['word2']
    previous_words = [w]
    haiku = [[w], [], []]
    counts = [5 - w['syllables'], 7, 5]

    for i,l in enumerate(counts):
        remaining = l
        while remaining > 0:
            try:
                w = get_word(previous_words, remaining, i)
                previous_words.append(w)
                haiku[i].append(w)
                remaining -= w['syllables']
            except Exception as e:
                raise e
                previous = haiku[i].pop()
                previous_words.pop()
                remaining += previous['syllables']


    print(capitalize("\n".join([" ".join([w['word'] for w in l]) for l in haiku]).lower()))
    
#generate_haiku()
    
generated = False
while not generated:
    try:
        generate_haiku()
        generated = True
    except:
        pass
#generate_haiku()

Veiling her deep heart
of the fatal issue of
the forces on mast


In [258]:
model_df[(model_df['word1'] == 'AJAX')]

Unnamed: 0,word1,word2,count,end,start,syllables,syllables_word1,end_percent,start_percent
304243,AJAX,IN,7,0,0,1,2,0.002937,0.095541
304244,AJAX,AND,11,0,1,1,2,0.000324,0.095541
304245,AJAX,THE,8,0,3,1,2,0.000327,0.095541
304246,AJAX,YOUR,1,0,0,1,2,0.000000,0.095541
304247,AJAX,WHICH,1,0,0,1,2,0.000410,0.095541
304248,AJAX,FROM,2,0,1,1,2,0.002742,0.095541
304249,AJAX,BUT,3,0,0,1,2,0.000360,0.095541
304250,AJAX,TO,4,0,0,1,2,0.000471,0.095541
304251,AJAX,I,1,0,0,1,2,0.010609,0.095541
304252,AJAX,WHO,2,0,0,1,2,0.000882,0.095541


In [159]:
corpus = []
regex = re.compile("[^A-Z\s'-\.]")
for path in pathlib.Path('corpus').glob('*'):
    print(str(path))
    with open(str(path)) as f:
        corpus = corpus + re.split(r'\.\s+', regex.sub('', f.read().upper()))

corpus\1321-0.txt
corpus\pg1322.txt
corpus\pg14568.txt
corpus\pg1567.txt
corpus\pg16452.txt
corpus\pg19221.txt
corpus\pg20.txt
corpus\pg2039.txt
corpus\pg228.txt
corpus\pg23684.txt
corpus\pg4800.txt
corpus\pg9622.txt


In [160]:
corpus[0:10]

['\nTHE PROJECT GUTENBERG EBOOK OF THE WASTE LAND, BY T',
 'S',
 'ELIOT\n\nTHIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE AT NO COST AND WITH\nALMOST NO RESTRICTIONS WHATSOEVER',
 'YOU MAY COPY IT, GIVE IT AWAY OR\nRE-USE IT UNDER THE TERMS OF THE PROJECT GUTENBERG LICENSE INCLUDED\nWITH THIS EBOOK OR ONLINE AT WWW.GUTENBERG.ORG\n\n\nTITLE THE WASTE LAND\n\nAUTHOR T',
 'S',
 'ELIOT\n\n\nMAY,   ETEXT \nLAST UPDATED JULY , \n\nLANGUAGE ENGLISH\n\nCHARACTER SET ENCODING UTF-\n\n*** START OF THIS PROJECT GUTENBERG EBOOK THE WASTE LAND ***\n\n\n\nTEXT FILE PRODUCED BY AN ANONYMOUS PROJECT GUTENBERG VOLUNTEER\n\nHTML FILE PRODUCED BY DAVID WIDGER\n\n\n\n\nTHE WASTE LAND\nBY T',
 'S',
 'ELIOT\n\n\n\nCONTENTS\n\nI',
 'THE BURIAL OF THE DEAD\n\nII',
 'A GAME OF CHESS\n\nIII']

In [8]:
dictionary = {}
with open('phoneme-groups-with-syllables.json') as f:
    for l in f:
        j = json.loads(l)
        dictionary[j['word']] = len(j['syllables'])

In [9]:
len(dictionary)

133334

In [44]:
dictionary['CAN\'T']

1

In [21]:
syllabels = [[] for i in range(20)]
for k in dictionary:
    syllabels[dictionary[k]].append(k)

In [24]:
syllabels[7]

['CONSPIRATORIALLY',
 'SUPERMINICOMPUTERS',
 'NONDISCRIMINATORY',
 'ENVIRONMENTALISM(1)',
 'DENATIONALIZATIONS',
 'ELECTROMAGNETISM',
 'UNCONSTITUTIONALLY',
 'LARYNGOSCOPICALY(1)',
 'SEPTUAGENARIAN',
 'INTERDISCIPLINARY',
 "INDIANAPOLIS'S",
 'MULTILATERALISM',
 'RECAPITALIZATIONS',
 'ANESTHESIOLOGY',
 'INTERNATIONALISM',
 'DEINSTITUTIONALIZE',
 'UNCEREMONIOUSLY',
 'CONFIDENTIALITY(1)',
 'N92762',
 'MINISUPERCOMPUTERS',
 'ELECTROCARDIOGRAMS',
 'SUPERCONDUCTIVITY',
 'AUTOBIOGRAPHICAL',
 'HOMOSEXUALITY',
 'OVERSIMPLIFICATION',
 'EPIDEMIOLOGIST',
 'UNRELIABILITY',
 'INTELLECTUALISM(1)',
 'CYTOMEGALOVIRUS',
 'PARTECIPAZIONI',
 'CONTEMPORANEOUSLY',
 'UNPROFITABILITY',
 'INTERGENERATIONAL',
 'EXTRATERRITORIAL',
 'LARYNGOSCOPICALY',
 'INTELLECTUALISM',
 'MINIATURIZATION',
 'IRRESPONSIBILITY',
 'ANGLO-CATHOLICISM',
 'UNDIFFERENTIATED',
 'OVERSENSITIVITY',
 'EPIDEMIOLOGY',
 'EDITORIALIZING',
 'INDIVIDUALISTIC',
 'ELECTROBIOLOGY',
 'INTERNATIONALISM(1)',
 'OVERWHELMABILITY',
 'BIOTECHNOLOGICAL',


In [29]:
haiku = []
for l in [5, 7, 5]:
    remaining = l
    haiku.append([])
    while remaining > 0:
        s = random.randint(1, remaining)
        haiku[-1].append(random.choice(syllabels[s]))
        remaining -= s
        
haiku

[['JOHANNESSEN', 'SPURRED'],
 ['SUPERCONDUCTIVITY'],
 ['DRACHMAS', 'OBESE', 'SHOOED']]

In [126]:
corpus = []
regex = re.compile("[^A-Z\s'-]")
for path in pathlib.Path('corpus').glob('*'):
    print(str(path))
    with open(str(path)) as f:
        corpus.append(regex.sub('', f.read().upper()).split())

corpus\1321-0.txt
corpus\pg1322.txt
corpus\pg14568.txt
corpus\pg1567.txt
corpus\pg16452.txt
corpus\pg19221.txt
corpus\pg20.txt
corpus\pg2039.txt
corpus\pg228.txt
corpus\pg23684.txt
corpus\pg4800.txt
corpus\pg9622.txt


In [103]:
len(corpus[0])

124386

In [48]:
corpus[0]

['GERONTION',
 'THOU',
 'HAST',
 'NOR',
 'YOUTH',
 'NOR',
 'AGE',
 'BUT',
 'AS',
 'IT',
 'WERE',
 'AN',
 'AFTER',
 'DINNER',
 'SLEEP',
 'DREAMING',
 'OF',
 'BOTH',
 'HERE',
 'I',
 'AM',
 'AN',
 'OLD',
 'MAN',
 'IN',
 'A',
 'DRY',
 'MONTH',
 'BEING',
 'READ',
 'TO',
 'BY',
 'A',
 'BOY',
 'WAITING',
 'FOR',
 'RAIN',
 'I',
 'WAS',
 'NEITHER',
 'AT',
 'THE',
 'HOT',
 'GATES',
 'NOR',
 'FOUGHT',
 'IN',
 'THE',
 'WARM',
 'RAIN',
 'NOR',
 'KNEE',
 'DEEP',
 'IN',
 'THE',
 'SALT',
 'MARSH',
 'HEAVING',
 'A',
 'CUTLASS',
 'BITTEN',
 'BY',
 'FLIES',
 'FOUGHT',
 'MY',
 'HOUSE',
 'IS',
 'A',
 'DECAYED',
 'HOUSE',
 'AND',
 'THE',
 'JEW',
 'SQUATS',
 'ON',
 'THE',
 'WINDOW',
 'SILL',
 'THE',
 'OWNER',
 'SPAWNED',
 'IN',
 'SOME',
 'ESTAMINET',
 'OF',
 'ANTWERP',
 'BLISTERED',
 'IN',
 'BRUSSELS',
 'PATCHED',
 'AND',
 'PEELED',
 'IN',
 'LONDON',
 'THE',
 'GOAT',
 'COUGHS',
 'AT',
 'NIGHT',
 'IN',
 'THE',
 'FIELD',
 'OVERHEAD',
 'ROCKS',
 'MOSS',
 'STONECROP',
 'IRON',
 'MERDS',
 'THE',
 'WOMAN',
 'KEEPS

In [55]:
corpus[0].index("WRATH-BEARING")

361

In [127]:
model = {}
for c in corpus:
    for i in range(len(c) - 1):
        w1 = c[i]
        w2 = c[i+1]
        if w1 not in model:
            model[w1] = {}
        if w2 not in model[w1]:
            model[w1][w2] = 1
        else:
            model[w1][w2] += 1

In [105]:
model['HERE']

{'A': 4,
 "ABSORB'D": 1,
 'ADAM': 1,
 'AGAIN': 1,
 'ALL': 3,
 'AMERICA': 1,
 "AMERICA'S": 1,
 'AND': 26,
 'ANYHOW': 1,
 'APENECK': 1,
 'ARE': 4,
 'ART': 1,
 'AS': 7,
 'AT': 2,
 'BARD': 1,
 'BE': 1,
 'BEAST': 1,
 'BEGINNING': 1,
 'BEHOLD': 1,
 'BELOW': 1,
 'BESIDE': 1,
 'BREATHE': 1,
 'BUILD': 1,
 'BUT': 1,
 'BY': 5,
 'CAPTAIN': 1,
 'CHAINS': 1,
 'CHANTING': 1,
 'CITY': 1,
 'CLIMB': 1,
 'COFFIN': 1,
 'COMES': 2,
 "CONDEMN'D": 1,
 "CONFIN'D": 1,
 'CONFRONT': 1,
 "DANC'D": 1,
 'DEAR': 1,
 'DO': 2,
 'DOES': 1,
 "DRIV'N": 1,
 'EACH': 2,
 'ENDED': 1,
 'FARR': 1,
 'FILL': 1,
 'FIND': 1,
 "FINISH'D": 1,
 'FIRST': 1,
 'FOR': 7,
 'FROM': 3,
 'FULL': 1,
 'GAPE': 1,
 'GOD': 1,
 'GROWS': 1,
 'HAD': 1,
 'HAPPIE': 1,
 'HATCHING': 1,
 'HE': 1,
 "HEAV'NS": 1,
 'HEED': 1,
 'HERE': 3,
 'HOW': 1,
 'HOWEVER': 4,
 'I': 10,
 'IF': 1,
 'IN': 17,
 "INSTALL'D": 1,
 'IS': 18,
 'IT': 2,
 'JOY': 1,
 'KEEP': 1,
 'LANDS': 1,
 'LAUGH': 1,
 'LET': 2,
 'LIGHTS': 1,
 'LIKE': 1,
 'LILAC': 1,
 'LONG': 1,
 'LOVE': 1,
 'MAR

In [128]:
records = [(w1, w2, model[w1][w2]) for w1 in model for w2 in model[w1]]

In [129]:
model_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2:'count'})
model_df.head()

Unnamed: 0,word1,word2,count
0,SETTLED,IN,1
1,SETTLED,AND,1
2,SETTLED,ON,3
3,SETTLED,SKY,1
4,SETTLED,SWELLFOOT,1


In [108]:
model_df.sort_values('count', ascending=False).head()

Unnamed: 0,word1,word2,count
97661,OF,THE,1432
25068,IN,THE,800
46695,AND,THE,673
104684,TO,THE,502
60655,ON,THE,405


In [130]:
dict_df = pd.DataFrame.from_records([(k, dictionary[k]) for k in dictionary])
dict_df.head()

Unnamed: 0,0,1
0,STEFANICH,3
1,PATAGONIAN,5
2,HAMMAR,2
3,DILLENBURG,3
4,ODDBALL,2


In [131]:
model_df = model_df.merge(dict_df.rename(columns={0: 'word2', 1: 'syllables'}), on='word2')

In [132]:
model_df.sort_values('syllables', ascending=False).head()

Unnamed: 0,word1,word2,count,syllables
369033,THE,INDIVIDUALITY,1,7
382373,OF,UNIMAGINABLE,2,6
405393,EMPLOYEE,IDENTIFICATION,1,6
402157,SPEECH,CONCILIATORY,2,6
409263,OPERATION,UNNECESSARILY,1,6


In [144]:
#haiku = [[]]
#counts = [5, 7, 5]

def get_word(previous_word, remaining):
    subset = model_df[(model_df['word1'] == previous_word) & (model_df['syllables'] <= remaining)]
    w = subset.sample(n=1, weights='syllables').iloc[0]
    return w['word2'], w['syllables']

w = model_df[model_df['syllables']<=5].sample(n=1).iloc[0]
previous_word = w['word2']
haiku = [[previous_word], [], []]
counts = [5 - w['syllables'], 7, 5]

for i,l in enumerate(counts):
    remaining = l
    while remaining > 0:
        w, s = get_word(previous_word, remaining)
        previous_word = w
        haiku[i].append(previous_word)
        remaining -= s
        
print("\n".join([" ".join(w) for w in haiku]).lower())

close embraces tied
unto all started from vague
it indeed do there


In [18]:
re.split(r'[\.!\?;]', "Hi! Hi. Hi? Hi; Hi")

['Hi', ' Hi', ' Hi', ' Hi', ' Hi']

In [19]:
"T  T".split()

['T', 'T']