In [1]:
import re
from time import process_time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
for dirname, _, filenames in os.walk('./'):
    if dirname[:6]!='./.git':
        for filename in filenames:
            print(os.path.join(dirname, filename))

./.gitattributes
./notebook.ipynb
./data\shakespeare.txt
./data\spenser.txt
./data\Syllable_dictionary.txt
./data\syllable_dict_explanation.docx


In [3]:
shakes = open('./data/shakespeare.txt').read()
print(shakes[:100])

1
From fairest creatures we desire increase,
That thereby beauty's rose might nev


There are a few exceptions: Sonnets 99, 126, and 145. Number 99 has fifteen lines. Number 126 consists of six couplets, and two blank lines marked with italic brackets; 145 is in iambic tetrameters, not pentameters. In one other variation on the standard structure, found for example in sonnet 29, the rhyme scheme is changed by repeating the second (B) rhyme of quatrain one as the second (F) rhyme of quatrain three.

Naive parsing from the homework helper ftn

In [4]:
def parse_observations(text):
    # Convert text to dataset.
    lines = [line.split() for line in text.split('\n') if line.split()]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []
        
        for word in line:
            word = re.sub(r'[^\w]', '', word).lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [5]:
obs, obs_map = parse_observations(shakes)
print(list(obs_map.keys())[:100])

['1', 'from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'bear', 'memory', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'self', 'foe', 'sweet', 'too', 'cruel', 'art', 'now', 'worlds', 'fresh', 'ornament', 'and', 'only', 'herald', 'gaudy', 'spring', 'within', 'bud', 'buriest', 'content', 'churl', 'makst', 'waste', 'in', 'niggarding', 'pity', 'world', 'or', 'else', 'this', 'glutton', 'be', 'eat', 'due', 'grave', 'thee', '2', 'when', 'forty', 'winters', 'shall', 'besiege', 'brow', 'dig', 'deep', 'trenches', 'field', 'youths', 'proud', 'livery', 'so', 'gazed', 'on', 'will']


Reading the given dictionary

In [3]:
syl_dict = pd.read_csv('./data/Syllable_dictionary.txt', sep=' ', names=["word", "length1", "length2"])
syl_dict.fillna(0, inplace=True)
# Syllable length of weak ending is negated, NaNs are replaced with 0
for i in syl_dict.index:
    if syl_dict["length2"][i]==0:                             ### Caveat: There are words with 0 syllables, but they all consists of fixed syllable length.
        syl_dict["length1"][i] = int(syl_dict["length1"][i])
    else:
        if syl_dict["length1"][i][0]=='E':
            syl_dict["length1"][i] = -int(syl_dict["length1"][i][1:])
            syl_dict["length2"][i] = int(syl_dict["length2"][i])
        elif syl_dict["length2"][i][0]=='E':
            syl_dict["length1"][i] = int(syl_dict["length1"][i])
            syl_dict["length2"][i] = -int(syl_dict["length2"][i][1:])
        else:
            syl_dict["length1"][i] = int(syl_dict["length1"][i])
            syl_dict["length2"][i] = int(syl_dict["length2"][i])
syl_dict.set_index("word", inplace=True)
"""
length2==0 iff fixed syllable length
|length1|<|length2| if variable syllable length
"""
syl_dict.head()

Unnamed: 0_level_0,length1,length2
word,Unnamed: 1_level_1,Unnamed: 2_level_1
'gainst,1,0
'greeing,-1,2
'scaped,1,0
'tis,1,0
'twixt,1,0


In [150]:
l = []
for i in range(3205):
    if syl_dict.iloc[i][1]!=0:
        l.append(abs(syl_dict.iloc[i][1])-abs(syl_dict.iloc[i][0]))
print(len(l), max(l), min(l))
"""
Syllable length can vary at most 1
"""

154 1 1


## Master class 'sonnet'

In [35]:
def SonnetLoader(path):
    """
    Load sonnets from txt, return a list consisting of one sonnet per element.
    Each sonnet consists of a list of lines in the sonnet, which is again a list of words in the line.
    TODO: There are apostrophes for possessive form of nouns or quotation marks, which must be distinguished from apostrophe for omission.
    Probably the easiest cure is deleting them manually before loading the sonnets?
    I have checked that sonnet 2, 8 and 145 have a quotation and 14 has a possesive noun (not that no more though).
    Why this a problem? Run "IsRegular" function on the second sonnet and you will see.
    """
    sonnets = []
    if path[0]!='.':
        path = './data/' + path + '.txt'
    with open(path) as f:
        txt = f.read()
        lines = txt.split('\n')
        for i in range(len(lines)):
            lines[i] = re.sub("^\s+", '', lines[i]).lower()
        beginning = 0
        sonnet_is_read = False
        i = 0
        while i<len(lines):
            if lines[i].isdigit():
                beginning = i
                sonnet_is_read = True
            elif len(lines[i])==0:
                if sonnet_is_read:
                    sonnets.append(lines[beginning+1:i])
                    sonnet_is_read = False
            i+=1
        if sonnet_is_read:
            sonnets.append(lines[beginning+1:])
    f.close()
    for sonnet in sonnets:
        for i in range(len(sonnet)):
            sonnet[i] = re.sub(r"[^-'\w\s]", '', sonnet[i]).split()

    return [Sonnet(sonnet) for sonnet in sonnets]

class Sonnet:
    def __init__(self, sonnet):
        self.stringform = sonnet        ### sonnet as a list of words itself
        is_ending = [[False for _ in range(len(line))] for line in sonnet]
        for line in is_ending:
            line[-1] = True
        self.is_ending = is_ending      ### Encoding the location of the end of each lines (having the same shape as stringform)

    def __repr__(self):
        s = ''
        for line in self.stringform:
            for word in line:
                s += word+' '
            s += '\n'
        return s
    
    def SetDict(self, df):              ### Set the syllable dictionary.
        self.dict = df                  ### Temporary format: rows indexed by the words, with two columns of possible syllables
        idxmap = {}
        for i, s in enumerate(self.dict.index.to_numpy()):
            idxmap[s] = i
        self.index_map = idxmap         ### {key:value}={word:idx}
        word_to_idx = []
        for line in self.stringform:
            word_to_idx_line = []
            for word in line:
                word_to_idx_line.append(self.index_map[word])
            word_to_idx.append(word_to_idx_line)
        self.word_to_index = word_to_idx    ### sonnet with words replaced with the corresponding idx

    def IsRegular(self):
        """
        Check if the given sonnet is in regular (pentameter) form.
        Must set the syllable dictionary beforehand.
        With a little modification, can assign a valid syllable length for the words.
        """
        try:
            df = self.dict
            isregular = False
            regularity = 0
            if len(self.stringform)==14:
                for line in self.stringform:
                    syllable_counter_min = 0
                    syllable_counter_max = 0
                    for i in range(len(line)):
                        if i<len(line)-1:
                            if df.loc[line[i]][1]==0:
                                syllable_counter_max += df.loc[line[i]][0]
                                syllable_counter_min += df.loc[line[i]][0]
                            else:
                                if df.loc[line[i]][0]<0:
                                    syllable_counter_max += df.loc[line[i]][1]
                                    syllable_counter_min += df.loc[line[i]][1]
                                elif df.loc[line[i]][1]<0:
                                    syllable_counter_max += df.loc[line[i]][0]
                                    syllable_counter_min += df.loc[line[i]][0]
                                else:
                                    syllable_counter_max += df.loc[line[i]][1]
                                    syllable_counter_min += df.loc[line[i]][0]
                        else:
                            if df.loc[line[i]][1]==0:
                                syllable_counter_max += df.loc[line[i]][0]
                                syllable_counter_min += df.loc[line[i]][0]
                            else:
                                syllable_counter_max += abs(df.loc[line[i]][1])
                                syllable_counter_min += abs(df.loc[line[i]][0])
                    if syllable_counter_min <= 10 <= syllable_counter_max:
                        regularity += 1
            if regularity==14:
                isregular = True
            return isregular

        except AttributeError:
            print("Set the syllable dictionary to use.")

    def WordList(self):
        s = set()
        for line in self.stringform:
            s |= set(line)
        return s
    
    def RhymePair(self):
        pair = []
        paring = [[0,2],[1,3],[4,6],[5,7],[8,10],[9,11],[12,13]]
        for couple in paring:
            i, j = couple
            pair.append({self.stringform[i][-1], self.stringform[j][-1]})
        return pair


In [36]:
a = SonnetLoader('shakespeare')

In [12]:
print(a[1].RhymePair())

[{'now', 'brow'}, {'held', 'field'}, {'eyes', 'lies'}, {'days', 'praise'}, {'use', "excuse'"}, {'thine', 'mine'}, {'old', 'cold'}]


In [37]:
a[0].SetDict(syl_dict)
a[0].IsRegular()

True

In [15]:
a[98].SetDict(syl_dict)
print(a[98].IsRegular())
print(len(a[98].stringform))            ### Sonnet 99 has 15 lines

False
15


In [16]:
a[125].SetDict(syl_dict)
print(a[125].IsRegular())
print(len(a[125].stringform))           ### Sonnet 125 has 12 lines

False
12


In [38]:
a[0].word_to_index

[[1109, 936, 574, 3025, 692, 1403],
 [2719, 2733, 222, 2262, 1696, 1787, 716],
 [391, 151, 2721, 2247, 2405, 393, 2784, 640],
 [1323, 2707, 1294, 1696, 210, 1323, 1682],
 [391, 2753, 535, 2793, 2741, 1895, 364, 926],
 [983, 2775, 1559, 1024, 3116, 2347, 1116],
 [1642, 5, 948, 3057, 14, 1554],
 [2775, 2339, 2775, 1041, 2793, 2775, 2661, 2339, 2807, 588],
 [2753, 2719, 148, 1816, 2721, 3150, 1104, 1871],
 [113, 1861, 1302, 2793, 2721, 1133, 2533],
 [3119, 2741, 1895, 376, 384, 2775, 531],
 [113, 2707, 460, 1637, 3014, 1398, 1798],
 [1974, 2721, 3149, 1868, 839, 2749, 1168, 208],
 [2793, 827, 2721, 3150, 799, 393, 2721, 1192, 113, 2722]]

In [46]:
from HMM import unsupervised_HMM

safelist = list(range(2,20))
safelist.remove(7)
safelist.remove(13)

a[0].SetDict(syl_dict)
a_all = a[0].word_to_index
for i in safelist:
    a[i].SetDict(syl_dict)
    a_all += a[i].word_to_index

n_states = 4
N_iters = 10

HMM = unsupervised_HMM(a_all, n_states, N_iters)

# Print the transition matrix.
print("Transition Matrix:")
print('#' * 70)
for i in range(len(HMM.A)):
    print(''.join("{:<12.3e}".format(HMM.A[i][j]) for j in range(len(HMM.A[i]))))
print('')
print('')

# Print the observation matrix. 
print("Observation Matrix:  ")
print('#' * 70)
for i in range(len(HMM.O)):
    print(''.join("{:<12.3e}".format(HMM.O[i][j]) for j in range(len(HMM.O[i]))))
print('')
print('')

IndexError: list index out of range