In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import os
os.chdir('..')

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt 
from string import punctuation
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=False)

from src.utils import print2_list, print2
from src.plotly import plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap, \
plot_grouped_scatter, plot_multiple_timelines, plot_meter, plot_grouped_boxplot, plot_overlayed_histogram
from src.meter import get_word_scansion, get_line_scansion, get_syllables_per_line_combined, \
combine_line_scansions, merge_lines, get_known_meter
from src.reddit_user_comment_reader import RedditUserCommentReader
from src.data_frame_parser import DataFrameParser

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [None]:
comment_reader = RedditUserCommentReader('poem_for_your_sprog','data')
df = comment_reader.read()
df = DataFrameParser().parse(df)

df = df[df['type'] == 'poem']

In [None]:
colorscale=['#d2d2d2','#336699']
z_text =[['i', 'should','have','hur','ried','youth','in','truth'],
    ['and','moved','more','quick','ly','on','','']]example_1
z =[[0,1,0,1,0,1,0,1],
    [0,1,0,1,0,1,np.nan,np.nan]]
fig = plot_meter(  
    text = z_text[::-1],
    meter = z[::-1],
    title = '',
    colorscale = colorscale
)
fig.show(config={'displayModeBar': False})

In [None]:
example_feet = {
    'iambic foot' :
        {
        'z_text' : [['ex','ist']],
        'z' : [[0,1]]
        },
    'trochaic foot' :
        {
        'z_text' : [['ti','ger']],
        'z' : [[1,0]]
        },
    'anapestic foot' :
        {
        'z_text' : [['un','der','stand']],
        'z' : [[0,0,1]]
        }
}

for key in example_feet.keys():
    fig = plot_meter(  
        text = example_feet[key]['z_text'][::-1],
        meter = example_feet[key]['z'][::-1],
        title = key,
        colorscale = colorscale
    )
    fig.show(config={'displayModeBar': False})

In [None]:
z_text = [x.split(' ') for x in 'i al ways hoped i\'d have some more>an oth er year or two'.split('>')]
z_text[1] = z_text[1] +['','']
z =[[0,1,0,1,0,1,0,1],[0,1,0,1,0,1,np.nan,np.nan]]
fig = plot_meter(  
    text = z_text[::-1],
    meter = z[::-1],
    title = '',
    colorscale = colorscale
)
fig.show(config={'displayModeBar': False})

In [None]:
z_text = [['so','throw','off','the','chains','of','op','pres','sion','said','he'],
          ['be','fair','ly','un','fet','terred','','','','',''],
          ['and','free','to','be','free','','','','','','']]

z =[[0,1,0,0,1,0,0,1,0,0,1],
   [0,1,0,0,1,0,np.nan,np.nan,np.nan,np.nan,np.nan],
   [0,1,0,0,1,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]]
fig = plot_meter(  
    text = z_text[::-1],
    meter = z[::-1],
    title = '',
    colorscale = colorscale
)
fig.show(config={'displayModeBar': False})

In [None]:
z_text = [['so','throw','off','the','chains','of','op','pres','sion','said','he'],
          ['be','fair','ly','un','fet','terred','and','free','to','be','free']]

z =[[0,1,0,0,1,0,0,1,0,0,1],[0,1,0,0,1,0,0,1,0,0,1]]
fig = plot_meter(  
    text = z_text[::-1],
    meter = z[::-1],
    title = '',
    colorscale = colorscale
)
fig.show(config={'displayModeBar': False})

In [None]:
# * = catalectic, i.e. the last (unstressed) syllable is omitted
# ** = iambic subsitution, i.e. the first (unstressed) syllable is omitted from an anapestic foot

known_meters = {
    'iambic hexameter'       : '010101010101',
    'iambic hexameter*'      : '01010101010',
    'iambic pentameter'      : '0101010101',
    'iambic pentameter*'     : '010101010',
    'iambic tetrameter'      : '01010101',
    'iambic tetrameter*'     : '0101010',
    'iambic trimeter'        : '010101',
    'iambic trimeter*'       : '01010',
    'iambic dimeter'         : '0101',
    'iambic dimeter*'        : '010',
    'iambic monometer'       : '01',
    
    'anapestic tetrameter'   : '001001001001',
    'anapestic tetrameter**' : '01001001001',
    'anapestic trimeter'     : '001001001',
    'anapestic trimeter**'   : '01001001',
    'anapestic dimeter'      : '001001',
    'anapestic dimeter**'    : '01001',
    'anapestic monometer'    : '001',

    'trochaic hexameter'     : '101010101010',
    'trochaic hexameter*'    : '10101010101',
    'trochaic pentameter'    : '1010101010',
    'trochaic pentameter*'   : '101010101',
    'trochaic tetrameter'    : '10101010',
    'trochaic tetrameter*'   : '1010101',
    'trochaic trimeter'      : '101010',
    'trochaic trimeter*'     : '10101',
    'trochaic bimeter'       : '1010',
    'trochaic bimeter*'      : '101',
    'trochaic monometer'     : '10',
    
    'amphibrachic dimeter'   : '010010'
}
known_meters_inv = inv_map = {v: k for k, v in known_meters.items()}


# DETERMINE SCANSION ----------------------------------------------------------

# Determine the scansion of each poem, and which lines to combine based on this scansion.
df['poem_as_list'] = [poem.split('>') for poem in df['poem']]
df['scansion'] = [[get_line_scansion(line) for line in poem] for poem in df['poem_as_list']]
df['lines_to_combine'] = [combine_line_scansions(x) for x in df['scansion']]

# combine scansion and poem lines based on the suggested improvements.
df['scansion_modified'] = [merge_lines(row['scansion'], row['lines_to_combine']) for ix, row in df.iterrows()]
df['poem_modified_as_list'] = [merge_lines(row['poem_as_list'], row['lines_to_combine'], sep = ' ') for ix, row in df.iterrows()]
df['poem_modified'] = ['>'.join(x) for x in df['poem_modified_as_list']]

# Determine which of our known meters the poem is.
df['meter_list'] = [get_known_meter(x, known_meters_inv) for x in df['scansion_modified']]
df['meter'] = [', '.join(x) for x in df['meter_list']]


# FIND THE MOST COMMON METERS ----------------------------------------------------------
df_most_common_meters = (df
                         .groupby('meter')
                         .agg(n=('ups', len), 
                              avg_ups=('ups', 'mean'))
                         .sort_values('n',ascending=False)
                        )
df_most_common_meters.reset_index(inplace=True)
df_most_common_meters_10 = df_most_common_meters.head(10)

In [None]:
print('Total number of poems: {0}\nTotal number of poems in top 10 meters: {1} ({2:.1f}% of total)'.format(
    len(df),
    df_most_common_meters_10['n'].sum(),
    df_most_common_meters_10['n'].sum()/len(df)*100
))

In [None]:
fig = plot_horizontal_bar(
    labels = df_most_common_meters_10['meter'][::-1],
    values = df_most_common_meters_10['n'][::-1],
    title = 'The 10 most common meters in poems by u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title='',
    figsize=(700,400)
)
fig.show()
fig.write_json('plots/2/plot_1_most_common_meters.json')

In [None]:
meter_examples = {
    'iambic tetrameter, iambic trimeter':
    {
        'z_text' :  
        [
            ['i', 'should','have','hur','ried','youth','in','truth'],
            ['and','moved','more','quick','ly','on','','']
        ],
        'z' : 
        [
            [0,1,0,1,0,1,0,1],
            [0,1,0,1,0,1,np.nan,np.nan]
        ]
    },
    'anapestic tetrameter**':
    {
        'z_text' : 
        [
            ['so', 'throw','off','the','chains','of','op','pres','sion','said','he'],
            ['be','fair','ly','un','fet','terred','and','free','to','be','free']
        ],
        'z' : 
        [
            [0,1,0,0,1,0,0,1,0,0,1],
            [0,1,0,0,1,0,0,1,0,0,1]
        ]
    },
    'iambic tetrameter':
    {
        'z_text' : 
        [
            ['from','time','to','time','i','think','of','then'],
            ['i','turn','my','gaze','be','fore','a','gain']
        ],
        'z' : 
        [
            [0,1,0,1,0,1,0,1],
            [0,1,0,1,0,1,0,1]
        ]
    },
    'trochaic tetrameter, trochaic tetrameter*':
    {
        'z_text' : 
        [
            ['would', 'you','suck','er','punch','a','mon','key?'],
            ['would','you','up','per','cut','a','bear?','']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1,0],
            [1,0,1,0,1,0,1,np.nan]
        ]
    },
        'trochaic tetrameter':
    {
        'z_text' : 
        [
            ['wave','good','bye','your','e','go','bro','ther'],
            ['kiss','your','kids','and','call','your', 'mo','ther']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1,0],
            [1,0,1,0,1,0,1,0]
        ]
    },
    'trochaic tetrameter, trochaic trimeter*':
    {
        'z_text' : 
        [
            ['no','one\'s','quite','as','strong','as','stan','ley'],
            ['stan','ley\'s','been','to','war','','','']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1,0],
            [1,0,1,0,1,np.nan,np.nan,np.nan]
        ]
    },    
    'trochaic tetrameter*':
    {
        'z_text' : 
        [
            ['when', 'you\'re','full','of','doubt','and','fear'],
            ['i\'ll','be','with','you','wait','ing','here']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1],
            [1,0,1,0,1,0,1]
        ]
    },
    'iambic dimeter':
    {
        'z_text' : 
        [
            ['my','name','is','dog'],
            ['and','e','ven', 'though']
        ],
        'z' : 
        [
            [0,1,0,1],
            [0,1,0,1]
        ]
    },
    'amphibrachic dimeter, anapestic dimeter**':
    {
        'z_text' : 
        [
            ['you\'re','sea','soned','in','sad','ness'],
            ['you\'re','prac','ticed','in','doubt','']
        ],
        'z' : 
        [
            [0,1,0,0,1,0],
            [0,1,0,0,1,np.nan]
        ]
    },
    'anapestic tetrameter':
    {
        'z_text' : 
        [
            ['there\'s','a','laugh','on','her','lips','and','a','light','in','her','eye'],
            ['and','a','warmth','in','her','voice','and','a','smile','in','her','sigh']
        ],
        'z' : 
        [
            [0,0,1,0,0,1,0,0,1,0,0,1],
            [0,0,1,0,0,1,0,0,1,0,0,1]
        ]
    },
    'anapestic tetrameter':
    {
        'z_text' : 
        [
            ['there\'s','a','laugh','on','her','lips','and','a','light','in','her','eye'],
            ['and','a','warmth','in','her','voice','and','a','smile','in','her','sigh']
        ],
        'z' : 
        [
            [0,0,1,0,0,1,0,0,1,0,0,1],
            [0,0,1,0,0,1,0,0,1,0,0,1]
        ]
    },
    'anapestic dimeter':
    {
        'z_text' : 
        [
            ['i','re','mem','ber','the','way'],
            ['that','i','thought','a','bout','love']
        ],
        'z' : 
        [
            [0,0,1,0,0,1],
            [0,0,1,0,0,1]
        ]
    },
    
}

for key in df_most_common_meters_10['meter'].iteritems():
    fig = plot_meter(  
        text = meter_examples[key[1]]['z_text'][::-1],
        meter = meter_examples[key[1]]['z'][::-1],
        title = str(key[0]+1) +'. ' + key[1],
        colorscale = colorscale
    )
    fig.show(config={'displayModeBar': False})

In [None]:
# Create a DataFrame with the top 10 poems based on upvotes for each in the 10 most commonly used meters.
df_10_poems_top_meters = (df
                          [df['meter'].isin(df_most_common_meters_10['meter'])]            
                         .groupby(["meter"])
                         .apply(lambda x: x.sort_values(["ups"], ascending = False))
                         .reset_index(drop=True)
                         .groupby('meter')
                         .head(10))

fig = plot_grouped_scatter(
    x = df_10_poems_top_meters['average_line_length'],
    y = df_10_poems_top_meters['ups'],
    groups = df_10_poems_top_meters['meter'],
    unique_groups = df_most_common_meters_10['meter'],
    text = np.array([re.sub('>','<br>',comment) for comment in df_10_poems_top_meters ['poem']]),
    title = 'Top rated poems in the 10 most common meters by u/poem_for_your_sprog',
    xaxis_title = 'Average line length',
    yaxis_title = 'Upvotes'
)
fig.show()
fig.write_json('plots/2/plot_2_top_rated_in_10_most_common_meters.json')

In [None]:
fig = plot_grouped_scatter(
    x = df_most_common_meters_10['meter'],
    y = df_most_common_meters_10['avg_ups'],
    groups = df_most_common_meters_10['meter'],
    unique_groups = df_most_common_meters_10['meter'],
    text = np.array(['{}<br>Average upvotes: {}<br>n: {}'.format(row['meter'],round(row['avg_ups'],1),row['n'])  
                       for index, row in (df_most_common_meters_10).iterrows()]),
    title = 'Average number of upvotes of poems in the 10 most common meters <br>by u/poem_for_your_sprog',
    xaxis_title = '',
    yaxis_title = 'Average upvotes'
)
fig.show()
fig.write_json('plots/2/plot_3_avg_upvotes_per_meter.json')

In [None]:
fig = plot_grouped_boxplot(
    df = df,
    obs_col = 'ups',
    group_col = 'meter',
    unique_groups = df_most_common_meters_10['meter'],
    title = "Box Plot of the number of upvotes per poem"
)
fig.show()
fig.write_json('plots/2/plot_4_box_plot_upvotes_per_meter.json')

In [None]:
fig = plot_overlayed_histogram(
    df = df,
    obs_col = 'average_line_length',
    group_col = 'meter',
    unique_groups = df_most_common_meters_10['meter'],
    title = 'Histogram of the average number of characters per line <br>for the ten most commonly used meters',
    xaxis_title = 'characters',
    yaxis_title = 'percentage'
)
fig.show()
fig.write_json('plots/2/plot_5_hist_line_length_per_meter.json')

In [None]:
# Create a DataFrame with the total number of poems per meter per month, and the fraction of total
df_meters_per_month = df[['date','meter']]
df_meters_per_month['date'] = pd.to_datetime(df_meters_per_month['date'])
df_meters_per_month['month'] = df_meters_per_month['date'].dt.to_period('M')
df_meters_per_month.loc[~df_meters_per_month['meter'].isin(df_most_common_meters_10['meter']), 'meter'] = 'other'
df_meters_per_month = (df_meters_per_month
                        .groupby(['month','meter'])['month']
                        .agg(n='count')
                        .unstack('meter')
                        .fillna(0)
                        .stack('meter')
                        .reset_index(inplace=False)
)
df_meters_per_month['month'] = [x.to_timestamp() for x in df_meters_per_month['month']]
df_meters_per_month['month_total'] = df_meters_per_month['n'].groupby(df_meters_per_month['month']).transform('sum')
df_meters_per_month['frac']=df_meters_per_month['n']/df_meters_per_month['month_total']

fig = plot_multiple_timelines(
    x = df_meters_per_month['month'],
    y = df_meters_per_month['frac'],
    groups = df_meters_per_month['meter'],
    unique_groups = list(df_most_common_meters_10['meter']) + ['other'],
    text=np.array([
        "{}<br>{}: {:.0f} ({:.2%})".format(
                                         row['meter'],
                                         row['month'].strftime("%b %Y"),
                                         row['n'],
                                         row['frac']) 
                   for ix, row in df_meters_per_month.iterrows()]),
    title='Fraction of poems per meter per month',
    xaxis_title = 'month',
    yaxis_title ='fraction of poems',
    figsize=(1000,1200)
)

fig.show()
fig.write_json('plots/2/plot_6_meter_timelines.json')

In [None]:
other_nice_meters = [
    # meter not found
    '',
    # nice meters
    'anapestic dimeter, anapestic trimeter',
    'anapestic tetrameter',
    'anapestic dimeter, anapestic dimeter**',
    # wrong meter
    'anapestic trimeter**, iambic trimeter'
]

df_other_meters = df_most_common_meters[df_most_common_meters['meter'].isin(other_nice_meters)]

To round it off, let's look at a selection of meters that did not make the top 10, but ended up in the range of numbers 11 to 20:

In [None]:
fig = plot_horizontal_bar(
    labels = df_other_meters['meter'][::-1],
    values = df_other_meters['n'][::-1],
    title = 'A selection of meters #11 to #20 in poems by u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title=''
)
fig.show()
fig.write_json('plots/2/plot_7_other_meters.json')

These three I quite like;

- anapestic dimeter, anapestic trimeter
- anapestic tetrameter
- anapestic dimeter, anapestic dimeter\*\*

so I decided to plot some poems in a graph so you can check them out yourself. Especially the *anapestic dimeter, anapestic trimeter* has a nice flow to it:

In [None]:
z_text = [
    ['i','want','na','tu','ral','hair','','',''],
    ['and','a','bos','so','my','pair','','',''],
    ['and','a','de','li','cate','e','le','gant','face'] 
]
z =[
    [0,0,1,0,0,1,np.nan,np.nan,np.nan],
    [0,0,1,0,0,1,np.nan,np.nan,np.nan],
    [0,0,1,0,0,1,0,0,1]
]
fig = plot_meter(  
    text = z_text[::-1],
    meter = z[::-1],
    title = 'anapestic dimeter, anapestic trimeter',
    colorscale = colorscale
)
fig.show(config={'displayModeBar': False})

Then there's the *anapestic trimeter**, iambic trimeter*. These seem to be misclassified poems; they are actually *iambic tetrameter, iambic trimeter*. At the time of writing however, there are 29 poems classified as the former, while there are 750 classified as the latter. So in general, this does not seem to be a major problem, and the simple classification method seems to be quite accurate.

The last meter that I included is the empty meter. These are poems in which my classification method failed to assign any meter at all. The reason for this is that the library I used is based on the [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict), while many of the words in these poems do not occur in a dictionary.

Here's the plot I promised earlier so you can see for yourself:

In [None]:
# Create a DataFrame with the top 10 poems based on upvotes for each in the 10 most commonly used meters.
df_poems_other_meters = df[df['meter'].isin(df_other_meters['meter'])]
df_10_poems_other_meters = (df_poems_other_meters
                 .groupby(["meter"])
                 .apply(lambda x: x.sort_values(["ups"], ascending = False))
                 .reset_index(drop=True)
                 .groupby('meter')
                 .head(10))

fig = plot_grouped_scatter(
    x = df_10_poems_other_meters ['average_line_length'],
    y = df_10_poems_other_meters ['ups'],
    groups = df_10_poems_other_meters['meter'],
    unique_groups = df_other_meters['meter'],
    text = np.array([re.sub('>','<br>',comment) for comment in df_10_poems_other_meters ['poem']]),
    title = 'Top rated poems in a selection of meters #11 to #20 by u/poem_for_your_sprog',
    xaxis_title = 'Average line length',
    yaxis_title = 'Upvotes'
)
fig.show()
fig.write_json('plots/2/plot_8_scatter_other_meters.json')

That was all for now! I learned a lot about meter in poetry by creating this notebook, and I hope I was able to share some of that with you. But most of all I hope you enjoyed reading this!

In [None]:
front_matter_str = """---
layout: post
title: "Poetry & Data II: Meter"
subtitle: Analyzing the meter in the poetry of /u/poem_for_your_sprog on Reddit
tags: [python, poetry, poem_for_your_sprog, reddit]
layout: html_post
---"""

from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
import time
time.sleep(10)
export_ipynb_for_github_pages(filename="2.0-meter.ipynb",
                              front_matter_str=front_matter_str,
                              prefix = run_date.strftime('%Y-%m-%d') +'-')