In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt 
from string import punctuation
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=False)

from src.utils import print2_list, print2, export_ipynb_for_github_pages
from src.plotly import plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap, \
plot_grouped_scatter, plot_multiple_timelines, plot_meter, plot_grouped_boxplot, plot_overlayed_histogram
from src.meter import get_word_scansion, get_line_scansion, get_syllables_per_line_combined, \
combine_line_scansions, merge_lines, get_known_meter
from src.rhyme import get_last_words_list, get_rhyme_scheme
from src.data import load_data


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

df = load_data('data/comments.txt', False)
df = df[df['type']=='poem']

# Metre

In [None]:
# * = catalectic, i.e. the last (unstressed) syllable is omitted
# ** = iambic subsitution, i.e. the first (unstressed) syllable is omitted from an anapestic foot

known_meters = {
    'iambic hexameter'       : '010101010101',
    'iambic hexameter*'      : '01010101010',
    'iambic pentameter'      : '0101010101',
    'iambic pentameter*'     : '010101010',
    'iambic tetrameter'      : '01010101',
    'iambic tetrameter*'     : '0101010',
    'iambic trimeter'        : '010101',
    'iambic trimeter*'       : '01010',
    'iambic dimeter'         : '0101',
    'iambic dimeter*'        : '010',
    'iambic monometer'       : '01',
    
    'anapestic tetrameter'   : '001001001001',
    'anapestic tetrameter**' : '01001001001',
    'anapestic trimeter'     : '001001001',
    'anapestic trimeter**'   : '01001001',
    'anapestic dimeter'      : '001001',
    'anapestic dimeter**'    : '01001',
    'anapestic monometer'    : '001',

    'trochaic hexameter'     : '101010101010',
    'trochaic hexameter*'    : '10101010101',
    'trochaic pentameter'    : '1010101010',
    'trochaic pentameter*'   : '101010101',
    'trochaic tetrameter'    : '10101010',
    'trochaic tetrameter*'   : '1010101',
    'trochaic trimeter'      : '101010',
    'trochaic trimeter*'     : '10101',
    'trochaic bimeter'       : '1010',
    'trochaic bimeter*'      : '101',
    'trochaic monometer'     : '10',
    
    'amphibrachic dimeter'   : '010010'
}
known_meters_inv = inv_map = {v: k for k, v in known_meters.items()}


# DETERMINE SCANSION ----------------------------------------------------------

# Determine the scansion of each poem, and which lines to combine based on this scansion.
df['poem_as_list'] = [poem.split('>') for poem in df['poem']]
df['scansion'] = [[get_line_scansion(line) for line in poem] for poem in df['poem_as_list']]
df['lines_to_combine'] = [combine_line_scansions(x) for x in df['scansion']]

# combine scansion and poem lines based on the suggested improvements.
df['scansion_modified'] = [merge_lines(row['scansion'], row['lines_to_combine']) for ix, row in df.iterrows()]
df['poem_modified_as_list'] = [merge_lines(row['poem_as_list'], row['lines_to_combine'], sep = ' ') for ix, row in df.iterrows()]
df['poem_modified'] = ['>'.join(x) for x in df['poem_modified_as_list']]

# Determine which of our known meters the poem is.
df['meter_list'] = [get_known_meter(x, known_meters_inv) for x in df['scansion_modified']]
df['meter'] = [', '.join(x) for x in df['meter_list']]


# FIND THE MOST COMMON METERS ----------------------------------------------------------
df_most_common_meters = (df
                         .groupby('meter')
                         .agg(n=('ups', len), 
                              avg_ups=('ups', 'mean'))
                         .sort_values('n',ascending=False)
                        )
df_most_common_meters.reset_index(inplace=True)
df_most_common_meters_10 = df_most_common_meters.head(10)

# Rhyming

In [None]:
df['last_words_list'] = df['poem_modified'].apply(get_last_words_list)

In [None]:
rhyme_schemes = [get_rhyme_scheme(x) for x in df['last_words_list']]
df['rhyme_scheme'] = rhyme_schemes

In [None]:
most_common_rhyme_schemes = df['rhyme_scheme'].value_counts().head(40)
fig = plot_horizontal_bar(
    labels = most_common_rhyme_schemes.index[::-1],
    values = most_common_rhyme_schemes[::-1],
    title = 'The 15 most common rhyming schemes in poems by /u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title='',
    figsize=(800,1400)
)
fig.show()

In [None]:
df_top_rhymes = df[df['rhyme_scheme'].isin(most_common_rhyme_schemes.index[:10])]

In [None]:
df_top_rhymes = (df_top_rhymes
                 .groupby(["rhyme_scheme"])
                 .apply(lambda x: x.sort_values(["ups"], ascending = False))
                 .reset_index(drop=True)
                 .groupby('rhyme_scheme')
                 .head(10))

In [None]:
fig = go.Figure()

for rhyme_scheme in most_common_rhyme_schemes.index[:10]:
    df_subset = df_top_rhymes[df_top_rhymes['rhyme_scheme'] == rhyme_scheme]
    fig.add_trace(go.Scatter(
        x=df_subset['average_line_length'],
        y=df_subset['ups'],
        mode='markers',
        name=rhyme_scheme,
        marker=dict(
            size = 8,
            line_width=1,
            opacity=0.7
        ),
        hoverinfo = 'text',
        text=[re.sub('>','<br>',comment) for comment in df_subset['poem']]
    )
 )


fig.update_layout(
    title='Top rated poems in the 10 most common rhyming schemes by u/poem_for_your_sprog',
    title_x=0.5,
    template='simple_white',
    xaxis_title='Average line length',
    yaxis_title='Upvotes'
)
fig.show()

# Rhyme sets

In [None]:
def get_rhyme_tuples(last_words_per_line):
    all_rhymes = list()
    for i in range(len(last_words_per_line)-1):
        if last_words_per_line[i] is not None:
            rhymes_with = pronouncing.rhymes(last_words_per_line[i])
            next_words = last_words_per_line[(i+1):np.min([len(last_words_per_line),i+4])]
            index_of_next_rhyme_words = np.where([x in rhymes_with for x in next_words])[0]
            if index_of_next_rhyme_words.size>0:
                all_rhymes.append((last_words_per_line[i], next_words[index_of_next_rhyme_words[0]]))
    return all_rhymes

In [None]:
from functools import reduce
all_rhyme_tuples = [get_rhyme_tuples(x) for x in last_words_list]
all_rhyme_tuples = reduce(lambda x,y: x+y,all_rhyme_tuples)

In [None]:
all_rhyme_tuples

In [None]:
all_rhyme_words = [x for tpl in all_rhyme_tuples for x in tpl]

In [None]:
df_rhyme_words = pd.DataFrame({'word':all_rhyme_words})

In [None]:
df_rhyme_words.groupby('word')['word'].count().sort_values(ascending=False).head(100)