In [None]:
%%capture
%load_ext autoreload
%autoreload 2

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt 
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots
init_notebook_mode(connected=False)

from src.utils import print2_list, print2, export_ipynb_for_github_pages
from src.plotly import plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap, plot_grouped_scatter, plot_multiple_timelines
from src.data import load_data

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

df = load_data('data/comments.txt', False)
df = df[df['type']=='poem']

# Metre

In [None]:
from string import punctuation

In [None]:
# * = acatalectic, i.e. the last (unstressed) syllable is omitted
# ** = iambic subsitution, i.e. the first (unstressed) syllable is omitted from an anapestic foot

known_metres = {
    'iambic hexameter'       : '010101010101',
    'iambic pentameter'      : '0101010101',
    'iambic tetrameter'      : '01010101',
    'iambic trimeter'        : '010101',
    'iambic dimeter'         : '0101',
    'iambic meter'           : '01',
    
    'anapestic tetrameter'   : '001001001001',
    'anapestic tetrameter**' : '01001001001',
    'anapestic trimeter'     : '001001001',
    'anapestic trimeter**'   : '01001001',
    'anapestic dimeter'      : '001001',
    'anapestic dimeter**'    : '01001',
    'anapestic meter'        : '001',

    'trochaic hexameter'     : '101010101010',
    'trochaic hexameter*'    : '10101010101',
    'trochaic pentameter'    : '1010101010',
    'trochaic pentameter*'   : '101010101',
    'trochaic tetrameter'    : '10101010',
    'trochaic tetrameter*'   : '1010101',
    'trochaic trimeter'      : '101010',
    'trochaic trimeter*'     : '10101',
    'trochaic bimeter'       : '1010',
    'trochaic bimeter*'      : '101',
    'trochaic meter'         : '10',
    
    'amphibrachic dimeter'   : '010010'
}
known_metres_inv = inv_map = {v: k for k, v in known_metres.items()}

In [None]:
display(known_metres)

In [None]:
from src.metre import get_word_scansion, get_line_scansion, get_syllables_per_line_combined, \
combine_line_scansions, merge_lines, get_known_metre

# Determine the scansion of each poem, and which lines to combine based on this scansion.
df['poem_as_list'] = [poem.split('>') for poem in df['poem']]
df['scansion'] = [[get_line_scansion(line) for line in poem] for poem in df['poem_as_list']]
df['lines_to_combine'] = [combine_line_scansions(x) for x in df['scansion']]

# combine scansion and poem lines based on the suggested improvements.
df['scansion_modified'] = [merge_lines(row['scansion'], row['lines_to_combine']) for ix, row in df.iterrows()]
df['poem_modified_as_list'] = [merge_lines(row['poem_as_list'], row['lines_to_combine'], sep = ' ') for ix, row in df.iterrows()]
df['poem_modified'] = ['>'.join(x) for x in df['poem_modified_as_list']]

# Determine which of our known metres the poem is.
df['metre_list'] = [get_known_metre(x, known_metres_inv) for x in df['scansion_modified']]
df['metre'] = [', '.join(x) for x in df['metre_list']]

In [None]:
df_most_common_metres = (df
                         .groupby('metre')
                         .agg(n=('ups', len), 
                              avg_ups=('ups', 'mean'))
                         .sort_values('n',ascending=False)
                        )
df_most_common_metres.reset_index(inplace=True)
df_most_common_metres_10 = df_most_common_metres.head(10)

In [None]:
print('Total number of poems: {0}\nTotal number of poems in top 10 metres: {1} ({2:.1f}% of total)'.format(
    len(df),
    df_most_common_metres_10['n'].sum(),
    df_most_common_metres_10['n'].sum()/len(df)*100
))

In [None]:
fig = plot_horizontal_bar(
    labels = df_most_common_metres_10['metre'][::-1],
    values = df_most_common_metres_10['n'][::-1],
    title = 'The 10 most common metres in poems by /u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title='',
    figsize=(800,600)
)
fig.show()

In [None]:
fig = plot_grouped_scatter(
    x = df_most_common_metres_10['n'],
    y = df_most_common_metres_10['avg_ups'],
    groups = df_most_common_metres_10['metre'],
    unique_groups = df_most_common_metres_10['metre'],
    text = np.array(['{}<br>Average upvotes: {}<br>n: {}'.format(row['metre'],round(row['avg_ups'],1),row['n'])  
                       for index, row in (df_most_common_metres_10).iterrows()]),
    title = 'Number of poems vs average number of upvotes of the 10 most common metres<br> by u/poem_for_your_sprog',
    xaxis_title = 'n',
    yaxis_title = 'Average upvotes'
)
fig.show()

In [None]:
# Create a DataFrame with the top 10 poems based on upvotes for each in the 10 most commonly used metres.
df_poems_top_metres = df[df['metre'].isin(df_most_common_metres_10['metre'])]
df_10_poems_top_metres = (df_poems_top_metres
                 .groupby(["metre"])
                 .apply(lambda x: x.sort_values(["ups"], ascending = False))
                 .reset_index(drop=True)
                 .groupby('metre')
                 .head(10))

In [None]:
fig = plot_grouped_scatter(
    x = df_10_poems_top_metres ['average_line_length'],
    y = df_10_poems_top_metres ['ups'],
    groups = df_10_poems_top_metres['metre'],
    unique_groups = df_most_common_metres_10['metre'],
    text = np.array([re.sub('>','<br>',comment) for comment in df_10_poems_top_metres ['poem']]),
    title = 'Top rated poems in the 10 most common metres by u/poem_for_your_sprog',
    xaxis_title = 'Average line length',
    yaxis_title = 'Upvotes'
)
fig.show()

In [None]:
# Create a DataFrame with the total number of poems per metre per month, and the fraction of total
df_poems_top_metres['date'] = pd.to_datetime(df_poems_top_metres['date'])
df_poems_top_metres['month'] = df_poems_top_metres['date'].dt.to_period('M')
df_metres_per_month = (df_poems_top_metres
                            .groupby(['month','metre'])['month']
                            .agg(n='count')
                            .unstack('metre')
                            .fillna(0)
                            .stack('metre')
                            .reset_index(inplace=False)
)
df_metres_per_month['month'] = [x.to_timestamp() for x in df_metres_per_month['month']]
df_metres_per_month['month_total'] = df_metres_per_month['n'].groupby(df_metres_per_month['month']).transform('sum')
df_metres_per_month['frac']=df_metres_per_month['n']/df_metres_per_month['month_total']

In [None]:
fig = plot_multiple_timelines(
    x = df_metres_per_month['month'],
    y = df_metres_per_month['frac'],
    groups = df_metres_per_month['metre'],
    unique_groups = df_most_common_metres_10['metre'],
    text=np.array([
        "{}: {:.0f} ({:.2%})".format(row['month'].strftime("%b %Y"),
                                     row['n'],
                                     row['frac']) 
                   for ix, row in df_metres_per_month.iterrows()]),
    title='Fraction of poems per metre per month',
    xaxis_title = 'month',
    yaxis_title ='fraction of poems',
    figsize=(1000,1200)
)

fig.show()

In [None]:
metre_examples = {
    'iambic tetrameter, iambic trimeter':
    {
        'z_text' :  
        [
            ['i', 'should','have','hur','ried','youth','in','truth'],
            ['and','moved','more','quick','ly','on','','']
        ],
        'z' : 
        [
            [0,1,0,1,0,1,0,1],
            [0,1,0,1,0,1,np.nan,np.nan]
        ]
    },
    'anapestic tetrameter**':
    {
        'z_text' : 
        [
            ['so', 'throw','off','the','chains','of','op','pres','sion','said','he'],
            ['be','fair','ly','un','fet','terred','and','free','to','be','free']
        ],
        'z' : 
        [
            [0,1,0,0,1,0,0,1,0,0,1],
            [0,1,0,0,1,0,0,1,0,0,1]
        ]
    },
    'iambic tetrameter':
    {
        'z_text' : 
        [
            ['from','time','to','time','i','think','of','then'],
            ['i','turn','my','gaze','be','fore','a','gain']
        ],
        'z' : 
        [
            [0,1,0,1,0,1,0,1],
            [0,1,0,1,0,1,0,1]
        ]
    },
    'trochaic tetrameter, trochaic tetrameter*':
    {
        'z_text' : 
        [
            ['would', 'you','suck','er','punch','a','mon','key?'],
            ['would','you','up','per','cut','a','bear?','']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1,0],
            [1,0,1,0,1,0,1,np.nan]
        ]
    },
        'trochaic tetrameter':
    {
        'z_text' : 
        [
            ['wave','good','bye','your','e','go','bro','ther'],
            ['kiss','your','kids','and','call','your', 'mo','ther']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1,0],
            [1,0,1,0,1,0,1,0]
        ]
    },
    'amphibrachic dimeter, anapestic dimeter**':
    {
        'z_text' : 
        [
            ['you\'re','sea','soned','in','sad','ness'],
            ['you\'re','prac','ticed','in','doubt','']
        ],
        'z' : 
        [
            [0,1,0,0,1,0],
            [0,1,0,0,1,np.nan]
        ]
    },
    'trochaic tetrameter*':
    {
        'z_text' : 
        [
            ['when', 'you\'re','full','of','doubt','and','fear'],
            ['i\'ll','be','with','you','wait','ing','here']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1],
            [1,0,1,0,1,0,1]
        ]
    },
    'trochaic tetrameter, trochaic trimeter*':
    {
        'z_text' : 
        [
            ['no','one\'s','quite','as','strong','as','stan','ley'],
            ['stan','ley\'s','been','to','war','','','']
        ],
        'z' : 
        [
            [1,0,1,0,1,0,1,0],
            [1,0,1,0,1,np.nan,np.nan,np.nan]
        ]
    },
   'iambic dimeter':
    {
        'z_text' : 
        [
            ['my','name','is','dog'],
            ['and','e','ven', 'though']
        ],
        'z' : 
        [
            [0,1,0,1],
            [0,1,0,1]
        ]
    },
   'anapestic dimeter':
    {
        'z_text' : 
        [
            ['i','re','mem','ber','the','way'],
            ['that','i','thought','a','bout','love']
        ],
        'z' : 
        [
            [0,0,1,0,0,1],
            [0,0,1,0,0,1]
        ]
    },
    
}
          
import plotly.figure_factory as ff
colorscale=['#d2d2d2','#1f77b4']

for key, value in metre_examples.items():
    fig = ff.create_annotated_heatmap(
        value['z'][::-1], 
        annotation_text=value['z_text'][::-1], 
        colorscale=colorscale)
    fig.update_layout(
        height=150,
        width=50*len(value['z'][0])+200,
        margin=dict(l=20, r=200, t=25, b=10),
        scene=dict(aspectmode='data'),
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False),
        title = key
    )
    fig.show()

In [None]:
# less popular metres

In [None]:
df_most_common_metres_10to20 = df_most_common_metres.iloc[10:20]

In [None]:
fig = plot_horizontal_bar(
    labels = df_most_common_metres_10to20['metre'][::-1],
    values = df_most_common_metres_10to20['n'][::-1],
    title = 'The 10 most common metres in poems by /u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title='',
    figsize=(800,600)
)
fig.show()

In [None]:
# Create a DataFrame with the top 10 poems based on upvotes for each in the 10 most commonly used metres.
df_poems_top_metres = df[df['metre'].isin(df_most_common_metres_10to20['metre'])]
df_10_poems_top_metres = (df_poems_top_metres
                 .groupby(["metre"])
                 .apply(lambda x: x.sort_values(["ups"], ascending = False))
                 .reset_index(drop=True)
                 .groupby('metre')
                 .head(10))

fig = plot_grouped_scatter(
    x = df_10_poems_top_metres ['average_line_length'],
    y = df_10_poems_top_metres ['ups'],
    groups = df_10_poems_top_metres['metre'],
    unique_groups = df_most_common_metres_10to20['metre'],
    text = np.array([re.sub('>','<br>',comment) for comment in df_10_poems_top_metres ['poem']]),
    title = 'Top rated poems in the 10 most common metres by u/poem_for_your_sprog',
    xaxis_title = 'Average line length',
    yaxis_title = 'Upvotes'
)
fig.show()

In [None]:
# shortcomings
# - Most poems with the word 'every' in it are misclassified

In [None]:
np.where(['i never chose to run the race' in x for x in df['poem']])

In [None]:
i = 1109

In [None]:
df['scansion'].iloc[i]

In [None]:
df['scansion_modified'].iloc[i]

In [None]:
df['metre'].iloc[552]

In [None]:

# Determine the scansion of each poem, and which lines to combine based on this scansion.
df['poem_as_list'] = [poem.split('>') for poem in df['poem']]
df['scansion'] = [[get_line_scansion(line) for line in poem] for poem in df['poem_as_list']]
df['lines_to_combine'] = [combine_line_scansions(x) for x in df['scansion']]

# combine scansion and poem lines based on the suggested improvements.
df['scansion_modified'] = [merge_lines(row['scansion'], row['lines_to_combine']) for ix, row in df.iterrows()]
df['poem_modified_as_list'] = [merge_lines(row['poem_as_list'], row['lines_to_combine'], sep = ' ') for ix, row in df.iterrows()]
df['poem_modified'] = ['>'.join(x) for x in df['poem_modified_as_list']]

# Determine which of our known metres the poem is.
df['metre_list'] = [get_known_metre(x, known_metres_inv) for x in df['scansion_modified']]
df['metre'] = [', '.join(x) for x in df['metre_list']]