In [None]:
%%capture
%load_ext autoreload
%autoreload 2

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt 
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=False)

from src.utils import print2_list, print2, export_ipynb_for_github_pages
from src.plotly import plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap
from src.data import load_data

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

df = load_data('data/comments.txt', False)
df = df[df['type']=='poem']

# Word analysis

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
comments = df['poem'].str.cat(sep=' ')
tokens = tokenizer.tokenize(comments)
tokens = [t for t in tokens if not t in stop_words]
frequency_dist = nltk.FreqDist(tokens)
most_common = frequency_dist.most_common(80)

In [None]:
fig = plot_horizontal_bar(
    labels = [x[0] for x in most_common[::-1]],
    values = [x[1] for x in most_common[::-1]],
    title = 'Most occuring words in comments by u/poem_for_your_sprog',
    xaxis_title = 'Occurence',
    yaxis_title='')
fig.show()

# What about Timmy?

In [None]:
comments_about_timmy = np.array(['timmy' in comment for comment in df['poem']])
comments_about_timmy_fucking_dying = np.array(['timmy fucking died' in comment for comment in df['poem']])

In [None]:
print('Comments about Timmy: {}'.format(comments_about_timmy.sum()))
print('Comments about Timmy fucking dying: {}'.format(comments_about_timmy_fucking_dying.sum()))
print('Comments about Timmy that do not end with Timmy fucking dying: {}'
      .format(comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()))

In [None]:
fig = go.Figure(data=[
                    go.Pie(
                        labels=['Timmy fucking dying','Timmy not fucking dying'], 
                        values=[comments_about_timmy_fucking_dying.sum(),
                             comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()], hole=.3
        )
    ]
)
fig.update_layout(
        template='simple_white'
)
fig.show()

So.. What happens to Timmy if he doesn't fucking die?

In [None]:
df_timmy_not_dying = df[(comments_about_timmy) & (~comments_about_timmy_fucking_dying)]
df_timmy_not_dying['ending'] = [x.split('>')[-1] for x in df_timmy_not_dying['poem']]
df_timmy_not_dying = df_timmy_not_dying.sort_values('ups')

In [None]:
fig = plot_horizontal_bar(
    labels = df_timmy_not_dying['ending'],
    values = df_timmy_not_dying['ups'],
    title = 'Best scoring alternative endings to poems about Timmy',
    xaxis_title = 'Upvotes',
    yaxis_title=''
)
fig.show()

# Rhyming

In [None]:
import pronouncing

In [None]:
def get_last_word_per_line(poem):
    return [re.findall(r"\s([^\.?!,\s]+)[\.?!,\s']*$",line)[0] if re.findall(r"\s([^\.?!,\s]+)[\.?!,\s']*$",line) else None 
     for line in poem.split('>')]

In [None]:
last_words_list = [get_last_word_per_line(mystr) for mystr in df['poem']]

In [None]:
import string

def get_rhyme_scheme(last_words_per_line):
    alphabet = string.ascii_lowercase
    rhyme_scheme = np.empty(len(last_words_per_line),dtype=str)
    k=0
    for i in range(len(last_words_per_line)):
        if rhyme_scheme[i]=='':
            if last_words_per_line[i] is not None:
                rhyme_scheme[i]=alphabet[k % 26]
                rhyme_list = pronouncing.rhymes(last_words_per_line[i])
                rhyme_scheme[(np.array([x in rhyme_list for x in last_words_per_line]) & (rhyme_scheme == ''))] = alphabet[k % 26]
                k+=1
            else:
                rhyme_scheme[i] = '?'               
    return ''.join(rhyme_scheme) 

In [None]:
rhyme_schemes = [get_rhyme_scheme(x) for x in last_words_list]
df['rhyme_scheme'] = rhyme_schemes

In [None]:
most_common_rhyme_schemes = df['rhyme_scheme'].value_counts().head(15)

In [None]:
fig = plot_horizontal_bar(
    labels = most_common_rhyme_schemes.index[::-1],
    values = most_common_rhyme_schemes[::-1],
    title = 'The 15 most common rhyming schemes in poems by /u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title='',
    figsize=(800,600)
)
fig.show()

In [None]:
df_top_rhymes = df[df['rhyme_scheme'].isin(most_common_rhyme_schemes.index[:10])]

In [None]:
df_top_rhymes = (df_top_rhymes
                 .groupby(["rhyme_scheme"])
                 .apply(lambda x: x.sort_values(["ups"], ascending = False))
                 .reset_index(drop=True)
                 .groupby('rhyme_scheme')
                 .head(10))

In [None]:
fig = go.Figure()

for rhyme_scheme in most_common_rhyme_schemes.index[:10]:
    df_subset = df_top_rhymes[df_top_rhymes['rhyme_scheme'] == rhyme_scheme]
    fig.add_trace(go.Scatter(
        x=df_subset['average_line_length'],
        y=df_subset['ups'],
        mode='markers',
        name=rhyme_scheme,
        marker=dict(
            size = 8,
            line_width=1,
            opacity=0.7
        ),
        hoverinfo = 'text',
        text=[re.sub('>','<br>',comment) for comment in df_subset['poem']]
    )
 )


fig.update_layout(
    title='Top rated poems in the 10 most common rhyming schemes by u/poem_for_your_sprog',
    title_x=0.5,
    template='simple_white',
    xaxis_title='Average line length',
    yaxis_title='Upvotes'
)
fig.show()

# Rhyme sets

In [None]:
def get_rhyme_tuples(last_words_per_line):
    all_rhymes = list()
    for i in range(len(last_words_per_line)-1):
        if last_words_per_line[i] is not None:
            rhymes_with = pronouncing.rhymes(last_words_per_line[i])
            next_words = last_words_per_line[(i+1):np.min([len(last_words_per_line),i+4])]
            index_of_next_rhyme_words = np.where([x in rhymes_with for x in next_words])[0]
            if index_of_next_rhyme_words.size>0:
                all_rhymes.append((last_words_per_line[i], next_words[index_of_next_rhyme_words[0]]))
    return all_rhymes

In [None]:
from functools import reduce
all_rhyme_tuples = [get_rhyme_tuples(x) for x in last_words_list]
all_rhyme_tuples = reduce(lambda x,y: x+y,all_rhyme_tuples)

In [None]:
all_rhyme_tuples

In [None]:
all_rhyme_words = [x for tpl in all_rhyme_tuples for x in tpl]

In [None]:
df_rhyme_words = pd.DataFrame({'word':all_rhyme_words})

In [None]:
df_rhyme_words.groupby('word')['word'].count().sort_values(ascending=False).head(100)

# Metre

In [None]:
from string import punctuation

In [None]:
def get_word_scansion(word):
    """ 
    Get the scansion per word, as a string of 0's and 1's.
    """
    word = word.strip(punctuation)
    if word == '': 
        return ''
    pronounciation = pronouncing.phones_for_word(word)
    if pronounciation: 
        stresses = pronouncing.stresses(pronounciation[0])
    else:
        word = re.sub("'.+",'',word)
        pronounciation = pronouncing.phones_for_word(word)
        if pronounciation: 
            stresses = pronouncing.stresses(pronounciation[0])
        else:
            stresses = '?'
    return re.sub('2','1',stresses)

def get_line_scansion(line):
    """ 
    Get the scansion per line, as a string of 0's and 1's.
    """
    return ''.join([get_word_scansion(word) for word in line.split(' ')])

In [None]:
# * = acatalectic, i.e. the last (unstressed) syllable is omitted
# ** = iambic subsitution, i.e. the first (unstressed) syllable is omitted from an anapestic foot

known_metres = {
    'iambic hexameter'       : '010101010101',
    'iambic pentameter'      : '0101010101',
    'iambic tetrameter'      : '01010101',
    'iambic trimeter'        : '010101',
    'iambic dimeter'         : '0101',
    'iambic meter'           : '01',
    
    'anapestic tetrameter'   : '001001001001',
    'anapestic tetrameter**' : '01001001001',
    'anapestic trimeter'     : '001001001',
    'anapestic trimeter**'   : '01001001',
    'anapestic dimeter'      : '001001',
    'anapestic dimeter**'    : '01001',
    'anapestic meter'        : '001',

    'trochaic hexameter'     : '101010101010',
    'trochaic hexameter*'    : '10101010101',
    'trochaic pentameter'    : '1010101010',
    'trochaic pentameter*'   : '101010101',
    'trochaic tetrameter'    : '10101010',
    'trochaic tetrameter*'   : '1010101',
    'trochaic trimeter'      : '101010',
    'trochaic trimeter*'     : '10101',
    'trochaic bimeter'       : '1010',
    'trochaic bimeter*'      : '101',
    'trochaic meter'         : '10',
}
kown_metres_inv = inv_map = {v: k for k, v in known_metres.items()}

In [None]:
scansion = [[get_line_scansion(line) for line in poem.split('>')] for poem in df['poem']]
df['scansion'] = [list(filter(bool, x)) for x in scansion]

In [None]:
def combine_line_scansions(scansion_list):
    """ 
    Combines multiple shorter lines into a single line, if the number of syllables is equal.
    This turns for example the following list of scansions per line;
    
    ['11101001011',
     '11101011001',
     '111011',
     '11011']
     
     into
     
     ['11101001011',
      '11101011001',
      '11101111011']
    
    """   
    scansion_list = scansion_list.copy()
    
    improvement_found = True
    while improvement_found:
        # Find which lines to combine into one, if any.
        n_syllables_per_line = [len(x) for x in scansion_list]
        unique_line_lengths = sorted(np.unique(np.array(n_syllables_per_line)), key=lambda item: -item)
        for target_length in unique_line_lengths[:np.min([len(unique_line_lengths),2])]:
            for lines_to_combine in [2,3,4]: # try to combine 2,3 or 4 lines.
                idx_line_to_combine = []
                if lines_to_combine<len(scansion_list):
                    combined_line_lengths = np.convolve(n_syllables_per_line,np.ones(lines_to_combine,dtype=int),'valid')
                    idx_line_to_combine = np.where(combined_line_lengths==target_length)[0]
                    if len(idx_line_to_combine)>0: break
            if len(idx_line_to_combine)>0: break

        if len(idx_line_to_combine)>0:
            improvement_found = True
            new_line = ''.join(scansion_list[idx_line_to_combine[0]:(idx_line_to_combine[0]+lines_to_combine)])
            scansion_list[idx_line_to_combine[0]] = new_line
            del scansion_list[(idx_line_to_combine[0]+1):(idx_line_to_combine[0]+lines_to_combine)]
        else:
            improvement_found = False
            
    return scansion_list
    
            

In [None]:
df['scansion_altered'] = [combine_line_scansions(x) for x in df['scansion']]

In [None]:
def same_non_stressed(a,b):
    return sum ((a[i] == '0') and (b[i] == '0') for i in range(len(a)))

In [None]:
def get_known_metre(scansion_list):
    """
    Use a list of scansion per line to estimate the metre of the poem. The assumption is 
    that a poem always has at most two different known metres. Furthermore, since our method of
    identifying the scansion overestimates the number of stressed syllables, we will use the number
    of accurate non-stressed syllables to determine the known metre.
    """
    # First, create metre_list; a list which elements have the structure [a,b] where a is the number 
    # of syllables in the line, and b a list of the most likely know metres.
    metre_list=[]
    for scansion in scansion_list:
        l = [(same_non_stressed(scansion,k),v) for k, v in kown_metres_inv.items() if len(k) == len(scansion)]    
        if l:
            maxValue = max(l, key=lambda x: x[0])[0]
            maxValueList = [x[1] for x in l if x[0] == maxValue]
            metre_list.append([len(scansion),maxValueList])

    # If metre_list has at least one element, create metres_list. The elements in this list
    # contain per line length all the predicted metres, still to be flattened.
    # If more than two elements, we only look at the stats for the two highest line lengths.
    # this is to filter outliers, if in some shorter lines the metre was not found and thus they
    # could not be combined.
    if metre_list:        
        (values,counts) = np.unique([x[0] for x in metre_list],return_counts=True)
        values = values[counts>1]
        values = sorted(list(values),reverse=True)[:np.min([len(values),2])]       
        metres_list = [[y[1] for y in metre_list if y[0] == val] for val in values]

        # Now, find per line length the most commonly predicted metre. In case of a tie, pick one at random.
        # Sorry, best we can do for now...
        result = list()
        for metres_per_line_length in metres_list:
            flat_list = [item for sublist in metres_per_line_length for item in sublist]
            (values,counts) = np.unique(flat_list,return_counts=True)
            ind=np.where(counts==np.max(counts))
            if len(ind[0])>1:
                result.append(np.random.choice(values[ind]))
            else:
                result.append(values[ind][0])
    else:
        result = 'unknown'
    return result

In [None]:
df['metre_list'] = [get_known_metre(x) for x in df['scansion_altered']]
df['metre'] = [', '.join(x) for x in df['metre_list']]

In [None]:
for i in range(100):
    print('\n\n--------')
    print(i)
    print2(df['poem'].iloc[i])
    print(df['scansion_altered'].iloc[i])
    print(df['metre'].iloc[i])

In [None]:
df_most_common_metres = (df
                         .groupby('metre')
                         .agg(n=('ups', len), 
                              avg_ups=('ups', 'mean'))
                         .sort_values('n',ascending=False)
                        )
df_most_common_metres.reset_index(inplace=True)
df_most_common_metres_10 = df_most_common_metres.head(10)

In [None]:
fig = plot_horizontal_bar(
    labels = df_most_common_metres_10['metre'][::-1],
    values = df_most_common_metres_10['n'][::-1],
    title = 'The 10 most common metres in poems by /u/poem_for_your_sprog',
    xaxis_title = 'Number of poems',
    yaxis_title='',
    figsize=(800,500)
)
fig.show()

In [None]:
fig = go.Figure()

for metre in df_most_common_metres_10['metre']:
    df_subset = df_most_common_metres_10[df_most_common_metres_10['metre'] == metre]
    fig.add_trace(go.Scatter(
        x=df_subset['n'],
        y=df_subset['avg_ups'],
        mode='markers',
        name=metre,
        marker=dict(
            size = 8,
            line_width=1,
            opacity=0.7
        ),
        hoverinfo = 'text',
        text= ['{}<br>Average upvotes: {}<br>n: {}'.format(row['metre'],round(row['avg_ups'],1),row['n'])  
               for index, row in (df_subset).iterrows()]
    )
 )

fig.update_layout(
    title='Number of poems vs average number of upvotes of the 10 most common metres<br> by u/poem_for_your_sprog',
    title_x=0.5,
    template='simple_white',
    xaxis_title='n',
    yaxis_title='Average upvotes'
)
fig.show()

In [None]:
df_top_metres = df[df['metre'].isin(df_most_common_metres_10['metre'])]
df_top_metres = (df_top_metres
                 .groupby(["metre"])
                 .apply(lambda x: x.sort_values(["ups"], ascending = False))
                 .reset_index(drop=True)
                 .groupby('metre')
                 .head(10))

In [None]:
fig = go.Figure()

for metre in df_most_common_metres_10['metre']:
    df_subset = df_top_metres[df_top_metres['metre'] == metre]
    fig.add_trace(go.Scatter(
        x=df_subset['average_line_length'],
        y=df_subset['ups'],
        mode='markers',
        name=metre,
        marker=dict(
            size = 8,
            line_width=1,
            opacity=0.7
        ),
        hoverinfo = 'text',
        text=[re.sub('>','<br>',comment) for comment in df_subset['poem']]
    )
 )

fig.update_layout(
    title='Top rated poems in the 10 most common metres by u/poem_for_your_sprog',
    title_x=0.5,
    template='simple_white',
    xaxis_title='Average line length',
    yaxis_title='Upvotes'
)
fig.show()