In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import os
import pickle
from src.string import clean_comment
import textwrap
import matplotlib.pyplot as plt 
from string import punctuation
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=False)

from src.utils import print2_list, print2, export_ipynb_for_github_pages
from src.plotly import plot_scatter, plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap, \
plot_grouped_scatter, plot_multiple_timelines, plot_meter, plot_grouped_boxplot, plot_overlayed_histogram
from src.meter import get_word_scansion, get_line_scansion, get_syllables_per_line_combined, \
combine_line_scansions, merge_lines, get_known_meter
from src.data import load_data

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

df = load_data('data/comments.txt', False)
run_date = dt.datetime.today().date()
df = df[df['date']<=run_date]
df = df[df['type'] == 'poem']

# Word analysis

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
comments = df['poem'].str.cat(sep=' ')
tokens = tokenizer.tokenize(comments)
tokens = [t for t in tokens if not t in stop_words]
frequency_dist = nltk.FreqDist(tokens)
most_common = frequency_dist.most_common(80)

In [None]:
fig = plot_horizontal_bar(
    labels = [x[0] for x in most_common[::-1]],
    values = [x[1] for x in most_common[::-1]],
    title = 'Most occuring words in comments by u/poem_for_your_sprog',
    xaxis_title = 'Occurence',
    yaxis_title='')
fig.show()

# What about Timmy?

In [None]:
comments_about_timmy = np.array(['timmy' in comment for comment in df['poem']])
comments_about_timmy_fucking_dying = np.array(['timmy fucking died' in comment for comment in df['poem']])

In [None]:
print('Comments about Timmy: {}'.format(comments_about_timmy.sum()))
print('Comments about Timmy fucking dying: {}'.format(comments_about_timmy_fucking_dying.sum()))
print('Comments about Timmy that do not end with Timmy fucking dying: {}'
      .format(comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()))

In [None]:
fig = go.Figure(data=[
                    go.Pie(
                        labels=['Timmy fucking dying','Timmy not fucking dying'], 
                        values=[comments_about_timmy_fucking_dying.sum(),
                             comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()], hole=.3
        )
    ]
)
fig.update_layout(
        template='simple_white'
)
fig.show()

So.. What happens to Timmy if he doesn't fucking die?

In [None]:
df_timmy_not_dying = df[(comments_about_timmy) & (~comments_about_timmy_fucking_dying)]
df_timmy_not_dying['ending'] = [x.split('>')[-1] for x in df_timmy_not_dying['poem']]
df_timmy_not_dying = df_timmy_not_dying.sort_values('ups')

In [None]:
fig = plot_horizontal_bar(
    labels = df_timmy_not_dying['ending'],
    values = df_timmy_not_dying['ups'],
    title = 'Best scoring alternative endings to poems about Timmy',
    xaxis_title = 'Upvotes',
    yaxis_title=''
)
fig.show()

In [None]:
df_timmy = df[comments_about_timmy]
fig = plot_scatter(
    x=df_timmy['total_awards_received'],
    y=df_timmy['ups'],
    text=['ups: {}<br>{}<br><br>'.format(row['ups'],row['awards_simple']) 
                  + re.sub('>','<br>',row['poem']) for index, row in (df_timmy).iterrows()],
    title='Upvotes versus number of awards of the top poems by u/poem_for_your_sprog',
    xaxis_title='Number of awards received',
    yaxis_title='Upvotes'
    )
fig.show()

In [None]:
df_timmy = df[comments_about_timmy]

In [None]:
def get_parents(parent_id, parents):
    r = requests.get("https://api.pushshift.io/reddit/comment/search/",params={'ids':parent_id})
    try:
        comment = r.json()
        parents.append(comment['data'][0]['body'])
        parents = get_parents(comment['data'][0]['parent_id'], parents)
    except:
        None
    return parents

In [None]:
# COMMENT PARENTS -----------------------------------------
if os.path.isfile("data/timmy_comment_parents.p"):
    timmy_comment_parents = pickle.load(open("data/timmy_comment_parents.p", "rb" ))
    df_timmy['parents'] = df_timmy['id'].map(timmy_comment_parents)
else:
    timmy_comment_parents = {}
    df_timmy['parents'] = None

# If there are any new comments about Timmy, get their parents and update the pickle file.
if np.any(df_timmy['parents'].isnull()):
    # Get the parents of the new comments
    df_timmy_new = df_timmy[df_timmy['parents'].isnull()]
    timmy_comment_parents_new={}
    for ix, row in df_timmy_new.iterrows():
        timmy_comment_parents_new[row['id']] = get_parents(row['parent_id'],[])
        
    # Update the dict, the pickle file, and the DataFrame.
    timmy_comment_parents.update(timmy_comment_parents_new)
    df_timmy['parents'] = df_timmy['id'].map(timmy_comment_parents)
    pickle.dump(timmy_comment_parents, open( "data/timmy_comment_parents.p", "wb" ) )

In [None]:
# POST TITLES -----------------------------------------
if os.path.isfile("data/timmy_titles.p"):
    timmy_titles = pickle.load(open("data/timmy_titles.p", "rb" ))
    df_timmy['title'] = df_timmy['id'].map(timmy_titles)
else:
    timmy_titles = {}
    df_timmy['title'] = None

# If there are any new comments about Timmy, get their parents and update the pickle file.
if np.any(df_timmy['title'].isnull()):
    # Get the parents of the new comments
    df_timmy_new = df_timmy[df_timmy['title'].isnull()]
    timmy_titles_new={}
    for ix, row in df_timmy_new.iterrows():
        r = requests.get("https://api.pushshift.io/reddit/submission/search/",params={'ids':row['link_id']})
        timmy_titles_new[row['id']] = r.json()['data'][0]['title']
        time.sleep(1)
        
    # Update the dict, the pickle file, and the DataFrame.
    timmy_titles.update(timmy_titles_new)
    df_timmy['title'] = df_timmy['id'].map(timmy_titles)
    pickle.dump(timmy_titles, open( "data/timmy_titles.p", "wb" ) )

In [None]:
df_timmy['title'] = ['>'.join(textwrap.wrap(clean_comment(x), width=100)) for x in df_timmy['title']]
df_timmy['parents'] = [['>'.join(textwrap.wrap(clean_comment(y),width=100)) for y in x] for x in df_timmy['parents']]
df_timmy['parents'] = ['>--->'.join(x[::-1]) for x in df_timmy['parents']]

In [None]:
fig = plot_scatter(
    x=df_timmy['date'],
    y=df_timmy['ups'],
    text=[ '<b>' + re.sub('>','<br>',row['title']) + '</b>'
                  + '<br><br>'
                  + re.sub('>','<br>',row['parents'])
                  + '<br>---<br>'
                  + re.sub('>','<br>',row['poem']) for index, row in (df_timmy).iterrows()],
    title='Upvotes versus number of awards of the top poems by u/poem_for_your_sprog',
    xaxis_title='Number of awards received',
    yaxis_title='Upvotes'
    )
fig.show()

In [None]:
df_timmy