In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import matplotlib.pyplot as plt 
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=False)

from src.utils import print2_list, print2, export_ipynb_for_github_pages
from src.plotly import plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap
from src.data import load_data

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

df = load_data('data/comments.txt', False)
df = df[df['type']=='poem']

# Word analysis

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
comments = df['poem'].str.cat(sep=' ')
tokens = tokenizer.tokenize(comments)
tokens = [t for t in tokens if not t in stop_words]
frequency_dist = nltk.FreqDist(tokens)
most_common = frequency_dist.most_common(80)

In [None]:
fig = plot_horizontal_bar(
    labels = [x[0] for x in most_common[::-1]],
    values = [x[1] for x in most_common[::-1]],
    title = 'Most occuring words in comments by u/poem_for_your_sprog',
    xaxis_title = 'Occurence',
    yaxis_title='')
fig.show()

# What about Timmy?

In [None]:
comments_about_timmy = np.array(['timmy' in comment for comment in df['poem']])
comments_about_timmy_fucking_dying = np.array(['timmy fucking died' in comment for comment in df['poem']])

In [None]:
print('Comments about Timmy: {}'.format(comments_about_timmy.sum()))
print('Comments about Timmy fucking dying: {}'.format(comments_about_timmy_fucking_dying.sum()))
print('Comments about Timmy that do not end with Timmy fucking dying: {}'
      .format(comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()))

In [None]:
fig = go.Figure(data=[
                    go.Pie(
                        labels=['Timmy fucking dying','Timmy not fucking dying'], 
                        values=[comments_about_timmy_fucking_dying.sum(),
                             comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()], hole=.3
        )
    ]
)
fig.update_layout(
        template='simple_white'
)
fig.show()

So.. What happens to Timmy if he doesn't fucking die?

In [None]:
df_timmy_not_dying = df[(comments_about_timmy) & (~comments_about_timmy_fucking_dying)]
df_timmy_not_dying['ending'] = [x.split('>')[-1] for x in df_timmy_not_dying['poem']]
df_timmy_not_dying = df_timmy_not_dying.sort_values('ups')

In [None]:
fig = plot_horizontal_bar(
    labels = df_timmy_not_dying['ending'],
    values = df_timmy_not_dying['ups'],
    title = 'Best scoring alternative endings to poems about Timmy',
    xaxis_title = 'Upvotes',
    yaxis_title=''
)
fig.show()

In [None]:
df_timmy_not_dying.tail(3)