In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import os
os.chdir('..')

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import string
import datetime as dt
import pandas as pd
import numpy as np
import time
import re
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from src.utils import print2_list, print2
from src.plotly import plot_histogram, plot_timeline, plot_horizontal_bar, plot_heatmap, plot_scatter, plot_events_timeline
from src.reddit_user_comment_reader import RedditUserCommentReader
from src.data_frame_parser import DataFrameParser

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

## Read data

In [None]:
comment_reader = RedditUserCommentReader('poem_for_your_sprog','data')
df = comment_reader.read()
df = DataFrameParser().parse(df)

# Comments per day

In [None]:
df_comments_per_day= df.groupby(['date'])['date'].agg(n='count')
idx = pd.date_range(df_comments_per_day.index.min(), dt.datetime.today())
df_comments_per_day = df_comments_per_day.reindex(idx, fill_value=0)

In [None]:
fig = plot_timeline(
    x=df_comments_per_day.index,
    y=df_comments_per_day['n'],
    title=None,
    xaxis_title='Day',
    yaxis_title='Number of comments',
    annotations=[
        go.layout.Annotation(
            x='2015-6-23',
            y=10,
            xref="x",
            yref="y",
            text="AMA",
            showarrow=True,
            arrowhead=2,
            ax=-50,
            ay=0
        )
    ]
)
fig.show()
fig.write_json('plots/1/plot_1_comment_count_per_day.json')

In [None]:
# Remove AMA comments
df = df[df['date']!=dt.date(2015,6,23)]
df.reset_index(inplace=True)

In [None]:
comments_per_month = df_comments_per_day.groupby(pd.Grouper(freq='M'))['n'].sum()

In [None]:
fig = plot_timeline(
    x=comments_per_month.index,
    y=comments_per_month,
    title='Number of comments on Reddit per month by u/poem_for_your_sprog',
    xaxis_title='Month',
    yaxis_title='Number of comments'
)
fig.show()
fig.write_json('plots/1/plot_2_comment_count_per_month.json')

In [None]:
print('In total, there are {} comments by /u/poem_for_your_sprog on Reddit. \nOut of those, {:.1f}% were comments within /r/AskReddit.'
      .format(len(df), 
              len(df[df['subreddit_name_prefixed']=='r/AskReddit'])/len(df)*100)
     )

In [None]:
df_poem_per_sub = df[df['subreddit_name_prefixed']!='r/AskReddit'][['subreddit_name_prefixed','date','poem','ups']]
df_poem_per_sub['poem_ups'] = ['Upvotes: {}>'.format(row['ups']) + row['poem'] for ix, row in df_poem_per_sub.iterrows()]
df_poem_per_sub = (
    df_poem_per_sub
    .groupby(['subreddit_name_prefixed','date'])['poem_ups']
    .agg(poem_ups =  lambda x: '>---->'.join(list(x)), count = len)
    .reset_index(inplace=False)
)

In [None]:
df_poem_per_sub['subreddit_total'] = (
    df_poem_per_sub['count']
    .groupby(df_poem_per_sub['subreddit_name_prefixed'])
    .transform('sum')
)
df_poem_per_sub = df_poem_per_sub.sort_values('subreddit_total')

fig = plot_events_timeline(
    x = df_poem_per_sub['date'],
    y = df_poem_per_sub['subreddit_name_prefixed'],
    text=["Total poems: {}<br>----<br>".format(row['count']) + re.sub('>','<br>',row['poem_ups']) for ix,row in df_poem_per_sub.iterrows()],
    title = 'Timeline of poems outside of /r/AskReddit by /u/poem_/for_your_sprog',
    xaxis_title = 'Date',
    yaxis_title = ''
)
fig.show()
fig.write_json('plots/1/plot_3_comments_outside_of_askreddit.json')

# Average line length

In [None]:
fig = plot_histogram(
    x = df['average_line_length'],    
    params = {'xbins':dict(start=0,end=200,size=1)},
    title = 'Histogram of the average characters per line by u/poem_for_your_sprog',
    xaxis_title = 'Number of characters',
    yaxis_title = 'Number of poems'
)

fig.show()
fig.write_json('plots/1/plot_4_characters_per_line.json')

In [None]:
# remove outliers
df = df[df['average_line_length']<65]
df = df[df['poem'].apply(len)>0]
df = df[df['number_of_lines']>1]

In [None]:
df_short = df.sort_values('average_line_length').head(100)
df_long = df.sort_values('average_line_length',ascending=False).head(100)
df_short_long = pd.concat([df_short,df_long])

In [None]:
fig = plot_scatter(
    x=df_short_long['average_line_length'],
    y=df_short_long['ups'],
    text=[re.sub('>','<br>',row['poem']) for ix, row in df_short_long.iterrows()],
    title='100 poems with the shortest and 100 poems with the <br>longest line length by u/poem_for_your_sprog',
    xaxis_title='Average line length',
    yaxis_title='Upvotes'
    )
fig.show()
fig.write_json('plots/1/plot_5_short_and_long_poems.json')

If you're done reading, let's move on to another histogram; the number of lines per poem.

In [None]:
fig = plot_histogram(
    x = df['number_of_lines'],    
    params = {'xbins':dict(size=1)},
    title = 'Histogram of the number of lines per poem by u/poem_for_your_sprog',
    xaxis_title = 'Number of lines',
    yaxis_title = 'Number of comments'
)

fig.show()
fig.write_json('plots/1/plot_6_hist_number_of_lines.json')

# Upvotes & Awards

In [None]:
print('Total number of upvotes on poems by /u/poem_for_your_sprog: {:,}'.format(df['ups'].sum()))

In [None]:
from collections import Counter
c = Counter()
for d in df['awards_dict']:
    c.update(d)

df_awards = (
    pd.DataFrame.from_dict(
        dict(c), 
        orient='index', 
        columns =['count']
    )
    .reset_index()
    .sort_values('count', ascending=False)
)

df_awards['index'][~df_awards['index'].isin(df_awards['index'].head(10))] = 'other'
df_awards = df_awards.groupby('index')['count'].sum().reset_index().sort_values('count', ascending=True)

In [None]:
fig = plot_horizontal_bar(labels=[x[0] for i, x in df_awards.iterrows()], 
                    values=[x[1] for i, x in df_awards.iterrows()],
                    title = 'Awards received on poems by /u/poem_for_your_sprog.',
                    xaxis_title = 'count', 
                    yaxis_title = "")

fig.show()
fig.write_json('plots/1/plot_7_awards.json')

In [None]:
fig = plot_histogram(
    x = df['ups'],    
    params = {'xbins':dict(size=250)},
    title = 'Histogram of the upvotes on poems by u/poem_for_your_sprog',
    xaxis_title = 'Upvotes',
    yaxis_title = 'Number of comments'
)
fig.show()
fig.write_json('plots/1/plot_8_hist_upvotes.json')

In [None]:
fig = plot_histogram(
    x = df['total_awards_received'],    
    params = {'xbins':dict(size=1)},
    title = 'Histogram of the number of awards per comment by u/poem_for_your_sprog',
    xaxis_title = 'Number of awards',
    yaxis_title = 'Number of comments'
)
fig.show()
fig.write_json('plots/1/plot_9_hist_awards.json')

In [None]:
top_awards = df['total_awards_received'].sort_values(ascending=False).index[:100]
top_upvotes = df['ups'].sort_values(ascending=False).index[:100]
top_poems = top_awards.union(top_upvotes)
df_top = df.loc[top_poems]

In [None]:
fig = plot_scatter(
    x=df_top['total_awards_received'],
    y=df_top['ups'],
    text=['upvotes: {}<br>awards: {}<br><br>'.format(row['ups'],row['total_awards_received']) 
                  + re.sub('>','<br>',row['poem']) for index, row in (df_top).iterrows()],
    title='Upvotes versus number of awards of the top poems by u/poem_for_your_sprog',
    xaxis_title='Number of awards received',
    yaxis_title='Upvotes'
    )
fig.show()
fig.write_json('plots/1/plot_10_most_upvotes_and_awards.json')

In [None]:
df_corr = df[['score','total_awards_received','average_line_length','number_of_lines']].corr()
df_corr[df_corr==1]=np.nan

fig = plot_heatmap(
    z = df_corr.values,
    x = df_corr.columns,
    y = df_corr.columns,
    title = 'Correlation plot',
    figsize = (600,500)
)
fig.show()
fig.write_json('plots/1/plot_11_correlation.json')