In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import string
import datetime as dt
import pandas as pd
import os
import time
import re
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
import numpy as np

from src.reddit_user_comment_reader import RedditUserCommentReader
from src.utils import print2_list, print2
from src.string import clean_comment

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Load the data

In [None]:
download_comments = True
file_name = 'data/comments.txt'
if os.path.isfile(file_name):
    mtime = os.path.getmtime(file_name)
    print("last modified: %s" % dt.datetime.fromtimestamp(mtime))
    if (dt.datetime.now() - dt.datetime.fromtimestamp(mtime)).seconds<24*60*60: # if modified in last 24 hours
        download_comments = False

if download_comments:
    print('Downloading comments...')
    reddit_user_comment_reader = RedditUserCommentReader('poem_for_your_sprog', verbose = True)
    all_comments = reddit_user_comment_reader.get_comments()
    print('Saving to file...')
    with open(file_name, 'w') as outfile:
        json.dump(all_comments, outfile)
else:   
    print('Loading comments from file...')
    with open(file_name, 'r') as infile:
        all_comments = json.load(infile)
print('Done.')

In [None]:
df = pd.DataFrame(all_comments)

In [None]:
df['author'].value_counts()

In [None]:
df['datetime'] = df['created_utc'].apply(dt.datetime.fromtimestamp)
df['date'] = df['datetime'].dt.date

In [None]:
def format_awards(x):
    return ', '.join([y['name'] + ': ' + str(y['count']) for y in x])
df['awards_simple'] = df['all_awardings'].apply(lambda x: format_awards(x)) 

# Comments per day

In [None]:
df_comments_per_day= df.groupby(['date','author'])['date'].agg(n='count').reset_index()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_comments_per_day['date'][df_comments_per_day['author']=='Poem_for_your_sprog'],
        y=df_comments_per_day['n'][df_comments_per_day['author']=='Poem_for_your_sprog'],
        mode='lines',
        name='lines'
    )
)
fig.update_layout(
    title='Number of comments on Reddit per day by u/poem_for_your_sprog',
    title_x=0.5,
    template = 'simple_white',
    xaxis_title='Day',
    yaxis_title='Number of comments',
    annotations=[
        go.layout.Annotation(
            x='2015-6-23',
            y=12,
            xref="x",
            yref="y",
            text="AMA",
            showarrow=True,
            arrowhead=2,
            ax=-50,
            ay=0
        )
    ]
)
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df_comments_per_day['date'][df_comments_per_day['author']=='[deleted]'],
        y=df_comments_per_day['n'][df_comments_per_day['author']=='[deleted]'],
        mode='lines',
        name='lines',
        line = {'color':'red'}
    )
)
fig.update_layout(
    title='Number of deleted comments on Reddit per day by u/poem_for_your_sprog',
    title_x=0.5,
    template = 'simple_white',
    xaxis_title='Day',
    yaxis_title='Number of comments'
 )
fig.show()

# Remove AMA, deleted and clean the comments

In [None]:
df = df[df['date']!=dt.date(2015,6,23)]
df = df[df['author']!='[deleted]']
df.reset_index(inplace=True)
df['comment_cleaned'] = df['body'].apply(clean_comment)

# Average line length

In [None]:
df['linebreaks'] = df['comment_cleaned'].apply(lambda x: sum(1 for _ in re.finditer(r'>', x)))
df['comment_length']= df['comment_cleaned'].str.len()
df['average_line_length'] = df['comment_length']/(df['linebreaks']+1)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=df['average_line_length'],    
        xbins=dict(
                start=0,
                end=200,
                size=1
        )
    )
)
fig.update_layout(
    title='Histogram of the average characters per line by u/poem_for_your_sprog',
    title_x=0.5,
    template = 'simple_white',
    xaxis_title='Day',
    yaxis_title='Number of comments'
)
fig.show()

# Score & Awards

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(
            x=df['score'],    
            xbins=dict(
                size=250
        )
    )
)
fig.update_layout(
    title='Histogram of the scores of poems by by u/poem_for_your_sprog',
    title_x=0.5,
    template = 'simple_white',
    xaxis_title='Score',
    yaxis_title='Number of comments'
)
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=df['total_awards_received'],    
        xbins=dict(
            size=1
        )
    )
)
fig.update_layout(
    title='Histogram of the number of awards per comment by u/poem_for_your_sprog',
    title_x=0.5,
    template='simple_white',
    xaxis_title='Number of awards',
    yaxis_title='Number of comments'
)
fig.show()

In [None]:
fig = go.Figure(
        data=go.Scatter(
            x=df['total_awards_received'],
            y=df['score'],
            mode='markers',
            marker=dict(
                size = 8,
                line_width=1,
                opacity=0.7
            ),
            hoverinfo = 'text',
            text=['score: {}<br>{}<br><br>'.format(row['score'],row['awards_simple']) 
                  + re.sub('>','<br>',row['comment_cleaned']) for index, row in df.iterrows()]
        )
)

fig.update_layout(
    title='Score versus number of awards of the poems by u/poem_for_your_sprog',
    title_x=0.5,
    template='simple_white',
    xaxis_title='Number of awards received',
    yaxis_title='Score'
)

# Word analysis

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
comments = df['comment_cleaned'].str.cat(sep=' ')
tokens = tokenizer.tokenize(comments)
tokens = [t for t in tokens if not t in stop_words]
frequency_dist = nltk.FreqDist(tokens)
most_common = frequency_dist.most_common(80)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y=[x[0] for x in most_common[::-1]],
        x=[x[1] for x in most_common[::-1]],
        name='SF Zoo',
        orientation='h'
    )
)
fig.update_layout(
        width=800, 
        height=900,
        title='Most occuring words in comments by u/poem_for_your_sprog',
        title_x=0.5,
        template='simple_white',
        xaxis_title='Occurence',
        yaxis_title='',
        yaxis=dict(
            tickfont=dict( size=10),
            tickvals=[x[0] for x in most_common[::-1]]
    )
)
fig.show()

# What about Timmy?

In [None]:
comments_about_timmy = np.array(['timmy' in comment for comment in df['comment_cleaned']])
print('Comments about Timmy: {}'.format(comments_about_timmy.sum()))
comments_about_timmy_fucking_dying = np.array(['timmy fucking died' in comment for comment in df['comment_cleaned']])
print('Comments about Timmy fucking dying: {}'.format(comments_about_timmy_fucking_dying.sum()))

print('Comments about Timmy that do not end with Timmy fucking dying: {}'
      .format(comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()))

In [None]:
fig = go.Figure(data=[go.Pie(labels=['Timmy fucking dying','Timmy not fucking dying'], 
                             values=[comments_about_timmy_fucking_dying.sum(),
                                     comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()], hole=.3)])
fig.update_layout(
        template='simple_white'
)
fig.show()

So.. What happens to Timmy if he doesn't fucking die?

In [None]:
poems_about_timmy_not_fucking_dying = df['comment_cleaned'][(comments_about_timmy) & (~comments_about_timmy_fucking_dying)]
[x.split('>')[-1] for x in poems_about_timmy_not_fucking_dying]

In [None]:
df['comment_cleaned'].head(5)