In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import datetime as dt
import pandas as pd
import os
import time
import re
import matplotlib.pyplot as plt 
import plotly.graph_objects as go

from src.reddit_comment_reader import RedditCommentReader
from src.utils import print2_list
from src.string import clean_comment

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [None]:
download_comments = True
file_name = 'data/comments.txt'
if os.path.isfile(file_name):
    mtime = os.path.getmtime(file_name)
    print("last modified: %s" % dt.datetime.fromtimestamp(mtime))
    if (dt.datetime.now() - dt.datetime.fromtimestamp(mtime)).seconds<24*60*60: # if modified in last 24 hours
        download_comments = False

if download_comments:
    print('Downloading comments...')
    reddit_comment_reader = RedditCommentReader(user='poem_for_your_sprog')
    all_comments = reddit_comment_reader.get_comments()
    print('Saving to file...')
    with open(file_name, 'w') as outfile:
        json.dump(all_comments, outfile)
else:   
    print('Loading comments from file...')
    with open(file_name, 'r') as infile:
        all_comments = json.load(infile)
print('Done.')

In [None]:
df = pd.DataFrame(all_comments)

In [None]:
df['datetime'] = df['created_utc'].apply(dt.datetime.fromtimestamp)
df['date'] = df['datetime'].dt.date

In [None]:
df_comments_per_day= df.groupby('date')['date'].agg(n='count').reset_index()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_comments_per_day['date'],
                         y=df_comments_per_day['n'],
                    mode='lines',
                    name='lines'))
fig.update_layout(title='Number of comments on Reddit per day by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Day',
                   yaxis_title='Number of comments')
fig.show()

In [None]:
df = df.merge(df_comments_per_day,on='date',how='left')

In [None]:
df['comment_cleaned'] = df['body'].apply(clean_comment)

In [None]:
df['linebreaks'] = df['comment_cleaned'].apply(lambda x: sum(1 for _ in re.finditer(r'>', x)))
df['comment_length']= df['comment_cleaned'].str.len()
df['average_line_length'] = df['comment_length']/df['linebreaks']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df['average_line_length'],    
       xbins=dict( # bins used for histogram
        start=0,
        end=200,
        size=1,
    )))
fig.update_layout(title='Histogram of the average characters per line by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Day',
                   yaxis_title='Number of comments')
fig.show()

In [None]:
print2_list(df[df['average_line_length']>75]['comment_cleaned']) 

In [None]:
df = df[df['average_line_length']<=75]

In [None]:
print2_list(df['comment_cleaned'][:100])