In [55]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
import json
import string
import datetime as dt
import pandas as pd
import os
import time
import re
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
import numpy as np

from src.reddit_user_comment_reader import RedditUserCommentReader
from src.utils import print2_list, print2
from src.string import clean_comment

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [57]:
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load the data

In [58]:
download_comments = True
file_name = 'data/comments.txt'
if os.path.isfile(file_name):
    mtime = os.path.getmtime(file_name)
    print("last modified: %s" % dt.datetime.fromtimestamp(mtime))
    if (dt.datetime.now() - dt.datetime.fromtimestamp(mtime)).seconds<24*60*60: # if modified in last 24 hours
        download_comments = False

if download_comments:
    print('Downloading comments...')
    reddit_user_comment_reader = RedditUserCommentReader('poem_for_your_sprog', verbose = True)
    all_comments = reddit_user_comment_reader.get_comments()
    print('Saving to file...')
    with open(file_name, 'w') as outfile:
        json.dump(all_comments, outfile)
else:   
    print('Loading comments from file...')
    with open(file_name, 'r') as infile:
        all_comments = json.load(infile)
print('Done.')

last modified: 2019-11-25 13:36:09.460411
Loading comments from file...
Done.


In [59]:
df = pd.DataFrame(all_comments)

In [60]:
df['author'].value_counts()

Poem_for_your_sprog    2763
[deleted]              1159
Name: author, dtype: int64

In [61]:
df['datetime'] = df['created_utc'].apply(dt.datetime.fromtimestamp)
df['date'] = df['datetime'].dt.date

In [62]:
def format_awards(x):
    return ', '.join([y['name'] + ': ' + str(y['count']) for y in x])
df['awards_simple'] = df['all_awardings'].apply(lambda x: format_awards(x)) 

# Comments per day

In [63]:
df_comments_per_day= df.groupby(['date','author'])['date'].agg(n='count').reset_index()

In [64]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_comments_per_day['date'][df_comments_per_day['author']=='Poem_for_your_sprog'],
                         y=df_comments_per_day['n'][df_comments_per_day['author']=='Poem_for_your_sprog'],
                    mode='lines',
                    name='lines'))
fig.update_layout(title='Number of comments on Reddit per day by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Day',
                   yaxis_title='Number of comments',
                   annotations=[
        go.layout.Annotation(
            x='2015-6-23',
            y=12,
            xref="x",
            yref="y",
            text="AMA",
            showarrow=True,
            arrowhead=2,
            ax=-50,
            ay=0
        )
    ]
                 )
fig.show()

In [65]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_comments_per_day['date'][df_comments_per_day['author']=='[deleted]'],
                         y=df_comments_per_day['n'][df_comments_per_day['author']=='[deleted]'],
                    mode='lines',
                    name='lines',
                    line = {'color':'red'}))
fig.update_layout(title='Number of deleted comments on Reddit per day by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Day',
                   yaxis_title='Number of comments')
fig.show()

# Remove AMA, deleted and clean the comments

In [66]:
df = df[df['date']!=dt.date(2015,6,23)]
df = df[df['author']!='[deleted]']
df.reset_index(inplace=True)
df['comment_cleaned'] = df['body'].apply(clean_comment)

# Average line length

In [67]:
df['linebreaks'] = df['comment_cleaned'].apply(lambda x: sum(1 for _ in re.finditer(r'>', x)))
df['comment_length']= df['comment_cleaned'].str.len()
df['average_line_length'] = df['comment_length']/df['linebreaks']

In [68]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df['average_line_length'],    
       xbins=dict( # bins used for histogram
        start=0,
        end=200,
        size=1,
    )))
fig.update_layout(title='Histogram of the average characters per line by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Day',
                   yaxis_title='Number of comments')
fig.show()

# Score & Awards

In [69]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df['score'],    
       xbins=dict( # bins used for histogram
        size=250,
    )))
fig.update_layout(title='Histogram of the scores of poems by by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Score',
                   yaxis_title='Number of comments')
fig.show()

In [70]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df['total_awards_received'],    
       xbins=dict( # bins used for histogram
        size=1,
    )))
fig.update_layout(title='Histogram of the number of awards per comment by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Number of awards',
                   yaxis_title='Number of comments')
fig.show()

In [78]:
fig = go.Figure(data=go.Scatter(x=df['total_awards_received'],
                                y=df['score'],
                                mode='markers',
                                hoverinfo = 'text',
                                marker_color=df['total_awards_received'],
                                                                text=['score: {}<br>{}<br><br>'
                                      .format(row['score'],row['awards_simple']) +
                                    re.sub('>','<br>',row['comment_cleaned']) for index, row in df.iterrows()]))

fig.update_layout(title='Score versus number of awards of the poems by u/poem_for_your_sprog',
                  title_x=0.5,
                   xaxis_title='Number of awards received',
                   yaxis_title='Score')

# Word analysis

In [72]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
comments = df['comment_cleaned'].str.cat(sep=' ')
tokens = tokenizer.tokenize(comments)
tokens = [t for t in tokens if not t in stop_words]
frequency_dist = nltk.FreqDist(tokens)
most_common = frequency_dist.most_common(80)

In [73]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=[x[0] for x in most_common[::-1]],
    x=[x[1] for x in most_common[::-1]],
    name='SF Zoo',
    orientation='h'
    )
)
fig.update_layout(
        width=800, 
        height=900,
        title='Most occuring words in comments by u/poem_for_your_sprog',
        title_x=0.5,
        xaxis_title='Occurence',
        yaxis_title='',
        yaxis=dict(
            tickfont=dict( size=10),
            tickvals=[x[0] for x in most_common[::-1]]
    )
)
fig.show()

# What about Timmy?

In [74]:
comments_about_timmy = np.array(['timmy' in comment for comment in df['comment_cleaned']])
print('Comments about Timmy: {}'.format(comments_about_timmy.sum()))
comments_about_timmy_fucking_dying = np.array(['timmy fucking died' in comment for comment in df['comment_cleaned']])
print('Comments about Timmy fucking dying: {}'.format(comments_about_timmy_fucking_dying.sum()))

print('Comments about Timmy that do not end with Timmy fucking dying: {}'
      .format(comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()))

Comments about Timmy: 178
Comments about Timmy fucking dying: 155
Comments about Timmy that do not end with Timmy fucking dying: 23


In [75]:
fig = go.Figure(data=[go.Pie(labels=['Timmy fucking dying','Timmy not fucking dying'], 
                             values=[comments_about_timmy_fucking_dying.sum(),
                                     comments_about_timmy.sum()-comments_about_timmy_fucking_dying.sum()], hole=.3)])
fig.show()

So.. What happens to Timmy if he doesn't fucking die?

In [76]:
poems_about_timmy_not_fucking_dying = df['comment_cleaned'][(comments_about_timmy) & (~comments_about_timmy_fucking_dying)]
[x.split('>')[-1] for x in poems_about_timmy_not_fucking_dying]

['and everybody died.',
 'and timmys fucking died.',
 'the next day the photos were posted again.',
 "... is timmy's a cunt .'",
 'and tammy fucking died.',
 "and timmy didn't die.",
 "... but this i can't believe.",
 'sprog will be.',
 "timmy's dad's a tool.",
 "'... this won't end well for me.'",
 "he didn't fucking die.",
 'but lose the will to',
 'and grandpappy timmy, poor grandpa, was doomed.',
 'and timmy fucking failed.',
 "one for dad, and one for mother!'",
 'mostly aimless.',
 "said little timmy's dad.",
 'and kept him in the sink.',
 "'for fuck's sake, timmy no. '",
 "and timmy's fucking fine.",
 'he broke a rule or two.',
 "he'd climb out of her... front.'",
 'so he tore his face right off.']

In [77]:
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 100, stop_words = stop_words)
X = vectorizer.fit_transform(df['comment_cleaned'])

count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
df_terms = pd.concat([df[['score','total_awards_received']], count_vect_df], axis=1)

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
X = df_terms.drop(['score','total_awards_received'], axis = 1, inplace = False)
y = df_terms['total_awards_received']

#Backward Elimination
cols = list(X.columns)
pmax = 1
prev_aic = np.inf
while (len(cols)>0):
    p = []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit_regularized(refit=True)
    new_aic = model.aic
    print(model.aic)
    p = pd.Series(model.pvalues[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05 and new_aic < prev_aic):
        cols.remove(feature_with_p_max)
        prev_aic = new_aic
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

X = X[selected_features_BE]
X = sm.add_constant(X)

model = sm.OLS(y,X_1).fit_regularized(refit=True)
model.summary()


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



12283.433806340308
12281.433883745332
12279.434439942997
12277.435678093585
12275.438489218512
12273.445008492918
12271.453841441202


KeyboardInterrupt: 

In [None]:
df_terms.groupby('timmy')['score'].agg([np.mean, len])

In [None]:
df_terms.groupby('timmy')['total_awards_received'].agg([np.mean, len])

In [None]:
df['total_awards_received'].summarize()