In [11]:
import pandas as pd
import numpy as np
import re
import statsmodels as sm
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [4]:
lobsters = pd.read_csv('lobsteRs.csv')
lobsters.head(10)

Unnamed: 0,age,num_comments,op_upvotes,postID,post_title,poster,reflink,tags
0,2019-06-01 20:31:15 -0500,0,14,1,Implementation matters: PS2 weirdness and Path...,calvin,govanify.com,"graphics,hardware"
1,2019-06-01 09:38:24 -0500,5,19,2,How (and why) to build a programming language,adamgordonbell,corecursive.com,"audio,education,plt"
2,2019-06-01 11:30:12 -0500,32,13,3,"The dangerous folly of ""Software as a Service""",Hail_Spacecake,esr.ibiblio.org,practices
3,2019-06-02 09:55:30 -0500,0,2,4,Tensorflow Object Detection for Real World Pro...,haxorjim,christopherstoll.org,ai
4,2019-06-01 16:12:47 -0500,11,9,5,Lack of leadership in open source results in s...,federico3,techcrunch.com,"law,programming"
5,2019-05-31 12:22:26 -0500,26,78,6,At least one Vim trick you might not know,hwayne,hillelwayne.com,vim
6,2019-06-01 09:31:03 -0500,3,8,7,Why quality is important,eduardsi,sizovs.net,programming
7,2019-06-02 00:29:52 -0500,0,3,8,Spresense dev board,fhk,developer.sony.com,hardware
8,2019-06-02 00:18:33 -0500,3,1,9,An Overview of Python’s Datatable package,stephane_rolland,towardsdatascience.com,python
9,2019-05-31 19:54:30 -0500,12,26,10,WebAssembly on the Server: How System Calls Work,cadey,christine.website,wasm


In [5]:
comments = pd.read_csv('comments.csv')
comments.head(10)


Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,com_age,com_upvotes,comment,commentID,commenter,postID
0,2019-05-31 14:25:34 -0500,6,This is a very thin wrapper around the Lobste....,1,gkbrk,25
1,2019-06-01 09:43:37 -0500,3,You could also use JSON schema instead of a Py...,1,hblanks,24
2,2019-06-01 18:21:56 -0500,1,"Good point for JSON-schema (,). Yeah, not easy...",2,enz,24
3,2019-06-03 08:21:52 -0500,1,Are there any code samples for the project we ...,1,zaphar,100
4,2019-06-05 01:38:33 -0500,2,"Uh, no. We’re not currently interested in deve...",2,gallabytes,100
5,2019-06-05 08:09:31 -0500,1,"Fair enough ,thanks for sharing anyway.😄",3,zaphar,100
6,2019-06-05 08:59:38 -0500,5,"The talk is a bit old, and given I was involve...",1,FRIGN,175
7,2019-06-05 12:19:11 -0500,1,Thank you for both links! I have Gustafson’s b...,2,minimax,175
8,2019-06-06 00:08:25 -0500,1,I’m not a heavy user of floating point arithme...,3,Screwtape,175
9,2019-06-06 01:26:57 -0500,3,The IEEE 754 floating-point numbers have a fix...,4,FRIGN,175


In [8]:
def parseDate(series):
    temp = series.apply(lambda x: toDateString(x))
    dates = {date:toDate(date) for date in temp.unique()}
    return temp.map(dates)

def toDate(x):
    try:
        return pd.to_datetime(x, format = "%Y-%m-%d")
    except:
        return np.nan

def toDateString(x):
    try:
        return re.match("^[-0-9]+",x).group(0)
    except:
        print(x)
        return np.nan
    
lobsters["timestamp"] = parseDate(lobsters.age)

In [9]:
weekly_posts = lobsters.resample('W-Mon', on='timestamp').agg('count')
weekly_posts = weekly_posts.loc[:, ['postID']][:-1].reset_index(level='timestamp')
weekly = lobsters.resample('W-Mon', on='timestamp').sum().reset_index().sort_values(by='timestamp')
weekly = weekly[:-1]

In [10]:
trace_posts = go.Scatter(
    x = weekly['timestamp'],
    y = weekly_posts['postID'],
    name = 'Weekly Posts',
    line = dict(
            color = ('rgb(60, 179, 113)'),
            width = 2,
            dash = 'line'))

trace_comments = go.Scatter(
    x = weekly['timestamp'],
    y = weekly['num_comments'],
    name = 'Weekly Comments',
    line = dict(
            color = ('rgb(22, 96, 167)'),
            width = 2,
            dash = 'line'))

trace_upvotes = go.Scatter(
    x = weekly['timestamp'],
    y = weekly['op_upvotes'],
    name = 'Weekly Upvotes',
    line = dict(
            color = ('rgb(205, 12, 24)'),
            width = 2,
            dash = 'line'))

data = [trace_posts, trace_comments, trace_upvotes]

layout = dict(title = 'Weekly Capital Generated',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Count')
             )

weeklies = dict(data = data, layout = layout)
iplot(weeklies)

In [27]:
lobsters[lobsters['num_comments'] == 0].describe()['op_upvotes']
lobsters[lobsters['num_comments'] != 0].describe()['op_upvotes']

x1 = lobsters[lobsters['num_comments'] == 0]['op_upvotes']
x2 = lobsters[lobsters['num_comments'] != 0]['op_upvotes']

group_labels = ['Upvotes on Commented Posts', 'Upvotes on Uncommented Posts']

data = [x1, x2]

commented = ff.create_distplot(data, group_labels)
commented['layout'].update(title='Do Uncommented Posts Matter?')

iplot(commented)

In [18]:
print(len(lobsters['poster'].unique().tolist()))
post_freq = lobsters.groupby('poster').count().groupby('age').count()['post_title']
post_freq = post_freq.reset_index(level='age')
post_freq.columns = ['num_posts', 'freq']

post_counts = go.Box(
    x = post_freq.num_posts,
    name = 'Posts',
    jitter = 0.3)

print(len(comments['commenter'].unique().tolist()))
com_freq = comments.groupby('commenter').count().groupby('com_age').count()['commentID']
com_freq = com_freq.reset_index(level='com_age')
com_freq.columns = ['num_comments', 'freq']

com_counts = go.Box(
    x = com_freq.num_comments,
    name = 'Comments',
    jitter = 0.3)

data = [post_counts, com_counts]

layout = dict(title = 'Lifetime Poster and Commenter Counts',
             showlegend = False)

fig = dict(data=data, layout=layout)
iplot(fig)

3475
4491


In [19]:
comments.loc[comments['com_age'].str.startswith('G'), 'com_age'] = comments['com_age'].str.replace('^[A-z]+\s[a-z]*\s', '')
comments["timestamp"] = parseDate(comments.com_age)

top_low = comments.groupby('postID').nth((0,-1))
top_low.head()

Unnamed: 0_level_0,com_age,com_upvotes,comment,commentID,commenter,timestamp
postID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,2019-06-20 04:31:11 -0500,1,Considering you actually spoke in that podcast...,5,Loup-Vaillant,2019-06-20
2,2019-06-02 07:55:22 -0500,2,Just finished listening - very enjoyable. Woul...,1,tigerfinch,2019-06-02
3,2019-06-02 20:26:53 -0500,1,So manipulating the technical arguments for a ...,32,xcombelle,2019-06-02
3,2019-06-01 15:41:00 -0500,14,I like how the bottom of the post has a link t...,1,stevelord,2019-06-01
5,2019-06-02 02:20:49 -0500,2,"\n,\n,I don’t know that I have a good idea of ...",11,srbaker,2019-06-02


In [23]:
topical_latency = pd.DataFrame(abs(top_low.timestamp.diff())[1::2])
topical_latency.columns = ['latency']
discuss_freq = pd.DataFrame(topical_latency.latency.value_counts()).reset_index()
discuss_freq.columns = ['categories', 'freq']

data_x = np.append(discuss_freq.freq[0:30].values, (discuss_freq.freq[31:].sum()))
data_x = pd.DataFrame(data_x)
data_x = data_x.reset_index()
data_x.columns = ['Days', 'Freq']

data = [go.Bar(
            x=data_x.Days,
            y=data_x.Freq)]

layout = go.Layout(
    title = "# Days Between Start and End of Discussion, Past 30+"
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
train_x = weekly_posts[:int(len(x)*.8)]
test_x =weekly_posts[int(len(x)*.8):]
    
model =  pm.auto_arima(train_x.postID, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model.summary())
    
fitted = model.fit(train_x.postID, train_x.timestamp, disp=-1)  
print(fitted.summary())

# Forecast
fc, se, conf = fitted.forecast(15, alpha=0.05)  # 95% conf

# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train_x, label='training')
plt.plot(test_x, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()



# [arima_predict(x, weekly.timestamp) for x in [weekly_posts.postID, weekly.num_comments, weekly.op_upvotes]]