In [1]:
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import pymc3 as pm
import os
import matplotlib.cm as cm
from random import random
%matplotlib inline

  from ._conv import register_converters as _register_converters


# Utilities for loading data

In [2]:
def isflt(s):
    s = s.split('.')
    if len(s) == 2 and s[0].isdigit() and s[1].isdigit():
        return True

    if len(s) != 2:
        return False
    if s[0].isdigit() == False or s[1].isdigit() == False:
        return False
    
    return True

In [3]:
def add_header(df, header):
    tmp = df.columns.values
    for idx, val in enumerate(tmp):
        if val.isdigit():
            tmp[idx] = int(val)
        elif isflt(val):
            tmp[idx] = int(val.split('.')[0])
        elif val[0] == '-' and val[1:].isdigit():
            tmp[idx] = -int(val[1:])
    df.columns = header
    y = pd.DataFrame(tmp.reshape(1,len(tmp)), columns=header)
    df = df.append(y, ignore_index=True)
    return df

In [4]:
def date_parser(date):
    date = date.split('T')[0]
    [year, mon, day] = date.split('-')
    
    return (int(year) - 2000) * 365 + int(mon) * 30 + int(day)

# Load data

In [5]:
votes_header = ['community', 'Id', 'PostId', 'VoteTypeId', 'CreationDate']

votes = add_header(pd.read_csv('votes.csv'), votes_header).rename(columns = {'Id':'VoteId'})
votes[:5]

Unnamed: 0,community,VoteId,PostId,VoteTypeId,CreationDate
0,3dprinting,2,2,2,2016-01-12T00:00:00.000
1,3dprinting,3,3,2,2016-01-12T00:00:00.000
2,3dprinting,4,1,2,2016-01-12T00:00:00.000
3,3dprinting,5,2,16,2016-01-12T00:00:00.000
4,3dprinting,6,4,2,2016-01-12T00:00:00.000


In [6]:
users_header = ['community', 'Id', 'Reputation', 'CreationDate', 'LastAccessDate', 
                'Age', 'Views', 'UpVotes', 'DownVotes', 'AccountId']

users = add_header(pd.read_csv('users.csv'), users_header).rename(columns = {'Id':'UserId'})
users[:3]

Unnamed: 0,community,UserId,Reputation,CreationDate,LastAccessDate,Age,Views,UpVotes,DownVotes,AccountId
0,3dprinting,1,101,2016-01-12T18:02:28.700,2017-05-12T19:12:03.053,,221,0,14,34933
1,3dprinting,2,101,2016-01-12T18:04:23.367,2017-08-16T19:45:31.167,,4,0,0,102159
2,3dprinting,3,101,2016-01-12T18:04:39.963,2017-07-04T15:04:50.553,26.0,3,0,0,89201


In [7]:
users.values[0]

array(['3dprinting', 1, 101, '2016-01-12T18:02:28.700',
       '2017-05-12T19:12:03.053', nan, 221, 0, 14, 34933.0], dtype=object)

In [8]:
# Link local userid and global userid
link = {}
users = users[users['AccountId'].notnull()]

for row in users.values:
    community, local_id, global_id = row[0], row[1], row[-1]
    if community not in link:
        link[community] = {}
    
    link[community][local_id] = int(float(global_id))

In [9]:
posts_header = ['community', 'Id', 'PostTypeId', 'ParentId', 'AcceptedAnswerId', 
'CreationDate', 'Score', 'ViewCount', 'OwnerUserId', 
'LastEditorUserId', 'LastEditDate', 'LastActivityDate', 
'AnswerCount', 'CommentCount', 'FavoriteCount']

posts = add_header(pd.read_csv('posts.csv'), posts_header).rename(columns = {'Id':'PostId'})
posts[:3]

Unnamed: 0,community,PostId,PostTypeId,ParentId,AcceptedAnswerId,CreationDate,Score,ViewCount,OwnerUserId,LastEditorUserId,LastEditDate,LastActivityDate,AnswerCount,CommentCount,FavoriteCount
0,3dprinting,2,1,,12,2016-01-12T18:45:51.287,22,1153,20,334,2016-11-15T16:16:11.163,2016-11-15T16:16:11.163,4,1,1
1,3dprinting,3,1,,152,2016-01-12T18:46:22.083,16,738,11,11,2016-01-12T22:00:36.347,2016-09-19T15:41:06.537,3,5,1
2,3dprinting,4,1,,1289,2016-01-12T18:50:55.973,15,155,16,98,2016-06-09T02:10:35.890,2016-06-10T13:32:20.493,4,0,2


In [11]:
posts["community"].nunique()

93

# Final touch on posts. posts_np is numpy array sorted by global user id

In [13]:
# Parser for time, output is days after 2008
# Not accurate but close enough
def parse_time(time_str):
    if type(time_str) == int:
        return time_str
    
    time_str = time_str[0:time_str.find('T')]
    loc = time_str.find('-')
    year = int(time_str[0:loc])
    month = int(time_str[loc+1:loc+3])
    day = int(time_str[loc+4:loc+6])
    
    return (year-2008)*365 + (month-1)*30 + day-1

In [14]:
# Convert PostType Id to verbal description
def parse_postid(postid):
    if type(postid) != int:
        return postid
    
    if postid == 1:
        return "question"
    if postid == 2:
        return "answer"
    
    return "others"

In [15]:
# Convert local account id to global account id
def convert_id(local_id, community, link):
    if community not in link:
        return -1
    if local_id not in link[community]:
        return -1
    return link[community][local_id]

In [16]:
# Convert "Creation time" to relative time after 2008-01-01 in days
posts['CreationDate'] = posts['CreationDate'].apply(parse_time)
# Convert PostTypeId from digit to string
posts['PostTypeId'] = posts['PostTypeId'].apply(parse_postid)

In [17]:
posts_np = posts.values
for idx, row in enumerate(posts_np):
    posts_np[idx][8] = convert_id(row[8], row[0], link)

In [18]:
posts_np = posts_np[posts_np[:,8].argsort()]
posts_np = posts_np[posts_np[:,8] != -1]

# Let's Roll!

In [20]:
if not os.path.exists("stat428"):
    os.mkdir("stat428")
        
last_idx = 0
for idx, row in enumerate(posts_np):
    if idx == len(posts_np) - 1 or row[8] != posts_np[idx+1][8]:
        user_posts = posts_np[last_idx:idx+1][:,[0, 2, 6, 5]]
        if len(user_posts) > 20:
            pd.DataFrame(user_posts).to_csv("stat428/"+str(row[8])+".csv", header=['community','action','score','date'], index=None)
            
        last_idx = idx