In [25]:
%matplotlib inline
import inspect, os, sys, copy, pytz, re, glob, math
import simplejson as json
import pandas as pd
from dateutil import parser
import datetime
import matplotlib.pyplot as plt   # Matplotlib for plotting
import matplotlib.dates as md
import numpy as np
import seaborn as sns
import csv
import statsmodels.formula.api as smf  # for doing statistical regression
import statsmodels.api as sm      # access to the wider statsmodels library, including R datasets
from collections import Counter, defaultdict
utc=pytz.UTC

ENV = "production"
BASE_DIR = "/home/nathan/reddit_archive/"
sys.path.append(BASE_DIR)

### FILTER OUT DEPRECATION WARNINGS ASSOCIATED WITH DECORATORS
# https://github.com/ipython/ipython/issues/9242
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')

# Load Comments and Posts from r/feminism from 2017
Posts and Comments have been extracted by select_subreddit_posts.py and select_subreddit_comments.py and placed in ~/reddit_archives/feminism_posts_2017.json

Source: Felipe Hoffa's Google BigQuery dataset "fh-bigquery" from reddit, prepared by Jason Baumgartner 

In [26]:
posts_filename = "Feminism_posts_2017.json"
comments_filename = "feminism_comments_2017.json"

In [27]:
all_posts = []
post_ids = set()
post_count = 0 
with open(os.path.join(BASE_DIR, "selected_output", posts_filename), "r") as f:
    for line in f:
        item = json.loads(line)
        if(item['id'] not in post_ids):
            item['created'] = datetime.datetime.utcfromtimestamp(float(item['created_utc']))
            all_posts.append(item)
            post_ids.add(item['id'])
        post_count += 1

        
all_posts = sorted(all_posts, key = lambda x: x['created'])        

all_comments = []
comment_ids = set()
comment_count = 0
with open(os.path.join(BASE_DIR, "selected_output", comments_filename), "r") as f:
    for line in f:
        item = json.loads(line)
        if(item['id'] not in comment_ids):
            item['created'] = datetime.datetime.utcfromtimestamp(float(item['created_utc']))
            item['body.charlength'] = len(item['body'])
            #item['body'] = None
            all_comments.append(item)
            comment_ids.add(item['id'])
        comment_count += 1
all_comments = sorted(all_comments, key = lambda x: x['created'])        

print("Loaded {0} Posts and {1} Comments".format(len(all_posts), len(all_comments)))
print("Loaded {0} Post lines and {1} Comment lines".format(post_count, comment_count))
#print("Posts have a mean of {0} comments".format(np.mean([len(x) for x in all_comments.values()])))

Loaded 11205 Posts and 41960 Comments
Loaded 11323 Post lines and 47103 Comment lines


### Create a Dataset of Previous Posts and Previous Comments by an Account

In [28]:
def previous_actions():
    return {"comments":[], "posts":[]}

author_records = defaultdict(previous_actions)

for item in all_comments:
    author_id = item['author']
    author_records[author_id]['comments'].append(item)

for item in all_posts:
    author_id = item['author']
    author_records[author_id]['posts'].append(item)

### Count Previous Posts and Comments in the Past 180 Days by Account

In [29]:
#one_eighty_days in seconds

def count_if_eligible(current, comparator):
    one_eighty_days = 60*60*24*180
    if(current['created'] > comparator['created'] and 
       (current['created'] - comparator['created']).total_seconds()<one_eighty_days):
        return 1
    return 0

earliest_date = parser.parse("Jan 1, 2017 00:00:00")  + datetime.timedelta(days=180)

sys.stdout.write("\ncomments")
sys.stdout.flush()
items_processed = 0
for item in all_comments:
    previous_comments = 0
    previous_posts = 0
    for comment in author_records[item['author']]['comments']:
        if(item['created'] > comment['created']):
            previous_comments += count_if_eligible(item, comment) 
    for post in author_records[item['author']]['posts']:
        if(item['created'] > post['created']):
            previous_posts += count_if_eligible(item, post)    
    items_processed += 1
    item['previous.comments'] = previous_comments
    item['previous.posts'] = previous_posts
    item['eligible'] = item['created'] > earliest_date
    
    if(items_processed % 1000 == 0):
        sys.stdout.write(".")
        sys.stdout.flush()
      
sys.stdout.write("\nposts")
sys.stdout.flush()
items_processed = 0
for item in all_posts:
    previous_comments = 0
    previous_posts = 0
    for comment in author_records[item['author']]['comments']:
        if(item['created'] > comment['created']):
            previous_comments += count_if_eligible(item, comment) 
    for post in author_records[item['author']]['posts']:
        if(item['created'] > post['created']):
            previous_posts += count_if_eligible(item, post)    
    items_processed += 1
    item['previous.comments'] = previous_comments
    item['previous.posts'] = previous_posts
    item['eligible'] = item['created'] > earliest_date

    if(items_processed % 1000 == 0):
        sys.stdout.write(".")
        sys.stdout.flush()


comments.........................................
posts...........

### Summarize Newcomer Comments and Posts

In [30]:
eligible_posts = [x for x in all_posts if x['eligible']]
post_ids = set([x['id'] for x in eligible_posts])
eligible_comments = [x for x in all_comments if x['link_id'].replace("t3_", "") in post_ids]

#eligible_comments = [x for x in all_comments if x['eligible']]
days_in_dataset = (all_posts[-1]['created'] - eligible_posts[0]['created']).total_seconds() / 60. / 60. / 24.

In [31]:
len([x for x in eligible_posts if x['previous.posts']==0]) / len(eligible_posts)

0.5246371920350995

In [32]:
print("{0} eligible first-time posts per day in r/Feminism".format(len([x for x in eligible_posts if x['previous.posts']>0]) / days_in_dataset))
print(eligible_posts[0]['created'])

15.228853460465698 eligible first-time posts per day in r/Feminism
2017-06-30 00:09:20


## Write to File

In [33]:
pd.DataFrame(eligible_posts).to_csv("feminism/feminism_posts_06.30.2017-12.31.2107.csv")
pd.DataFrame(eligible_comments).to_csv("feminism/feminism_comments_on_posts_with_body_06.30.2017-12.31.2107.csv")

In [34]:
#eligible_comments[0]