# Reddit Data Collection

The goal is to collect posts from Reddit that mirror our Twitter data collection using the [PushShift API](https://github.com/pushshift/api). To that end we can create a function that will walk through search results for posts that match a given query defined by the kwargs that are passed in, and a start/end time range.

In [26]:
import json
import time
import pandas
import requests

from datetime import datetime

def search_pushshift(start, end, max_errors=30, sleep=1, **kwargs):
    url = "https://api.pushshift.io/reddit/search/submission"
    
    params = kwargs
    params['results'] = 500
        
    now = datetime.now()
    hour = int((now - end).total_seconds() / (60 * 60))
    num_hours = int((now - start).total_seconds() / (60 * 60))
    step = 1
    errs = 0

    while hour < num_hours:
        params['before'] = f'{hour}h'
        params['after'] = f'{hour + step}h'
        try:
            resp = requests.get(url, params=params)
            if resp.status_code != 200:
                errs += 1
            else:
                errs = 0
                results = resp.json()['data']
                if len(results) > 0: 
                    for result in results:
                        result['created'] = datetime.fromtimestamp(result['created_utc'])
                        if result['created'] < start:
                            break                            
                        yield result
                    hour += step
                    step = 1            
                else:
                    step = 2 * step
                        
        except Exception as e:
            print(f'got exception: {e}')
            errs += 1
            
        if errs > max_errors:
            print(f'bailing after {max_errors} consecutive errors')
            
        time.sleep(sleep)

So for example you can search for "police violence" posts from 2021-01-01 to 2021-01-04:

In [3]:
for result in search_pushshift(q='"police violence"', start=datetime(2020, 1, 1), end=datetime(2020, 1, 4)):
    print(result['id'], result['created'], result['title'])

ejmjlz 2020-01-03 22:01:59 Imran Khan Tweets Indian Police Pogrom On Muslims
eji17g 2020-01-03 16:49:16 [IN] - Videos of police violence surface in Bihar | The Hindu
ejhke1 2020-01-03 16:16:32 Pakistan's Prime Minister Imran Khan Tweets Fake Video From Bangladesh, Tries To Pass It Off As Police Violence In India.
eib2oo 2020-01-01 00:05:44 35 [M4F] Seattle/Online - Nerdy socialist viking seeks SPICY chats and fun times
eid6ya 2020-01-01 03:14:21 r/selfawarewolves advocating for anti-police violence "cops should be subject to abuse everywhere they go. They’ve earned it" (100+ upvotes)


Now we can try the same but searching only the title of the submission:

In [4]:
for result in search_pushshift(title='"police violence"', start=datetime(2020, 1, 1), end=datetime(2020, 1, 4)):
    print(result['id'], result['created'], result['title'])


eji17g 2020-01-03 16:49:16 [IN] - Videos of police violence surface in Bihar | The Hindu
ejhke1 2020-01-03 16:16:32 Pakistan's Prime Minister Imran Khan Tweets Fake Video From Bangladesh, Tries To Pass It Off As Police Violence In India.
eid6ya 2020-01-01 03:14:21 r/selfawarewolves advocating for anti-police violence "cops should be subject to abuse everywhere they go. They’ve earned it" (100+ upvotes)


## Data Collection

Experimentation has shown that the following properties come back from the PushShift API. We can use them to create a CSV where each row is a submission and each column is a property of that submission.

In [2]:
cols = ['pinned', 'secure_media_embed', 'post_hint', 'parent_whitelist_status', 'author_premium', 'pwls', 'all_awardings', 'author_flair_richtext', 'whitelist_status', 'gildings', 'media_metadata', 'score', 'author_cakeday', 'thumbnail_height', 'treatment_tags', 'created', 'media_embed', 'is_crosspostable', 'upvote_ratio', 'is_gallery', 'author_flair_template_id', 'spoiler', 'secure_media', 'crosspost_parent_list', 'url_overridden_by_dest', 'allow_live_comments', 'subreddit_id', 'link_flair_css_class', 'url', 'can_mod_post', 'author_flair_text', 'media', 'domain', 'preview', 'wls', 'author_flair_type', 'is_original_content', 'locked', 'removed_by_category', 'thumbnail_width', 'permalink', 'awarders', 'suggested_sort', 'link_flair_background_color', 'content_categories', 'link_flair_text_color', 'selftext', 'thumbnail', 'link_flair_type', 'author_flair_background_color', 'is_robot_indexable', 'media_only', 'crosspost_parent', 'over_18', 'author_patreon_flair', 'total_awards_received', 'author_flair_text_color', 'author_fullname', 'contest_mode', 'gallery_data', 'id', 'num_comments', 'retrieved_on', 'is_video', 'send_replies', 'is_self', 'author', 'subreddit', 'stickied', 'full_link', 'no_follow', 'poll_data', 'author_flair_css_class', 'subreddit_type', 'is_reddit_media_domain', 'num_crossposts', 'link_flair_template_id', 'is_meta', 'subreddit_subscribers', 'created_utc', 'link_flair_text', 'link_flair_richtext', 'title', 'edited', 'banned_by', 'author_cakeday', 'rpan_video', 'event_start', 'event_enshowsd', 'collections', 'steward_reports', 'discussion_type', 'gilded', 'event_is_live', 'edited', 'banned_by', 'author_cakeday', 'rpan_video', 'event_start', 'event_enshowsd', 'collections', 'steward_reports', 'discussion_type', 'gilded', 'event_is_live']

Now we are ready to do the data collection for the following datasets:

In [4]:
queries = [
    '"black people"',
    '"racial wealth gap"',
    'racism'
]

In [None]:
import re
import csv 
import sys
import pathlib

data_dir = pathlib.Path('../data/reddit.pull')

def collect(query):
    norm_q = query.replace(' ', '_').replace('"', '')
    csv_file = data_dir / f'{norm_q}.csv'
    print()
    print(csv_file)
    with csv_file.open('w') as fh:
        out = csv.DictWriter(fh, fieldnames=cols, extrasaction='ignore')
        out.writeheader()
        for result in search_pushshift(title=query, start=datetime(2020, 5, 25), end=datetime(2021, 5, 25), sleep=1):
            out.writerow(result)
            sys.stdout.write('.')
            sys.stdout.flush()
        
for query in queries:
    collect(query)
    

## Comments

Since we are analyzing conversations it's important to get the comments for these submissions as well.PushShift have an API to get the comment IDs for a post ID, e.g. 

https://api.pushshift.io/reddit/submission/comment_ids/pe65v4

And then they have an API to get the comments themselves using the discovered comment ids:

https://api.pushshift.io/reddit/comment/search?ids=hav4td5,havo62q

The function `get_comments()` will take a post id and return a list of comment objects for that post. Since there are lots of comments for some posts we need to be careful to only ask for 50 comment ids at a time.

In [63]:
import itertools
        
def get_comments(post_id, sleep=.5):
    time.sleep(sleep)
    comment_ids = requests.get(f'https://api.pushshift.io/reddit/submission/comment_ids/{post_id}').json()['data']
    for ids in grouper(50, comment_ids):
        resp = requests.get(f'https://api.pushshift.io/reddit/comment/search?ids={",".join(ids)}')
        if resp.status_code == 200:
            yield from resp.json()['data']
        else:
            print(f'error: {resp}')
            return
        
# for chunking an iteratorinto tuples of size n
def grouper(n, iterable):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk

Lets try to get the first comment for post id [pe65v4](https://www.reddit.com/r/pushshift/comments/pe65v4/update_on_reddit_monthly_comment_dumps/).

In [64]:
comment = next(get_comments('pe65v4'))
print(json.dumps(comment, indent=2))

{
  "all_awardings": [],
  "approved_at_utc": null,
  "associated_award": null,
  "author": "Watchful1",
  "author_flair_background_color": null,
  "author_flair_css_class": null,
  "author_flair_richtext": [],
  "author_flair_template_id": null,
  "author_flair_text": null,
  "author_flair_text_color": null,
  "author_flair_type": "text",
  "author_fullname": "t2_d0z23",
  "author_is_blocked": false,
  "author_patreon_flair": false,
  "author_premium": true,
  "awarders": [],
  "banned_at_utc": null,
  "body": "Thanks! I've been looking forward to them.\n\nWill this include the recompressed older files?",
  "can_mod_post": false,
  "collapsed": false,
  "collapsed_because_crowd_control": null,
  "collapsed_reason": null,
  "collapsed_reason_code": null,
  "comment_type": null,
  "created_utc": 1630279831,
  "distinguished": null,
  "edited": false,
  "gildings": {},
  "id": "hav4td5",
  "is_submitter": false,
  "link_id": "t3_pe65v4",
  "locked": false,
  "no_follow": true,
  "parent_

### CSV

For analysis its easiest if the conversations are converted to CSV and stored alongside the posts that they are from. Unlike the Twitter conversations we aren't constrained by an API quota, so we can attempt to get all the conversation threads.

The properties of the comment JSON object we just retrieved can be used as the columns of our CSV dataset.

In [56]:
comment_cols = list(comment.keys())

The function `save_comments()` will read a file of search results and retrieve comments for the posts that have comments. Experimentation has shown that a large value in one of the rows requires csv to be instructed to load larger cells.

In [65]:
csv.field_size_limit = 1131072

def save_comments(posts_file):
    print(f"\nprocessing {posts_file}")
    convs_dir = data_dir / (posts_file.stem + "_convs")
    
    if not convs_dir.is_dir():
        convs_dir.mkdir()
        
    for post in csv.DictReader(posts_file.open()):
        
        if int(post['num_comments']) > 0:
            
            sys.stdout.write(f'\n{post["id"]}[{post["num_comments"]}]')
            csv_file = convs_dir / f"{post['id']}.csv"
            
            with csv_file.open('w') as fh: 
                out = csv.DictWriter(fh, fieldnames=comment_cols, extrasaction='ignore')
                out.writeheader()
                for comment in get_comments(post['id']):
                    out.writerow(comment)
                    sys.stdout.write('.')
                    sys.stdout.flush()

In [None]:
for post_file in data_dir.glob('reddit_*.csv'):
    save_comments(post_file)

## Sampling

Since the Twitter conversations were sampled we can randomly sample the Reddit ones too. Since we have `num_comments` we can use it to sample conversations that have more than 5 comments. We oversample since some threads no longer contain any comments (due to deletions or other discrepencies in the PushShift API).

In [116]:
import os
import sh
import pandas

def sample(posts_file, n=30):
    sample_dir = data_dir / f'{posts_file.stem}_{n}'
    zip_file = data_dir / f'{posts_file.stem}_{n}.zip'

    if zip_file.is_file():
        print(f'already sampled {posts_file}')
        return
        
    sh.mkdir(sample_dir)
        
    df = pandas.read_csv(posts_file)
    df = df[df['num_comments'] >= 5]
    
    # we sample 10 extra  posts in case we hit some posts that lack comments
    # this shouldn't happen because we've filtered on num_comments, but it does (deletes)
    s = df.sample(n + 5)
    
    convs_found = 0
    for post_id in s['id']:
        comments_found = 0
        comments_file = sample_dir / f'{post_id}.csv'
        
        with comments_file.open('w') as fh:
            out = csv.DictWriter(fh, fieldnames=comment_cols, extrasaction='ignore')
            out.writeheader()
            for comment in get_comments(post_id):
                comments_found += 1
                out.writerow(comment)
        
        if comments_found >= 5:
            print(posts_file, post_id, comments_found)
            convs_found += 1
        else:
            sh.rm(comments_file)
        
        if convs_found >= n:
            break
    
    pwd = os.getcwd()
    os.chdir(data_dir)
    sh.zip('-r', zip_file.name, sample_dir.name)
    os.chdir(pwd)
    
    sh.rm('-rf', sample_dir)
    print(f'created sample {zip_file}') 

In [117]:
for posts_csv in data_dir.glob('*.csv'):
    sample(posts_csv)  

already sampled ../data/reddit.pull/reddit_black_people.csv
already sampled ../data/reddit.pull/reddit_racial_wealth_gap.csv
already sampled ../data/reddit.pull/reddit_racism.csv
