In [1]:
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 09:23:15) 
[Clang 10.0.1 (clang-1001.0.46.3)]


In [91]:
import time
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import itertools
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
pd.set_option('display.max_colwidth', 100)

HOST = "elastic.pushshift.io"
PORT = 80
TIMEOUT = 1000

es = Elasticsearch(hosts=[{'host': HOST, 'port': PORT}], timeout=TIMEOUT)
es.info()

{'name': 'node-2',
 'cluster_name': 'pushshift',
 'cluster_uuid': '2O6kg4ywRIGfUnIVFA078A',
 'version': {'number': '5.6.3',
  'build_hash': '1a2f265',
  'build_date': '2017-10-06T20:33:39.012Z',
  'build_snapshot': False,
  'lucene_version': '6.6.1'},
 'tagline': 'You Know, for Search'}

In [70]:
SUBMISSIONS_INDEX = "rs/submissions"
# BATCH_SIZE = 10000 # set to maximum number of entries that can be retrieved in a single call
BATCH_SIZE = 25000

def get_all_results_helper(es, index, query_body, total_size=-1):
    # The below line may not work for the PushShift Elasticsearch backend since
    # the shards have duplicate entries.
    # Thus, we end up manually search+scroll to retrieve all data from a single shard.
    gen = scan(es, index=index, query=query_body, size=BATCH_SIZE, scroll='1m', clear_scroll=False, raise_on_error=False)
    gen = gen if total_size == -1 else itertools.islice(gen, total_size)
    return gen

#     def extract_hits(batch):
#         return batch["hits"]["hits"]
#     results = []
#     batch = es.search(index=SUBMISSIONS_INDEX, body=query_body, size=BATCH_SIZE, scroll="1m", preference="_local")
#     scroll_id = batch["_scroll_id"]
#     results.extend(extract_hits(batch))
#     while extract_hits(batch) and (total_size == -1 or len(results) < total_size):
#         batch = es.scroll(scroll_id, scroll="1m")
#         results.extend(extract_hits(batch))
#     return results if total_size == -1 else results[:total_size]

def get_fun_fact_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"fun fact: \")"}},
        "sort": ["_doc"],
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

def get_til_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"til: \")"}},
        "sort": ["_doc"],
#         "sort": {"score": {"order": "desc"}},
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

def get_ysk_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"ysk: \")"}},
        "sort": ["_doc"],
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

In [115]:
es.search(index=SUBMISSIONS_INDEX, body={"query": {"query_string" : {"query" : "(title:\"til: \")"}}}, size=1)

{'took': 720,
 'timed_out': False,
 '_shards': {'total': 40, 'successful': 29, 'skipped': 0, 'failed': 0},
 'hits': {'total': 1716151,
  'max_score': 12.554386,
  'hits': [{'_index': 'rs_deltab',
    '_type': 'submissions',
    '_id': '429256991',
    '_score': 12.554386,
    '_source': {'pinned': False,
     'over_18': False,
     'title': 'TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL TIL',
     'subreddit': 'circlejerk',
     'subreddit_id': 4605573,
     'num_comments': 0,
     'score': 1,
     'suggested_sort': 'confidence',
     'whitelist_status': 'all_ads',
     'num_crossposts': 0,
     'can_mod_post': False,
     'spoiler': False,
     'id': 429256991,
     'created_utc': 1506844608,
     'locked': False,
     'is_self': True,
     'selftext': '[deleted]',
     'thumbnail': 'default',
     'author': '[deleted]',
     'is_crosspostable': False,
     'brand_safe': True,
     'url': 'https://www.reddit.com/r/circlejerk/comments/73kgtb/til_til_til_til_til_til_til_ti

In [71]:
def ingest_into_pandas_and_normalize(results):
    start_time = time.time()
    df = pd.DataFrame(results)
    print("ingested {} results".format(len(df.index)))
    df.drop_duplicates("_id", inplace=True)
    print("retained {} results after dropping duplicates".format(len(df.index)))
    dfn = json_normalize(df["_source"])
    end_time = time.time()
    print("took {:.4f}s".format(end_time - start_time))
    return dfn

In [80]:
results = get_fun_fact_in_title(1500000)
fun_fact_dfn = ingest_into_pandas_and_normalize(results)
fun_fact_dfn.to_csv('fun_fact_title.csv', header='column_names', index=False)

Scroll request has only succeeded on 25 shards out of 40.


ingested 96658 results
retained 32962 results after dropping duplicates
took 31.3243s


In [27]:
fun_fact_dfn.loc[0].dropna()

author                                                     [deleted]
created_utc                                               1358750166
domain                                                     imgur.com
id                                                          72184347
is_self                                                        False
mod_reports                                                       []
num_comments                                                       0
over_18                                                        False
permalink       /r/funny/comments/16z5sr/fun_fact_about_the_vatican/
retrieved_on                                             1.41318e+09
score                                                              0
stickied                                                       False
subreddit                                                      funny
subreddit_id                                             4.59443e+06
thumbnail                         

In [54]:
fun_fact_dfn["id"].astype('str').describe()

count         39156
unique        39156
top       253040073
freq              1
Name: id, dtype: object

In [83]:
results = get_til_in_title(2000000)
til_dfn = ingest_into_pandas_and_normalize(results)
til_dfn.to_csv('til_title.csv', header='column_names', index=False)

Scroll request has only succeeded on 25 shards out of 40.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request has only succeeded on 13 shards out of 25.
Scroll request

ingested 3000000 results
retained 550000 results after dropping duplicates
took 765.5250s


In [84]:
til_dfn.loc[0].dropna()

author                                                                                        BannedINDC
created_utc                                                                                   1304372001
domain                                                                                        avclub.com
id                                                                                              28674356
is_self                                                                                            False
num_comments                                                                                           0
over_18                                                                                            False
permalink                  /r/todayilearned/comments/h2l9w/til_corey_feldman_already_made_a_movie_about/
score                                                                                                  1
subreddit                                              

In [85]:
til_dfn["id"].astype('str').describe()

count       550000
unique      550000
top       34118270
freq             1
Name: id, dtype: object

In [86]:
results = get_ysk_in_title(1500000)
ysk_dfn = ingest_into_pandas_and_normalize(results)
ysk_dfn.to_csv('ysk_title.csv', header='column_names', index=False)

Scroll request has only succeeded on 25 shards out of 40.


ingested 100019 results
retained 35717 results after dropping duplicates
took 32.1365s


In [92]:
ysk_dfn.loc[0].dropna()

author                                                                                                 ruddelsticks
created_utc                                                                                              1340224891
domain                                                                                                   google.com
id                                                                                                         52651175
is_self                                                                                                       False
num_comments                                                                                                      1
over_18                                                                                                       False
permalink                                  /r/YouShouldKnow/comments/vchwn/ysk_that_you_can_use_google_mapmaker_to/
score                                                                   

In [93]:
ysk_dfn["id"].astype('str').describe()

count         35717
unique        35717
top       263810047
freq              1
Name: id, dtype: object

# Retrieving comments under question posts with "fact"

In [202]:
import requests
import concurrent.futures
from typing import List

CONNECTIONS = 5
MAX_QUERY_STRING_SIZE = 2048 - 100
COMMENTS_PER_REQUEST = MAX_QUERY_STRING_SIZE // 6 # characters in the base36 encoding

def get_comment_ids(submission_ids):
    comment_ids = {}
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
        def job(submission_id: int):
            base_36_encoded = np.base_repr(submission_id, 36).lower()
            resp = requests.get('https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(base_36_encoded))
            return submission_id, resp.json()['data']
        future_jobs = (executor.submit(job, int(id)) for id in submission_ids)
        for future in concurrent.futures.as_completed(future_jobs):
            try:
                submission_id, data = future.result()
                comment_ids[submission_id] = data
            except Exception as e:
                print(e)
            
    end_time = time.time()
    print("Retrieved {} results in {:.4f}s".format(len(comment_ids), end_time - start_time))
    return comment_ids

def get_comments_by_ids(comment_ids):
    comments = []
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
        def job(comment_ids_batch: List[str]):
            query_param = ",".join(comment_ids_batch)
            resp = requests.get('https://api.pushshift.io/reddit/comment/search?ids={}'.format(query_param))
            return resp.json()['data']
        batches = (comment_ids[i:i+COMMENTS_PER_REQUEST] for i in range(0, len(comment_ids), COMMENTS_PER_REQUEST))
        future_jobs = (executor.submit(job, b) for b in batches)
        for future in concurrent.futures.as_completed(future_jobs):
            try:
                data = future.result()
                comments += data
            except Exception as e:
                print(e)
    end_time = time.time()
    print("Retrieved {} results in {:.4f}s".format(len(comments), end_time - start_time))
    return comments

In [153]:
def get_fun_fact_question_posts(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(subreddit:askreddit) AND (title:fact) AND num_comments:>1000"}},
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

In [154]:
results = get_fun_fact_question_posts()
fun_fact_questions = ingest_into_pandas_and_normalize(results)

Scroll request has only succeeded on 25 shards out of 40.


ingested 463 results
retained 463 results after dropping duplicates
took 0.6401s


In [197]:
fun_fact_comment_ids = get_comment_ids(fun_fact_questions['id'])

Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Retrieved 445 results in 712.3955s


In [206]:
fun_fact_comments = {k: get_comments_by_ids(v) for k, v in fun_fact_comment_ids.items()}

Retrieved 1735 results in 3.9778s
Retrieved 1358 results in 3.5970s
Retrieved 6355 results in 11.0501s
Retrieved 1533 results in 4.1159s
Retrieved 12908 results in 21.0720s
Retrieved 8226 results in 16.2062s
Retrieved 22099 results in 40.2565s
Retrieved 18244 results in 29.6475s
Retrieved 1452 results in 3.4914s
Retrieved 2399 results in 5.0097s
Retrieved 6854 results in 12.6083s
Retrieved 2356 results in 5.8037s
Retrieved 17955 results in 40.6719s
Retrieved 9583 results in 16.0760s
Retrieved 1117 results in 3.0744s
Retrieved 2032 results in 4.8073s
Retrieved 9377 results in 15.7844s
Retrieved 6913 results in 12.8892s
Retrieved 1378 results in 3.7013s
Retrieved 2785 results in 7.1647s
Retrieved 1977 results in 4.2950s
Retrieved 10691 results in 18.0326s
Retrieved 2849 results in 6.3339s
Retrieved 2280 results in 4.7807s
Retrieved 2405 results in 5.1497s
Retrieved 20224 results in 33.4736s
Retrieved 1659 results in 3.7801s
Retrieved 1265 results in 3.3140s
Retrieved 8082 results in 13.7

Retrieved 1054 results in 2.9647s
Retrieved 3027 results in 5.8890s
Retrieved 9889 results in 16.5849s
Retrieved 2342 results in 4.8723s
Retrieved 5423 results in 10.9987s
Retrieved 1665 results in 3.7845s
Retrieved 2600 results in 5.2428s
Retrieved 2184 results in 4.5215s
Retrieved 3987 results in 7.4288s
Retrieved 16610 results in 26.8579s
Retrieved 1694 results in 3.9817s
Retrieved 24594 results in 39.1354s
Retrieved 1689 results in 3.9759s
Retrieved 12394 results in 20.3188s
Retrieved 1560 results in 3.4390s
Retrieved 1329 results in 3.4717s
Retrieved 27077 results in 43.2222s
Retrieved 4642 results in 8.4976s
Retrieved 2049 results in 4.4042s
Retrieved 2451 results in 5.0162s
Retrieved 2267 results in 4.6338s
Retrieved 1724 results in 3.8644s
Retrieved 3223 results in 6.1579s
Retrieved 2157 results in 4.5219s
Retrieved 1061 results in 3.1670s
Retrieved 5802 results in 10.0706s
Retrieved 1764 results in 4.0287s
Retrieved 1381 results in 3.4860s
Retrieved 2885 results in 5.6443s
Ret

In [244]:
fun_fact_root_comments = {
    k: [c if c['parent_id'].split('_')[-1] == k for c in v]
    for k, v in fun_fact_comments.items()
}

print("{} fun facts at the root comments".format(sum([len(v) for v in fun_fact_root_comments.values()])))

390889 fun facts at the root comments


In [245]:
fun_fact_comments_dfn = pd.concat([pd.DataFrame(c) for c in fun_fact_root_comments.values()], join='inner')
fun_fact_comments_dfn = fun_fact_comments_dfn[~fun_fact_comments_dfn["body"].isin(["[removed]", "[deleted]"])]

In [267]:
fun_fact_comments_dfn.to_csv('fun_fact_comments.csv', header='column_names', index=False)

In [268]:
fun_fact_comments_dfn

Unnamed: 0,author,author_flair_css_class,author_flair_text,body,created_utc,id,link_id,parent_id,subreddit,subreddit_id
0,Dursie,,,We are animals too,1454817825,czqq12m,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
2,ProximaCentauri3,,,That girls do not pee out of their vagina. Sometimes it's not even common knowledge among girls!,1454817891,czqq279,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
3,panzerkampfwagen,,,Just because you want something to be true doesn't mean it is. There is no such thing as person...,1454817905,czqq2gj,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
5,iamonlyoneman,,,"There is a God, and he has some expectations you're *badly* failing to meet\n\n...but believing ...",1454817962,czqq3ha,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
6,LemmonSmith,,,"If you put the lid down in a porta-potty, the smell will vent out that tube in the top and not i...",1454817971,czqq3n6,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
7,Dismalhead,,,Fox News isn't a legitimate news organization.,1454818000,czqq45l,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
8,PM_ME_UR_DIRTSTAR,,,Tracy Chapman is a woman,1454818073,czqq5hz,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
9,ROPISUS,,,God doesn't exist\n,1454818124,czqq6eb,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
10,RaatKaRaja,,,The sites you see when you google for something are different from what I see if I google for th...,1454818140,czqq6or,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
11,raiden_the_conquerer,,,"Kangaroos can't hop backwards. Next time you're in an altercation with one, get behind it.",1454818149,czqq6uf,t3_44jwbu,t3_44jwbu,AskReddit,t5_2qh1i
