In [1]:
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 09:23:15) 
[Clang 10.0.1 (clang-1001.0.46.3)]


In [2]:
import time
import pandas as pd
from pandas.io.json import json_normalize
import itertools
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

HOST = "elastic.pushshift.io"
PORT = 80
TIMEOUT = 1000

es = Elasticsearch(hosts=[{'host': HOST, 'port': PORT}], timeout=TIMEOUT)
es.info()

{'name': 'node-2',
 'cluster_name': 'pushshift',
 'cluster_uuid': '2O6kg4ywRIGfUnIVFA078A',
 'version': {'number': '5.6.3',
  'build_hash': '1a2f265',
  'build_date': '2017-10-06T20:33:39.012Z',
  'build_snapshot': False,
  'lucene_version': '6.6.1'},
 'tagline': 'You Know, for Search'}

In [18]:
SUBMISSIONS_INDEX = "rs/submissions"
# BATCH_SIZE = 10000 # set to maximum number of entries that can be retrieved in a single call
BATCH_SIZE = 25000

def get_all_results_helper(es, index, query_body, total_size=-1):
    # The below line may not work for the PushShift Elasticsearch backend since
    # the shards have duplicate entries.
    # Thus, we end up manually search+scroll to retrieve all data from a single shard.
    gen = scan(es, index=SUBMISSIONS_INDEX, query=query_body, size=BATCH_SIZE, scroll='1m', clear_scroll=False, raise_on_error=False)
    gen = gen if total_size == -1 else itertools.islice(gen, total_size)
    return list(gen)

#     def extract_hits(batch):
#         return batch["hits"]["hits"]
#     results = []
#     batch = es.search(index=SUBMISSIONS_INDEX, body=query_body, size=BATCH_SIZE, scroll="1m", preference="_local")
#     scroll_id = batch["_scroll_id"]
#     results.extend(extract_hits(batch))
#     while extract_hits(batch) and (total_size == -1 or len(results) < total_size):
#         batch = es.scroll(scroll_id, scroll="1m")
#         results.extend(extract_hits(batch))
#     return results if total_size == -1 else results[:total_size]

def get_fun_fact_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"fun fact: \")"}},
        "sort": ["_doc"],
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

def get_til_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"til: \")"}},
        "sort": ["_doc"],
#         "sort": {"score": {"order": "desc"}},
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

def get_ysk_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"ysk: \")"}},
        "sort": ["_doc"],
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

In [4]:
start = time.time()
results = get_fun_fact_in_title(1500000)
end = time.time()
print("Retrieved {} results in {:.4f}s".format(len(results), end - start))

Scroll request has only succeeded on 36 shards out of 40.


Retrieved 81624 results in 24.4393s


In [5]:
df = pd.DataFrame(results)
# df["_id"].describe()
df.drop_duplicates("_id", inplace=True)
dfn = json_normalize(df["_source"])
dfn.to_csv('fun_fact_title.csv', header='column_names')

In [6]:
df["_source"][0]

{'thumbnail': 'default',
 'mod_reports': [],
 'author': '[deleted]',
 'over_18': False,
 'user_reports': [],
 'title': 'Fun fact about the Vatican',
 'report_reasons': None,
 'subreddit': 'funny',
 'subreddit_id': 4594431,
 'url': 'http://imgur.com/tJJEovL',
 'num_comments': 0,
 'score': 0,
 'stickied': False,
 'domain': 'imgur.com',
 'retrieved_on': 1413178593,
 'id': 72184347,
 'created_utc': 1358750166,
 'permalink': '/r/funny/comments/16z5sr/fun_fact_about_the_vatican/',
 'is_self': False}

In [7]:
df["_id"].describe()

count         39156
unique        39156
top       245120152
freq              1
Name: _id, dtype: object

In [8]:
start = time.time()
results = get_til_in_title(1500000)
end = time.time()
print("Retrieved {} results in {:.4f}s".format(len(results), end - start))

Scroll request has only succeeded on 36 shards out of 40.


Retrieved 1500000 results in 231.0791s


In [9]:
df = pd.DataFrame(results)
# df["_id"].describe()
df.drop_duplicates("_id", inplace=True)
dfn = json_normalize(df["_source"])
dfn.to_csv('til_title.csv', header='column_names')

In [10]:
df["_source"][0]

{'thumbnail': 'default',
 'author': 'BannedINDC',
 'over_18': False,
 'title': 'TIL Corey Feldman already made a movie about killing Bin Laden',
 'subreddit': 'todayilearned',
 'subreddit_id': 4606680,
 'url': 'http://www.avclub.com/articles/never-mind-corey-feldman-already-made-a-movie-abou,55383/',
 'num_comments': 0,
 'score': 1,
 'domain': 'avclub.com',
 'id': 28674356,
 'created_utc': 1304372001,
 'permalink': '/r/todayilearned/comments/h2l9w/til_corey_feldman_already_made_a_movie_about/',
 'is_self': False}

In [11]:
df["_id"].describe()

count       325000
unique      325000
top       32646334
freq             1
Name: _id, dtype: object

In [19]:
start = time.time()
results = get_ysk_in_title(1500000)
end = time.time()
print("Retrieved {} results in {:.4f}s".format(len(results), end - start))

Scroll request has only succeeded on 36 shards out of 40.


Retrieved 119148 results in 40.9507s


In [20]:
df = pd.DataFrame(results)
# df["_id"].describe()
df.drop_duplicates("_id", inplace=True)
dfn = json_normalize(df["_source"])
dfn.to_csv('ysk_title.csv', header='column_names')

In [21]:
df["_source"][0]

{'thumbnail': 'default',
 'mod_reports': [],
 'author': '[deleted]',
 'over_18': False,
 'user_reports': [],
 'title': 'YSK Useful Tips For A Rainy Day',
 'report_reasons': None,
 'subreddit': 'YouShouldKnow',
 'subreddit_id': 4630776,
 'url': 'http://molempire.com/2011/05/18/useful-tips-for-a-rainy-day/',
 'num_comments': 2,
 'score': 0,
 'stickied': False,
 'domain': 'molempire.com',
 'retrieved_on': 1413178213,
 'id': 72196576,
 'created_utc': 1358766233,
 'permalink': '/r/YouShouldKnow/comments/16zf8g/ysk_useful_tips_for_a_rainy_day/',
 'is_self': False}

In [22]:
df["_id"].describe()

count         48537
unique        48537
top       472591046
freq              1
Name: _id, dtype: object