In [1]:
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 09:23:15) 
[Clang 10.0.1 (clang-1001.0.46.3)]


In [2]:
import time
import pandas as pd
from pandas.io.json import json_normalize
import itertools
import matplotlib.pyplot as plt
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

HOST = "elastic.pushshift.io"
PORT = 80
TIMEOUT = 1000

es = Elasticsearch(hosts=[{'host': HOST, 'port': PORT}], timeout=TIMEOUT)
es.info()

{'name': 'node-2',
 'cluster_name': 'pushshift',
 'cluster_uuid': '2O6kg4ywRIGfUnIVFA078A',
 'version': {'number': '5.6.3',
  'build_hash': '1a2f265',
  'build_date': '2017-10-06T20:33:39.012Z',
  'build_snapshot': False,
  'lucene_version': '6.6.1'},
 'tagline': 'You Know, for Search'}

In [3]:
SUBMISSIONS_INDEX = "rs/submissions"
# BATCH_SIZE = 10000 # set to maximum number of entries that can be retrieved in a single call
BATCH_SIZE = 25000

def get_all_results_helper(es, index, query_body, total_size=-1):
    # The below line may not work for the PushShift Elasticsearch backend since
    # the shards have duplicate entries.
    # Thus, we end up manually search+scroll to retrieve all data from a single shard.
    gen = scan(es, index=SUBMISSIONS_INDEX, query=query_body, size=BATCH_SIZE, scroll='1m', clear_scroll=False, raise_on_error=False)
    gen = gen if total_size == -1 else itertools.islice(gen, total_size)
    return list(gen)

#     def extract_hits(batch):
#         return batch["hits"]["hits"]
#     results = []
#     batch = es.search(index=SUBMISSIONS_INDEX, body=query_body, size=BATCH_SIZE, scroll="1m", preference="_local")
#     scroll_id = batch["_scroll_id"]
#     results.extend(extract_hits(batch))
#     while extract_hits(batch) and (total_size == -1 or len(results) < total_size):
#         batch = es.scroll(scroll_id, scroll="1m")
#         results.extend(extract_hits(batch))
#     return results if total_size == -1 else results[:total_size]

def get_fun_fact_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"fun fact: \")"}},
        "sort": ["_doc"],
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

def get_til_in_title(total_size=-1):
    query_body = {
        "query": {"query_string" : {"query" : "(title:\"til: \")"}},
        "sort": ["_doc"],
#         "sort": {"score": {"order": "desc"}},
    }
    return get_all_results_helper(es, SUBMISSIONS_INDEX, query_body, total_size)

In [4]:
start = time.time()
results = get_fun_fact_in_title()
end = time.time()
print("Retrieved {} results in {:.4f}s".format(len(results), end - start))

Scroll request has only succeeded on 36 shards out of 40.


Retrieved 81624 results in 24.4393s


In [5]:
df = pd.DataFrame(results)
# df["_id"].describe()
df.drop_duplicates("_id", inplace=True)
dfn = json_normalize(df["_source"])
dfn.to_csv('fun_fact_title.csv', header='column_names')

In [6]:
df["_source"][0]

{'thumbnail': 'default',
 'mod_reports': [],
 'author': '[deleted]',
 'over_18': False,
 'user_reports': [],
 'title': 'Fun fact about the Vatican',
 'report_reasons': None,
 'subreddit': 'funny',
 'subreddit_id': 4594431,
 'url': 'http://imgur.com/tJJEovL',
 'num_comments': 0,
 'score': 0,
 'stickied': False,
 'domain': 'imgur.com',
 'retrieved_on': 1413178593,
 'id': 72184347,
 'created_utc': 1358750166,
 'permalink': '/r/funny/comments/16z5sr/fun_fact_about_the_vatican/',
 'is_self': False}

In [7]:
df["_id"].describe()

count         39156
unique        39156
top       245120152
freq              1
Name: _id, dtype: object

In [8]:
start = time.time()
results = get_til_in_title(1500000)
end = time.time()
print("Retrieved {} results in {:.4f}s".format(len(results), end - start))

Scroll request has only succeeded on 36 shards out of 40.


Retrieved 1500000 results in 231.0791s


In [9]:
df = pd.DataFrame(results)
# df["_id"].describe()
df.drop_duplicates("_id", inplace=True)
dfn = json_normalize(df["_source"])
dfn.to_csv('til_title.csv', header='column_names')

In [10]:
df["_source"][0]

{'thumbnail': 'default',
 'author': 'BannedINDC',
 'over_18': False,
 'title': 'TIL Corey Feldman already made a movie about killing Bin Laden',
 'subreddit': 'todayilearned',
 'subreddit_id': 4606680,
 'url': 'http://www.avclub.com/articles/never-mind-corey-feldman-already-made-a-movie-abou,55383/',
 'num_comments': 0,
 'score': 1,
 'domain': 'avclub.com',
 'id': 28674356,
 'created_utc': 1304372001,
 'permalink': '/r/todayilearned/comments/h2l9w/til_corey_feldman_already_made_a_movie_about/',
 'is_self': False}

In [11]:
df["_id"].describe()

count       325000
unique      325000
top       32646334
freq             1
Name: _id, dtype: object

In [17]:
dfn.dropna(axis='columns')

Unnamed: 0,created_utc,domain,id,is_self,num_comments,over_18,permalink,score,thumbnail,title,url
0,1304372001,avclub.com,28674356,False,0,False,/r/todayilearned/comments/h2l9w/til_corey_feld...,1,default,TIL Corey Feldman already made a movie about k...,http://www.avclub.com/articles/never-mind-core...
1,1286852572,self.til,23048532,True,1,False,/r/til/comments/dq0d0/til_the_new_oreo_fudge_c...,2,self,TIL The new Oreo Fudge Creme (Mint) are just G...,http://www.reddit.com/r/til/comments/dq0d0/til...
2,1431128415,reddit.com,190376712,False,0,False,/r/unremovable/comments/35cfm0/til_in_2008_whe...,1,http://b.thumbs.redditmedia.com/SEbywoyHs0nRfH...,"TIL in 2008, when South Korea's Health Ministr...",http://www.reddit.com/r/todayilearned/comments...
3,1358766137,troll-pictures.com,72196524,False,5,False,/r/todayilearned/comments/16zf70/til_that_tom_...,0,default,TIL that Tom Cruise has a tooth in the middle ...,http://troll-pictures.com/you-will-never-unsee...
4,1297665165,self.todayilearned,26172889,True,1,False,/r/todayilearned/comments/fkz4p/til_why_i_woul...,0,default,TIL why I wouldn't want to be shot by a gun......,http://www.reddit.com/r/todayilearned/comments...
5,1420555154,reddit.com,167150347,False,0,False,/r/RisingThreads/comments/2rim17/rtodayilearne...,4,http://b.thumbs.redditmedia.com/4_kVUn7EPe_PX8...,r/todayilearned: TIL only one white man has ev...,http://www.reddit.com/r/todayilearned/comments...
6,1358766001,fosna-folket.no,72196446,False,4,False,/r/norge/comments/16zf4u/mann_fikk_svart_skjer...,4,http://b.thumbs.redditmedia.com/Szoa9D7Y_euppC...,Mann fikk svart skjerm etter at en ulovlig por...,http://www.fosna-folket.no/nyheter/article6940...
7,1358004499,lafenty.hubpages.com,71281587,False,0,False,/r/todayilearned/comments/16ft83/til_santa_has...,1,default,TIL Santa has an evil brother who takes presen...,http://lafenty.hubpages.com/hub/KrampusandKnec...
8,1286851810,sports.yahoo.com,23048300,False,0,False,/r/todayilearned/comments/dq06k/til_that_some_...,3,http://thumbs.reddit.com/t3_dq06k.png,TIL that some Tampa Bay Rays players spend hun...,http://sports.yahoo.com/mlb/news?slug=jp-raysw...
9,1338511781,armorgames.com,51079820,False,0,False,/r/todayilearned/comments/uetfw/til_platform_g...,0,default,TIL platform games can be pretty cool.,http://armorgames.com/play/13357/sequester


Unnamed: 0,adserver_click_url,adserver_imp_pixel,approved_at_utc,author,author_cakeday,author_flair_css_class,author_flair_text,author_id,banned_at_utc,brand_safe,...,third_party_tracking,third_party_tracking_2,thumbnail,thumbnail_height,thumbnail_width,title,url,user_reports,view_count,whitelist_status
0,,,,BannedINDC,,,,,,,...,,,default,,,TIL Corey Feldman already made a movie about k...,http://www.avclub.com/articles/never-mind-core...,,,
1,,,,illus1on,,,,,,,...,,,self,,,TIL The new Oreo Fudge Creme (Mint) are just G...,http://www.reddit.com/r/til/comments/dq0d0/til...,,,
2,,,,unremovable,,,,,,,...,,,http://b.thumbs.redditmedia.com/SEbywoyHs0nRfH...,,,"TIL in 2008, when South Korea's Health Ministr...",http://www.reddit.com/r/todayilearned/comments...,,,
3,,,,XilentAssassin,,,,,,,...,,,default,,,TIL that Tom Cruise has a tooth in the middle ...,http://troll-pictures.com/you-will-never-unsee...,[],,
4,,,,[deleted],,,,,,,...,,,default,,,TIL why I wouldn't want to be shot by a gun......,http://www.reddit.com/r/todayilearned/comments...,,,
5,,,,rising_threads_bot,,,,,,,...,,,http://b.thumbs.redditmedia.com/4_kVUn7EPe_PX8...,,,r/todayilearned: TIL only one white man has ev...,http://www.reddit.com/r/todayilearned/comments...,,,
6,,,,warrantyvoid,,,,,,,...,,,http://b.thumbs.redditmedia.com/Szoa9D7Y_euppC...,,,Mann fikk svart skjerm etter at en ulovlig por...,http://www.fosna-folket.no/nyheter/article6940...,[],,
7,,,,TrueWarrior,,,,,,,...,,,default,,,TIL Santa has an evil brother who takes presen...,http://lafenty.hubpages.com/hub/KrampusandKnec...,[],,
8,,,,noviestar,,,,,,,...,,,http://thumbs.reddit.com/t3_dq06k.png,,,TIL that some Tampa Bay Rays players spend hun...,http://sports.yahoo.com/mlb/news?slug=jp-raysw...,,,
9,,,,double-mobius,,,,,,,...,,,default,,,TIL platform games can be pretty cool.,http://armorgames.com/play/13357/sequester,,,
