# Download Data from Reddit

#### Load Libraries

In [1]:
import pandas as pd
import datetime as dt
import time
import requests


pd.set_option('display.max_columns', None)

In [44]:
def query_pushshift(subreddit, kind = 'submission', day_window = 7, n = 520):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [45]:
movies = query_pushshift("movies")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=14d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=21d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=28d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=35d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=42d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=49d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=56d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=63d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&a

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=574d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=581d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=588d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=595d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=602d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=609d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=616d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=623d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=630d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1141d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1148d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1155d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1162d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1169d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1176d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1183d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1190d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1197d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddi

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1701d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1708d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1715d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1722d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1729d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1736d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1743d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1750d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=1757d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddi

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2261d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2268d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2275d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2282d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2289d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2296d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2303d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2310d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2317d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddi

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2821d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2828d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2835d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2842d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2849d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2856d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2863d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2870d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=2877d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddi

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3381d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3388d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3395d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3402d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3409d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3416d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3423d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3430d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=movies&size=500&after=3437d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddi

In [46]:
movies.shape

(16688, 9)

In [47]:
movies.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,"Recount, stars Kevin Spacey as Ron Klain, inco...",[deleted],movies,1611094020,[deleted],3,1,True,2021-01-19
1,Feel bad for the ladies who watched Oldboy (20...,[removed],movies,1611094078,Minstrel-of-Shadow,0,1,True,2021-01-19
2,"Recount, stars Kevin Spacey as Ron Klain, inco...",,movies,1611094192,[deleted],4,0,True,2021-01-19
3,Looking for any movie recommendations,[removed],movies,1611094237,sourdough_star,0,1,True,2021-01-19
4,"The Good, the Bad, and the Ugly?",[removed],movies,1611094449,MatthewGialdini,0,1,True,2021-01-19


In [67]:
movies.to_csv('../data/movies_raw.csv')

In [63]:
books = query_pushshift("books",day_window = 7, n = 520 )

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=14d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=21d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=28d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=35d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=42d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=49d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=56d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=63d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=70d
Q

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=581d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=588d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=595d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=602d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=609d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=616d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=623d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=630d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=637d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&a

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1148d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1155d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1162d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1169d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1176d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1183d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1190d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1197d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1204d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1715d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1722d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1729d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1736d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1743d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1750d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1757d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1764d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=1771d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2282d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2289d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2296d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2303d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2310d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2317d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2324d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2331d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2338d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2849d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2856d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2863d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2870d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2877d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2884d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2891d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2898d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=2905d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3416d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3423d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3430d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3437d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3444d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3451d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3458d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3465d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&size=500&after=3472d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=books&s

In [64]:
books.shape

(26019, 9)

In [65]:
books.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Is it bad that I've only read like 20% of the ...,I recently heard of the Japanese term Tsundoku...,books,1611101866,redditaccount444444,174,7,True,2021-01-19
1,amazon kindle book,[removed],books,1611102573,kimberly0099,0,1,True,2021-01-19
2,What books are worse than the film adaptation?,[removed],books,1611102737,Abranurni,10,2,True,2021-01-19
4,Need help finding the right literature,[removed],books,1611104309,Ozay0900,6,0,True,2021-01-19
5,Anyone have delivery issues with a thriftbooks...,[removed],books,1611104463,RadiantInspector2459,2,1,True,2021-01-19


Writing it into a CSV to read from other notebooks

In [68]:
books.to_csv('../data/books_raw.csv')