# Scrap all the data from reddit

## All the functions to do so:

In [12]:
import numpy as np
import requests
import json
import csv
import time
import datetime
import os


def get_pushshift_data(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title=' + str(query) + '&size=1000&after=' + str(
        after) + '&before=' + str(before) + '&subreddit=' + str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


def collect_sub_data(subm):
    sub_data = list()  # list to store data points
    title = subm['title']
    url = subm['url']
    try:
        # if flair is available then get it, else set 'NaN'
        flair = subm['link_flair_text']
    except KeyError:
        flair = 'NaN'
    author = subm['author']
    sub_id = subm['id']
    score = subm['score']
    try:
        # if selftext is available then get it, else set it empty
        selftext = subm['selftext']
        list_of_empty_markers = ['[removed]', '[deleted]']
        # many times selftext would be removed or deleted, if thats the case then set it empty
        if selftext in list_of_empty_markers:
            selftext = ''
    except:
        selftext = ''
    created = datetime.datetime.fromtimestamp(subm['created_utc'])  # 1520561700.0
    numComms = subm['num_comments']
    permalink = subm['permalink']

    sub_data.append((sub_id, title, selftext, url, author, score, created, numComms, permalink, flair))
    sub_stats[sub_id] = sub_data


def write_subs_to_file(filename):
    upload_count = 0
    if os.path.exists(filename):
        keep_header = False
    else:
        keep_header = True

    with open(filename, 'a', newline='') as file:
        a = csv.writer(file, delimiter=',')
        headers = ['post_id', 'title', 'selftext', 'url', 'author', 'score', 'publish_date', 'num_of_comments',
                   'permalink', 'flair']
        if keep_header:
            a.writerow(headers)
        for sub in sub_stats:
            a.writerow(sub_stats[sub][0])
            upload_count += 1
        # print(str(upload_count) + ' submissions have been uploaded')


def scrapper(start_date, end_date, filename, max_count):
    sub_reddit = 'bitcoin'
    key_word = 'bitcoin'

    output_filename = f'{filename}.csv'
    
    # in each itration get reddit posts for one day, to avoid getting blocked by server
    one_day = datetime.timedelta(hours=24)
    after_date = start_date
    after = str(int(after_date.timestamp()))
    before_date = start_date + one_day
    before = str(int(before_date.timestamp()))

    while after_date < end_date:
        print('-' * 80)
        print(after_date, ' -> ', before_date)
        print('-' * 80)

        sub_count = 0
        sub_stats = {}

        data = get_pushshift_data(key_word, after, before, sub_reddit)

        #max_count = 3
        count = 0
        while len(data) > 0 and count < max_count:
            print('count ', count)
            for submission in data:
                collect_sub_data(submission)
                sub_count += 1

            print(len(data))
            print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
            after = data[-1]['created_utc']
            data = get_pushshift_data(key_word, after, before, sub_reddit)
            # print(data)
            # print(data['data'][0]['author'])
            count = count + 1

        # keep saving data collected in each iteration
        write_subs_to_file(output_filename)

        # move to next day
        after_date += one_day
        after = str(int(after_date.timestamp()))
        before_date += one_day
        before = str(int(before_date.timestamp()))

        # randomly sleep before starting next iteration
        time.sleep(np.random.randint(1, 3))



## Obtaining the data:

In [18]:
months = list(range(1,12))
years = [2020, 2019]
max_per_day = 50

In [15]:
for year in years:
    for month in months:
        filename = f'reddit_data_{month}_{year}'
        # search all the posts from start_date to end_date overall
        start_date = datetime.datetime(year, month, 1, 0)
        end_date = datetime.datetime(year, month, 30, 0)
        scrapper(start_date, end_date, filename, max_per_day)

--------------------------------------------------------------------------------
2021-01-01 00:00:00  ->  2021-01-02 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1609459200&before=1609545600&subreddit=bitcoin
count  0
100
2021-01-01 15:51:08
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1609516268&before=1609545600&subreddit=bitcoin
count  1
85
2021-01-01 23:54:54
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1609545294&before=1609545600&subreddit=bitcoin
--------------------------------------------------------------------------------
2021-01-02 00:00:00  ->  2021-01-03 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1609545600&before=1609632000&subreddit=bitcoin
count  0


count  0
100
2021-01-11 10:10:07
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1610359807&before=1610409600&subreddit=bitcoin
count  1
100
2021-01-11 16:54:52
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1610384092&before=1610409600&subreddit=bitcoin
count  2
100
2021-01-11 23:20:26
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1610407226&before=1610409600&subreddit=bitcoin
--------------------------------------------------------------------------------
2021-01-12 00:00:00  ->  2021-01-13 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1610409600&before=1610496000&subreddit=bitcoin
count  0
100
2021-01-12 11:39:21
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1610451561&before=1610496000&subreddit=bitcoin
count  1
100
2021-01-

count  1
100
2021-01-22 22:30:32
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1611354632&before=1611360000&subreddit=bitcoin
count  2
12
2021-01-22 23:59:38
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1611359978&before=1611360000&subreddit=bitcoin
--------------------------------------------------------------------------------
2021-01-23 00:00:00  ->  2021-01-24 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1611360000&before=1611446400&subreddit=bitcoin
count  0
100
2021-01-23 17:15:38
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1611422138&before=1611446400&subreddit=bitcoin
count  1
31
2021-01-23 23:24:23
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1611444263&before=1611446400&subreddit=bitcoin
-----------------------

## Reading the data:

In [16]:
import pandas as pd
pd.read_csv(f'reddit_data_{month}_{year}.csv')

Unnamed: 0,post_id,title,selftext,url,author,score,publish_date,num_of_comments,permalink,flair
0,pdjv8l,Bitcoin generated $2 billion in profits for Mi...,,https://digesttime.com/2021/08/28/bitcoin-gene...,thefoodboylover,1,2021-08-29 00:01:06,23,/r/Bitcoin/comments/pdjv8l/bitcoin_generated_2...,
1,pdkb1m,Bitcoin will shoot.,Bitcoin will def shoot rapidly and would be mo...,https://www.reddit.com/r/Bitcoin/comments/pdkb...,tammy_lee112,1,2021-08-29 00:28:51,0,/r/Bitcoin/comments/pdkb1m/bitcoin_will_shoot/,
2,pdkmwp,I have impossible idea. It is the incredible s...,We understand an important issue: the technolo...,https://www.reddit.com/r/Bitcoin/comments/pdkm...,cryptosyor,1,2021-08-29 00:49:57,39,/r/Bitcoin/comments/pdkmwp/i_have_impossible_i...,
3,pdkul3,Has anyone brainstormed about how Bitcoin can ...,Is it possible? It must be. I don't know enoug...,https://www.reddit.com/r/Bitcoin/comments/pdku...,dikgumdur,1,2021-08-29 01:03:33,26,/r/Bitcoin/comments/pdkul3/has_anyone_brainsto...,
4,pdl3ym,Gold mining is the largest source of CHILD LAB...,I was reading some articles today and went dow...,https://www.reddit.com/r/Bitcoin/comments/pdl3...,saccred,1,2021-08-29 01:19:52,147,/r/Bitcoin/comments/pdl3ym/gold_mining_is_the_...,
...,...,...,...,...,...,...,...,...,...,...
142354,l7xc5j,"Can't post about crypto on WSB, so I made a co...",Several comments on this well written post [ht...,https://www.reddit.com/r/Bitcoin/comments/l7xc...,WormLivesMatter,1,2021-01-29 16:46:50,3,/r/Bitcoin/comments/l7xc5j/cant_post_about_cry...,
142355,l7xcnj,WSB surpasses bitcoin on coinmarketcap!,Did coinmarketcap get hacked?,https://www.reddit.com/r/Bitcoin/comments/l7xc...,MisterKinister,1,2021-01-29 16:47:23,7,/r/Bitcoin/comments/l7xcnj/wsb_surpasses_bitco...,
142356,l7xeps,What happens when people/companies buy large a...,"Hi guys,\n\nThis is my first post so forgive m...",https://www.reddit.com/r/Bitcoin/comments/l7xe...,brianReddits,1,2021-01-29 16:49:37,3,/r/Bitcoin/comments/l7xeps/what_happens_when_p...,
142357,l7xfet,Buying bitcoin on revolut,I was just wondering if it's a bad place to bu...,https://www.reddit.com/r/Bitcoin/comments/l7xf...,Mouradb123,1,2021-01-29 16:50:24,5,/r/Bitcoin/comments/l7xfet/buying_bitcoin_on_r...,
