In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install praw
!pip install ffn
!pip install --upgrade pandas-datareader

In [None]:
# libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 300)
import praw #reddit data api
from praw.models import MoreComments # module to get replies to comments
import ffn #for loading financial data
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
import requests
import json
import csv
import time
import datetime
import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
import pickle

In [None]:
# enter path
path = ""

In [None]:
sub='wallstreetbets' #Subreddit to scrape
#start and end date of scraping
before = "2021-12-31"
after = "2021-01-01" # starting to scrape in august 2018 when the post first appeared
# define the thread we want to scrape every day
query = "Daily Discussion Thread"
subStats = []
subCount = 0


#function to get reddit post titles and urls with timestamp from pusshift api
def getPushshiftData(query, after, before, sub):
    url = ('https://api.pushshift.io/reddit/search/submission/?title='
           +str(query)+'&size=10000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)) # get Pusshift url
    r = requests.get(url) # get request
    data = json.loads(r.text) # load data into a json file
    return data['data'] # return part of the json file


#get needed data from data scraped above
def collectPosts(post):
    subData = ([post['id'], post['title'], post['url'], 
                datetime.datetime.fromtimestamp(post['created_utc']).date()]) #create list to hold data about posts
    try:
        flair = post['link_flair_text'] # try to get flair of the post
    except KeyError:
        flair = "NaN" # if there is no flai return NaN
    subData.append(flair) # append flair
    subStats.append(subData) # append data 

    
data = getPushshiftData(query, after, before, sub) # get data


# loop will run until all posts have been gathered 
# from the 'after' date up until before date
while len(data) > 0:
    for submission in data:
        collectPosts(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)
    

#organize data into dataframe
# create variables
data={} # dictionary in preperation for dataframe
# lists that will contain column values
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]

# get data into variables
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])

# append dictionary    
data['id']=ids
data['title']=titles
data['url']=urls
data['date']=dates
data['flair']=flairs

# create dataframe from dictionary

posts=pd.DataFrame(data)
posts=posts[posts['flair']=='Daily Discussion']
posts

In [None]:
# setting up redit client to scrape comments of the posts with PRAW
reddit = praw.Reddit(
  client_id = "",
  client_secret = "",
  user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
)

In [None]:
# long run time take note

daily_comments=[] # list that will hold lists of daily comments
for url in tqdm((posts['url'].tolist())): # iterate over urls
    try:
        comments = []
        submission = reddit.submission(url=url) # get comments
        submission.comments.replace_more(limit=0) # this PRAW function allows to access comments and replies 
        for comment in submission.comments: #
            comments.append(comment.body) # append comments and replies to list
        daily_comments.append(comments)
    except:
        comment=None
        comments.append(comment)
        

In [None]:
# check
comments[0]
daily_comments[1]

In [None]:
# store
posts.to_pickle(path + 'data/posts.pickle')

with open(path + 'data/comments.pickle', 'wb') as fp:
  pickle.dump(comments, fp)

with open(path + 'data/daily_comments.pickle', 'wb') as fp:
  pickle.dump(daily_comments, fp)

with open (path + 'data/comments.pickle', 'rb') as fp:
  abc = pickle.load(fp)

with open (path + 'data/daily_comments.pickle', 'rb') as fp:
  cde = pickle.load(fp)