# Data Scraping
This notebook gathers information on reddit, twitter etc. about Ethereum's Merge

In [1]:
import pandas as pd
import datetime as dt
from tqdm.notebook import tqdm

import praw
from psaw import PushshiftAPI

#### Get data from reddit

In [2]:
reddit = praw.Reddit(
    client_id="",
    client_secret="",
    password="",
    user_agent="ICASentimentAnalysis",
    username="",
    redirect_uri=""
)
push_api = PushshiftAPI(reddit)

In [3]:
raw_data = pd.DataFrame()
def date4push(year):
    return int(dt.datetime(year, 1, 1).timestamp())

In [4]:
search_query = "ethereum merge"
submissions = push_api.search_submissions(limit=99999, after=date4push(2022), q=search_query)
submissions = list(submissions)
print(str(len(submissions)) + " submissions found in 2022")



12775 submissions found in 2022


In [5]:
for submission_id in tqdm(submissions):
    submission = reddit.submission(id=submission_id.id)
    raw_data = raw_data.append({
        "title":submission.title,
        "text":submission.selftext,
        "score":submission.score,
        "time":pd.to_datetime(submission.created, unit="s"),
        "upvote_ratio":submission.upvote_ratio
    }, ignore_index=True)

raw_data.to_csv("data/q_" + search_query + "_2022_now.csv", index=False)

  0%|          | 0/12775 [00:00<?, ?it/s]

In [6]:
for year in tqdm(range(2021, 2013, -1)):
    submissions = push_api.search_submissions(limit=99999, after=date4push(year), before=date4push(year+1), q=search_query)
    submissions = list(submissions)
    print(str(len(submissions)) + " submissions found in " + str(year))
    for submission_id in tqdm(submissions):
        submission = reddit.submission(id=submission_id.id)
        raw_data = raw_data.append({
            "title":submission.title,
            "text":submission.selftext,
            "score":submission.score,
            "time":pd.to_datetime(submission.created, unit="s"),
            "upvote_ratio":submission.upvote_ratio
        }, ignore_index=True)
    raw_data.to_csv("data/q_" + search_query + "_reddit_" + str(year) + "_now.csv", index=False)

  0%|          | 0/8 [00:00<?, ?it/s]

1363 submissions found in 2021


  0%|          | 0/1363 [00:00<?, ?it/s]



64 submissions found in 2020


  0%|          | 0/64 [00:00<?, ?it/s]

50 submissions found in 2019


  0%|          | 0/50 [00:00<?, ?it/s]

112 submissions found in 2018


  0%|          | 0/112 [00:00<?, ?it/s]

54 submissions found in 2017


  0%|          | 0/54 [00:00<?, ?it/s]

33 submissions found in 2016


  0%|          | 0/33 [00:00<?, ?it/s]

14 submissions found in 2015


  0%|          | 0/14 [00:00<?, ?it/s]

4 submissions found in 2014


  0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
print("Found " + str(raw_data.shape[0]) + " posts in total")

Found 14469 posts in total


In [9]:

# Drop posts without text or deleted
data = raw_data.loc[raw_data.text != "[removed]"]
data = data.loc[data.text != "[entfernt]"]
data = data.loc[data.text != "[gelöscht]"]
data = data.loc[data.text != ""]
print(str(data.shape[0]) + " remaining posts in total")
# Filter high scoring posts
data = data.loc[data.score >= 10]
print(str(data.shape[0]) + " remaining posts with score over 10 in total")

3777 remaining posts in total
1042 remaining posts with score over 10 in total


In [10]:
# Save to file
data.to_csv("data/q_" + search_query + "_reddit_full10.csv", index=False)

NameError: name 'data' is not defined