In [1]:
import numpy as np
import requests
import json
import csv
import time
import datetime
import os
import pandas as pd
import pprint

In [2]:
def getApiData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title=' + str(query) + '&size=1000&after=' + str(
        after) + '&before=' + str(before) + '&subreddit=' + str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [3]:
def collectData(subm):
    sub_data = list()  # list to store data points
    title = subm['title']
    url = subm['url']
    try:
        # if flair is available then get it, else set 'NaN'
        flair = subm['link_flair_text']
    except KeyError:
        flair = 'NaN'
    author = subm['author']
    sub_id = subm['id']
    score = subm['score']
    try:
        # if selftext is available then get it, else set it empty
        selftext = subm['selftext']
        list_of_empty_markers = ['[removed]', '[deleted]']
        # many times selftext would be removed or deleted, if thats the case then set it empty
        if selftext in list_of_empty_markers:
            selftext = ''
    except:
        selftext = ''
    created = datetime.datetime.fromtimestamp(subm['created_utc'])  # 1520561700.0
    numComms = subm['num_comments']
    permalink = subm['permalink']

    sub_data.append((sub_id, title, selftext, url, author, score, created, numComms, permalink, flair))
    sub_stats[sub_id] = sub_data
    # print(sub_stats)


In [4]:
def write_subs_to_file(filename):
    upload_count = 0
    if os.path.exists(filename):
        keep_header = False
    else:
        keep_header = True

    with open(filename, 'a', newline='', encoding='utf-8') as file:
        a = csv.writer(file, delimiter=',')
        headers = ['post_id', 'title', 'selftext', 'url', 'author', 'score', 'publish_date', 'num_of_comments',
                   'permalink', 'flair']
        if keep_header:
            a.writerow(headers)
        for sub in sub_stats:
            a.writerow(sub_stats[sub][0])
            upload_count += 1
        # print(str(upload_count) + ' submissions have been uploaded')
        


In [8]:

# Download reddit posts from sub_reddit with keywords given by key_word

sub_reddit = 'bitcoin'
key_word = 'bitcoin'

output_filename = 'reddit_data.csv'
# search all the posts from start_date to end_date overall
start_date = datetime.datetime(2020, 5, 12, 0)
end_date = datetime.datetime(2020, 7, 31, 0)

# in each itration get reddit posts for one day, to avoid getting blocked by server
one_day = datetime.timedelta(hours=24)
after_date = start_date
after = str(int(after_date.timestamp()))
before_date = start_date + one_day
before = str(int(before_date.timestamp()))

while after_date < end_date:
    print('-' * 80)
    print(after_date, ' -> ', before_date)
    print('-' * 80)

    sub_count = 0
    sub_stats = {}

    data = getApiData(key_word, after, before, sub_reddit)

    max_count = 100
    count = 0
    while len(data) > 0 and count < max_count:
        print('count ', count)
        for submission in data:
            collectData(submission)
            sub_count += 1

        print(len(data))
        print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
        after = data[-1]['created_utc']
        data = getApiData(key_word, after, before, sub_reddit)
        # print(data)
        # print(data['data'][0]['author'])
        count = count + 1

    # keep saving data collected in each iteration
    write_subs_to_file(output_filename)

    # move to next day
    after_date += one_day
    after = str(int(after_date.timestamp()))
    before_date += one_day
    before = str(int(before_date.timestamp()))

    # randomly sleep before starting next iteration
    time.sleep(np.random.randint(1, 3))


--------------------------------------------------------------------------------
2020-05-12 00:00:00  ->  2020-05-13 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1589221800&before=1589308200&subreddit=bitcoin
count  0
100
2020-05-12 06:56:54
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1589246814&before=1589308200&subreddit=bitcoin
count  1
100
2020-05-12 23:20:36
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1589305836&before=1589308200&subreddit=bitcoin
count  2
1
2020-05-12 23:54:42
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1589307882&before=1589308200&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-05-13 00:00:00  ->  2020-05-14 00:00:00
----------------------------------------------------------

count  0
96
2020-05-25 23:48:54
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1590430734&before=1590431400&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-05-26 00:00:00  ->  2020-05-27 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1590431400&before=1590517800&subreddit=bitcoin
count  0
100
2020-05-26 22:45:48
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1590513348&before=1590517800&subreddit=bitcoin
count  1
8
2020-05-26 23:26:43
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1590515803&before=1590517800&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-05-27 00:00:00  ->  2020-05-28 00:00:00
-----------------------------------------------------------

count  0
97
2020-06-09 23:46:13
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1591726573&before=1591727400&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-06-10 00:00:00  ->  2020-06-11 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1591727400&before=1591813800&subreddit=bitcoin
count  0
92
2020-06-10 23:50:57
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1591813257&before=1591813800&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-06-11 00:00:00  ->  2020-06-12 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1591813800&before=1591900200&subreddit=bitcoin
count  0
1

count  0
90
2020-06-25 23:10:37
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1593106837&before=1593109800&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-06-26 00:00:00  ->  2020-06-27 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1593109800&before=1593196200&subreddit=bitcoin
count  0
73
2020-06-26 23:44:36
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1593195276&before=1593196200&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-06-27 00:00:00  ->  2020-06-28 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1593196200&before=1593282600&subreddit=bitcoin
count  0
5

count  0
46
2020-07-12 23:08:52
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1594575532&before=1594578600&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-07-13 00:00:00  ->  2020-07-14 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1594578600&before=1594665000&subreddit=bitcoin
count  0
67
2020-07-13 23:37:26
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1594663646&before=1594665000&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-07-14 00:00:00  ->  2020-07-15 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1594665000&before=1594751400&subreddit=bitcoin
count  0
9

count  1
41
2020-07-28 23:48:39
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1595960319&before=1595961000&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-07-29 00:00:00  ->  2020-07-30 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1595961000&before=1596047400&subreddit=bitcoin
count  0
86
2020-07-29 23:48:50
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1596046730&before=1596047400&subreddit=bitcoin
--------------------------------------------------------------------------------
2020-07-30 00:00:00  ->  2020-07-31 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=bitcoin&size=1000&after=1596047400&before=1596133800&subreddit=bitcoin
count  0
8

## Sentiment Analysis

In [9]:
import pandas as pd
import flair
from textblob import TextBlob
import os
import datetime
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [10]:
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')
fmt = '%Y-%m-%d %H:00:00'
sid = SentimentIntensityAnalyzer()

nltk.download('vader_lexicon')

2020-12-03 18:55:08,287 loading file C:\Users\dheer\.flair\models\sentiment-en-mix-distillbert_3.1.pt


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dheer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
def get_sentiment_val_for_flair(sentiments):
    """
    parse input of the format [NEGATIVE (0.9284018874168396)] and return +ve or -ve float value
    :param sentiments:
    :return:
    """
    total_sentiment = str(sentiments)
    neg = 'NEGATIVE' in total_sentiment
    if neg:
        total_sentiment = total_sentiment.replace('NEGATIVE', '')
    else:
        total_sentiment = total_sentiment.replace('POSITIVE', '')

    total_sentiment = total_sentiment.replace('(', '').replace('[', '').replace(')', '').replace(']', '')

    val = float(total_sentiment)
    if neg:
        return -val
    return val

In [12]:
def get_sentiment_report(input_filename, output_filename):
    df = pd.read_csv(input_filename)
    df = df[['title', 'selftext', 'publish_date']]
    df = df.fillna('')

    df['text'] = df['title'] + ' ' + df['selftext']
    df.set_index('publish_date', inplace=True)
    df.drop(['title', 'selftext'], axis=1, inplace=True)

    for row_i, row in df.iterrows():
        tb_sentiment_polarity_dict = dict()
        tb_sentiment_subjectivity_dict = dict()
        flair_sentiment_dict = dict()

        sid_pos_dict = dict()
        sid_neg_dict = dict()
        sid_neu_dict = dict()
        sid_com_dict = dict()

        data = row['text']
        print(row_i)
        print(data[0:15])
        flair_s = flair.data.Sentence(data)
        flair_sentiment.predict(flair_s)
        flair_total_sentiment = flair_s.labels
        flair_val = get_sentiment_val_for_flair(flair_total_sentiment)

        flair_sentiment_dict[str(row_i)] = flair_val
        tb_sentiment_polarity_dict[str(row_i)] = TextBlob(data).sentiment[0]
        tb_sentiment_subjectivity_dict[str(row_i)] = TextBlob(data).sentiment[1]

        ss = sid.polarity_scores(data)
        sid_pos_dict[str(row_i)] = ss['pos']
        sid_neg_dict[str(row_i)] = ss['neg']
        sid_neu_dict[str(row_i)] = ss['neu']
        sid_com_dict[str(row_i)] = ss['compound']

        flair_df = pd.DataFrame.from_dict(flair_sentiment_dict, orient='index', columns=['reddit_flair'])
        flair_df.index.name = 'timestamp'

        tb_polarity_df = pd.DataFrame.from_dict(tb_sentiment_polarity_dict, orient='index',
                                                columns=['reddit_tb_polarity'])
        tb_polarity_df.index.name = 'timestamp'

        tb_subjectivity_df = pd.DataFrame.from_dict(tb_sentiment_subjectivity_dict, orient='index',
                                                    columns=['reddit_tb_subjectivity'])
        tb_subjectivity_df.index.name = 'timestamp'

        sid_pos_df = pd.DataFrame.from_dict(sid_pos_dict, orient='index',
                                            columns=['reddit_sid_pos'])
        sid_pos_df.index.name = 'timestamp'

        sid_neg_df = pd.DataFrame.from_dict(sid_neg_dict, orient='index',
                                            columns=['reddit_sid_neg'])
        sid_neg_df.index.name = 'timestamp'

        sid_neu_df = pd.DataFrame.from_dict(sid_neu_dict, orient='index',
                                            columns=['reddit_sid_neu'])
        sid_neu_df.index.name = 'timestamp'

        sid_com_df = pd.DataFrame.from_dict(sid_com_dict, orient='index',
                                            columns=['reddit_sid_com'])
        sid_com_df.index.name = 'timestamp'

        final_senti_df = pd.concat([flair_df, tb_polarity_df, tb_subjectivity_df, sid_pos_df, sid_neg_df, sid_neu_df,
        							sid_com_df], axis=1)

        if os.path.exists(output_filename):
            keep_header = False
        else:
            keep_header = True

        final_senti_df.to_csv(output_filename, mode='a', header=keep_header)

    return

In [13]:
def clean_sentiment_report(input_filename, output_filename):
    # drop duplicates and sort
    master_df = pd.read_csv(input_filename, index_col=0)
    master_df.index = pd.to_datetime(master_df.index)
    idx = np.unique(master_df.index, return_index=True)[1]
    master_df = master_df.iloc[idx]
    master_df.to_csv(output_filename)

In [None]:
def bucketize_sentiment_report(input_filename, output_filename):
    start_date_time_obj = datetime.datetime(2019, 11, 21, 0)
    end_date_time_obj = datetime.datetime(2020, 7,30, 0)
    hr1 = datetime.timedelta(hours=1)
    curr_date_time_obj = start_date_time_obj
    in_df = pd.read_csv(input_filename)


    out_dict = dict()

    while curr_date_time_obj <= end_date_time_obj:
        curr_timestamp = curr_date_time_obj.strftime(format=fmt)
        # print(curr_timestamp)
        # create data dict with all possible timestamps and dummy value of reddit_flair
        # reddit_flair is chosen just randomly as a placeholder
        out_dict[curr_timestamp] = 0
        curr_date_time_obj += hr1

    out_df = pd.DataFrame.from_dict(out_dict, orient='index',
                                    columns=['reddit_flair'])

    # print(out_dict)
    out_df.index.name = 'timestamp'
    # populate more colums
    out_df['reddit_flair_count'] = 0
    out_df['reddit_tb_polarity'] = 0
    out_df['reddit_tb_polarity_count'] = 0
    out_df['reddit_tb_subjectivity'] = 0
    out_df['reddit_tb_subjectivity_count'] = 0
    out_df['reddit_sid_neg'] = 0
    out_df['reddit_sid_neu'] = 0
    out_df['reddit_sid_com'] = 0
    out_df['reddit_sid_count'] = 0

    for i in range(len(in_df)):
        timestamp = in_df.loc[i, 'timestamp']
        out_key = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        # timestamp is current plus few minutes or seconds, so collect all these data in the bucket of next hour
        out_key += hr1
        out_key = out_key.strftime(format='%Y-%m-%d %H:00:00')
        #print(out_key)
        # add up all values and count how many values we have added. In next pass we would normalize the values
        try:
            out_df.loc[out_key, 'reddit_flair'] += in_df.loc[i, 'reddit_flair']
            out_df.loc[out_key, 'reddit_flair_count'] += 1
            out_df.loc[out_key, 'reddit_tb_polarity'] += in_df.loc[i, 'reddit_tb_polarity']
            out_df.loc[out_key, 'reddit_tb_polarity_count'] += 1
            out_df.loc[out_key, 'reddit_tb_subjectivity'] += in_df.loc[i, 'reddit_tb_subjectivity']
            out_df.loc[out_key, 'reddit_tb_subjectivity_count'] += 1
            out_df.loc[out_key, 'reddit_sid_pos'] += in_df.loc[i, 'reddit_sid_pos']
            out_df.loc[out_key, 'reddit_sid_neg'] += in_df.loc[i, 'reddit_sid_neg']
            out_df.loc[out_key, 'reddit_sid_neu'] += in_df.loc[i, 'reddit_sid_neu']
            out_df.loc[out_key, 'reddit_sid_com'] += in_df.loc[i, 'reddit_sid_com']
            out_df.loc[out_key, 'reddit_sid_count'] += 1
        except:
            pass

    # make timestamp as a column and reindex the dataframe to make loc method happy
    out_df['timestamp'] = out_df.index
    out_df.index = range(len(out_df))

    for i in range(len(out_df)):
        #print(out_df.loc[i, 'timestamp'])
        # normalize the values
        if out_df.loc[i, 'reddit_flair_count'] == 0:
            out_df.loc[i, 'reddit_flair'] = 0
        else:
            out_df.loc[i, 'reddit_flair'] /= out_df.loc[i, 'reddit_flair_count']

        if out_df.loc[i, 'reddit_tb_polarity_count'] == 0:
            out_df.loc[i, 'reddit_tb_polarity'] = 0
        else:
            out_df.loc[i, 'reddit_tb_polarity'] /= out_df.loc[i, 'reddit_tb_polarity_count']

        if out_df.loc[i, 'reddit_tb_subjectivity_count'] == 0:
            out_df.loc[i, 'reddit_tb_subjectivity'] = 0
        else:
            out_df.loc[i, 'reddit_tb_subjectivity'] /= out_df.loc[i, 'reddit_tb_subjectivity_count']

        if out_df.loc[i, 'reddit_sid_count'] == 0:
            out_df.loc[i, 'reddit_sid_pos'] = 0
            out_df.loc[i, 'reddit_sid_neg'] = 0
            out_df.loc[i, 'reddit_sid_neu'] = 0
            out_df.loc[i, 'reddit_sid_com'] = 0
        else:
            out_df.loc[i, 'reddit_sid_pos'] /= out_df.loc[i, 'reddit_sid_count']
            out_df.loc[i, 'reddit_sid_neg'] /= out_df.loc[i, 'reddit_sid_count']
            out_df.loc[i, 'reddit_sid_neu'] /= out_df.loc[i, 'reddit_sid_count']
            out_df.loc[i, 'reddit_sid_com'] /= out_df.loc[i, 'reddit_sid_count']

        if os.path.exists(output_filename):
            keep_header = False
        else:
            keep_header = True

    out_df.drop(['reddit_flair_count', 'reddit_tb_polarity_count', 'reddit_tb_subjectivity_count','reddit_sid_count'], axis=1,
                inplace=True)
    # change back index to timestamp to save the data in csv
    out_df.set_index('timestamp', inplace=True)
    out_df.to_csv(output_filename)


In [None]:
input_filename = 'reddit_data.csv'
output_sentiment_filename = input_filename[0:-4] + '_sentiment.csv'

# read input_filename (which can be generated by download_data_from_reddit.py script) and performs
# sentiment analyis of the text data
# get_sentiment_report(input_filename, output_sentiment_filename)
output_sentiment_bucketize_filename = output_sentiment_filename[0:-4] + '_bucketized.csv'

# reddit posts can land anytime. Collect all the posts (and its sentiment reports) landed on a given hour (0 to 59 minutes)
# and bucketize them all into the corresponding hour
bucketize_sentiment_report(output_sentiment_filename, output_sentiment_bucketize_filename)