# Reddit API data collection

## Reddit forums: r/artificial, r/machinelearning, r/bigscience
Use APIs to collect data mentioning GPT-3 and other AI technologies from November 1 2022 to January 31 2023.

In [151]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain
import json

# reddit crawler
import praw

# converting created dates from reddit API into human readable format
from datetime import datetime, timedelta

# make directories for data collection
import os

# copy data structure
import copy

# regular expression search PRAW results
import re

# wait time for api limits and api retry
import time
import asyncio

In [152]:
# Load the JSON credentials file
with open('reddit.json') as f:
    reddit_users = json.load(f)

credentials = reddit_users['Zealousideal-Land259']
#credentials = reddit_users['reddit_user4']

# Instantiate praw.Reddit object
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    user_agent=credentials['user_agent'],
    redirect_uri=credentials['redirect_uri'],
    refresh_token=credentials['refresh_token']
)

reddit_user = str(reddit.user.me())
# test connection
print(f"Reddit user: {reddit_user}")

Reddit user: Zealousideal-Land259


In [153]:
# Check if config file exists
config_file = 'config.json'
if not os.path.exists(config_file):
    # Create config file with default content
    with open(config_file, 'w') as f:
        json.dump({"run": 0}, f)

# Load run information and increment run
with open(config_file) as f:
    config = json.load(f)
    config['run'] = config['run'] + 1

# Write new run to config file
with open(config_file, 'w') as f:
    json.dump({"run": config['run']}, f)

# test connection
run_name = '{:03d}'.format(config['run'])
print(f"Run name: {run_name}")

Run name: 005


## Scrape top posts from reddit

In [154]:
# After successful result 
# Retry every 10 seconds 12 times for a total of 2 minutes
def retry_function(func, *args, max_attempts=12, delay=10, **kwargs):
    attempts = 0
    while attempts < max_attempts:
        try:
            start_time = time.time() # record start time of api call
            result = func(*args, **kwargs)  # Call the function
            end_time = time.time() # record start time of api call
            # Reddit API restricts to 100 queries per minute
            reddit_api_restriction = 100/60
            # wait for the difference between the api restriction and the total api call time
            api_wait_time = reddit_api_restriction - (end_time-start_time)
            if api_wait_time > 0: time.sleep(api_wait_time)
            return result  # Return the result if successful
        except Exception as e:
            print(f"An error occurred: {e}")
            attempts += 1
            if attempts < max_attempts:
                print(f"Retrying attempt #{attempts} in {delay} seconds...")
                for _ in range(delay): time.sleep(1)
    print("Max attempts reached. Continuing loop.")
    return None  # Or you can raise an exception here if needed


In [155]:
# API structure
topics_dict_template = {"id":[],
                        "subreddit":[],
                        "query":[],
                        "sort":[],
                        "date":[],
                        "title":[],
                        "author":[],
                        "stickied":[],
                        "upvote_ratio":[],
                        "score":[],
                        "url":[],
                        "num_comments": [],
                        "created": [],
                        "body":[]}

def scrape_submission(topics_dict, submission, other):
    #November 1 2022 to January 31 2023
    # Define the date range for GPT-3 hype analysis
    gpt3_start = datetime(2022, 11, 1)
    gpt3_end = datetime(2023, 1, 31)

    #February 15 to May 15 that includes the launch GPT-4 on March 14, 2023
    # Define the date range for GPT-4 hype analysis
    gpt4_start = datetime(2023, 2, 15)
    gpt4_end = datetime(2023, 5, 15)

    # Scrape only dates within the timeframes
    date = datetime.utcfromtimestamp(submission.created)
    if (gpt3_start <= date < gpt3_end + timedelta(days=1)) | (gpt4_start <= date < gpt4_end + timedelta(days=1)):
        # build the dictionary
        topics_dict["date"].append(date)
        topics_dict["title"].append(submission.title)
        topics_dict["author"].append(submission.author)
        topics_dict["stickied"].append(submission.stickied)
        topics_dict["upvote_ratio"].append(submission.upvote_ratio)
        topics_dict["score"].append(submission.score)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        topics_dict["num_comments"].append(submission.num_comments)
        topics_dict["created"].append(submission.created)
        topics_dict["body"].append(submission.selftext)
        # add subreddit, query, sort
        for entry in other:
            topics_dict[entry].append(other[entry])
    return topics_dict

In [160]:
# Define the subreddits to search
subreddits = sorted(['artificial', 'datascience', 'datasets', 'deeplearning', 'LanguageTechnology', 'MachineLearning', 'learnmachinelearning',
                     'chatgpt', 'ChatGPTPromptGenius', 'ChatGPTCoding', 'GPT3', 'OpenAI'])

# Define queries to search
queries = sorted(['ChatGPT', 'GPT-4', 'GPT-3', 'GPT', 'OpenAI', 'Open-AI', 'LLM'])

In [170]:
# dataframe structure
subreddit_dict = {  "name":[],
                    "subscribers":[] }

for subreddit in subreddits:
    sub = reddit.subreddit(subreddit)
    sub_dir = '/'.join(['data', reddit_user, run_name])
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)
    subreddit_dict["name"].append(sub.display_name)
    subreddit_dict["subscribers"].append(sub.subscribers)
subreddit_data = pd.DataFrame(subreddit_dict)
subreddit_data.to_csv('/'.join(['data', reddit_user, run_name, 'subreddits' + '.meta']))
subreddit_data

Unnamed: 0,name,subscribers
0,ChatGPTCoding,104140
1,ChatGPTPromptGenius,192899
2,GPT3,719011
3,LanguageTechnology,46115
4,OpenAI,1116358
5,artificial,717231
6,chatgpt,4480729
7,datascience,1360115
8,datasets,187602
9,deeplearning,148827


In [158]:
# total data retrieved
running_count_data = 0
# number of queries
running_count_queries = 0

# Query for a subreddit by name
for subreddit in subreddits:
    sub = reddit.subreddit(subreddit)
    sub_dir = '/'.join(['data', reddit_user, run_name, subreddit])
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    def sub_search(query):
        topics_dict = copy.deepcopy(topics_dict_template)
        sort_options = ['top','comments','relevance']
        # search by keyword
        for sort in sort_options:
            search_results = sub.search(query=query, sort=sort, syntax='plain', time_filter='all')
            for submission in search_results:
                # Append the dictionary to the DataFrame
                topics_dict = scrape_submission(topics_dict, submission, {'subreddit': subreddit, 'query': query, 'sort': sort})
        return pd.DataFrame(topics_dict)

    # PRAW search function
    for query in queries:
        # Call the retry_function with your function
        topics_data = retry_function(sub_search, query)
        running_count_queries+=1
        if(topics_data is not None):
            topics_data.to_csv('/'.join(['data', reddit_user, run_name, subreddit, query + '.csv']))
            running_count_data += len(topics_data)

    def sub_controversial():
        topics_dict = copy.deepcopy(topics_dict_template)
        # search by keyword
        search_results = sub.controversial(time_filter='all')
        for submission in search_results:
            # search for query text in controversial posts
            for query in queries:
                pattern = re.compile(r'\b' + re.escape(query) + r'\b', re.IGNORECASE)  # Compile regex pattern
                if pattern.search(submission.title) or pattern.search(submission.selftext):
                    # Append the dictionary to the DataFrame
                    topics_dict = scrape_submission(topics_dict, submission, {'subreddit': subreddit, 'query': query, 'sort': 'controversial'})
        return pd.DataFrame(topics_dict)

    # PRAW controversial function
    topics_data = retry_function(sub_controversial)
    running_count_queries+=1
    if(topics_data is not None):
        topics_data.to_csv('/'.join(['data', reddit_user, run_name, subreddit, 'controversial.csv']))
        running_count_data += len(topics_data)

print(f"Total queries sent: {running_count_queries}")
print(f"Total data retrieved: {running_count_data}")


Total queries sent: 88
Total data retrieved: 6966


In [159]:
topics_data
topics_data.describe()

Unnamed: 0,upvote_ratio,score,num_comments,created
count,4.0,4.0,4.0,4.0
mean,0.485,0.0,11.0,1675045000.0
std,0.017321,0.0,9.237604,3325830.0
min,0.47,0.0,3.0,1672164000.0
25%,0.47,0.0,3.0,1672164000.0
50%,0.485,0.0,11.0,1675045000.0
75%,0.5,0.0,19.0,1677925000.0
max,0.5,0.0,19.0,1677925000.0
