# Reddit API data collection

In [361]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain
import json

# reddit crawler
import praw

# converting created dates from reddit API into human readable format
from datetime import datetime, timedelta

# make directories for data collection
import os

# copy data structure
import copy

# regular expression search PRAW results
import re

# wait time for api limits and api retry
import time
#import asyncio # Not implemented

# debugging tools
import traceback
import logging
# Change logging level to print to the debug.log file
logging.basicConfig(level=logging.INFO)
# Create a file handler to log debug messages to a file
debug_handler = logging.FileHandler('debug.log')
debug_handler.setLevel(logging.DEBUG)
# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
debug_handler.setFormatter(formatter)
# Add the debug handler to the root logger
logging.getLogger('').addHandler(debug_handler)

# Load config with run iteration and Reddit user logins

In [362]:
def get_config(users):
    # Check if config file exists
    config_file = 'config.json'
    if not os.path.exists(config_file):
        # Create config file with default content
        with open(config_file, 'w') as f:
            json.dump({"run": 0, 'reddit_user': users[0]}, f)

    # Load run information and increment run
    with open(config_file) as f:
        config = json.load(f)
        config['reddit_user'] = users[config['run']%len(users)]
        config['run'] = config['run'] + 1

    # Write new run to config file
    with open(config_file, 'w') as f:
        json.dump(config, f)
        
    return config

# Load Reddit user login

In [363]:
# Load the JSON credentials file
with open('reddit.json') as f:
    reddit_users = json.load(f)

# Get user and run_name
config = get_config(list(reddit_users.keys()))

# Uncomment to change user manually
#config['reddit_user'] = 'reddit_user4'
#config['reddit_user'] = 'Zealousideal-Land259'

# Get credentials for user
credentials = reddit_users[config['reddit_user']]

# Instantiate praw.Reddit object
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    user_agent=credentials['user_agent'],
    redirect_uri=credentials['redirect_uri'],
    refresh_token=credentials['refresh_token']
)

# test connection
run_name = '{:03d}'.format(config['run'])
print(f"Run name: {run_name}")
reddit_user = str(reddit.user.me())
print(f"Reddit user: {reddit_user}")

Run name: 157


Reddit user: Zealousideal-Land259


# API scraper to pause between API calls

In [364]:
# After successful result 
# Retry every 10 seconds 12 times for a total of 2 minutes
def retry_function(func, *args, max_attempts=12, delay=10, **kwargs):
    attempts = 0
    while attempts < max_attempts:
        try:
            start_time = time.time() # record start time of api call
            result = func(*args, **kwargs)  # Call the function
            end_time = time.time() # record start time of api call
            # Reddit API restricts to 100 queries per minute
            reddit_api_restriction = 100/60
            # wait for the difference between the api restriction and the total api call time
            api_wait_time = reddit_api_restriction - (end_time-start_time)
            if api_wait_time > 0: time.sleep(api_wait_time)
            return result  # Return the result if successful
        except Exception as e:
            print(f"An error occurred: {e}")
            print(f"Function: {func}")
            print(f"Args: {args}")
            traceback.print_exc()
            attempts += 1
            if attempts < max_attempts:
                print(f"Retrying attempt #{attempts} in {delay} seconds...")
                for _ in range(delay): time.sleep(1)
    print("Max attempts reached. Continuing loop.")
    return None  # Or you can raise an exception here if needed


# Scrape posts from Reddit with target date range
Filter results from API mentioning GPT-3 and GPT-4 from November 1 2022 to January 31 2023 and February 15, 2023 to May 15, 2023.

In [365]:
#November 1 2022 to January 31 2023
# Define the date range for GPT-3 hype analysis
gpt3_start = datetime(2022, 11, 1)
gpt3_end = datetime(2023, 1, 31)

#February 15 to May 15 that includes the launch GPT-4 on March 14, 2023
# Define the date range for GPT-4 hype analysis
gpt4_start = datetime(2023, 2, 15)
gpt4_end = datetime(2023, 5, 15)

# API structure
posts_dict_template = {"id":[],
                        "subreddit":[],
                        "query":[],
                        "sort":[],
                        "date":[],
                        "title":[],
                        "author":[],
                        "stickied":[],
                        "upvote_ratio":[],
                        "score":[],
                        "url":[],
                        "num_comments": [],
                        "created": [],
                        "body":[]}

def scrape_submission(posts_dict, submission, other):
    # Scrape only dates within the timeframes
    date = datetime.utcfromtimestamp(submission.created)
    if (gpt3_start <= date < gpt3_end + timedelta(days=1)) | (gpt4_start <= date < gpt4_end + timedelta(days=1)):
        # build the dictionary
        posts_dict["date"].append(date)
        posts_dict["title"].append(submission.title)
        posts_dict["author"].append(str(submission.author))
        posts_dict["stickied"].append(submission.stickied)
        posts_dict["upvote_ratio"].append(submission.upvote_ratio)
        posts_dict["score"].append(submission.score)
        posts_dict["id"].append(submission.id)
        posts_dict["url"].append(submission.url)
        posts_dict["num_comments"].append(submission.num_comments)
        posts_dict["created"].append(submission.created)
        posts_dict["body"].append(submission.selftext)
        # add subreddit, query, sort
        for entry in other:
            posts_dict[entry].append(other[entry])
    return posts_dict

# API structure
comments_dict_template = {"id":[],
                        "comment_id":[],
                        "comment_date":[],
                        "comment_score":[],
                        "comment_num_replies":[],
                        "comment_body":[]}

def scrape_comment(comments_dict, submission_dict, comment):
    # Scrape only dates within the timeframes
    date = datetime.utcfromtimestamp(comment.created_utc)
    if (gpt3_start <= date < gpt3_end + timedelta(days=1)) | (gpt4_start <= date < gpt4_end + timedelta(days=1)):
        # copy id from submission
        comments_dict["id"].append(submission_dict["id"])
        comments_dict["comment_id"].append(comment.id)
        comments_dict["comment_date"].append(date)
        comments_dict["comment_score"].append(comment.score)
        comments_dict["comment_num_replies"].append(len(comment.replies))
        comments_dict["comment_body"].append(comment.body)
    return comments_dict

# Reddit forum targeting
Using forums like r/artificial, r/machinelearning, r/bigscience. Using queries like 'ChatGPT', 'OpenAI'.

In [366]:
# Define the subreddits to search
subreddits = sorted(['artificial', 'datascience', 'datasets', 'deeplearning', 'LanguageTechnology', 'MachineLearning', 'learnmachinelearning',
                     'chatgpt', 'ChatGPTPromptGenius', 'ChatGPTCoding', 'GPT3', 'OpenAI'], key=lambda x: x.lower())

# Define queries to search
queries = sorted(['ChatGPT', 'GPT-4', 'GPT-3', 'GPT', 'OpenAI', 'Open-AI', 'LLM'], key=lambda x: x.lower())

# Subreddit metadata for number of subscribers

In [367]:
# dataframe structure
subreddit_dict = {  "name":[],
                    "subscribers":[] }

for subreddit in subreddits:
    sub = reddit.subreddit(subreddit)
    sub_dir = '/'.join(['data', reddit_user, run_name])
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)
    subreddit_dict["name"].append(sub.display_name)
    subreddit_dict["subscribers"].append(sub.subscribers)
subreddit_data = pd.DataFrame(subreddit_dict)
subreddit_data.to_csv('/'.join(['data', reddit_user, run_name, 'subreddits' + '.meta']))
subreddit_data

Unnamed: 0,name,subscribers
0,artificial,720349
1,chatgpt,4507776
2,ChatGPTCoding,104377
3,ChatGPTPromptGenius,193302
4,datascience,1366943
5,datasets,187715
6,deeplearning,149118
7,GPT3,722368
8,LanguageTechnology,46152
9,learnmachinelearning,384569


# Iterate through Subreddit API calls
Loop through queries, controversial, and matched submissions within each Subreddit.

In [None]:
# total data retrieved
running_count_searches = 0
# number of queries
running_count_queries = 0
# number of unqiue submissions
running_count_submissions = 0
# number of comments
running_count_comments = 0

# Query for a subreddit by name
for subreddit in subreddits:
    subreddit_posts = pd.DataFrame()
    subreddit_comments = pd.DataFrame()
    sub = reddit.subreddit(subreddit)
    sub_dir = '/'.join(['data', reddit_user, run_name, subreddit])
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    def sub_search(query):
        posts_dict = copy.deepcopy(posts_dict_template)
        sort_options = ['top','comments','relevance']
        for sort in sort_options:
            # search posts by keyword
            search_results = sub.search(query=query, sort=sort, syntax='plain', time_filter='all')
            for submission in search_results:
                # Append the post list after checking the dates
                posts_dict = scrape_submission(posts_dict, submission, {'subreddit': subreddit, 'query': query, 'sort': sort})
        return pd.DataFrame(posts_dict)

    # PRAW search function
    for query in queries:
        # Call the retry_function with your function
        posts_data = retry_function(sub_search, query)
        running_count_queries+=1
        if((posts_data is not None) & (not posts_data.empty)):
            print(posts_data)
            subreddit_posts = pd.concat([subreddit_posts, posts_data], ignore_index=True)
            running_count_searches += len(posts_data)

    def sub_controversial():
        posts_dict = copy.deepcopy(posts_dict_template)
        # search by all controversial posts within the subreddit
        search_results = sub.controversial(time_filter='all')
        for submission in search_results:
            # search for query text in controversial posts
            for query in queries:
                pattern = re.compile(r'\b' + re.escape(query) + r'\b', re.IGNORECASE)  # Compile regex pattern
                if pattern.search(submission.title) or pattern.search(submission.selftext):
                    # Append the post list after checking the dates
                    posts_dict = scrape_submission(posts_dict, submission, {'subreddit': subreddit, 'query': query, 'sort': 'controversial'})
                    break
        return pd.DataFrame(posts_dict)

    # PRAW controversial function
    posts_data = retry_function(sub_controversial)
    running_count_queries+=1
    if((posts_data is not None) & (not posts_data.empty)):
        print(posts_data)
        subreddit_posts = pd.concat([subreddit_posts, posts_data], ignore_index=True)
        running_count_searches += len(posts_data)

    def comment_search(submission_dict):
        comments_dict = copy.deepcopy(comments_dict_template)
        # search each submission for comments
        submission = reddit.submission(id=submission_dict['id'])
        for comment in submission.comments:
            # More comments object shows up at the end of long lists of comments
            if type(comment).__name__ == 'Comment':
                # Append the comment list after checking the dates
                comments_dict = scrape_comment(comments_dict, submission_dict, comment)
        return pd.DataFrame(comments_dict)

    # Write CSV with unique subreddit posts
    subreddit_posts.drop_duplicates(subset='id', inplace=True)  # Drop duplicates based on the 'id' column
    subreddit_posts.to_csv('/'.join([sub_dir, 'posts.csv']))
    
    # Scrape comments in each subreddit post
    for index, row in subreddit_posts.iterrows():
        submission_dict = row.to_dict()
        comments_data = retry_function(comment_search, submission_dict)
        running_count_submissions+=1
        if((comments_data is not None) & (not comments_data.empty)):
            subreddit_comments = pd.concat([subreddit_comments, comments_data], ignore_index=True)
            running_count_comments += len(comments_data)
    # Write CSV with all subreddit comments, unique to each post
    subreddit_comments.to_csv('/'.join([sub_dir, 'comments.csv']))


In [369]:
# Print results
print(f"Total queries sent: {running_count_queries}")
print(f"Total submissions from searches: {running_count_searches}")
print(f"Total unique submissions: {running_count_submissions}")
print(f"Total submission comments: {running_count_comments}")

Total queries sent: 96
Total submissions from searches: 7459
Total unique submissions: 2604
Total submission comments: 53396
