# Reddit API data collection

## Reddit forums: r/artificial, r/machinelearning, r/bigscience
Use APIs to collect data mentioning GPT-3 and other AI technologies from November 1 2022 to January 31 2023.

In [16]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain

# reddit crawler
import praw

# converting created dates from reddit API into human readable format
from datetime import datetime

# make directories for data collection
import os

In [2]:
import praw
import json

# Load the JSON file
with open('reddit.json') as f:
    credentials = json.load(f)

# Instantiate praw.Reddit object
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    user_agent=credentials['user_agent'],
    redirect_uri=credentials['redirect_uri'],
    refresh_token=credentials['refresh_token']
)

# test connection
print(reddit.user.me())

Zealousideal-Land259


## Scrape controversial posts from reddit

In [11]:
import time

# Reddit API restricts to 100 queries per minute
# Retry every 10 seconds 12 times for a total of 2 minutes
def retry_function(func, *args, max_attempts=12, delay=10, **kwargs):
    attempts = 0
    while attempts < max_attempts:
        try:
            result = result = func(*args, **kwargs)  # Call the function
            return result  # Return the result if successful
        except Exception as e:
            print(f"An error occurred: {e}")
            attempts += 1
            if attempts < max_attempts:
                print(f"Retrying attempt #{attempts} in {delay} seconds...")
                time.sleep(delay)
    print("Max attempts reached. Continuing loop.")
    return None  # Or you can raise an exception here if needed


In [17]:
# dataframe structure
topics_dict = { "date":[],
                "author":[],
                "title":[],
                "ups":[],
                "downs":[],
                "score":[],
                "id":[],
                "url":[],
                "comms_num": [],
                "created": [],
                "body":[]}

# Define the subreddits to search
subreddits = ['artificial', 'machinelearning', 'learnmachinelearning', 'deeplearning', 'chatgpt', 'generativeAI']
queries = ['gpt-4.0', 'gpt-3.0', 'gpt', 'openai', 'open-ai', 'llm']

# Query for a subreddit by name
for subreddit in subreddits:
    sub = reddit.subreddit(subreddit)
    os.makedirs('/'.join(['data', subreddit]))

    def sub_search(query):
        # search by keyword
        search_results = sub.search(query=query, sort='top', syntax='plain', time_filter='all')
        for submission in search_results:
            topics_dict["date"].append(datetime.utcfromtimestamp(submission.created))
            topics_dict["title"].append(submission.title)
            topics_dict["author"].append(submission.author)
            topics_dict["ups"].append(submission.ups)
            topics_dict["downs"].append(submission.downs)
            topics_dict["score"].append(submission.score)
            topics_dict["id"].append(submission.id)
            topics_dict["url"].append(submission.url)
            topics_dict["comms_num"].append(submission.num_comments)
            topics_dict["created"].append(submission.created)
            topics_dict["body"].append(submission.selftext)
            topics_data = pd.DataFrame(topics_dict)
        return topics_data

    for query in queries:
        # Call the retry_function with your function
        topics_data = retry_function(sub_search, query)
        if(topics_data is not None):
            topics_data.to_csv('/'.join(['data', subreddit, query + '.csv']))



In [13]:
topics_data
topics_data.describe()

Unnamed: 0,ups,downs,score,comms_num,created
count,2791.0,2791.0,2791.0,2791.0,2791.0
mean,1033.709065,0.0,1033.709065,123.653887,1670104000.0
std,3540.141712,0.0,3540.141712,358.171136,42755760.0
min,0.0,0.0,0.0,0.0,1449873000.0
25%,13.0,0.0,13.0,3.0,1672180000.0
50%,80.0,0.0,80.0,16.0,1683495000.0
75%,352.0,0.0,352.0,87.0,1694871000.0
max,65401.0,0.0,65401.0,9029.0,1708266000.0
