# Reddit API data collection

## Reddit forums: r/artificial, r/machinelearning, r/bigscience
Use APIs to collect data mentioning GPT-3 and other AI technologies from November 1 2022 to January 31 2023.

In [39]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain

# reddit crawler
import praw

# converting created dates from reddit API into human readable format
from datetime import datetime

# make directories for data collection
import os

# copy data structure
import copy

# regular expression search PRAW results
import re

In [74]:
import praw
import json

# Load the JSON file
with open('reddit.json') as f:
    reddit_users = json.load(f)

credentials = reddit_users['Zealousideal-Land259']
#credentials = reddit_users['reddit_user4']

# Instantiate praw.Reddit object
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    user_agent=credentials['user_agent'],
    redirect_uri=credentials['redirect_uri'],
    refresh_token=credentials['refresh_token']
)

# test connection
reddit_user = str(reddit.user.me())
print(reddit_user)

Zealousideal-Land259


## Scrape top posts from reddit

In [41]:
import time

# Reddit API restricts to 100 queries per minute
# Retry every 10 seconds 12 times for a total of 2 minutes
def retry_function(func, *args, max_attempts=12, delay=10, **kwargs):
    attempts = 0
    while attempts < max_attempts:
        try:
            result = result = func(*args, **kwargs)  # Call the function
            return result  # Return the result if successful
        except Exception as e:
            print(f"An error occurred: {e}")
            attempts += 1
            if attempts < max_attempts:
                print(f"Retrying attempt #{attempts} in {delay} seconds...")
                time.sleep(delay)
    print("Max attempts reached. Continuing loop.")
    return None  # Or you can raise an exception here if needed


In [42]:
# Define the subreddits to search
subreddits = sorted(['artificial', 'chatgpt', 'deeplearning', 'machinelearning', 'learnmachinelearning'])

# Define queries to search
queries = sorted(['chatgpt', 'gpt-4', 'gpt-3', 'gpt', 'openai', 'open-ai', 'llm'])

In [43]:
# dataframe structure
topics_dict_template = {"subreddit":[],
                        "query":[],
                        "sort":[],
                        "date":[],
                        "title":[],
                        "author":[],
                        "stickied":[],
                        "upvote_ratio":[],
                        "score":[],
                        "id":[],
                        "url":[],
                        "num_comments": [],
                        "created": [],
                        "body":[]}

# Query for a subreddit by name
for subreddit in subreddits:
    sub = reddit.subreddit(subreddit)
    sub_dir = '/'.join(['data', reddit_user, subreddit])
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    def sub_search(query):
        topics_dict = copy.deepcopy(topics_dict_template)
        sort_options = ['top','comments','relevance']
        # search by keyword
        for sort in sort_options:
            search_results = sub.search(query=query, sort=sort, syntax='plain', time_filter='all')
            for submission in search_results:
                topics_dict["subreddit"].append(subreddit)
                topics_dict["query"].append(query)
                topics_dict["sort"].append(sort)
                topics_dict["date"].append(datetime.utcfromtimestamp(submission.created))
                topics_dict["title"].append(submission.title)
                topics_dict["author"].append(submission.author)
                topics_dict["stickied"].append(submission.stickied)
                topics_dict["upvote_ratio"].append(submission.upvote_ratio)
                topics_dict["score"].append(submission.score)
                topics_dict["id"].append(submission.id)
                topics_dict["url"].append(submission.url)
                topics_dict["num_comments"].append(submission.num_comments)
                topics_dict["created"].append(submission.created)
                topics_dict["body"].append(submission.selftext)
                topics_data = pd.DataFrame(topics_dict)
        return topics_data

    # PRAW search function
    for query in queries:
        # Call the retry_function with your function
        topics_data = retry_function(sub_search, query)
        if(topics_data is not None):
            topics_data.to_csv('/'.join(['data', reddit_user, subreddit, query + '.csv']))

    def sub_controversial():
        topics_dict = copy.deepcopy(topics_dict_template)
        # search by keyword
        search_results = sub.controversial(time_filter='all')
        for submission in search_results:
            # search for query text in controversial posts
            for query in queries:
                pattern = re.compile(r'\b' + re.escape(query) + r'\b', re.IGNORECASE)  # Compile regex pattern
                if pattern.search(submission.title) or pattern.search(submission.selftext):
                    topics_dict["subreddit"].append(subreddit)
                    topics_dict["query"].append(query)
                    topics_dict["sort"].append('controversial')
                    topics_dict["date"].append(datetime.utcfromtimestamp(submission.created))
                    topics_dict["title"].append(submission.title)
                    topics_dict["author"].append(submission.author)
                    topics_dict["stickied"].append(submission.stickied)
                    topics_dict["upvote_ratio"].append(submission.upvote_ratio)
                    topics_dict["score"].append(submission.score)
                    topics_dict["id"].append(submission.id)
                    topics_dict["url"].append(submission.url)
                    topics_dict["num_comments"].append(submission.num_comments)
                    topics_dict["created"].append(submission.created)
                    topics_dict["body"].append(submission.selftext)
                    topics_data = pd.DataFrame(topics_dict)
        return topics_data

    # PRAW controversial function
    topics_data = retry_function(sub_controversial)
    if(topics_data is not None):
        topics_data.to_csv('/'.join(['data', reddit_user, subreddit, 'controversial.csv']))



In [44]:
# dataframe structure
subreddit_dict = {  "name":[],
                    "subscribers":[] }

for subreddit in subreddits:
    sub = reddit.subreddit(subreddit)
    sub_dir = '/'.join(['data', reddit_user])
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)
    subreddit_dict["name"].append(subreddit)
    subreddit_dict["subscribers"].append(sub.subscribers)
subreddit_data = pd.DataFrame(subreddit_dict)
subreddit_data.to_csv('/'.join(['data', reddit_user, 'subreddits' + '.meta']))
subreddit_data

Unnamed: 0,name,subscribers
0,artificial,713596
1,chatgpt,4446512
2,deeplearning,148505
3,learnmachinelearning,383386
4,machinelearning,2868752


In [45]:
topics_data
topics_data.describe()

Unnamed: 0,upvote_ratio,score,num_comments,created
count,32.0,32.0,32.0,32.0
mean,0.508125,6.09375,37.53125,1675756000.0
std,0.048423,15.715945,23.504268,41175600.0
min,0.42,0.0,0.0,1551651000.0
25%,0.4775,0.0,15.5,1678618000.0
50%,0.5,0.0,37.0,1686365000.0
75%,0.5275,2.75,56.0,1698330000.0
max,0.63,83.0,77.0,1707539000.0
