# Reddit API data collection

## Reddit forums: r/artificial, r/machinelearning, r/bigscience
Use APIs to collect data mentioning GPT-3 and other AI technologies from November 1 2022 to January 31 2023.

In [None]:
# https://medium.com/bitgrit-data-science-publication/sentiment-analysis-on-reddit-tech-news-with-python-cbaddb8e9bb6

import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain

# reddit crawler
import praw

# sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 8) # default plot size
import seaborn as sns
sns.set(style='whitegrid', palette='Dark2')
from wordcloud import WordCloud

In [1]:
import praw
import json

# Load the JSON file
with open('reddit.json') as f:
    credentials = json.load(f)

# Instantiate praw.Reddit object
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    user_agent=credentials['user_agent'],
    redirect_uri=credentials['redirect_uri'],
    refresh_token=credentials['refresh_token']
)

# test connection
print(reddit.user.me())

Zealousideal-Land259


## Scrape controversial posts from reddit
GET [/r/subreddit]/sortreadrss support
→ [/r/subreddit]/top
→ [/r/subreddit]/controversial
This endpoint is a listing.

t	
one of (hour, day, week, month, year, all)

after	
fullname of a thing

before	
fullname of a thing

count	
a positive integer (default: 0)

limit	
the maximum number of items desired (default: 25, maximum: 100)

show	
(optional) the string all

sr_detail	
(optional) expand subreddits

In [15]:
from datetime import datetime

# Define the subreddits to search
subreddits = ['artificial', 'machinelearning', 'bigscience']

# Define the date range
start_date = datetime(2022, 11, 1)
end_date = datetime(2023, 1, 31)

# Query for a subreddit by name
sub = reddit.subreddit('artificial')

# Can query for top posts for a time period, the top 20 posts, or the
# 10 most controversial posts of the past month
top_posts_of_the_day = sub.top('day')
hot_posts = sub.hot(limit=20)
controversial_posts = sub.controversial(time_filter='month', limit=10)

# Can also search for use of a keyword
gpt_4 = sub.search('gpt-4.0')

# Sample of some of the more interesting data about a 
# submission that could make for interesting analysis
for submission in controversial_posts:
    print("TITLE: {}".format(submission.title))
    print("AUTHOR: {}".format(submission.author))
    parsed_date = datetime.utcfromtimestamp(submission.created)
    year = parsed_date.year
    month = parsed_date.month
    day = parsed_date.day

    print("CREATED: {}".format(parsed_date))
    print("COMMENTS: {}".format(submission.num_comments))
    print("UPS: {}".format(submission.ups))
    print("DOWNS: {}".format(submission.downs))
    print("URL: {}".format(submission.url))

Call this function with 'time_filter' as a keyword argument.
  top_posts_of_the_day = sub.top('day')


TITLE: I found out my company implemented an AI program that would “save the company money” in December
AUTHOR: Hey_you_-_-
CREATED: 2024-01-31 13:54:31
COMMENTS: 113
UPS: 19
DOWNS: 0
URL: https://www.reddit.com/r/artificial/comments/1afid9k/i_found_out_my_company_implemented_an_ai_program/
TITLE: What is appealing about AI-created music?
AUTHOR: Complex_Valuable_833
CREATED: 2024-01-28 19:11:55
COMMENTS: 71
UPS: 0
DOWNS: 0
URL: https://www.reddit.com/r/artificial/comments/1adad7s/what_is_appealing_about_aicreated_music/
TITLE: Poisoned AI went rogue during training and couldn't be taught to behave again in 'legitimately scary' study
AUTHOR: Thekingofchrome
CREATED: 2024-01-31 20:39:23
COMMENTS: 18
UPS: 11
DOWNS: 0
URL: https://www.livescience.com/technology/artificial-intelligence/legitimately-scary-anthropic-ai-poisoned-rogue-evil-couldnt-be-taught-how-to-behave-again
TITLE: Is the reason AI is bad at drawing hands because there have been so many people on the Internet who said it is

In [9]:
reddit.subreddit("test")

Subreddit(display_name='test')