# Extracting Reddit Information Script

### Code in this section is necessary to properly extract Post and Comment information from relevant state reddits
**NOTE: It is not possible to extract the necessary information from 9 states due to the limits of reddit's API. It is not possible to extract the relevant posts from requested 3 week time periods before and after the election due to the amount of posts on these specific state subreddits. The states in question are: {'northcarolina', 'minnesota', 'newjersey', 'ohio', 'massachusetts', 'pennsylvania', 'wisconsin', 'texas', 'florida', 'connecticut'}**

In [25]:
# Necessary libraries
import re
from datetime import datetime, UTC
import pytz
import praw
import time
import json

In [26]:
# EXTRACT SUBREDDIT NAMES FROM THE TEXT FILE
state_subreddit_text = ""

with open('state_reddits.txt', 'r') as file:
    state_subreddit_text = file.read()

text_list = re.split(' |\n', state_subreddit_text)
state_subreddits = [sub[2:] for sub in text_list if len(sub) > 0 and sub[0:2] == "r/"]

# Delete invalid state subreddits 
invalid_states = {'northcarolina', 'minnesota', 'newjersey', 'ohio', 'massachusetts', 'pennsylvania', 'wisconsin', 'texas', 'florida', 'connecticut'}
valid_state_subreddits = [state for state in state_subreddits if state not in invalid_states]
print(valid_state_subreddits)

['california', 'michigan', 'colorado', 'oregon', 'hawaii', 'oklahoma', 'maryland', 'arizona', 'virginia', 'maine', 'indiana', 'iowa', 'washington', 'newhampshire', 'alaska', 'louisiana', 'vermont', 'newyork', 'arkansas', 'alabama', 'kentucky', 'southcarolina', 'georgia', 'montana', 'delaware', 'utah', 'rhodeisland', 'missouri', 'tennessee', 'nebraska', 'illinois', 'westvirginia', 'newmexico', 'mississippi', 'kansas', 'northdakota', 'idaho', 'southdakota', 'wyoming', 'nevada']


In [27]:
# Create Reddit Client using pre-made credentials
reddit = praw.Reddit(
    client_id="P1pvruNSVTvfHuAPk9jIPw",
    client_secret="6x-bxO9xIUyyAv640iJpqkZxoSpkKA",
    refresh_token="2074775942901-SVxkdV43tApwU_FMAN4piRs-jlzXIQ",
    user_agent="python:US_Election:v1.0 (by u/Watermelon_boiii)",
)

In [28]:
# Define helper variables and functions to extract post data

# Define time cutoffs
# Oct 15th 8pm EST
before_elec_time = datetime(2024, 10, 15, 20, 0, 0, tzinfo=pytz.timezone('America/New_York')).astimezone(pytz.utc).timestamp()
# Nov 5th 8pm EST
election_time = datetime(2024, 11, 5, 20, 0, 0, tzinfo=pytz.timezone('America/New_York')).astimezone(pytz.utc).timestamp()
# Nov 26th 8pm EST
after_elec_time = datetime(2024, 11, 26, 20, 0, 0, tzinfo=pytz.timezone('America/New_York')).astimezone(pytz.utc).timestamp()


def get_all_state_posts(reddit, states, date_cutoff):
    all_posts = {}
    for state in states:
        posts = get_subreddit_posts(reddit, state, date_cutoff)
        all_posts[state] = posts
    return all_posts

def get_subreddit_posts(reddit, state, date_cutoff):
    posts = []
    data = reddit.subreddit(state).new(limit=1000)
    for raw_post in data:
        post = vars(raw_post)
        post_date = post["created_utc"]
        if post_date <= date_cutoff:
            break
        posts.append(post)
    return posts

def split_posts_by_date(posts, election_time):
    before_posts = {}
    after_posts = {}
    for state in posts:
        curr_after_posts = []
        for post in posts[state]:
            post_date = post["created_utc"]
            if post_date >= election_time:
                curr_after_posts.append(post)
            else:
                break
        after_posts[state] = curr_after_posts
        before_posts[state] = posts[state][len(curr_after_posts):]
    return before_posts, after_posts

def write_posts_to_json(file_name, data):
    for state in data:
        for post in data[state]:
            clean_post(post)
    with open(file_name, 'w') as file:
        json.dump(data, file, indent=4)

def check_for_objects(value):
    reddit_objects = {praw.models.Subreddit, praw.reddit.Reddit, praw.reddit.Redditor, praw.models.comment_forest.CommentForest, praw.models.Submission, praw.models.reddit.poll.PollData}    
    for reddit_obj in reddit_objects:
        if isinstance(value, reddit_obj):
            return True
    return False

def clean_post(input_dict):
    for key, value in input_dict.items():
        # Check if the value is an instance of praw.reddit.Reddit or any other non-serializable object
        if check_for_objects(value):
            input_dict[key] = None

In [29]:
# Execute above functions, creating two json files, one for posts from before the election and one for posts after the election
all_posts = get_all_state_posts(reddit, valid_state_subreddits, before_elec_time)
before_elec_posts, after_elec_posts = split_posts_by_date(all_posts, election_time)
write_posts_to_json("before_election_posts_data.json", before_elec_posts)
write_posts_to_json("after_election_posts_data.json", after_elec_posts)

In [42]:
# Define helper variables and functions to extract comment data
# WARNING: THIS IS A LOT OF DATA

def get_comments_from_post(reddit, post):
    url = "https://www.reddit.com" + post["permalink"]
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=0)  # Replace "more comments" to fetch all comments
    comments = []
    for raw_comment in submission.comments.list():
        comment = vars(raw_comment)
        comments.append(comment)
    return comments

def get_comments_from_subreddit(reddit, state_posts):
    comments = {}
    for post in state_posts:
        comments[post["name"]] = get_comments_from_post(reddit, post)
    return comments

def get_all_comments(reddit, posts_dict):
    all_comments = {}
    for state in posts_dict:
        all_comments[state] = get_comments_from_subreddit(reddit, posts_dict[state])
    return all_comments

def write_comments_to_json(file_name, data):
    for state in data:
        for post_id in data[state]:
            for comment in data[state][post_id]:
                clean_comment(comment)
    with open(file_name, 'w') as file:
        json.dump(data, file, indent=4)

def check_for_objects(value):
    reddit_objects = {praw.models.Subreddit, praw.reddit.Reddit, praw.reddit.Redditor, praw.models.comment_forest.CommentForest, praw.models.Submission, praw.models.reddit.poll.PollData}    
    for reddit_obj in reddit_objects:
        if isinstance(value, reddit_obj):
            return True
    return False

def clean_comment(input_dict):
    for key, value in input_dict.items():
        # Check if the value is an instance of praw.reddit.Reddit or any other non-serializable object
        if check_for_objects(value):
            input_dict[key] = None



In [44]:
# Find and save comments into a JSON file
# NOTE: only run locally with states nevada and wyoming due to size of data. Use normal variables, not sample variables in actual run
sample_before_elec_posts = {"nevada": before_elec_posts["nevada"], "wyoming": before_elec_posts["wyoming"]}
sample_after_elec_posts = {"nevada": after_elec_posts["nevada"], "wyoming": after_elec_posts["wyoming"]}

before_elec_comments = get_all_comments(reddit, sample_before_elec_posts)
# before_elec_comments = get_all_comments(reddit, before_elec_posts)
write_comments_to_json("before_election_comments_data.json", before_elec_comments)
after_elec_comments = get_all_comments(reddit, sample_after_elec_posts)
# after_elec_comments = get_all_comments(reddit, after_elec_posts)
write_comments_to_json("after_election_comments_data.json", after_elec_comments)

# End of script

The rest of the notebook is exploratory code used to help write this script. It is left commented out for now in case it is necessary in the future.

In [45]:
# num_posts = 1000
# curr_cuttoffs, posts = find_posts_in_timeframe(reddit, list(states_to_find), num_posts)

# for state, value in curr_cuttoffs.items():
#     if value <= 900:
#         continue
#     else:
#         del posts[state]
#         states_to_find.remove(state)
#         state_cutoffs[state] = value

# invalid_states = states_to_find
# print(state_cutoffs)
# print(invalid_states)

In [None]:
# import praw
# import time
# from datetime import datetime, timedelta

# import logging

# logging.basicConfig(level=logging.DEBUG)

# # Initialize reddit client
# reddit = praw.Reddit(
#     client_id="P1pvruNSVTvfHuAPk9jIPw",
#     client_secret="6x-bxO9xIUyyAv640iJpqkZxoSpkKA",
#     redirect_uri="http://localhost:8080",
#     user_agent="python:US_Election:v1.0 (by u/Watermelon_boiii)",
# )

# # You need to obtain an authorization URL
# auth_url = reddit.auth.url(scopes=['identity, read'], state="random", duration='permanent')

# # This URL will guide you to Reddit's OAuth page to authorize access
# print("Visit this URL to authorize:", auth_url)

In [None]:
# code = "HAWkqxUwIPFqk2hggjiUwLAvXbQVoA"
# print(reddit.auth.authorize(code))
# print(reddit.user.me())


In [None]:
# # Try using refresh token for reddit client
# reddit = praw.Reddit(
#     client_id="P1pvruNSVTvfHuAPk9jIPw",
#     client_secret="6x-bxO9xIUyyAv640iJpqkZxoSpkKA",
#     refresh_token="2074775942901-SVxkdV43tApwU_FMAN4piRs-jlzXIQ",
#     user_agent="python:US_Election:v1.0 (by u/Watermelon_boiii)",
# )
# print(reddit.auth.scopes())

# posts = reddit.subreddit("NewYork").new(limit=5)

In [None]:
# from datetime import datetime, timedelta

# def get_posts(reddit, state):
#     posts = []
#     data = reddit.subreddit(state).new(limit=50)
#     for raw_post in data:
#         post = vars(raw_post)
#         if post_too_old(post):
#             break
#         url = "https://www.reddit.com" + post["permalink"]
#         # post["comments"] = get_comments(reddit, url)
#         posts.append(post)
#     return posts

# def post_too_old(post):
#     # UTC Unix timestamp
#     timestamp = post["created_utc"]
    
#     # # Convert Unix timestamp to datetime
#     # dt = datetime.utcfromtimestamp(timestamp)
#     # print("UTC datetime:", dt)

#     # Get the current UTC time
#     current_time = datetime.utcnow()
    
#     # Calculate the time 4 weeks ago
#     four_weeks_ago = current_time - timedelta(weeks=4)
    
#     # Convert the post's timestamp to a datetime object
#     post_time = datetime.utcfromtimestamp(timestamp)
#     return post_time <= four_weeks_ago
    

# def get_comments(reddit, url): 
#     url = "https://www.reddit.com" + post["permalink"]
#     submission = reddit.submission(url=url)
#     submission.comments.replace_more(limit=0)  # Replace "more comments" to fetch all comments
    
#     comments = []
#     for raw_comment in submission.comments.list():
#         comment = vars(raw_comment)
#         comments.append(comment)

#     return comments

# # posts = get_posts(reddit, "wyoming")
# # print(posts)
        

In [None]:
# # iterate over all state subreddits
# state_posts = {}
# for state in state_subreddits:
#     posts = get_posts(reddit, state)
#     state_posts[state] = posts


In [None]:
# import json
# # create json 
# filename = 'state_reddit_data.json'

# def clean_post(input_dict):
#     cleaned_dict = {}
#     for key, value in input_dict.items():
#         # Check if the value is an instance of praw.reddit.Reddit or any other non-serializable object
#         if check_for_objects(value) or key == "comments":
#             cleaned_dict[key] = None  # Replace Reddit object with None (or you could choose to remove it entirely)
#         else:
#             # Keep serializable values
#             cleaned_dict[key] = value

#     comments = []
#     for comment in input_dict["comments"]:
#         comments.append(clean_comment(comment))
#     cleaned_dict["comments"] = comments
        
    
#     return cleaned_dict

# def clean_comment(comment):
#     new_comment = {}
#     for key, value in comment.items():
#         if check_for_objects(value):
#             # Skip adding this key-value pair if it's a Subreddit object
#             continue
#         else:
#             # Keep all other values
#             new_comment[key] = value
#     return new_comment


# def check_for_objects(value):
#     reddit_objects = {praw.models.Subreddit, praw.reddit.Reddit, praw.reddit.Redditor, praw.models.comment_forest.CommentForest, praw.models.Submission, praw.models.reddit.poll.PollData}    
#     for reddit_obj in reddit_objects:
#         if isinstance(value, reddit_obj):
#             return True
#     return False


# # for state in state_posts:
# #     for post in state:
# #         clean_dict(post)

# clean_state_posts = {}
# for state in state_posts:
#     post_list = []
#     for post in state_posts[state]:
#         new_dict = clean_post(post)
#         post_list.append(new_dict)
#     clean_state_posts[state] = post_list
    
# # Open the file in write mode and save the dictionary as JSON
# with open(filename, 'w') as file:
#     json.dump(clean_state_posts, file, indent=4)

In [None]:
# # Get comments off a post
# post_url = " https://www.reddit.com/r/newyork/comments/1gvveui/weirdly_specific_regional_request_trying_to_find/"

# submission = reddit.submission(url=post_url)

# # Fetch the comments
# submission.comments.replace_more(limit=0)  # Replace "more comments" to fetch all comments

# # Print the comments
# for comment in submission.comments.list():
#     print(f"Comment by {comment.author}: {comment.body}\n")

In [None]:
# # Define the subreddit and the time frame (last two weeks)
# subreddit = reddit.subreddit("AITA")

# # Get the current time and the time from two weeks ago
# now = time.time()  # current time in seconds since epoch
# two_weeks_ago = now - (2 * 7 * 24 * 60 * 60)  # 2 weeks in seconds

# # Fetch new posts
# posts = subreddit.new(limit=100)  # Fetch 100 most recent posts (you can adjust this)
# filtered_posts = []

# for post in posts:
#     if post.created_utc >= two_weeks_ago:
#         filtered_posts.append(post)

# # Display the filtered posts
# for post in filtered_posts:
#     post_time = datetime.utcfromtimestamp(post.created_utc)
#     print(f"Title: {post.title}")
#     print(f"Score: {post.score}")
#     print(f"URL: {post.url}")
#     print(f"Author: {post.author}")
#     print(f"Created: {post_time}")
#     print(f"Comments: {post.num_comments}")
#     print("-" * 80)

In [None]:
# import requests
# import base64

# # Define Reddit OAuth2 endpoint and your credentials
# client_id="P1pvruNSVTvfHuAPk9jIPw"
# client_secret="6x-bxO9xIUyyAv640iJpqkZxoSpkKA"
# user_agent = "python:US_Election:v1.0 (by u/Watermelon_boiii)"

# # The token URL for Reddit
# token_url = "https://www.reddit.com/api/v1/access_token"

# # Prepare the authentication headers and payload
# auth = base64.b64encode(f"{client_id}:{client_secret}".encode("utf-8")).decode("utf-8")
# headers = {
#     "User-Agent": user_agent,
#     "Authorization": f"Basic {auth}"
# }

# # Define the payload to request the Bearer token
# payload = {
#     "grant_type": "client_credentials",
#     "scope": "read"
# }

# # Make the request to get the access token
# response = requests.post(token_url, headers=headers, data=payload)

# if response.status_code == 200:
#     # Extract the token from the response
#     token_data = response.json()
#     bearer_token = token_data["access_token"]
#     print("Bearer Token:", bearer_token)
# else:
#     print(f"Failed to get access token: {response.status_code}")

In [None]:
# import requests
# from datetime import datetime, timedelta
# import time

# # Define Reddit API endpoint for new posts
# url = "https://api.reddit.com/r/newyork/new"

# # Define the time frame for the last two weeks
# now = time.time()  # current time in seconds since epoch
# two_weeks_ago = now - (2 * 7 * 24 * 60 * 60)  # 2 weeks in seconds

# # Set parameters (you can adjust 'limit' as needed)
# params = {
#     "limit": 100,  # Number of posts to retrieve
#     "t": "all"     # Retrieve posts from any time frame
# }

# # Add the OAuth2 access token to the headers
# headers = {
#     "Authorization": f"Bearer {bearer_token}",
#     "User-Agent": "python:US_Election:v1.0 (by u/Watermelon_boiii)"
# }

# # Make the GET request
# response = requests.get(url, headers=headers, params=params)

# # Check if the request was successful
# if response.status_code == 200:
#     data = response.json()

#     # Filter posts to get only those from the last two weeks
#     filtered_posts = [
#         post['data'] for post in data['data']['children']
#         if post['data']['created_utc'] >= two_weeks_ago
#     ]
    
#     # Display the filtered posts
#     for post_data in filtered_posts:
#         title = post_data['title']
#         score = post_data['score']
#         post_url = post_data['url']
#         author = post_data['author']
#         created_time = datetime.utcfromtimestamp(post_data['created_utc'])
#         num_comments = post_data['num_comments']

#         print(f"Title: {title}")
#         print(f"Score: {score}")
#         print(f"URL: {post_url}")
#         print(f"Author: {author}")
#         print(f"Created: {created_time}")
#         print(f"Comments: {num_comments}")
#         print("-" * 80)
# else:
#     print(f"Failed to fetch posts: {response.status_code}")