# Collecting Reddit Data with Reddit API


In [2]:
!pip install python-dotenv
!pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl (189 kB)
Collecting prawcore<3,>=2.4
  Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Collecting update_checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update-checker-0.18.0


In [3]:
# Import libraries
import praw
import datetime
import pandas as pd
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load environment variables
load_dotenv('x.env')

# Set up credentials from env file
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
)

MissingRequiredAttributeException: Required configuration setting 'client_id' missing. 
This setting can be provided in a praw.ini file, as a keyword argument to the Reddit class constructor, or as an environment variable.

### Get Training Data

In [None]:
# Get training data range
start_date = datetime.datetime(2024, 11, 1)  # November 1, 2024
end_date = datetime.datetime(2025, 1, 31)  # January 31, 2025

# Convert datetime dates to Unix timestamps
start_timestamp = int(start_date.timestamp())
end_timestamp = int(end_date.timestamp())

# Query subreddit posts containing 'NVIDIA' within the date range in various subreddits
subreddits = ['stocks', 'investing', 'money', 'DayTrading', 'wallstreetbets']
# Create filtered posts list
train_filtered_posts = []
# Loop through subreddits
for subreddit in subreddits:
  curr_subreddit = reddit.subreddit(subreddit)

  # Search query with a time filter (limit to posts from Nov-Jan)
  posts = curr_subreddit.search("NVIDIA",
                            sort='new',
                            time_filter='all',
                            limit=None)
  # Loop through search results, if they are within time period, add to the list
  for post in posts:
      post_date = datetime.datetime.utcfromtimestamp(post.created_utc)
      if start_timestamp <= post.created_utc <= end_timestamp:
          train_filtered_posts.append({
              'Post_Title': post.title,
              'Post_URL': post.url,
              'Post_Text': post.selftext,
              'Date_Posted': post_date,
              'Upvotes': post.score,
              'Comments': post.num_comments,
              'Subreddit': post.subreddit.display_name,
          })

# Convert the list of dictionaries to pandas df
train_reddit_df = pd.DataFrame(train_filtered_posts)

print(len(train_reddit_df))
train_reddit_df.head()


In [None]:
train_reddit_df.to_csv('data/train_reddit_df_w_text.csv', index=False)

### Get Test Data

In [None]:
# Get test data range
start_date = datetime.datetime(2025, 2, 1)  # February, 1, 2025
end_date = datetime.datetime(2025, 2, 7)  # February, 7, 2025

start_timestamp = int(start_date.timestamp())
end_timestamp = int(end_date.timestamp())

# Query subreddit posts containing 'NVIDIA' within the date range in various subreddits
subreddits = ['stocks', 'investing', 'money', 'DayTrading', 'wallstreetbets']
test_filtered_posts = []
for subreddit in subreddits:
  curr_subreddit = reddit.subreddit(subreddit)

  # Search query with a time filter (limit to posts from Feb1-Feb7)
  posts = curr_subreddit.search("NVIDIA",
                            sort='new',
                            time_filter='all',
                            limit=None)
  # Loop through search results, append if within desired time range
  for post in posts:
      post_date = datetime.datetime.utcfromtimestamp(post.created_utc)
      if start_timestamp <= post.created_utc <= end_timestamp:
          test_filtered_posts.append({
              'Post_Title': post.title,
              'Post_URL': post.url,
              'Post_Text': post.selftext,
              'Date_Posted': post_date,
              'Upvotes': post.score,
              'Comments': post.num_comments,
              'Subreddit': post.subreddit.display_name,
          })

# Convert the list of dictionaries to a DataFrame
test_reddit_df = pd.DataFrame(test_filtered_posts)

print(len(test_reddit_df))
test_reddit_df.head()

In [None]:
test_reddit_df.to_csv('data/test_reddit_df_w_text.csv', index=False)