In [2]:
!pip install praw pandas numpy

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting websocket-client>=0.54.0 (from praw)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Downloading websocket_client-1.8.0-py3-none-any.whl (58 kB)
Installing collected packages: websocket-client, update_checker, prawcore, praw
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [praw]
[1A[2KSuccessfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0 websocket-client-1.8.0


In [None]:
import praw
import pandas as pd
import numpy as np

In [20]:
# --- CONFIG ---
CLIENT_ID = '3Ptv1n3uzKL-RaqAQnrMlg'
CLIENT_SECRET = 'pa5OheU7NtiIw6jl5MaFAz8ouLrZDQ'
USER_AGENT = 'reddit-popularity-predictor'

SUBREDDITS = ['technology', 'sports', 'funny', 'science', 'politics', 'gaming', 'movies']
POSTS_PER_SUBREDDIT = 750
SAMPLE_PER_BUCKET = 300 # how many posts per popularity bucket to keep

In [None]:
# Initialize Reddit API
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

In [None]:

def fetch_posts(subreddit, sort, limit):
    """Fetch posts from a subreddit with given sort and limit."""
    posts = []
    submissions = getattr(reddit.subreddit(subreddit), sort)(limit=limit)
    for submission in submissions:
        posts.append({
            'subreddit': subreddit,
            'id': submission.id,
            'title': submission.title,
            'selftext': submission.selftext,
            'score': submission.score,
            'num_comments': submission.num_comments,
            'created_utc': submission.created_utc,
            'flair': submission.link_flair_text,
            'upvote_ratio': submission.upvote_ratio,
            'is_self': submission.is_self,
            'nsfw': submission.over_18,
            'author': str(submission.author),
            'url': submission.url,
            'sort_type': sort
        })
    return posts


In [21]:
all_posts = []

# Fetch new posts (raw, less biased)
for sub in SUBREDDITS:
    print(f"Fetching new posts from r/{sub}...")
    all_posts.extend(fetch_posts(sub, 'new', POSTS_PER_SUBREDDIT))

# Fetch top posts (popular)
for sub in SUBREDDITS:
    print(f"Fetching top posts from r/{sub}...")
    all_posts.extend(fetch_posts(sub, 'top', POSTS_PER_SUBREDDIT))

# Create DataFrame
df = pd.DataFrame(all_posts)

# Remove duplicates (some posts may appear in both new and top)
df = df.drop_duplicates(subset='id')

print(f"Total posts before bucketing: {len(df)}")

Fetching new posts from r/technology...
Fetching new posts from r/sports...
Fetching new posts from r/funny...
Fetching new posts from r/science...
Fetching new posts from r/politics...
Fetching new posts from r/gaming...
Fetching new posts from r/movies...
Fetching top posts from r/technology...
Fetching top posts from r/sports...
Fetching top posts from r/funny...
Fetching top posts from r/science...
Fetching top posts from r/politics...
Fetching top posts from r/gaming...
Fetching top posts from r/movies...
Total posts before bucketing: 10047


In [22]:
df.head()

Unnamed: 0,subreddit,id,title,selftext,score,num_comments,created_utc,flair,upvote_ratio,is_self,nsfw,author,url,sort_type
0,technology,1lvds7w,Students can’t use AI to cheat on standardized...,,2,2,1752051000.0,Artificial Intelligence,1.0,False,False,ubcstaffer123,https://www.fraserinstitute.org/commentary/stu...,new
1,technology,1lvdi5e,Instagram wrongly accuses some users of breach...,,9,2,1752050000.0,Social Media,1.0,False,False,zsreport,https://www.bbc.com/news/articles/cy8kjdz9nr3o,new
2,technology,1lvcxoa,Turkey blocks X's Grok chatbot for alleged ins...,,18,3,1752047000.0,Social Media,0.91,False,False,BreakfastTop6899,https://www.reuters.com/business/media-telecom...,new
3,technology,1lvai0d,GlobalFoundries to make RISC-V CPUs — fab acqu...,,18,1,1752038000.0,Hardware,0.83,False,False,jhansonxi,https://www.tomshardware.com/pc-components/cpu...,new
4,technology,1lv9syt,Rubio impersonation campaign underscores broad...,,25,3,1752036000.0,Artificial Intelligence,0.82,False,False,BreakfastTop6899,https://www.axios.com/2025/07/08/rubio-ai-impe...,new


In [23]:
# --- Bucket scores into low/medium/high popularity ---

# Define buckets by score quantiles or fixed thresholds
# Here: Use quantiles to split into 3 equal groups

quantiles = df['score'].quantile([0.33, 0.66]).values
low_threshold, high_threshold = quantiles[0], quantiles[1]

def bucket_score(score):
    if score <= low_threshold:
        return 'low'
    elif score <= high_threshold:
        return 'medium'
    else:
        return 'high'

df['popularity_bucket'] = df['score'].apply(bucket_score)

print(df['popularity_bucket'].value_counts())

popularity_bucket
high      3415
low       3316
medium    3316
Name: count, dtype: int64


In [24]:
print("Length of the dataset:", len(df))

Length of the dataset: 10047


In [None]:
# skipping as dataset is balanced already
# --- Balance dataset by sampling equal posts per bucket ---

# balanced_dfs = []

# for bucket in ['low', 'medium', 'high']:
    # bucket_df = df[df['popularity_bucket'] == bucket]
    # sampled_df = bucket_df.sample(n=min(SAMPLE_PER_BUCKET, len(bucket_df)), random_state=42)
    # balanced_dfs.append(sampled_df)

# balanced_df = pd.concat(balanced_dfs).reset_index(drop=True)

# print(f"Balanced dataset size: {len(balanced_df)}")
# print(balanced_df['popularity_bucket'].value_counts())

In [25]:
# --- Save dataset ---
df.to_csv('reddit_dataset.csv', index=False)
print("Saved dataset to reddit_dataset.csv")

Saved dataset to reddit_dataset.csv
