In [60]:
import os
import csv
import time
import praw
import shutil
import random
import requests
import kagglehub
import numpy as np
import pandas as pd
from datetime import datetime

In [57]:
reddit = praw.Reddit(
    client_id="fc4kMrikT-wsbuJECFEKpA",
    client_secret="s9umceJJUwJNCbRpaNZ9vK4hC-LsBA",
    user_agent="BotDetectorScript by u/Feeling_Sandwich3620"
)

In [19]:
# get current working dir
proj_dir = os.getcwd()
data_dir = os.path.join(proj_dir, "data")
os.makedirs(data_dir, exist_ok=True)

# download Reddit username dataset
dataset = kagglehub.dataset_download("colinmorris/reddit-usernames")
source_path = os.path.join(dataset, "users.csv")
dest_path = os.path.join(data_dir, "users.csv")
shutil.copy(source_path, dest_path)

print("Path to dataset:", dest_path)

Path to dataset: /Users/bryan/Documents/College/Computer Science/CMSC 475 - Neural Networks/Project/Code/reddit-bot-project/data/users.csv


In [21]:
# read in csv file
users_df = pd.read_csv(dest_path)

# filter possible non-human entries (accounts with too little activity)
users_df = users_df[users_df['n'] >= 3]
users_df = users_df.drop_duplicates(subset=["author"])

# randomly sample n usernames
n = 1000
sampled_users = users_df.sample(n=n, random_state=1234)

# save to text file
sampled_users["author"].to_csv("data/human_accounts.txt", index=False, header=False)
print(f"Saved {n} sampled usernames to data/normal_accounts.txt")

Saved 1000 sampled usernames to data/normal_accounts.txt


In [None]:
# clean up bot usernames
bot_usernames = []
with open("data/bot_accounts.txt", "r") as infile:
    for line in infile:
        username = line.split('\t')[0].replace("u/", "").strip()
        bot_usernames.append(username)

with open("data/bot_accounts.txt", "w") as outfile:
    for username in bot_usernames:
        outfile.write(f"{username}\n")

print("Cleaned bot usernames and saved to data/bot_usernames.txt")

Cleaned bot usernames and saved to data/bot_usernames.txt


In [37]:
HUMAN_FILE = "data/human_accounts.txt"
BOT_FILE = "data/bot_accounts.txt"
COMBINED_FILE = "data/combined_accounts.csv"

def load_usernames(path, label):
    with open(path, "r") as file:
        users = [line.strip() for line in file if line.strip()]
    return [(user, label) for user in users]

# label human and bot usernames and combine into one dataset
if os.path.exists(HUMAN_FILE) and os.path.exists(BOT_FILE):
    human_usernames = load_usernames(HUMAN_FILE, 0)     # 0 for human
    bot_usernames = load_usernames(BOT_FILE, 1)         # 1 for bot
    all_usernames = human_usernames + bot_usernames
else:
    print(f"Error: {HUMAN_FILE} or {BOT_FILE} does not exist.")
    exit(1)

# shuffle combined list
random.shuffle(all_usernames)

In [64]:
def extract_account_features(username, label):
    print(f"Extracting features for {username}...")

    try:
        redditor = reddit.redditor(username)

        # fetch subissions and comments
        posts = list(redditor.submissions.new(limit=100))
        comments = list(redditor.comments.new(limit=100))

        if not posts and not comments:
            print(f"No data found for {username}")
            return None
    
        now = datetime.utcnow().timestamp()
        timestamps = [getattr(item, "created_utc", now) for item in posts + comments]
        account_age = (now - min(timestamps)) / (60 * 60 * 24) if timestamps else 0 # in days

        post_count = len(posts)
        post_scores = [p.score for p in posts]
        post_lengths = [len(((p.title or "") + " " + (p.selftext or "")).split()) for p in posts]

        comment_count = len(comments)
        comment_scores = [c.score or 0 for c in comments]
        comment_lengths = [len((c.body or "").split()) for c in comments]

        return {
            "username": username,
            "account_age": account_age,
            "post_count": post_count,
            "comment_count": comment_count,
            "avg_post_score": np.mean(post_scores) if post_scores else 0,
            "avg_comment_score": np.mean(comment_scores) if comment_scores else 0,
            "total_post_score": sum(post_scores),
            "total_comment_score": sum(comment_scores),
            "post_to_comment_ratio": post_count / comment_count if comment_count > 0 else 0,
            "avg_post_length": np.mean(post_lengths) if post_lengths else 0,
            "avg_comment_length": np.mean(comment_lengths) if comment_lengths else 0,
            "label": label,
        }

    except Exception as e:
        print(f"Error processing {username}: {e}")
        return None

In [71]:
# if the file is empty, we will write the header
write_header = True
processed = set()
if os.path.exists(COMBINED_FILE):
    with open(COMBINED_FILE, "r") as f:
        first_line = f.readline().strip()
        if "username" in first_line:
            write_header = False
            f.seek(0)
            reader = csv.DictReader(f)
            for row in reader:
                processed.add(row["username"])
        else:
            print("Warning: Output file exists but is missing header. Ignoring file for resume logic.")

# extract features and append to CSV file
with open(COMBINED_FILE, "a", newline="") as f:
    # create CSV feature headers
    fieldnames = [
        "username", "account_age", "post_count", "comment_count",
        "avg_post_score", "avg_comment_score", "total_post_score",
        "total_comment_score", "post_to_comment_ratio",
        "avg_post_length", "avg_comment_length", "label",
    ]
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    if write_header:
        writer.writeheader()

    # go through all usernames
    for idx, (username, label) in enumerate(all_usernames):
        if username in processed:
            print(f"Skipping already processed user: {username}")
            continue
        
        # save features to combined CSV file (remember: 1 for bot, 0 for human)
        row = extract_account_features(username, label)
        if row:
            writer.writerow(row)
            f.flush()
            print(f"[{idx + 1}/{len(all_usernames)}] Processed: {username} - {label}")
        else:
            print(f"[{idx + 1}/{len(all_usernames)}] No data found: {username}")

        time.sleep(1) # to avoid hitting API rate limits

Skipping already processed user: wultura
Skipping already processed user: AllanMcPherson2
Skipping already processed user: FelhazarVim
Skipping already processed user: nu3goxyu
Skipping already processed user: GingersPanties4U
Skipping already processed user: -a-compte-
Extracting features for CreepyMiddleAgedDude...
Error processing CreepyMiddleAgedDude: received 404 HTTP response
[7/1939] No data found: CreepyMiddleAgedDude
Skipping already processed user: ArashiraMirantrius
Skipping already processed user: gordon_br
Skipping already processed user: TygrargasShadowstaf
Skipping already processed user: blackie_jack
Skipping already processed user: RegretfullyYours
Skipping already processed user: furiouslegume
Skipping already processed user: SirGingerGoat
Skipping already processed user: Tohorooowoaowoaoy
Skipping already processed user: garliccomet
Skipping already processed user: MiranadarAnardana
Extracting features for Lexy_Avocado...
Error processing Lexy_Avocado: received 404 H

In [52]:
# client ID:        fc4kMrikT-wsbuJECFEKpA
# client secret:    s9umceJJUwJNCbRpaNZ9vK4hC-LsBA
# user agent:       BotDetectorScript by u/Feeling_Sandwich3620