In [1]:
import os
import re
import time
import math
import pandas as pd
import praw
from typing import List, Tuple, Dict

In [3]:
# -----------------------------
# AUTH (use environment vars)
# -----------------------------
def reddit_client():
    return praw.Reddit(
        client_id="vkf1gQoROd-7TWrnmXRtRg",
        client_secret="bJrKNBRU9HfXqSXOf2K7Wp-Mp3iIgQ",
        user_agent="LETF Scraper for MSc Thesis by u/the_ronnows", # A descriptive user_agent is required
        username="the_ronnows",
        password=os.environ.get('Jule9sse') # Reads the password securely
    )

reddit = reddit_client()
print(f"Authenticated as: u/{reddit.user.me()}")

Authenticated as: u/None


In [None]:
#CONFIGURATION
SUBREDDIT_TO_SCRAPE = 'LETFs'
KEYWORDS = [
    'UPRO', 'TQQQ', 'SSO', 'QLD', 'TMF', 'DBPG', '3EUL', '3QQQ',
    'leveraged etf', 'letf', 'volatility decay', 'leverage for the long run',
    'HFEA', '2x SMA', '3x SMA', '9sig', '200d SMA', 'leverage rotation',
    'daily leveraged'
]
# Limit for the number of submissions to search through. PRAW's max is currently set to 500 per query.
SUBMISSION_SEARCH_LIMIT = 500
# The columns for our DataFrame
DATA_COLUMNS = ['comment_id', 'body', 'created_utc', 'score', 'subreddit', 'submission_id', 'permalink']

# --- Data Collection ---
comments_data = []
processed_submissions = set()

In [26]:
#Script to collect comments from subreddits
# PRAW search query format requires 'OR' between keywords
search_query = ' OR '.join(f'"{k}"' for k in KEYWORDS)

print(f"Starting scrape of r/{SUBREDDIT_TO_SCRAPE}...")
print(f"Searching for submissions with keywords: {', '.join(KEYWORDS[:3])}...")

subreddit = reddit.subreddit(SUBREDDIT_TO_SCRAPE)

# Search for submissions in the subreddit
for submission in subreddit.search(search_query, limit=SUBMISSION_SEARCH_LIMIT):
    # Avoid processing the same submission twice if found by different keywords
    if submission.id in processed_submissions:
        continue
    
    print(f"\n--- Processing Submission ID: {submission.id} ---")
    print(f"Title: {submission.title}")

    # Fetch all comments, including replies to other comments
    # limit=None fetches all comments. replace_more resolves 'MoreComments' objects.
    submission.comments.replace_more(limit=None)
    
    for comment in submission.comments.list():
        # Skip deleted or removed comments
        if not hasattr(comment, 'body') or comment.body == '[deleted]' or comment.body == '[removed]':
            continue

        comments_data.append({
            'comment_id': comment.id,
            'body': comment.body,
            'created_utc': comment.created_utc,
            'score': comment.score,
            'subreddit': str(comment.subreddit),
            'submission_id': submission.id,
            'permalink': comment.permalink
        })
    
    processed_submissions.add(submission.id)
    print(f"Found {len(submission.comments.list())} comments. Total collected: {len(comments_data)}")

print(f"\nScraping complete. Collected a total of {len(comments_data)} comments from {len(processed_submissions)} submissions.")

# --- DataFrame Creation and Storage ---
if comments_data:
    # Convert list of dictionaries to a pandas DataFrame
    df_comments = pd.DataFrame(comments_data, columns=DATA_COLUMNS)

    # Ensure the target directory exists
    output_dir = '../data/raw/reddit_data/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Define the output file path
    output_path = os.path.join(output_dir, f'{SUBREDDIT_TO_SCRAPE.lower()}_raw.csv')
    
    # Save the DataFrame to a CSV file
    df_comments.to_csv(output_path, index=False)
    
    print(f"\nSuccessfully saved data to: {output_path}")
    print("\nFirst 5 rows of the collected data:")
    print(df_comments.head())
else:
    print("\nNo comments were collected. The search might not have returned any relevant submissions.")



Starting scrape of r/LETFs...
Searching for submissions with keywords: UPRO, TQQQ, SSO...

--- Processing Submission ID: 1mzsdjb ---
Title: Finally, the holy grail of LETF is incoming: AMUNDI MSCI WORLD (2X) LEVERAGED UCITS ETF
Found 160 comments. Total collected: 160

--- Processing Submission ID: 1nx359x ---
Title: Genuine question: Why do all leveraged etfs show  crazy returns? And why isn't everyone getting rich?
Found 155 comments. Total collected: 312

--- Processing Submission ID: 1lmuybz ---
Title: Simple easy TQQQ strategy using the 200 SMA from QQQ with a few modifications
Found 123 comments. Total collected: 435

--- Processing Submission ID: 1nhye66 ---
Title: SPY 200SMA (+4%/-3%) TQQQ/QQQ Long Term Investment Strategy
Found 77 comments. Total collected: 512

--- Processing Submission ID: 1n02g66 ---
Title: Since Amundi is releasing a 2x Total World LETF, what is your planned portfolio?
Found 89 comments. Total collected: 599

--- Processing Submission ID: 1nikxft ---
Title