# Data Pipeline Prototype

## Importing libraries

In [None]:
# ! pip install pandas praw prawcore python-dotenv pyarrow

In [None]:
import pandas as pd
import praw, prawcore, time, os, sys, functools, random
import datetime as dt
from dotenv import load_dotenv
from typing import List, Dict, Any
from collections.abc import Callable, Iterator
from itertools import product
from collections import deque, namedtuple

In [None]:
# Load .env file for access keys
load_dotenv(os.path.join('..', 'config', '.env'))

# Import config.py to access environment variables
sys.path.append('../config')
from config import PRAW_ID, PRAW_SECRET, PRAW_USER_AGENT, PRAW_USERNAME, PRAW_PASSWORD

## Setting up access to Reddit API
Access keys to Reddit API are stored in a .env file under the config directory of this repository. A template for the .env file is provided in the config directory.

The config.py script assigns the environment variables to the `PRAW_ID`, `PRAW_SECRET`, `PRAW_USER_AGENT`, `PRAW_USERNAME`, and `PRAW_PASSWORD` global variables respectively.  

In [None]:
# Initialize PRAW 
reddit = praw.Reddit(
    client_id = PRAW_ID,
    client_secret = PRAW_SECRET,
    username = PRAW_USERNAME,
    password = PRAW_PASSWORD,
    user_agent = PRAW_USER_AGENT
)

## Extracting text data

### Utility Functions
The helper functions were designed to extract relevant data and metadata from Reddit submissions and comments, and package the data into a dict of dicts that can be easily parsed into a Pandas DataFrame object for further analysis.

`backoff_on_rate_limit`: This is a decorator factory that builds a custom decorator based on specified backoff parameters (max retries, base delay, cap, jitter). The decorator itself is a wrapper for custom functions that call PRAW methods such as `fetch_submissions` and `fetch_comments`, which call subreddit.search() and submission.comments.replace_more() respectively. The decorator implements exponential backoff with optional full jitter to respect Reddit API rate limits while handling transient failures.

__Inputs:__
- Integer value for max retries. When attempts exceed this number, an Exception is raised
- Float for base delay in seconds (i.e. Delay at first failed attempt)
- Float for maximum delay in seconds
- Bool on whether to implement full jitter or not

__Outputs:__
- Decorator to be applied to an PRAW API request wrapper

`parse_comments`: This is a utility function that fetches comments from a given post and formats each comment as a dictionary of dictionaries with key as comment id and value as a dictionary of comment content and metadata (e.g. body, timestamp, upvotes).

__Inputs:__ 
- Submission object from PRAW (i.e. Reddit posts)
- Integer for .replace_more limit parameter, default=0 (i.e. top/parent comments only)

__Output:__
- Dict of comments in the format {comment_id : {data_header: data_value}}

`parse_search_results`: This is a utility function that fetches submissions (posts) from a given subreddit using a predefined search query (i.e. keywords). Submissions are formatted into a dict of dicts with format {submission id : {data_header : data_value}}. This returns a tuple of submission data and comment data.

__Inputs:__ 
- String of Subreddit name
- String of search query
- Integer for limit of submissions yielded by PRAW subreddit search

__Output:__
- Tuple of submission data dict and comment data dict

__Read more:__
1. [API Rate Limits Explained: Best Practices for 2025](https://orq.ai/blog/api-rate-limit)
2. [Exponential Backoff And Jitter](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/)
3. [Yield Statements vs. Returning Lists in Python](https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://community.aws/content/2h01Byx1ytU8357tp2bvcUuJ2j0/yield-statements-vs-returning-lists-in-python%23:~:text%3DYield%253A%2520Ideal%2520for%2520large%2520data,potentially%2520leading%2520to%2520memory%2520errors.&ved=2ahUKEwjzvJvd74uOAxVkQ6QEHVAVMHcQFnoECBIQAw&usg=AOvVaw3hMoJHnPwBIQOdBmB_NiBD)

In [313]:
def backoff_on_rate_limit(max_retries:int=5, 
                        base_delay:float=1.0, 
                        cap_delay:float=60.0, 
                        jitter:bool=True) -> Callable:
    """
    Decorator factory that applies exponential backoff (with optional jitter)
    when Reddit API rate limits (HTTP 429) or server errors occur.
    Stops after max_retries and re-raises the exception.
    """
    def decorator(func: Callable):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            # Start with base delay, then exponentially scale by attempt
            attempt = 0
            while True:
                try:
                    return func(*args, **kwargs)
                except prawcore.exceptions.ResponseException as e:
                    if attempt > max_retries:
                        raise Exception("Max retries exceeded with Reddit API.")
                    delay = min(cap_delay, base_delay * 2 ** attempt)
                    if jitter:
                        delay = random.uniform(0, delay)
                    print(f"[WARNING] {e.__class__.__name__} on attempt {attempt+1}, retrying after {delay:.2f}s...")
                    time.sleep(delay)
                    attempt += 1
        return wrapper
    return decorator

@backoff_on_rate_limit()
def fetch_submissions(subreddit:object, query:str, limit:int=100, **kwargs) -> Iterator:
    """Modify the subreddit search from PRAw to ensure adherence to safe request limits."""
    return subreddit.search(**kwargs, query=query, limit=limit)

@backoff_on_rate_limit()
def fetch_comments(submission:object, limit:int=0) -> list:
    """Modify the comment fetch from PRAW to ensure adherence to safe request limits."""
    submission.comments.replace_more(limit=limit)
    return submission.comments.list()

In [None]:
def stream_comments(submission:object, limit:int=0):
    """
    Fetches comments from a Submission objects then parses each comment into a dictionary record.
    Each entry is streamed for efficient memory footprint when handling larger CommentForests.
    """
    # Dict of dicts with format {comment_id : comment_info_dict}
    comment_data: Dict[str, Dict[str, Any]] = {}
    # Update comments dict with info dict 
    for comment in fetch_comments(submission, limit=limit):
        record = {
            'comment_id':comment.id,
            'body':comment.body,
            'score':comment.score,
            'timestamp':comment.created_utc,
            'subreddit':comment.subreddit_name_prefixed,
            'parent_submission_id':submission.id
        }
        # Stream
        yield "comment", record

In [None]:
def stream_submissions_and_comments(subreddit_name:str, query:str, limit:int=50, **search_kwargs):
    """
    Fetches submissions, parses each submission into a dictionary record, and calls the stream_comments
    function on each submission. Submission data and comment data are streamed for efficient memory 
    footprint when handling larger datasets. 
    """
    sub = reddit.subreddit(subreddit_name)
    
    # Fetch submissions, and for every submission, fetch the comments
    for submission in fetch_submissions(**search_kwargs, subreddit=sub, query=query, limit=limit):
        # Update submissions dict with info dict from submission
        record = {
            'submission_id':submission.id,
            'title':submission.title,
            'selftext':submission.selftext,
            'score':submission.score,
            'upvote_ratio':submission.upvote_ratio,
            'timestamp':submission.created_utc,
            'subreddit':submission.subreddit_name_prefixed,
            'num_comments':submission.num_comments
            }
        # Stream comment data from current submission
        yield from stream_comments(submission)
        # Stream submission data
        yield "submission", record

### Provide an initial list of search queries and Subreddits

To scrape the relevant text data from Reddit, I created a small list of queries covering diverse yet relevant topics to buying affordable used vehicles. The queries involved location-specific, model-specific, and thematic keywords to ensure that the search covers as much ground as possible. Chosen subreddits have > 1e5 subscribers to ensure that search queries will yield a significant amount of results per API request.

With a 10x10 query and subreddit array, I expect at least an initial 100 requests for the subreddit search yielding 100x50 submissions at most.

Fetching the comments involves significantly more requests as each submission requires 1 request to yield the CommentForest. Fetching the comments will require at least 10,000 requests.

__Expected Minimum API Requests__
|Search Requests|Comment Fetch Requests|Total Requests|
|:----------|:----------|:----------|
|100      |5,000  |5,100|

As such, a single batch job covering all query-subreddit combinations will yield at least 10,100 API requests in a single go, which wildly exceeds the Reddit API fair use policy (i.e. Cap requests to 100/min averaged over 10-minute sliding window). To address this issue, batched processing will be implemented to ensure average requests is under safe rate limits.

### Utility Functions

`parse_txt_file`: Parses text files containing data separated by newlines. Returns a list. Used for containerizing search_queries and subreddit strings into separate text files that can be easily mutated without modifying source code.

__Input:__
- String for the path of text file, with each item separated by a newline

__Output:__
- List (e.g. search queries, subreddit names)

`aggregate_search_results`: This function is a wrapper for the `parse_search_results` call and calls the inner function for each subreddit-query pair formed from the input list arguments. Requests are tracked at every iteration and compared against maximum requests per minute. If expected total requests go beyond rate limit, program execution is paused for at least a minute to ensure that requests are within safe rate limits. The number of submissions requested are also randomized per search pair to reduce predictability of scraping pattern.

__Inputs:__ 
- List of subreddit name strings
- List of search query strings
- Int of maximum requests per minute, also determines upper bound of search result limit
- Int of minimum requests, which is the floor of search result limit
- List of float values denoting delay in seconds for long delay (interval between search pairs); minimum of 60s

__Output:__
- Tuple of aggregated submissions dict and comments dict

__Read more:__
1. [Rate Limiter - Sliding Window Counter](https://medium.com/@avocadi/rate-limiter-sliding-window-counter-7ec08dbe21d6)

In [316]:
def parse_txt_file(file_path:str):
    """
    Utility function for parsing a multi-line text file where each item is separated
    by a newline.
    Input: String for file path
    Output: List
    """
    with open(file_path, 'r') as f:
        # Ignore comments and empty lines
        results = [line.rstrip("\n") for line in f if not (line.startswith('#') or line.startswith("\n"))]
    return results

In [None]:
def stream_aggregate_results(subreddits:List[str], 
                             queries:List[str],
                             max_requests:int=100, 
                             min_requests:int=50,
                             jitter:List[float] = [1.0,10.0],
                             **search_kwargs):
    """
    Wrapper for streaming functions. Takes a list of subreddits and queries, then calls the 
    stream_search_results  function for each combination of subreddit and query. Jitter is implemented 
    to introduce randomness in number of API requests with a short backoff in each iteration to ensure
    adherence to Reddit API rate limits.
    """
    assert isinstance(subreddits, list), "Argument 'subreddits' expects a list of subreddit names."
    assert isinstance(queries, list), "Argument 'queries' expects a list of search queries names."
    
    # API request counter for triggering execution cooldown
    Trace = namedtuple('Trace', ['timestamp','total_requests'])
    trace_requests = []
    total_requests = 0
    
    # Parse submission and comment data with jittered API calls
    for subreddit, query in product(subreddits, queries):
        # Random number of requests per iteration to reduce predictability
        submission_limit = int(random.uniform(min_requests, max_requests))
        
        # Record start time
        start_time = dt.datetime.now()
        
        # Stream data 
        stream_submissions_and_comments(**search_kwargs, subreddit_name=subreddit, 
                                        query=query, limit=submission_limit)
        
        # Record end time and compare against window (start + 1 minute)
        end_time = dt.datetime.now()
        
        # Trigger delay to ensure average requests are under 100 per minute
        window = start_time + dt.timedelta(minutes=1)
        delay = abs(window - end_time).seconds + random.uniform(*jitter)
        time.sleep(delay)
        
        # Update total requests and request history
        total_requests += submission_limit
        trace_requests.append(Trace(dt.datetime.now(), total_requests))
    
    print(f'Finished writing data to disk.\nTotal requests made: {sum(trace_requests)};\nTrace: {"\n".join(trace_requests)}.')
        

In [None]:
def delay_next_execution(period:float=60.0, jitter:List[float]=[0.1,5.0]):
    """Decorator factory for delaying next execution by at least the residual time from
    the end of function execution and the remaining time window."""
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            """
            Wrap generator function with a timer that ensures at least the stated period
            elapsed from the start of the recently executed function to the evaluation of
            the next expression.
            """
            start_time = dt.datetime.now()
            yield from func(*args, **kwargs)
            end_time = dt.datetime.now()
            window = start_time + dt.timedelta(seconds=period)
            if window > end_time:
                delay = abs(window - end_time).seconds + random.uniform(*jitter)
                time.sleep(delay)
        return wrapper
    return decorator

In [169]:
for _ in range(5):
    print(list(foo()))

Started at 20:03:26
Ended at 20:03:26
Elapsed 2.26s before next batch.
[1, 2, 3, 4, 5]
Started at 20:03:28
Ended at 20:03:28
Elapsed 1.05s before next batch.
[1, 2, 3, 4, 5]
Started at 20:03:29
Ended at 20:03:29
Elapsed 0.75s before next batch.
[1, 2, 3, 4, 5]
Started at 20:03:30
Ended at 20:03:30
Elapsed 0.63s before next batch.
[1, 2, 3, 4, 5]
Started at 20:03:31
Ended at 20:03:31
Elapsed 1.66s before next batch.
[1, 2, 3, 4, 5]


In [99]:
start_window = dt.datetime.now()

In [103]:
window = (start_window + dt.timedelta(minutes=1))

In [116]:
abs(window - dt.datetime.now()).seconds

270

In [111]:
(dt.datetime.now() - window).seconds

113

In [65]:
total_requests = [500,20,30,65,32]
timestamp = [600,650,670,700,710]
Trace = namedtuple('Trace',['timestamp','requests'])
traces = deque()
for ts, r in zip(timestamp, total_requests):
    traces.append(Trace(ts, r))

In [69]:
current_window = dt.datetime.now()
previous_window = current_window - dt.timedelta(minutes=10)

In [81]:
current_window

datetime.datetime(2025, 6, 25, 18, 42, 1, 971572)

In [84]:
window1 > current_window

True

In [82]:
window1 = dt.datetime.now()

In [74]:
previous_window

datetime.datetime(2025, 6, 25, 18, 32, 1, 971572)

In [63]:
window_requests = 0
max_requests = 1000
while traces and window_requests < max_requests:
    window_requests += traces.popleft().requests

In [64]:
window_requests

647

In [42]:
register_time(timestamps)

In [46]:
compute_sleep_window(timestamps)

1

In [24]:
timestamps

deque([datetime.datetime(2025, 6, 25, 17, 52, 48, 516068),
       datetime.datetime(2025, 6, 25, 17, 53, 13, 343613)])

In [None]:
def sliding_window

In [338]:
# Parse text files containing search queries and subreddit names
search_queries = parse_txt_file("../src/search_queries.txt")
subreddits = parse_txt_file("../src/subreddits.txt")

### Fetching and parsing search results from Reddit used car communities

In [339]:
search_queries = search_queries[:2]
subreddits = subreddits[:2]
print('-------Search Pairs-------')
for (subreddit, query) in product(subreddits, search_queries):
    print(subreddit,"-",query)

-------Search Pairs-------
CarsAustralia - affordable reliable used cars under 15k Australia
CarsAustralia - affordable reliable used cars under 10k USA
UsedCars - affordable reliable used cars under 15k Australia
UsedCars - affordable reliable used cars under 10k USA


In [344]:
%%time
# Fetch search results and parse to dict of dicts
submission_data, comment_data = aggregate_search_results(subreddits=subreddits, queries=search_queries)

Cooldown triggered: sleeping for 109.40741414375071s to avoid rate limit.
Cooldown triggered: sleeping for 71.57586797057071s to avoid rate limit.
Cooldown triggered: sleeping for 65.00197525949584s to avoid rate limit.
CPU times: user 2.8 s, sys: 314 ms, total: 3.11 s
Wall time: 7min 44s


## Storing the scraped data

### Formatting to a Pandas DataFrame

In [348]:
submission_df = pd.DataFrame.from_dict(submission_data, orient='index')
comment_df = pd.DataFrame.from_dict(comment_data, orient='index')

### Exporting DataFrame to a Parquet file for efficient storage

In [349]:
submission_df.to_parquet(os.path.join("..","data","submission_data.parquet"), 
                         engine='pyarrow',
                         compression='gzip')

comment_df.to_parquet(os.path.join("..","data","comment_data.parquet"),
                      engine='pyarrow',
                      compression='gzip')

## Exploratory Data Analysis