In [1]:
import os
import time
import csv
from datetime import datetime, timezone

import pandas as pd
import praw
from dotenv import load_dotenv
from prawcore.exceptions import RequestException, ResponseException

In [2]:
threads_df = pd.read_csv('reddit_threads.csv', header=None, names=['number', 'match', 'thread'])
thread_dict = {i+1: [row['number'], row['match'], row['thread']] for i, row in threads_df.iterrows()}

In [3]:
load_dotenv()
reddit = praw.Reddit(
    client_id=os.getenv('CLIENT_ID'),
    client_secret=os.getenv('CLIENT_SECRET'),
    user_agent=os.getenv('USER_AGENT'),
)

In [4]:

def get_comments(submission_url, filename, retries=10, initial_wait=5):
    os.makedirs('reddit_threads', exist_ok=True)
    filepath = os.path.join('reddit_threads', filename)
    
    with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Timestamp', 'Username', 'Comment', 'Upvotes'])
        
        submission = reddit.submission(url=submission_url)
        submission.comment_sort = 'old'
        
        wait_time = initial_wait
        
        for attempt in range(retries):
            try:
                submission.comments.replace_more(limit=None)
                
                for top_level_comment in submission.comments:
                    writer.writerow([
                        datetime.fromtimestamp(top_level_comment.created_utc, timezone.utc).isoformat(),
                        top_level_comment.author,
                        top_level_comment.body,
                        top_level_comment.score
                    ])
                break
            
            except (RequestException, ResponseException) as e:
                if '429' in str(e):
                    print(f"Rate limited. Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(wait_time)
                    wait_time *= 2
                else:
                    raise
        else:
            raise Exception(f"Failed to fetch comments from {submission_url} after {retries} attempts.")


In [5]:
for match, (match_number, match_teams, thread_url) in thread_dict.items():
    print(f"Processing Match {match_number}: {match_teams}")
    
    try:
        filename = f'{match_number}.csv'
        get_comments(thread_url, filename)

    except Exception as e:
        print(f"Error {match_number}: {match_teams}")
        print(e)
    
    print(f"Finished processing Match {match_number}: {match_teams}\n")


Processing Match 1216492: MI vs CSK
Finished processing Match 1216492: MI vs CSK

Processing Match 1216493: DC vs KXIP
Finished processing Match 1216493: DC vs KXIP

Processing Match 1216494: RCB vs SRH
Finished processing Match 1216494: RCB vs SRH

Processing Match 1216495: RR vs CSK
Finished processing Match 1216495: RR vs CSK

Processing Match 1216496: MI vs KKR
Finished processing Match 1216496: MI vs KKR

Processing Match 1216497: KXIP vs RCB
Finished processing Match 1216497: KXIP vs RCB

Processing Match 1216498: DC vs CSK
Finished processing Match 1216498: DC vs CSK

Processing Match 1216499: SRH vs KKR
Finished processing Match 1216499: SRH vs KKR

Processing Match 1216500: KXIP vs RR
Finished processing Match 1216500: KXIP vs RR

Processing Match 1216501: RCB vs MI
Rate limited. Retrying in 5 seconds... (Attempt 1/10)
Error 1216501: RCB vs MI
received 503 HTTP response
Finished processing Match 1216501: RCB vs MI

Processing Match 1216502: SRH vs DC
Finished processing Match 