In [8]:
import os
import time
import csv
from datetime import datetime, timezone

import pandas as pd
import praw
from dotenv import load_dotenv
from prawcore.exceptions import RequestException, ResponseException

In [9]:
threads_df = pd.read_csv('2024.csv', header=None, names=['thread'])
thread_dict = {i+1: [f"{i+1}", f"Match {i+1}", row['thread']] for i, row in threads_df.iterrows()}

In [10]:
load_dotenv()
reddit = praw.Reddit(
    client_id=os.getenv('CLIENT_ID'),
    client_secret=os.getenv('CLIENT_SECRET'),
    user_agent=os.getenv('USER_AGENT'),
)

In [None]:

def get_comments(submission_url, filename, retries=10, initial_wait=5):
    os.makedirs('reddit_threads', exist_ok=True)
    filepath = os.path.join('reddit_threads', filename)
    
    with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Timestamp', 'Comment', 'Upvotes'])
        
        submission = reddit.submission(url=submission_url)
        submission.comment_sort = 'old'
        
        wait_time = initial_wait
        
        for attempt in range(retries):
            try:
                submission.comments.replace_more(limit=None)
                
                for top_level_comment in submission.comments:
                    writer.writerow([
                        datetime.fromtimestamp(top_level_comment.created_utc, timezone.utc).isoformat(),
                        top_level_comment.body,
                        top_level_comment.score
                    ])
                break
            
            except (RequestException, ResponseException) as e:
                if '429' in str(e):
                    print(f"Rate limited. Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{retries})")
                    time.sleep(wait_time)
                    wait_time *= 2
                else:
                    raise
        else:
            raise Exception(f"Failed to fetch comments from {submission_url} after {retries} attempts.")


In [12]:
for match, (match_number, match_teams, thread_url) in thread_dict.items():
    filename = f'{match_number}.csv'
    filepath = os.path.join('reddit_threads', filename)
    
    print(f"Processing Match {match_number}: {match_teams}")
    
    if os.path.exists(filepath):
        print(f"Skipping - file already exists for Match {match_number}: {match_teams}\n")
        continue
    
    try:
        get_comments(thread_url, filename)
    except Exception as e:
        print(f"Error {match_number}: {match_teams}")
        print(e)
    
    print(f"Finished processing Match {match_number}: {match_teams}\n")


Processing Match 1: Match 1
Skipping - file already exists for Match 1: Match 1

Processing Match 2: Match 2
Skipping - file already exists for Match 2: Match 2

Processing Match 3: Match 3
Skipping - file already exists for Match 3: Match 3

Processing Match 4: Match 4
Skipping - file already exists for Match 4: Match 4

Processing Match 5: Match 5
Rate limited. Retrying in 5 seconds... (Attempt 1/10)
Finished processing Match 5: Match 5

Processing Match 6: Match 6
Rate limited. Retrying in 5 seconds... (Attempt 1/10)
Finished processing Match 6: Match 6

Processing Match 7: Match 7
Rate limited. Retrying in 5 seconds... (Attempt 1/10)
Finished processing Match 7: Match 7

Processing Match 8: Match 8
Rate limited. Retrying in 5 seconds... (Attempt 1/10)
Finished processing Match 8: Match 8

Processing Match 9: Match 9
Finished processing Match 9: Match 9

Processing Match 10: Match 10
Rate limited. Retrying in 5 seconds... (Attempt 1/10)
Finished processing Match 10: Match 10

Proces