---
title: "Data Collection"
format:
    html: 
        code-fold: false
---

{{< include overview.qmd >}} 


{{< include methods.qmd >}} 

# Codes

In [None]:
import csv
import json
import praw
import pandas as pd
from datetime import datetime, timezone

# Note: person user details are concealed
client_id = ''
client_secret = ''
user_agent = ''

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# Test if connected
print(reddit.read_only)  # Expected True

try:
    # Test with subreddit: python
    subreddit = reddit.subreddit('python')
    print(f"Successfully connected! Subreddit title: {subreddit.title}")
    print(f"Read-only mode: {reddit.read_only}")  # True Expected
except Exception as e:
    print(f"Connection failed: {e}")

In [None]:
file_path_subreddit = "data/processed-data/subreddits.csv"
df_subreddit = pd.read_csv(file_path_subreddit)
subreddit_list = df_subreddit['subreddit'].tolist()

# Storage for results
results = []

# Counters for submission and comments
submission_count = 0
comment_count = 0

# Function to convert UTC timestamp to readable format
def convert_timestamp(utc_timestamp):
    return datetime.fromtimestamp(utc_timestamp, timezone.utc).strftime('%Y/%m')

# Iterate over each subreddit
for subreddit_name in subreddit_list:
    print(f"Fetching data for subreddit: {subreddit_name}")
    try:
        subreddit = reddit.subreddit(subreddit_name)
        
        # Store post IDs to avoid duplicates
        seen_post_ids = set()
        
        # Submission Collection Part 1: Get top 10 posts by hot ranking
        for submission in subreddit.hot(limit=10):
            
            # Skip repeat submission
            if submission.id in seen_post_ids:
                continue

            # Skip posts with the "meme" flair
            if submission.link_flair_text and "meme" in submission.link_flair_text.lower():
                print(f"Skipping post with 'meme' flair: {submission.title}")
                continue

            submission_data = {
                "subreddit": subreddit_name,
                "post_id": submission.id,
                "title": submission.title,
                "text": submission.selftext,
                "score": submission.score,
                "created_utc": convert_timestamp(submission.created_utc),
                "sort_type": "hot",
                "comments": []
            }

            seen_post_ids.add(submission.id)
            submission_count += 1

            # Process comments for this submission
            submission.comments.replace_more(limit=0)  
            all_comments = submission.comments.list()
            
            # Sort comments by number of replies
            comments_by_replies = sorted(
                all_comments,
                key=lambda x: len(x.replies) if hasattr(x, 'replies') else 0,
                reverse=True
            )[:10]
            
            # Process comments and add indentation
            def process_comments(comments, depth=0):
                global comment_count
                processed_comments = []
                
                for comment in comments:
                    # indent = "  " * depth  
                    processed_comment = {
                        "comment_id": comment.id,
                        "body": comment.body,
                        "score": comment.score,
                        "depth": depth,
                        "num_replies": len(comment.replies) if hasattr(comment, 'replies') else 0,
                        "created_utc": convert_timestamp(comment.created_utc)  
                    }
                    
                    comment_count += 1
                    processed_comments.append(processed_comment)
                    
                    # Process replies if they exist
                    if hasattr(comment, 'replies') and len(comment.replies) > 0:
                        replies = list(comment.replies)[:10]  # Limit replies to 10 per comment
                        processed_comments.extend(process_comments(replies, depth + 1))
                
                return processed_comments

            # Process and store comments for the current submission
            submission_data["comments"] = process_comments(comments_by_replies)
            results.append(submission_data)


        # Submission Collection Part 2: Get top 10 posts by controversial ranking
        for submission in subreddit.controversial(limit=10):
            if submission.id in seen_post_ids:
                continue
                
            # Skip posts with the "meme" flair
            if submission.link_flair_text and "meme" in submission.link_flair_text.lower():
                print(f"Skipping post with 'meme' flair: {submission.title}")
                continue
            
            submission_data = {
                "subreddit": subreddit_name,
                "post_id": submission.id,
                "title": submission.title,
                "text": submission.selftext,
                "score": submission.score,
                "created_utc": convert_timestamp(submission.created_utc),  
                "sort_type": "controversial",  
                "comments": []
            }
            
            seen_post_ids.add(submission.id)
            submission_count += 1

            # Process comments for this submission
            submission.comments.replace_more(limit=0)
            all_comments = submission.comments.list()
            
            # Sort comments by number of replies
            comments_by_replies = sorted(
                all_comments,
                key=lambda x: len(x.replies) if hasattr(x, 'replies') else 0,
                reverse=True
            )[:10]
            
            # Process and store comments using the same process_comments function
            submission_data["comments"] = process_comments(comments_by_replies)
            results.append(submission_data)

    except Exception as e:
        print(f"Error fetching data for subreddit {subreddit_name}: {e}")
        continue

# Print summary of fetched data
print(f"Total posts fetched: {submission_count}")
print(f"Total comments fetched: {comment_count}")

{{< include closing.qmd >}} 