In [1]:
import time
import httpx
import pandas as pd

# Base URL and endpoint definitions
base_url = 'https://www.reddit.com'
endpoint = '/r/NoStupidQuestions'
category = '/hot'

# URL to fetch posts in JSON format
url = base_url + endpoint + category + '.json'
after_post_id = None
dataset = []

# Loop to fetch posts
for _ in range(5):
    params = {
        'limit': 100,
        't': 'year',  # Fetch posts from the last year
        'after': after_post_id
    }
    response = httpx.get(url, params=params, headers={'User-agent': 'Mozilla/5.0'})
    print(f'Fetching "{response.url}"...')
    
    if response.status_code != 200:
        raise Exception('Failed to fetch posts data')

    json_data = response.json()
    posts = json_data['data']['children']
    dataset.extend([post['data'] for post in posts])

    # Update the `after` parameter for pagination
    after_post_id = json_data['data']['after']
    time.sleep(0.5)

# Initialize a list to store comments
comments_dataset = []

# Fetch comments for each post
for post in dataset:
    post_id = post['id']
    comments_url = f'{base_url}{endpoint}/comments/{post_id}.json'

    response = httpx.get(comments_url, headers={'User-agent': 'Mozilla/5.0'})
    print(f'Fetching comments from "{response.url}"...')

    if response.status_code != 200:
        print(f'Failed to fetch comments for post {post_id}')
        continue

    comments_data = response.json()
    
    # Extract the comments from the JSON response
    if len(comments_data) > 1:  # Comments are typically in the second element
        comments = comments_data[1]['data']['children']
        for comment in comments:
            comment_info = comment['data']
            comments_dataset.append({
                'post_id': post_id,
                'comment_id': comment_info['id'],
                'comment_author': comment_info.get('author', ''),
                'comment_body': comment_info.get('body', ''),
                'comment_score': comment_info.get('score', 0)
            })

    time.sleep(0.5)

# Convert both datasets to DataFrames
df_posts = pd.DataFrame(dataset)
df_comments = pd.DataFrame(comments_dataset)

# Save the data to CSV files
df_posts.to_csv('reddit_posts.csv', index=False)
df_comments.to_csv('reddit_comments.csv', index=False)

print("Data extraction completed and saved to CSV files.")


Fetching "https://www.reddit.com/r/NoStupidQuestions/hot.json?limit=100&t=year&after="...
Fetching "https://www.reddit.com/r/NoStupidQuestions/hot.json?limit=100&t=year&after=t3_1f80xun"...
Fetching "https://www.reddit.com/r/NoStupidQuestions/hot.json?limit=100&t=year&after=t3_1f71hkh"...
Fetching "https://www.reddit.com/r/NoStupidQuestions/hot.json?limit=100&t=year&after=t3_1f7kyf8"...
Fetching "https://www.reddit.com/r/NoStupidQuestions/hot.json?limit=100&t=year&after=t3_1f7imm5"...
Fetching comments from "https://www.reddit.com/r/NoStupidQuestions/comments/1f6bte8.json"...
Fetching comments from "https://www.reddit.com/r/NoStupidQuestions/comments/1f7ikbf.json"...
Fetching comments from "https://www.reddit.com/r/NoStupidQuestions/comments/1f7tqsh.json"...
Fetching comments from "https://www.reddit.com/r/NoStupidQuestions/comments/1f7rj9h.json"...
Fetching comments from "https://www.reddit.com/r/NoStupidQuestions/comments/1f7jeeg.json"...
Fetching comments from "https://www.reddit.co