In [8]:
import praw
from collections import defaultdict
from datetime import datetime
import pandas as pd

In [9]:
import logging
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
for logger_name in ("praw", "prawcore"):
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)

In [10]:
client_id = "M1t7URU_UQeGJZhsSEs8Kw"
client_secret = "8lqKc-YKKigvFi1EiNqmJDiJeuNycw"

In [11]:
reddit = praw.Reddit(
client_id =client_id,
client_secret = client_secret,
user_agent='Windows:reddit_bitcoin_scraper:v1 (by /u/Critical_Product_103)',
)

In [12]:
subreddits_of_interest = ["bitcoin", "btc", "cryptocurrency", "cryptomarkets"]

In [13]:
query = "+".join(subreddits_of_interest)

In [14]:
subreddits = reddit.subreddit(query)

In [15]:
query = "bitcoin OR btc"

In [17]:
posts_by_date = defaultdict(list)

# Iterate over search results (limiting to first 5 posts)
for post in subreddits.search(query, limit=5):

    post_date = pd.to_datetime(post.created_utc, unit='s', utc=True).strftime('%Y-%m-%d')

    # Ensure comments are fully loaded
    post.comments.replace_more(limit=None)

    # Get the top 200 comments
    top_comments = [comment.body for comment in post.comments.list()[:200]]

    # Store the post details in a dictionary
    post_data = {
        "title": post.title,
        "body": post.selftext if post.selftext else "No text body (link post)",
        "comments": top_comments
    }

    # Append to the respective date key
    posts_by_date[post_date].append(post_data)

# Print the dictionary
for date, posts in posts_by_date.items():
    print(f"Date: {date}")
    for i, post in enumerate(posts, start=1):
        print(f"  Post {i}: {post['title']}")
        print(f"    Body: {post['body'][:500]}")  # Limit body output to 500 characters
        print(f"    Top {len(post['comments'])} Comments: {post['comments'][:5]}")  # Display only first 5 comments
    print("-" * 80)

Fetching: GET https://oauth.reddit.com/r/bitcoin+btc+cryptocurrency+cryptomarkets/search/ at 1741525450.3914897
Fetching: GET https://oauth.reddit.com/r/bitcoin+btc+cryptocurrency+cryptomarkets/search/ at 1741525450.3914897
Data: None
Data: None
Params: {'limit': 5,
 'q': 'bitcoin OR btc',
 'raw_json': 1,
 'restrict_sr': True,
 'sort': 'relevance',
 'syntax': 'lucene',
 't': 'all'}
Params: {'limit': 5,
 'q': 'bitcoin OR btc',
 'raw_json': 1,
 'restrict_sr': True,
 'sort': 'relevance',
 'syntax': 'lucene',
 't': 'all'}
Response: 200 (5277 bytes) (rst-348:rem-973.0:used-27 ratelimit) at 1741525450.9032567
Response: 200 (5277 bytes) (rst-348:rem-973.0:used-27 ratelimit) at 1741525450.9032567
Fetching: GET https://oauth.reddit.com/comments/1h6ykx6/ at 1741525450.9040217
Fetching: GET https://oauth.reddit.com/comments/1h6ykx6/ at 1741525450.9040217
Data: None
Data: None
Params: {'limit': 2048, 'raw_json': 1, 'sort': 'confidence'}
Params: {'limit': 2048, 'raw_json': 1, 'sort': 'confidence'}


Date: 2024-12-05
  Post 1: Bitcoin Hit 100k
    Body: No text body (link post)
    Top 200 Comments: ["Congrats on the most upvoted $100k post OP! Let's get this sucker to #1 on r/all!\n\n7 years ago, we hit #1 on r/all when [we passed $10k.](https://www.reddit.com/r/Bitcoin/comments/7g9cd3/its_official_1_bitcoin_10000_usd/)\n\n$1 million in 2031?\n\nEdit: We made it to #1 on r/all!!! Congrats to all the holders!!!", 'Dec 4th, 2024!  We were here.', 'This is history! We were all here. Lfg!', 'And it’s passed 101,000 already 🙂', 'I’m laying on the same futon I was when I watched mtgox prices crash to $91 and the bear whale tore up 30k btc sell wall']
--------------------------------------------------------------------------------
Date: 2025-01-27
  Post 1: 3 BTC to quit? 🤔
    Body: No text body (link post)
    Top 200 Comments: ['In a heart beat.', 'I would then I’d just go get another job and pocket those 3 bitcoin until they hit 1 mill each. I’m an electrician by trade so I don’t nee

In [18]:
# Convert dictionary to DataFrame
df = pd.DataFrame([
    {"date": date, "title": post["title"], "body": post["body"], "comments": post["comments"]}
    for date, posts in posts_by_date.items()
    for post in posts
])


In [19]:
# Save to Parquet
df.to_parquet("sample.parquet", engine="pyarrow", index=False)

In [20]:
df

Unnamed: 0,date,title,body,comments
0,2024-12-05,Bitcoin Hit 100k,No text body (link post),[Congrats on the most upvoted $100k post OP! L...
1,2025-01-27,3 BTC to quit? 🤔,No text body (link post),"[In a heart beat., I would then I’d just go ge..."
2,2025-02-15,Me In 2009 Instead of Buying Bitcoin (BTC),No text body (link post),[No one was buying bitcoin in 2009. I bought i...
3,2024-11-21,Sold .16 BTC to pay off my car! 🧘🐋🥂,"Was contemplating doing this transaction, but ...",[This is the real purpose of bitcoin. Freedom!...
4,2025-02-20,This 12-year-old Kid “Erik Finman” in Idaho bo...,No text body (link post),[Teen Bitcoin Expert\n\nQualifications: Gamble...


In [21]:
rate_limit_info = reddit.auth.limits
print(rate_limit_info)

{'remaining': 960.0, 'reset_timestamp': 1741526399.834681, 'used': 40}


In [22]:
from datetime import datetime

reset_time = pd.to_datetime(rate_limit_info['reset_timestamp'],unit = "s",utc = True).tz_convert("Asia/Singapore")  # Convert to UTC time
print("Rate limit resets at (SGT):", reset_time.strftime('%Y-%m-%d %H:%M:%S %Z'))

Rate limit resets at (SGT): 2025-03-09 21:19:59 +08
