## Reddit Scraping (legal way)

Register app @ https://ssl.reddit.com/prefs/apps/ to scrape data through reddit's API

Objective of scraping: Trends 

Key words: 
- Communities other than Keurig: brew, bean, Nespresso, nespresso, keurig, Keurig, espresso, machine, waste, sustainability, kcups, pod, reusable, traditional, drip, french press, single-serve, single serve, French press, nitro, cold brew, roast, specialty

- Keurig community: Pods, cup, reusable 


Features of posts to be retained:
- Title 
- Post 
- Date 
- URL 
- Score  
- Comments (all comments) 
- Number of comments 

Note:
1. Fill in the relevant client id, secret and user agent
2. Replace the relevant subreddit name 
3. Rename the lists and file name as necessary!

## Espresso extraction try 2

In [5]:
import praw
import csv
import time
from datetime import datetime
import pandas as pd

# Authenticate with Reddit
reddit = praw.Reddit(
    client_id="AxA88TAw9r4lzQrZqUk8hQ",
    client_secret="HUyuEV1pOkn2qnpmGrRxFqc6xMlR4w",
    user_agent="python:espresso (by /u/Fit-Blood1919)"
)

# Access the subreddit
subreddit = reddit.subreddit("espresso")

# List of keywords
keywords = [
    "brew", "bean", "nespresso", "keurig", "espresso", "machine",
    "waste", "sustainability", "kcups", "pod", "reusable",
    "traditional", "drip", "french press", "single-serve",
    "single serve", "k-cup", "roast", "specialty", "French press",
    "nitro", "cold brew"
]

# Files for saving progress
priority_file = "partial_priority_espresso_posts.csv"
non_priority_file = "partial_non_priority_espresso_posts.csv"

# Load previous progress if available
try:
    priority_posts = pd.read_csv(priority_file).to_dict('records')
    non_priority_posts = pd.read_csv(non_priority_file).to_dict('records')
    post_ids = {post['id'] for post in priority_posts + non_priority_posts}
    print(f"Resuming from saved progress: {len(priority_posts)} priority posts, {len(non_priority_posts)} non-priority posts.")
except FileNotFoundError:
    priority_posts, non_priority_posts, post_ids = [], [], set()
    print("No saved progress found. Starting fresh.")

# Helper functions
def contains_keywords(text):
    return any(keyword.lower() in text.lower() for keyword in keywords)

def save_to_csv(file_name, posts):
    header = ['id', 'title', 'body', 'upvotes', 'url', 'created_time', 'num_comments', 'comments', 'contains_keywords']
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(posts)

def process_posts(posts, priority_only=False):
    for post in posts:
        if post.id not in post_ids:
            try:
                # Expand comments
                post.comments.replace_more(limit=None)
                comment_bodies = [comment.body for comment in post.comments.list()]
                all_comments = " | ".join(comment_bodies)

                # Check for keywords
                has_keywords = (
                    contains_keywords(post.title)
                    or contains_keywords(post.selftext)
                    or any(contains_keywords(comment) for comment in comment_bodies)
                )

                post_data = {
                    'id': post.id,
                    'title': post.title,
                    'body': post.selftext,
                    'upvotes': post.score,
                    'url': post.url,
                    'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'comments': all_comments,
                    'num_comments': len(comment_bodies),
                    'contains_keywords': has_keywords
                }

                # Save to appropriate list
                if has_keywords:
                    priority_posts.append(post_data)
                elif not priority_only:
                    non_priority_posts.append(post_data)

                post_ids.add(post.id)
            except Exception as e:
                print(f"Error processing post {post.id}: {e}")

# Parameters
fetch_limit = 3000
posts_per_request = 100
pause_time = 2
max_posts_to_screen = 2000
checkpoint_interval = 500

# Fetch posts
try:
    screened_posts_count = len(post_ids)  # Start from previously screened count
    print("Starting scraping...")

    # Stage 1: Priority posts
    while len(priority_posts) < fetch_limit and screened_posts_count < max_posts_to_screen:
        posts = subreddit.hot(limit=posts_per_request)
        process_posts(posts, priority_only=True)
        screened_posts_count += posts_per_request

        if screened_posts_count % checkpoint_interval == 0:
            save_to_csv(priority_file, priority_posts)
            print(f"Checkpoint: {screened_posts_count} posts screened. Priority posts: {len(priority_posts)}.")
        time.sleep(pause_time)

    # Stage 2: Additional non-priority posts if needed
    if len(priority_posts) < fetch_limit:
        print("Switching to non-priority posts...")
        while len(priority_posts) + len(non_priority_posts) < fetch_limit:
            posts = subreddit.hot(limit=posts_per_request)
            process_posts(posts, priority_only=False)

            if len(post_ids) % checkpoint_interval == 0:
                save_to_csv(priority_file, priority_posts)
                save_to_csv(non_priority_file, non_priority_posts)
                print(f"Checkpoint: Saved {len(priority_posts)} priority and {len(non_priority_posts)} non-priority posts.")
            time.sleep(pause_time)

except KeyboardInterrupt:
    print("Script interrupted. Saving progress...")
    save_to_csv(priority_file, priority_posts)
    save_to_csv(non_priority_file, non_priority_posts)
    print("Progress saved. Exiting.")

# Final Save
save_to_csv("espresso_priority_posts_3000.csv", priority_posts)
print(f"Saved {len(priority_posts)} priority posts.")

save_to_csv("espresso_all_posts_3000.csv", priority_posts + non_priority_posts)
print(f"Saved {len(priority_posts) + len(non_priority_posts)} total posts.")


Resuming from saved progress: 194 priority posts, 0 non-priority posts.
Starting scraping...


  'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Switching to non-priority posts...
Script interrupted. Saving progress...
Progress saved. Exiting.
Saved 252 priority posts.
Saved 268 total posts.


# Nespresso subreddit

In [6]:
import praw
import csv
import time
from datetime import datetime
import pandas as pd

# Authenticate with Reddit
reddit = praw.Reddit(
    client_id="-tX-5xECMB1d2cgNSs2ysg",
    client_secret="sQlh4Z2M9BBkCwvGFAS08N_6_0Xr2g",
    user_agent="python:nespresso (by /u/Fit-Blood1919)"
)

# Access the subreddit
subreddit = reddit.subreddit("nespresso")

# List of keywords
keywords = [
    "brew", "bean", "nespresso", "keurig", "espresso", "machine",
    "waste", "sustainability", "kcups", "pod", "reusable",
    "traditional", "drip", "french press", "single-serve",
    "single serve", "k-cup", "roast", "specialty", "French press",
    "nitro", "cold brew"
]

# Files for saving progress
priority_file = "partial_priority_nespresso_posts.csv"
non_priority_file = "partial_non_priority_nespresso_posts.csv"

# Load previous progress if available
try:
    priority_posts = pd.read_csv(priority_file).to_dict('records')
    non_priority_posts = pd.read_csv(non_priority_file).to_dict('records')
    post_ids = {post['id'] for post in priority_posts + non_priority_posts}
    print(f"Resuming from saved progress: {len(priority_posts)} priority posts, {len(non_priority_posts)} non-priority posts.")
except FileNotFoundError:
    priority_posts, non_priority_posts, post_ids = [], [], set()
    print("No saved progress found. Starting fresh.")

# Helper functions
def contains_keywords(text):
    return any(keyword.lower() in text.lower() for keyword in keywords)

def save_to_csv(file_name, posts):
    header = ['id', 'title', 'body', 'upvotes', 'url', 'created_time', 'num_comments', 'comments', 'contains_keywords']
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(posts)

def process_posts(posts, priority_only=False):
    for post in posts:
        if post.id not in post_ids:
            try:
                # Expand comments
                post.comments.replace_more(limit=None)
                comment_bodies = [comment.body for comment in post.comments.list()]
                all_comments = " | ".join(comment_bodies)

                # Check for keywords
                has_keywords = (
                    contains_keywords(post.title)
                    or contains_keywords(post.selftext)
                    or any(contains_keywords(comment) for comment in comment_bodies)
                )

                post_data = {
                    'id': post.id,
                    'title': post.title,
                    'body': post.selftext,
                    'upvotes': post.score,
                    'url': post.url,
                    'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'comments': all_comments,
                    'num_comments': len(comment_bodies),
                    'contains_keywords': has_keywords
                }

                # Save to appropriate list
                if has_keywords:
                    priority_posts.append(post_data)
                elif not priority_only:
                    non_priority_posts.append(post_data)

                post_ids.add(post.id)
            except Exception as e:
                print(f"Error processing post {post.id}: {e}")

# Parameters
fetch_limit = 2000
posts_per_request = 100
pause_time = 2
max_posts_to_screen = 100000
checkpoint_interval = 500

# Fetch posts
try:
    screened_posts_count = len(post_ids)  # Start from previously screened count
    print("Starting scraping...")

    # Stage 1: Priority posts
    while len(priority_posts) < 500 and screened_posts_count < max_posts_to_screen:
        posts = subreddit.hot(limit=posts_per_request)
        process_posts(posts, priority_only=True)
        screened_posts_count += posts_per_request

        if screened_posts_count % checkpoint_interval == 0:
            save_to_csv(priority_file, priority_posts)
            print(f"Checkpoint: {screened_posts_count} posts screened. Priority posts: {len(priority_posts)}.")
        time.sleep(pause_time)

    # Stage 2: Additional non-priority posts if needed
    if len(priority_posts) < fetch_limit:
        print("Switching to non-priority posts...")
        while len(priority_posts) + len(non_priority_posts) < fetch_limit:
            posts = subreddit.hot(limit=posts_per_request)
            process_posts(posts, priority_only=False)

            if len(post_ids) % checkpoint_interval == 0:
                save_to_csv(priority_file, priority_posts)
                save_to_csv(non_priority_file, non_priority_posts)
                print(f"Checkpoint: Saved {len(priority_posts)} priority and {len(non_priority_posts)} non-priority posts.")
            time.sleep(pause_time)

except KeyboardInterrupt:
    print("Script interrupted. Saving progress...")
    save_to_csv(priority_file, priority_posts)
    save_to_csv(non_priority_file, non_priority_posts)
    print("Progress saved. Exiting.")

# Final Save
save_to_csv("nespresso_priority_posts_3000.csv", priority_posts)
print(f"Saved {len(priority_posts)} priority posts.")

save_to_csv("nespresso_all_posts_3000.csv", priority_posts + non_priority_posts)
print(f"Saved {len(priority_posts) + len(non_priority_posts)} total posts.")


No saved progress found. Starting fresh.
Starting scraping...


  'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Checkpoint: 500 posts screened. Priority posts: 90.
Checkpoint: 1000 posts screened. Priority posts: 90.
Checkpoint: 1500 posts screened. Priority posts: 90.
Checkpoint: 2000 posts screened. Priority posts: 90.
Checkpoint: 2500 posts screened. Priority posts: 90.
Checkpoint: 3000 posts screened. Priority posts: 90.
Checkpoint: 3500 posts screened. Priority posts: 90.
Checkpoint: 4000 posts screened. Priority posts: 90.
Checkpoint: 4500 posts screened. Priority posts: 90.
Checkpoint: 5000 posts screened. Priority posts: 90.
Checkpoint: 5500 posts screened. Priority posts: 90.
Checkpoint: 6000 posts screened. Priority posts: 90.
Checkpoint: 6500 posts screened. Priority posts: 90.
Checkpoint: 7000 posts screened. Priority posts: 90.
Checkpoint: 7500 posts screened. Priority posts: 90.
Checkpoint: 8000 posts screened. Priority posts: 90.
Checkpoint: 8500 posts screened. Priority posts: 90.
Checkpoint: 9000 posts screened. Priority posts: 90.
Checkpoint: 9500 posts screened. Priority posts

# James Hoffman (content creator)

In [8]:
import praw
import csv
import time
from datetime import datetime
import pandas as pd

# Authenticate with Reddit
reddit = praw.Reddit(
    client_id="xXTkGHZ2iN_dO47QiHTA7w",
    client_secret="3wVSZ7sNP5wezm0kKxW7Wtu67DnA6w",
    user_agent="python:JamesHoffman (by /u/Fit-Blood1919)"
)

# Access the subreddit
subreddit = reddit.subreddit("JamesHoffmann")

# List of keywords
keywords = [
    "brew", "bean", "nespresso", "keurig", "espresso", "machine",
    "waste", "sustainability", "kcups", "pod", "reusable",
    "traditional", "drip", "french press", "single-serve",
    "single serve", "k-cup", "roast", "specialty", "French press",
    "nitro", "cold brew"
]

# Files for saving progress
priority_file = "partial_priority_jameshoff_posts.csv"
non_priority_file = "partial_non_priority_jameshoff_posts.csv"

# Load previous progress if available
try:
    priority_posts = pd.read_csv(priority_file).to_dict('records')
    non_priority_posts = pd.read_csv(non_priority_file).to_dict('records')
    post_ids = {post['id'] for post in priority_posts + non_priority_posts}
    print(f"Resuming from saved progress: {len(priority_posts)} priority posts, {len(non_priority_posts)} non-priority posts.")
except FileNotFoundError:
    priority_posts, non_priority_posts, post_ids = [], [], set()
    print("No saved progress found. Starting fresh.")

# Helper functions
def contains_keywords(text):
    return any(keyword.lower() in text.lower() for keyword in keywords)

def save_to_csv(file_name, posts):
    header = ['id', 'title', 'body', 'upvotes', 'url', 'created_time', 'num_comments', 'comments', 'contains_keywords']
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(posts)

def process_posts(posts, priority_only=False):
    for post in posts:
        if post.id not in post_ids:
            try:
                # Expand comments
                post.comments.replace_more(limit=None)
                comment_bodies = [comment.body for comment in post.comments.list()]
                all_comments = " | ".join(comment_bodies)

                # Check for keywords
                has_keywords = (
                    contains_keywords(post.title)
                    or contains_keywords(post.selftext)
                    or any(contains_keywords(comment) for comment in comment_bodies)
                )

                post_data = {
                    'id': post.id,
                    'title': post.title,
                    'body': post.selftext,
                    'upvotes': post.score,
                    'url': post.url,
                    'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'comments': all_comments,
                    'num_comments': len(comment_bodies),
                    'contains_keywords': has_keywords
                }

                # Save to appropriate list
                if has_keywords:
                    priority_posts.append(post_data)
                elif not priority_only:
                    non_priority_posts.append(post_data)

                post_ids.add(post.id)
            except Exception as e:
                print(f"Error processing post {post.id}: {e}")

# Parameters
fetch_limit = 3000
posts_per_request = 100
pause_time = 2
max_posts_to_screen = 100000
checkpoint_interval = 500

# Fetch posts
try:
    screened_posts_count = len(post_ids)  # Start from previously screened count
    print("Starting scraping...")

    # Stage 1: Priority posts
    while len(priority_posts) < fetch_limit and screened_posts_count < max_posts_to_screen:
        posts = subreddit.hot(limit=posts_per_request)
        process_posts(posts, priority_only=True)
        screened_posts_count += posts_per_request

        if screened_posts_count % checkpoint_interval == 0:
            save_to_csv(priority_file, priority_posts)
            print(f"Checkpoint: {screened_posts_count} posts screened. Priority posts: {len(priority_posts)}.")
        time.sleep(pause_time)

    # Stage 2: Additional non-priority posts if needed
    if len(priority_posts) < fetch_limit:
        print("Switching to non-priority posts...")
        while len(priority_posts) + len(non_priority_posts) < fetch_limit:
            posts = subreddit.hot(limit=posts_per_request)
            process_posts(posts, priority_only=False)

            if len(post_ids) % checkpoint_interval == 0:
                save_to_csv(priority_file, priority_posts)
                save_to_csv(non_priority_file, non_priority_posts)
                print(f"Checkpoint: Saved {len(priority_posts)} priority and {len(non_priority_posts)} non-priority posts.")
            time.sleep(pause_time)

except KeyboardInterrupt:
    print("Script interrupted. Saving progress...")
    save_to_csv(priority_file, priority_posts)
    save_to_csv(non_priority_file, non_priority_posts)
    print("Progress saved. Exiting.")

# Final Save
save_to_csv("jameshoff_priority_posts_3000.csv", priority_posts)
print(f"Saved {len(priority_posts)} priority posts.")

save_to_csv("jameshoff_all_posts_3000.csv", priority_posts + non_priority_posts)
print(f"Saved {len(priority_posts) + len(non_priority_posts)} total posts.")


Resuming from saved progress: 86 priority posts, 0 non-priority posts.
Starting scraping...


  'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Switching to non-priority posts...
Script interrupted. Saving progress...
Progress saved. Exiting.
Saved 89 priority posts.
Saved 89 total posts.


# Pourover webscrape

In [9]:
import praw
import csv
import time
from datetime import datetime
import pandas as pd

# Authenticate with Reddit
reddit = praw.Reddit(
    client_id="59_GDgLd2eBJ7RQTe4HgFQ",
    client_secret="Wsp5z2g7gos8M38vTU74uWA_zn4smg",
    user_agent="python:pourover (by /u/granola-cookies)"
)

# Access the subreddit
subreddit = reddit.subreddit("pourover")

# List of keywords
keywords = [
    "brew", "bean", "nespresso", "keurig", "espresso", "machine",
    "waste", "sustainability", "kcups", "pod", "reusable",
    "traditional", "drip", "french press", "single-serve",
    "single serve", "k-cup", "roast", "specialty", "French press",
    "nitro", "cold brew"
]

# Files for saving progress
priority_file = "partial_priority_pourover_posts.csv"
non_priority_file = "partial_non_priority_pourover_posts.csv"

# Load previous progress if available
try:
    priority_posts = pd.read_csv(priority_file).to_dict('records')
    non_priority_posts = pd.read_csv(non_priority_file).to_dict('records')
    post_ids = {post['id'] for post in priority_posts + non_priority_posts}
    print(f"Resuming from saved progress: {len(priority_posts)} priority posts, {len(non_priority_posts)} non-priority posts.")
except FileNotFoundError:
    priority_posts, non_priority_posts, post_ids = [], [], set()
    print("No saved progress found. Starting fresh.")

# Helper functions
def contains_keywords(text):
    return any(keyword.lower() in text.lower() for keyword in keywords)

def save_to_csv(file_name, posts):
    header = ['id', 'title', 'body', 'upvotes', 'url', 'created_time', 'num_comments', 'comments', 'contains_keywords']
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(posts)

def process_posts(posts, priority_only=False):
    for post in posts:
        if post.id not in post_ids:
            try:
                # Expand comments
                post.comments.replace_more(limit=None)
                comment_bodies = [comment.body for comment in post.comments.list()]
                all_comments = " | ".join(comment_bodies)

                # Check for keywords
                has_keywords = (
                    contains_keywords(post.title)
                    or contains_keywords(post.selftext)
                    or any(contains_keywords(comment) for comment in comment_bodies)
                )

                post_data = {
                    'id': post.id,
                    'title': post.title,
                    'body': post.selftext,
                    'upvotes': post.score,
                    'url': post.url,
                    'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'comments': all_comments,
                    'num_comments': len(comment_bodies),
                    'contains_keywords': has_keywords
                }

                # Save to appropriate list
                if has_keywords:
                    priority_posts.append(post_data)
                elif not priority_only:
                    non_priority_posts.append(post_data)

                post_ids.add(post.id)
            except Exception as e:
                print(f"Error processing post {post.id}: {e}")

# Parameters
fetch_limit = 3000
posts_per_request = 100
pause_time = 2
max_posts_to_screen = 100000
checkpoint_interval = 500

# Fetch posts
try:
    screened_posts_count = len(post_ids)  # Start from previously screened count
    print("Starting scraping...")

    # Stage 1: Priority posts
    while len(priority_posts) < fetch_limit and screened_posts_count < max_posts_to_screen:
        posts = subreddit.hot(limit=posts_per_request)
        process_posts(posts, priority_only=True)
        screened_posts_count += posts_per_request

        if screened_posts_count % checkpoint_interval == 0:
            save_to_csv(priority_file, priority_posts)
            print(f"Checkpoint: {screened_posts_count} posts screened. Priority posts: {len(priority_posts)}.")
        time.sleep(pause_time)

    # Stage 2: Additional non-priority posts if needed
    if len(priority_posts) < fetch_limit:
        print("Switching to non-priority posts...")
        while len(priority_posts) + len(non_priority_posts) < fetch_limit:
            posts = subreddit.hot(limit=posts_per_request)
            process_posts(posts, priority_only=False)

            if len(post_ids) % checkpoint_interval == 0:
                save_to_csv(priority_file, priority_posts)
                save_to_csv(non_priority_file, non_priority_posts)
                print(f"Checkpoint: Saved {len(priority_posts)} priority and {len(non_priority_posts)} non-priority posts.")
            time.sleep(pause_time)

except KeyboardInterrupt:
    print("Script interrupted. Saving progress...")
    save_to_csv(priority_file, priority_posts)
    save_to_csv(non_priority_file, non_priority_posts)
    print("Progress saved. Exiting.")

# Final Save
save_to_csv("pourover_priority_posts_3000.csv", priority_posts)
print(f"Saved {len(priority_posts)} priority posts.")

save_to_csv("pourover_all_posts_3000.csv", priority_posts + non_priority_posts)
print(f"Saved {len(priority_posts) + len(non_priority_posts)} total posts.")


No saved progress found. Starting fresh.
Starting scraping...


  'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Checkpoint: 500 posts screened. Priority posts: 96.
Checkpoint: 1000 posts screened. Priority posts: 96.
Checkpoint: 1500 posts screened. Priority posts: 96.
Checkpoint: 2000 posts screened. Priority posts: 99.
Checkpoint: 2500 posts screened. Priority posts: 99.
Checkpoint: 3000 posts screened. Priority posts: 100.
Checkpoint: 3500 posts screened. Priority posts: 100.
Checkpoint: 4000 posts screened. Priority posts: 100.
Checkpoint: 4500 posts screened. Priority posts: 100.
Checkpoint: 5000 posts screened. Priority posts: 100.
Checkpoint: 5500 posts screened. Priority posts: 100.
Checkpoint: 6000 posts screened. Priority posts: 100.
Checkpoint: 6500 posts screened. Priority posts: 100.
Checkpoint: 7000 posts screened. Priority posts: 100.
Checkpoint: 7500 posts screened. Priority posts: 100.
Checkpoint: 8000 posts screened. Priority posts: 100.
Checkpoint: 8500 posts screened. Priority posts: 100.
Checkpoint: 9000 posts screened. Priority posts: 100.
Checkpoint: 9500 posts screened. P

Pushshift access is now restricted to moderators. No longer an option for webscrape. However, if we want old data from pushshift, code template from
https://github.com/Watchful1/PushshiftDumps/tree/master 

#### Test code with separate subreddit

In [8]:
import praw
import csv
import time
from datetime import datetime

# List of 2 different API credentials
api_credentials = [
    {
        "client_id": "RwYC7skcILM5paQFQd50fQ",
        "client_secret": "T_Qo40-37JL3cPu5MunSsMWICOf8DQ",
        "user_agent": "python:climbingv1.0 (by /u/granola-cookies)"
    },
    {
        "client_id": "DPNTcZUVX6yOxJ30K5L_RQ",
        "client_secret": "rw4ah7szqMrd7wpnI8zk8Q_wkiXYTQ",
        "user_agent": "python:climbingv2.0 (by /u/granola-cookies)"
    }
]

# Initialize subreddit
subreddit = "Coffee"
keywords = ["brew", "bean", "Keurig", "espresso", "machine", "waste", "sustainability", "kcups", "pod", 
            "reusable", "traditional", "drip", "French press", "single-serve"]

# List to store posts
coffee_posts = []
post_ids = set()  # To track unique post IDs and avoid duplicates

# Variable to track pagination (after parameter)
after = None

# Function to check if a post/comment contains any of the keywords
def contains_keywords(text):
    return any(keyword.lower() in text.lower() for keyword in keywords)

# Function to process and add posts
def process_posts(posts, include_non_keyword=False):
    global coffee_posts, post_ids  # Make sure these lists are updated globally
    for post in posts:
        if post.id not in post_ids:  # Check for duplicates
            try:
                # Check if the post title or body contains any of the keywords
                match = contains_keywords(post.title) or contains_keywords(post.selftext)

                # If it's a keyword match, include it, otherwise check if we're including non-matching posts
                if match or (include_non_keyword and not match):
                    post.comments.replace_more(limit=None)  # Expand comments
                    comment_bodies = [comment.body for comment in post.comments.list()]
                    all_comments = " | ".join(comment_bodies)

                    # Check if any comment contains the keywords (optional)
                    if any(contains_keywords(comment) for comment in comment_bodies):
                        coffee_posts.append({
                            'title': post.title,
                            'body': post.selftext,
                            'upvotes': post.score,
                            'url': post.url,
                            'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            'comments': all_comments,
                            'num_comments': len(comment_bodies)
                        })
                        post_ids.add(post.id)  # Mark the post as processed
            except Exception as e:
                print(f"Error processing post {post.id}: {e}")

# Function to switch between API credentials
def get_reddit_instance(credentials):
    return praw.Reddit(
        client_id=credentials["client_id"],
        client_secret=credentials["client_secret"],
        user_agent=credentials["user_agent"]
    )

# Fetch posts using different API credentials and pagination
def fetch_posts():
    global after, current_api_index  # Ensure these are updated globally
    target_post_count = 2000  # Set the target post count to 2000

    while len(coffee_posts) < target_post_count:
        try:
            reddit = get_reddit_instance(api_credentials[current_api_index])
            subreddit_instance = reddit.subreddit(subreddit)

            # Search for posts using a broad query (will filter by keywords later)
            subreddit_posts = list(subreddit_instance.search(' '.join(keywords), limit=100, params={"after": after}))

            # Process the fetched posts (filter by keywords first)
            process_posts(subreddit_posts, include_non_keyword=True)  # Process posts without keywords if necessary

            # Update 'after' for pagination
            if subreddit_posts:
                after = subreddit_posts[-1].name
            else:
                break  # Stop if no more posts

            # Rotate API credentials (to prevent hitting rate limits)
            current_api_index = (current_api_index + 1) % len(api_credentials)

            time.sleep(1)  # Add delay to prevent hitting the rate limit
        except praw.exceptions.APIException as e:
            if "TooManyRequests" in str(e):
                print("Rate limit exceeded. Waiting for 60 seconds...")
                time.sleep(60)
            else:
                raise
        except Exception as e:
            print(f"Unexpected error: {e}")
            break

# Initialize current_api_index globally
current_api_index = 0  # Set initial API index to 0 (use first API credentials)

# Fetch posts with the main function
fetch_posts()

# Save the posts data to a CSV file
csv_file = 'coffee_posts.csv'
header = ['title', 'body', 'upvotes', 'url', 'created_time', 'num_comments', 'comments']

# Open the CSV file for writing
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()  # Write header
    for post in coffee_posts:
        writer.writerow(post)  # Write each post's data

print(f"Fetched {len(coffee_posts)} posts and saved them to {csv_file}.")


  'created_time': datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Fetched 214 posts and saved them to coffee_posts.csv.
