# **Reddit API Test**

* Add count of posts per subreddit

In [30]:
import praw
import pandas as pd
import time
import json
import sys
import asyncpraw

In [31]:
# append path to credentials
sys.path.append('c:\\Users\\3leso\\Documents\\Elena\\Uni\\MasterThesis')
from credentials import CLIENT_ID, CLIENT_SECRET, USER_AGENT

In [32]:
users_all = pd.read_csv("output/users_all.csv")
users_all

Unnamed: 0,author,count
0,Qu1nlan,137739
1,english06,121853
2,therealdanhill,88413
3,Anxa,58476
4,lotrouble,33609
...,...,...
804509,fasulo_,1
804510,fastwall,1
804511,LastWalker,1
804512,Last_Account_Ever,1


In [33]:
users_to_pull = pd.read_csv("output/users_to_pull.csv")
users_to_pull

Unnamed: 0,author,count
0,Qu1nlan,137739
1,english06,121853
2,therealdanhill,88413
3,Anxa,58438
4,lotrouble,33609
...,...,...
576,ted5011c,30
577,Poop__Pirates,30
578,BeKindToEachOther6,30
579,UnpopularUrsula,30


***
***
Do You Need to Handle Rate Limits When Using PRAW?

No, you don't need to manually handle rate limits when using PRAW. PRAW automatically respects Reddit's API rate limits and will pause or retry requests as needed. However, understanding how PRAW manages rate limits and how Reddit enforces them is important, especially for large-scale data collection like your case with 776,000 users.
How PRAW Handles Rate Limits

    Automatic Handling:

        PRAW automatically waits and retries when it encounters rate limit errors (e.g., "You're doing that too much. Try again in X seconds").

        You can configure the ratelimit_seconds parameter to set the maximum time PRAW will wait before raising an exception. For example:

        python
        reddit = praw.Reddit(client_id="YOUR_CLIENT_ID",
                             client_secret="YOUR_CLIENT_SECRET",
                             user_agent="YOUR_USER_AGENT",
                             ratelimit_seconds=300)

    Batch Requests:

        PRAW often bundles multiple objects (e.g., submissions or comments) into a single request, which helps optimize API usage.

    Rate Limit Information:

        You can access rate limit details via reddit.auth.limits, which provides information such as remaining requests and reset timestamps.

Reddit's API Rate Limits

    Authenticated Requests (OAuth):

        100 requests per minute per OAuth client ID.

        Averaged over a 10-minute window, allowing bursts of up to 1,000 requests in 10 minutes

    .

Unauthenticated Requests:

    Limited to 10 requests per minute

    .

Special Rate Limits:

    Reddit may enforce additional limits for certain actions (e.g., commenting, banning users), which are not documented but handled by PRAW

        .

Handling 776,000 Users

Given the scale of your task, here’s how you can efficiently collect data while staying within rate limits:
Steps to Optimize Your Workflow

    Use OAuth Authentication:

        Ensure your app is authenticated with OAuth to get the higher rate limit (100 requests/minute).

    Track Progress:

        Use a counter to keep track of processed users and log progress periodically.

    Parallel Processing:

        If possible, split the task across multiple machines or threads using different OAuth client IDs to increase throughput.

    Pause on Rate Limits:

        Let PRAW handle rate limits automatically, but monitor reddit.auth.limits for real-time feedback on remaining requests.

    Retry Logic:

        Implement retry logic with exponential backoff if you encounter API errors or unexpected delays.

Example Code for Large-Scale Data Collection

Here’s a simplified example of how you might process users while respecting rate limits:


***
***

In [36]:


# Authenticate with Reddit API
def authenticate():
    reddit = praw.Reddit(
        client_id=CLIENT_ID[0],
        client_secret=CLIENT_SECRET[0],
        user_agent=USER_AGENT[0],
        ratelimit_seconds=300
    )
    return reddit

# Fetch subreddits for a user
def fetch_user_subreddits(username, reddit):
    try:
        redditor = reddit.redditor(username)
        subreddit_counts = {}
        
        # Fetch submissions
        for submission in redditor.submissions.new(limit=None):
            subreddit_name = submission.subreddit.display_name
            if subreddit_name in subreddit_counts:
                subreddit_counts[subreddit_name] += 1
            else:
                subreddit_counts[subreddit_name] = 1
        
        # Fetch comments
        for comment in redditor.comments.new(limit=None):
            subreddit_name = comment.subreddit.display_name
            if subreddit_name in subreddit_counts:
                subreddit_counts[subreddit_name] += 1
            else:
                subreddit_counts[subreddit_name] = 1

        return subreddit_counts
    
    except Exception as e:
        print(f"Error fetching data for user {username}: {e}")
        return []




# Process users in batches
def process_users(user_list, reddit, output_file):
    user_dict = {}
    processed_count = 0
    
    for username in user_list:

        subreddit_counts = fetch_user_subreddits(username, reddit)
        user_dict[username] = subreddit_counts
        # save
        with open(output_file, 'w') as f:
            json.dump(user_dict, f)
        
        # Log progress
        processed_count += 1
        print(f"Processed {processed_count}/{len(user_list)} users.")
        
        # Optional: Save results to file or database
        
        # Pause if needed (PRAW handles this automatically)
        time.sleep(0)  # No explicit sleep required unless desired

    return user_dict


In [37]:

# Main execution
if __name__ == "__main__":
    reddit = authenticate()
    
    # Example user list (replace with your actual list)
    #user_list = ["user1", "user2", "user3", ...]
    
    process_users(users_to_pull['author'], reddit, output_file='output/user_data_top100.json')

Processed 1/581 users.
Processed 2/581 users.
Processed 3/581 users.
Processed 4/581 users.
Error fetching data for user lotrouble: received 403 HTTP response
Processed 5/581 users.
Processed 6/581 users.
Processed 7/581 users.
Processed 8/581 users.
Processed 9/581 users.


  if {"kind", "data"}.issubset(data) and data["kind"] in self.parsers:
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000025C3D77E150>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000025C43B04980>


Processed 10/581 users.
Processed 11/581 users.
Processed 12/581 users.
Processed 13/581 users.
Processed 14/581 users.
Processed 15/581 users.
Processed 16/581 users.
Processed 17/581 users.
Processed 18/581 users.
Processed 19/581 users.
Error fetching data for user dubalubdub: received 404 HTTP response
Processed 20/581 users.
Processed 21/581 users.
Error fetching data for user Enjolras55: received 403 HTTP response
Processed 22/581 users.
Processed 23/581 users.
Processed 24/581 users.
Processed 25/581 users.
Processed 26/581 users.
Processed 27/581 users.
Processed 28/581 users.
Processed 29/581 users.
Processed 30/581 users.
Processed 31/581 users.
Processed 32/581 users.
Processed 33/581 users.
Processed 34/581 users.
Processed 35/581 users.
Processed 36/581 users.
Processed 37/581 users.
Processed 38/581 users.
Processed 39/581 users.
Processed 40/581 users.
Processed 41/581 users.
Error fetching data for user Gnome_Sane: received 403 HTTP response
Processed 42/581 users.
Proc

In [28]:

with open('output/user_data.json','r') as f:
    user_dict = json.load(f)

In [29]:
user_dict

{'Qu1nlan': {'IAmA': 78,
  'TwoXChromosomes': 71,
  'politics': 1437,
  'ainbow': 5,
  'LGBTnews': 4,
  'LegendaryTales': 4,
  'starwarsblackseries': 66,
  'TrueSTL': 35,
  'KingdomHearts': 9,
  'transgender': 4,
  'rootgame': 5,
  '2meirl4meirl': 14,
  'zelda': 1,
  'Libraries': 2,
  'bannedbooks': 2,
  'Fremont': 57,
  'Journalism': 2,
  'lgbt': 3,
  'Muppets': 4,
  'GooglePixel': 3,
  'missoula': 3,
  'MontanaPolitics': 2,
  'illinois': 4,
  'eeaao': 7,
  'BlackPeopleTwitter': 1,
  'duolingo': 16,
  'StarWars': 29,
  'BestOfReports': 1,
  '2meirl42meirl4meirl': 14,
  'Pennsylvania': 3,
  'IowaPolitics': 1,
  'Iowa': 1,
  'itookapicture': 3,
  'andymewba': 1,
  'massachusetts': 3,
  'KeepOurNetFree': 1,
  'netneutrality': 1,
  'inthenews': 1,
  'SandersForPresident': 2,
  'CBD': 1,
  'newyorkcity': 1,
  'assassinscreed': 15,
  'rtms': 1,
  'IndianCountry': 2,
  'ContraPoints': 1,
  'PS4Deals': 8,
  'sanfrancisco': 4,
  'SmashBrosUltimate': 1,
  'TrollCoping': 4,
  'PokemonGoSnap': 1,

***
***

Key Considerations

    Time Estimate: At 100 requests/minute, processing 776,000 users would take approximately 129 hours (if each user requires one request). Parallelization can reduce this significantly.

    Ethical Compliance: Ensure you're collecting only publicly available data and adhering to Reddit's API terms of use.

    Monitoring: Use logging or monitoring tools to track progress and detect issues during long-running tasks.

PRAW's automatic rate limit handling makes it well-suited for large-scale data collection tasks like yours!

***
***

In [18]:
!pip install nest_asyncio



In [23]:
import asyncpraw
import asyncio
import json
from collections import defaultdict
from pathlib import Path
import nest_asyncio

# ========== Async Authentication ==========
def authenticate_async():
    reddit = asyncpraw.Reddit(
        client_id=CLIENT_ID[0],
        client_secret=CLIENT_SECRET[0],
        user_agent=USER_AGENT[0]
    )
    return reddit



# ========== Fetch a user's subreddit activity ==========
async def fetch_user_subreddits(username, reddit):
    subreddit_counts = defaultdict(int)
    try:
        redditor = await reddit.redditor(username)

        async for submission in redditor.submissions.new(limit=None):
            subreddit_counts[submission.subreddit.display_name] += 1

        async for comment in redditor.comments.new(limit=None):
            subreddit_counts[comment.subreddit.display_name] += 1

    except Exception as e:
        print(f"Error fetching data for user {username}: {e}")
    
    return username, dict(subreddit_counts)



# ========== Process users concurrently ==========
async def process_users(user_list, reddit, max_concurrent=20):
    
    semaphore = asyncio.Semaphore(max_concurrent)
    user_dict = {}

    async def limited_fetch(username):
        async with semaphore:
            return await fetch_user_subreddits(username, reddit)

    tasks = [limited_fetch(user) for user in user_list]
    for i, task in enumerate(asyncio.as_completed(tasks), 1):
        username, counts = await task
        user_dict[username] = counts

        # Save partial results
        with open('output/user_data.json', 'w') as f:
            json.dump(user_dict, f)

        print(f"Processed {i}/{len(user_list)} users.")

    return user_dict

# ========== Main async runner ==========
# def run_async_main(user_list):
#     reddit = authenticate_async()
#     Path('output').mkdir(exist_ok=True)
#     return asyncio.run(process_users(user_list, reddit))


In [24]:
nest_asyncio.apply()
reddit = authenticate_async()
results = await process_users(users_to_pull, reddit)

Error fetching data for user author: received 403 HTTP response
Processed 1/10000 users.


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000025C4386BFE0>


Processed 2/10000 users.


In [25]:
results

{'author': {},
 'count': {'hockey': 118,
  'EdmontonOilers': 184,
  'Outdoors': 1,
  'CanadianPL': 2,
  'nba': 1,
  'AskWomen': 1,
  'CFL': 6,
  'buffalobills': 4,
  'Edmonton': 4,
  'MLS': 2,
  'RogueCompany': 1,
  'RocketLeagueExchange': 3,
  'nfl': 4,
  'Chargers': 4,
  'CalgaryFlames': 1,
  'fuckcars': 1,
  'alberta': 1,
  'TheYouShow': 18,
  'distantsocializing': 30,
  'TheArtistStudio': 11,
  'TheGamerLounge': 2,
  'AnimalsOnReddit': 6,
  'RedditSessions': 3,
  'whereintheworld': 3}}