# Gotta Catch 'Em All

In [None]:
import requests
from bs4 import BeautifulSoup
import yaml
import time
from tqdm import tqdm

PAGES_TO_SCRAPE = 800
DIVS_PER_PAGE = 250
SLEEP_SECONDS = 0
now = time.strftime("%Y%m%d_%H%M%S")
FILENAME = f"subreddits_all_{now}.yaml"

for page in tqdm(range(1, PAGES_TO_SCRAPE+1)):
    url = f"https://www.reddit.com/best/communities/{page}/"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")

    items = []
    idiv = 0
    for div in soup.find_all("div", class_="flex flex-wrap justify-center py-[0.75rem]"):
        idiv += 1
        number_tag = div.find("h6", class_="flex flex-col font-bold justify-center items-center text-12 w-2xl m-0 truncate")
        number = number_tag.text.strip() if number_tag else None

        a_tag = div.find("a", class_="m-0 font-bold text-12 text-current truncate max-w-[11rem]")
        subreddit = a_tag.text.strip() if a_tag else None

        desc_tags = div.find_all("h6")
        description = desc_tags[1].text.strip() if len(desc_tags) > 1 else None

        faceplate_tag = div.find("faceplate-number")
        members = int(faceplate_tag.get("number")) if faceplate_tag else None

        items.append({
            "number_read": number,
            "number_count": (page-1)*DIVS_PER_PAGE + idiv,
            "subreddit": subreddit,
            "description": description,
            "members": members
        })

    # Write to YAML after each page
    with open(FILENAME, "a", encoding="utf-8") as f:
        yaml.dump(items, f, allow_unicode=True, explicit_start=True)

    time.sleep(SLEEP_SECONDS)

 80%|████████  | 804/1000 [41:50<54:14, 16.61s/it]  

In [14]:
import yaml

filename = "subreddits_all_20250607_170000.yaml"

# Read all YAML documents and combine into a single list
with open(filename, "r", encoding="utf-8") as f:
    all_items = []
    for doc in yaml.safe_load_all(f):
        if doc:
            all_items.extend(doc)

# Now all_items is a list of all subreddit entries
print(f"Loaded {len(all_items)} subreddits")
print(all_items[-3:])  # Show last 3 items as a sample

Loaded 144163 subreddits
[{'description': 'Welcome to Ayahuasca_Circle_US', 'members': 116, 'number_count': 178248, 'number_read': '178K+', 'subreddit': 'r/Ayahuasca_Circle_US'}, {'description': 'Loves to mix hunting and adventure!', 'members': 116, 'number_count': 178249, 'number_read': '178K+', 'subreddit': 'r/backpackhunting'}, {'description': '#aiparcatcaunbou acum pe Reddit .\nPorcǎim toți șoferii necivilizați și obraznici.\nHaideți să creștem împreună!', 'members': 116, 'number_count': 178250, 'number_read': '178K+', 'subreddit': 'r/BadParkingRomania'}]


In [19]:
import pandas as pd
ita_df  = pd.read_csv("subreddits.csv")
ita_subreddits = [ f"r/{x.lower()}" for x in ita_df['name']]
ita_df

Unnamed: 0,name,tag,language,created,subscribers,reason,created_utc,description
0,Abruzzo,regione,ita,,799.0,,2013-04-08 23:06:11,
1,AmericansInItaly,community,,,9537.0,,2021-08-25 00:24:41,
2,AndroidItalia,hobby,ita,,1166.0,,2016-10-24 08:55:40,Comunità aperta italiana dedicata al mondo And...
3,AnimeItaly,hobby,ita,,156260.0,,2016-09-28 08:40:31,"Benvenuto su /r/AnimeItaly , il nuovo subreddi..."
4,Aosta,provincia,ita,,16.0,,2021-11-29 08:25:18,aosta
...,...,...,...,...,...,...,...,...
132,Universitaly,community,,,111451.0,,2020-07-22 14:02:32,Questo è un subreddit dedicato agli universita...
133,Veneto,regione,ita,,1076.0,,2013-05-20 21:16:25,🇮🇹 Il subreddit del Veneto e delle loro città....
134,Venezia,provincia,ita,,67204.0,,2011-10-28 13:40:20,**Tutto quanto riguarda Venezia** **Everythin...
135,xxItaly,community,ita,,16920.0,,2019-06-08 06:55:37,**Benvenute e benvenuti alla community delle r...


In [25]:
first_less=False
for item in all_items:
    sub = item["subreddit"]
    subl = sub.lower()

    if any([
        "italy" in subl,
        "italia" in subl,
        "ITA" in sub,
        ]) and subl not in ita_subreddits:
        print(">>>", sub, item["members"], "https://www.reddit.com/" + sub) 
        print(item["description"])
        print()

>>> r/AITAH 5878143 https://www.reddit.com/r/AITAH
This is a community like r/AmITheAsshole, except unlike that subreddit you can post interpersonal conflicts. Anything that's AITA including relationships, hypotheticals, even posting about Scar from the Lion King and trying to convince redditors that he was not the AH. AI scenarios/posts are not welcome or tolerated. An elaboration on our rules can be found here: https://www.reddit.com/r/AITAH/wiki/index

>>> r/AITA_WIBTA_PUBLIC 142077 https://www.reddit.com/r/AITA_WIBTA_PUBLIC
post your stories inquiring if you are or would be the asshole. the subject matter is not restricted, so you can post what you really want to talk about. Feel free to share your honest opinion in the comments, just be kind to each other... Are you the asshole?

>>> r/AITASims 76447 https://www.reddit.com/r/AITASims
r/AITASims is a community where Sims can go to share their stories and ask for advise, and most notably ask if they are the llama.

>>> r/ViaggiITA 4