# Gotta Catch 'Em All

In [None]:
import requests
from bs4 import BeautifulSoup
import yaml
import time
from tqdm import tqdm

PAGES_TO_SCRAPE = 800
DIVS_PER_PAGE = 250
SLEEP_SECONDS = 0
now = time.strftime("%Y%m%d_%H%M%S")
FILENAME = f"subreddits_all_{now}.yaml"

for page in tqdm(range(1, PAGES_TO_SCRAPE+1)):  
    url = f"https://www.reddit.com/best/communities/{page}/"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")

    items = []
    idiv = 0
    for div in soup.find_all("div", class_="flex flex-wrap justify-center py-[0.75rem]"):
        idiv += 1
        number_tag = div.find("h6", class_="flex flex-col font-bold justify-center items-center text-12 w-2xl m-0 truncate")
        number = number_tag.text.strip() if number_tag else None

        a_tag = div.find("a", class_="m-0 font-bold text-12 text-current truncate max-w-[11rem]")
        subreddit = a_tag.text.strip() if a_tag else None

        desc_tags = div.find_all("h6")
        description = desc_tags[1].text.strip() if len(desc_tags) > 1 else None

        faceplate_tag = div.find("faceplate-number")
        members = int(faceplate_tag.get("number")) if faceplate_tag else None

        items.append({
            "number_read": number,
            "number_count": (page-1)*DIVS_PER_PAGE + idiv,
            "subreddit": subreddit,
            "description": description,
            "members": members
        })

    # Write to YAML after each page
    with open(FILENAME, "a", encoding="utf-8") as f:
        yaml.dump(items, f, allow_unicode=True, explicit_start=True)

    time.sleep(SLEEP_SECONDS)

 80%|████████  | 804/1000 [41:50<54:14, 16.61s/it]  

In [1]:
import yaml

filename = "subreddits_all_20250607_170000.yaml"

# Read all YAML documents and combine into a single list
with open(filename, "r", encoding="utf-8") as f:
    all_items = []
    for doc in yaml.safe_load_all(f):
        if doc:
            all_items.extend(doc)

# Now all_items is a list of all subreddit entries
print(f"Loaded {len(all_items)} subreddits")
print(all_items[-3:])  # Show last 3 items as a sample

Loaded 144163 subreddits
[{'description': 'Welcome to Ayahuasca_Circle_US', 'members': 116, 'number_count': 178248, 'number_read': '178K+', 'subreddit': 'r/Ayahuasca_Circle_US'}, {'description': 'Loves to mix hunting and adventure!', 'members': 116, 'number_count': 178249, 'number_read': '178K+', 'subreddit': 'r/backpackhunting'}, {'description': '#aiparcatcaunbou acum pe Reddit .\nPorcǎim toți șoferii necivilizați și obraznici.\nHaideți să creștem împreună!', 'members': 116, 'number_count': 178250, 'number_read': '178K+', 'subreddit': 'r/BadParkingRomania'}]


In [15]:
for i, item in enumerate(all_items):
    if item["subreddit"].lower() == "r/italy":
        print(i, item)
    if item["members"]<500:
        print(i, item)
        break
    
len(str(all_items))

760 {'description': 'Reddit Italy - Italia\n\nWelcome everyone! This is a place to post and discuss anything related to Italy. We also speak English!', 'members': 1092489, 'number_count': 761, 'number_read': '761', 'subreddit': 'r/italy'}


35540276

In [7]:
import pandas as pd
ita_df  = pd.read_csv("subreddits.csv")
ita_subreddits = [ f"r/{x.lower()}" for x in ita_df['name']]
ita_df

Unnamed: 0,name,tag,language,created,subscribers,reason,created_utc,description
0,Abruzzo,regione,ita,,799.0,,2013-04-08 23:06:11,
1,ACMilan,squadra,eng,,74504.0,,2010-04-08 19:11:21,#####Welcome to /r/ACMilan! Please be sure to ...
2,ADO_Italy,fanclub,ita,,345.0,,2024-10-10 17:43:20,"Un subreddit dedicato ad Ado, la talentuosa ca..."
3,AlfaRomeoGiulia,fanclub,eng,,2390.0,,2019-10-01 13:01:13,/r/AlfaRomeoGiulia is a place for Giulia owner...
4,AmericansInItaly,community,,,9537.0,,2021-08-25 00:24:41,
...,...,...,...,...,...,...,...,...
199,ViaFrancigena,fanclub,eng,,561.0,,2018-07-22 20:41:01,
200,ViaggiITA,community,ita,,45659.0,,2024-06-26 07:50:26,Il subreddit per Italiani per parlare di viagg...
201,wallstreetbetsitalia,community,ita,,510.0,,2021-01-28 00:23:57,Gruppo italiano di wallstreetbets
202,xxItaly,community,ita,,16920.0,,2019-06-08 06:55:37,**Benvenute e benvenuti alla community delle r...


In [8]:
first_less=False
for item in all_items:
    sub = item["subreddit"]
    subl = sub.lower()
    descl = item["description"].lower()

    if any([
        "italy" in subl, 
        "italy" in descl,
        "italia" in subl, 
        "italia" in descl,
        "benvenut" in descl,
        sub.endswith("ITA"), 
        sub.endswith("_IT"), 
        ]) and subl not in ita_subreddits:
        print(">>>", item["subreddit"], item["members"], "https://www.reddit.com/" + item["subreddit"]) 
        print(item["description"])
        print()

>>> r/2westerneurope4u 255451 https://www.reddit.com/r/2westerneurope4u
Ironic ultranationalistic memes about Western European countries (Netherlands, Luxembourg, Germany, France, Italy, Spain, Portugal, Greece, UK, Ireland, Denmark (incl. Greenland), Norway, Sweden, Finland, Iceland, Switzerland, Austria, Flanders & Wallonia) 

You will learn more about European culture here than anywhere else on Reddit.

>>> r/Crossdress_Expression 59340 https://www.reddit.com/r/Crossdress_Expression
Cross-dressing is, among other things, a form of artistic self-expression for many.  When we cross-dress, we aspire to be someone other than who we are in our normal lives and fantasy definitely plays a role for many of us as well.  This sub welcomes all your lovely photos that embody who you are when you are dressed.  We won't scrutinize you for using filters.  Disclose it if you wish.  I've labeled this NSFW but not x-rated.  No exposed genitalia or other nudity please.  Only tasteful pics.

>>> r/tour

In [9]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

dotenv_path = find_dotenv()
load_dotenv(dotenv_path, override=True)

print(openai.__version__)
openai.api_key = os.getenv('OPENAI_API_KEY')
print(f"Loaded OPENAI_API_KEY={openai.api_key[:15]}****{openai.api_key[-5:]} from Path {dotenv_path}")

1.75.0
Loaded OPENAI_API_KEY=sk-proj-6DjWT4A****G1dUA from Path /Users/danieleongari/Programs/awesome-italian-subreddits/.env


In [19]:
from openai import OpenAI 
from pydantic import BaseModel
client = OpenAI()

n_include = 1000
system_content = f""""You are given a list of {n_include} reddit channels names and descriptions. 
Just output the names of the channels among these {n_include} that are related to Italy, Italian culture, or the Italian language. 
The names you output MUST BE in the user prompt, not invented or based on your knowledge.
If none are found, just output an empty list."""

class FoundChan(BaseModel):
    found_names: list[str]

gpt_found_list = []
for i in range(0, len(all_items), n_include):
  user_content = "\n".join([f"{item['subreddit']}|{item['description']}" for item in all_items[i:i+n_include]])
  completion = client.beta.chat.completions.parse(
    model="gpt-4.1-mini",
    messages=[
      {"role": "system", "content": system_content}, 
      {"role": "user", "content": user_content}  
    ],
    response_format=FoundChan,
  )
  foundchani = completion.choices[0].message.parsed
  gpt_found_list += foundchani.found_names
  print(f"{i}: {foundchani}")
  if i > 70000:
    break

0: found_names=['r/italy']
1000: found_names=['r/seriea']
2000: found_names=['r/Italy']
3000: found_names=['r/ItalianFood', 'r/mediterraneandiet']
4000: found_names=['r/napoli', 'r/rome', 'r/italianlearning', 'r/ItalyTravel', 'r/rome', 'r/Calcio', 'r/Ferrari']
5000: found_names=['r/askitaly', 'r/catalunya', 'r/italy', 'r/bologna', 'r/torino']
6000: found_names=['r/ACMilan']
7000: found_names=['r/Ducati', 'r/Juve', 'r/Venezia', 'r/sicily']
8000: found_names=['r/AlfaRomeo']
9000: found_names=['r/Italian', 'r/Relazioni', 'r/ViaggiITA', 'r/FCInterMilan']
10000: found_names=[]
11000: found_names=['r/neapolitanpizza']
12000: found_names=['r/ItalianGreyhounds', 'r/cafebrasil', 'r/VictoriaDeAngelis']
13000: found_names=['r/ItalyPhotos', 'r/chiliZ', 'r/callmebyyourname']
14000: found_names=['r/florence', 'r/ASRoma']
15000: found_names=['r/Italianscamads', 'r/sscnapoli', 'r/RomeTotalWar', 'r/PandabuyItalia']
16000: found_names=['r/SegheSuVIP_ITA']
17000: found_names=[]
18000: found_names=['r/jur

In [21]:
import yaml
import time
now = time.strftime("%Y%m%d_%H%M%S")
with open(f"subreddit_gptfound_{now}.yaml", "w", encoding="utf-8") as f:
    yaml.dump(gpt_found_list, f, allow_unicode=True, explicit_start=True)

In [27]:
ita_subreddits = [ f"r/{x.lower()}" for x in ita_df['name']]
all_subreddits = [item["subreddit"].lower() for item in all_items]

gpt_found_list_unique = []
for item in gpt_found_list:
    if item not in gpt_found_list_unique:
        gpt_found_list_unique.append(item)

for rname in gpt_found_list_unique:
    if rname.lower() not in ita_subreddits:
        print("https://www.reddit.com/" + rname, rname[2:], end=" ")
        if rname.lower() in all_subreddits:
            print(all_items[all_subreddits.index(rname.lower())])
        else:
            print("<<<WARNING>>> Not found in all_items")

https://www.reddit.com/r/Ferrari Ferrari {'description': 'Subreddit dedicated to everything that comes out of Maranello, for both Scuderia Ferrari and the factory road cars. \n\nHigh resolution photos, news items, articles, motorsports, if it concerns Ferrari, it belongs here!', 'members': 129358, 'number_count': 4307, 'number_read': '4K+', 'subreddit': 'r/Ferrari'}
https://www.reddit.com/r/catalunya catalunya {'description': "La porta d'entrada a reddit en llengua catalana. Notícies, preguntes, debat i més sobre Catalunya, els Països Catalans i el món.", 'members': 104784, 'number_count': 5123, 'number_read': '5K+', 'subreddit': 'r/catalunya'}
https://www.reddit.com/r/Ducati Ducati {'description': 'Anything and everything about the bikes from Bologna.', 'members': 68148, 'number_count': 7106, 'number_read': '7K+', 'subreddit': 'r/Ducati'}
https://www.reddit.com/r/AlfaRomeo AlfaRomeo {'description': 'A subreddit for the real Alfisti.', 'members': 56159, 'number_count': 8161, 'number_re