In [9]:
import os
import re
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from urllib.parse import urljoin
from tqdm import tqdm

In [2]:
URL = 'https://www.youtube.com/intl/ALL/howyoutubeworks/policies/community-guidelines/'

In [3]:
def get_topic_html(base_url: str):
    response = requests.get(base_url)
    response.encoding = 'utf-8'
    response.raise_for_status()
    return response.text

test1 = get_topic_html(URL)

In [4]:
test1



In [5]:
def parse_community_guidelines(html, base_domain="https://www.youtube.com"):
    soup = BeautifulSoup(html, "html.parser")

    resource_sections = soup.find_all("div", class_=["ytr-resources", "ytr-resources-grid"])
    topic_sections = []

    for resource in resource_sections:
        cells = resource.find_all("div", class_="ytr-resources-grid__cell", recursive=True)
        if not cells:  
            cells = [resource]

        for section in cells:
            title_tag = section.find("span", class_="lb-font-weight-600 lb-font-subhead")
            if not title_tag:
                continue
            title = title_tag.get_text(strip=True)

            ul_tag = section.find("ul")
            links = []
            if ul_tag:
                for li in ul_tag.find_all("li", class_="lb-font-paragraph"):
                    a_tag = li.find("a", class_="ytr-link")
                    if not a_tag:
                        continue
                    link = urljoin(base_domain, a_tag.get("href"))
                    name = a_tag.get_text(strip=True)
                    links.append({"name": name, "url": link})

            topic_sections.append({
                "title": title,
                "links": links
            })

    return topic_sections

In [6]:
html = get_topic_html("https://www.youtube.com/intl/ALL/howyoutubeworks/policies/community-guidelines/#community-guidelines")
topics = parse_community_guidelines(html)

unique_titles = set()
unique_topic_sections = []

for topic in topics:
    if topic['title'] not in unique_titles:
        unique_topic_sections.append(topic)
        unique_titles.add(topic['title'])

for topic in unique_topic_sections:
    print(f"--- {topic['title']}")
    for link in topic['links']:
        print(f"{link['name']} -> {link['url']}")

--- Spam & deceptive practices
Fake engagement -> https://support.google.com/youtube/answer/3399767?hl=en&ref_topic=9282365
Impersonation -> https://support.google.com/youtube/answer/2801947?hl=en&ref_topic=9282365
External links -> https://support.google.com/youtube/answer/9054257?hl=en&ref_topic=9282365
Spam, deceptive practices & scams -> https://support.google.com/youtube/answer/2801973?hl=en&ref_topic=9282365
Playlists -> https://support.google.com/youtube/answer/9713446?hl=en&ref_topic=9282365
Additional policies -> https://support.google.com/youtube/answer/2801981?hl=en&ref_topic=9282365
--- Sensitive content
Child safety -> https://support.google.com/youtube/answer/2801999?hl=en&ref_topic=9282679
Thumbnails -> https://support.google.com/youtube/answer/9229980?hl=en&ref_topic=9282679
Nudity and sexual content -> https://support.google.com/youtube/answer/2802002?hl=en&ref_topic=9282679
Suicide and self-harm -> https://support.google.com/youtube/answer/2802245?hl=en&ref_topic=9282

In [7]:
len(unique_topic_sections)

7

In [11]:
def sanitize_filename(name):
    return re.sub(r"[\\/:*?\"<>|]", "_", name).strip()

def save_markdown(folder, filename, content):
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, f"{filename}.md")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)

def get_text_with_links(tag):
    """
    Extract text from a BeautifulSoup tag, handling links and formatting.
    """
    result = []

    for content in tag.descendants:
        if isinstance(content, NavigableString):
            result.append(str(content).strip())
        elif isinstance(content, Tag):
            if content.name == "a":
                href = content.get("href", "#")
                text = content.get_text(strip=True)
                result.append(f"[{text}]({href})")
            elif content.name == "strong":
                strong_text = content.get_text(strip=True)
                result.append(f"**{strong_text}**")
            elif content.name in ["br"]:
                result.append("\n")

    return " ".join(filter(None, result))


def download_article(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    content_div = soup.find("div", class_="cc")
    if not content_div:
        return "Not found"

    result = []

    def parse_element(el):
        if el.name in ["h1", "h2", "h3", "h4"]:
            level = int(el.name[1])
            text = get_text_with_links(el)
            return f"{'#' * level} {text}"
        elif el.name == "p":
            return get_text_with_links(el)
        elif el.name == "ul":
            return "\n".join(f"- {get_text_with_links(li)}" for li in el.find_all("li", recursive=False))
        elif el.name == "ol":
            return "\n".join(f"{i+1}. {get_text_with_links(li)}" for i, li in enumerate(el.find_all("li", recursive=False)))
        elif el.name in ["div"]:
            children = [parse_element(child) for child in el.find_all(recursive=False)]
            return "\n\n".join(filter(None, children))
        return None

    for el in content_div.find_all(recursive=False):
        block = parse_element(el)
        if block:
            result.append(block)

    return "\n\n".join(result)


def crawl_and_save(base_url, save_root="backend/app/data"):
    print(f"Start crawling: {base_url}")
    html = get_topic_html(base_url)
    topics = parse_community_guidelines(html)

    unique_titles = set()
    unique_topic_sections = []
    for topic in topics:
        if topic['title'] not in unique_titles:
            unique_topic_sections.append(topic)
            unique_titles.add(topic['title'])

    for topic in unique_topic_sections:
        topic_folder = os.path.join(save_root, sanitize_filename(topic["title"]))
        print(f"\n--- Topic: {topic['title']} ({len(topic['links'])} links) ---")

        for link_info in tqdm(topic["links"], desc=topic["title"]):
            try:
                content = download_article(link_info["url"])
                filename = sanitize_filename(link_info["name"]).replace(" ", "_")
                save_markdown(topic_folder, filename, content)
            except Exception as e:
                print(f"Error downloading {link_info['url']}: {e}")

In [12]:
if __name__ == "__main__":
    url = f"https://www.youtube.com/intl/ALL/howyoutubeworks/policies/community-guidelines/#community-guidelines"
    crawl_and_save(url)

Start crawling: https://www.youtube.com/intl/ALL/howyoutubeworks/policies/community-guidelines/#community-guidelines

--- Topic: Spam & deceptive practices (6 links) ---


Spam & deceptive practices: 100%|██████████| 6/6 [00:11<00:00,  1.90s/it]



--- Topic: Sensitive content (5 links) ---


Sensitive content: 100%|██████████| 5/5 [00:08<00:00,  1.65s/it]



--- Topic: Violent or dangerous content (5 links) ---


Violent or dangerous content: 100%|██████████| 5/5 [00:09<00:00,  1.91s/it]



--- Topic: Regulated goods (2 links) ---


Regulated goods: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]



--- Topic: Misinformation (3 links) ---


Misinformation: 100%|██████████| 3/3 [00:06<00:00,  2.03s/it]



--- Topic: Educational, Documentary, Scientific, and Artistic (EDSA) content (1 links) ---


Educational, Documentary, Scientific, and Artistic (EDSA) content: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]



--- Topic: Resources (2 links) ---


Resources: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
