In [3]:
import requests
from urllib.parse import urlparse, parse_qs
import xml.etree.ElementTree as ET
from collections import defaultdict

def fetch_sitemap(url):
    """Fetch the sitemap XML content from the specified URL."""
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for HTTP errors
    return response.text

def parse_sitemap(xml_content):
    """Parse the sitemap XML and extract URLs."""
    namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    root = ET.fromstring(xml_content)
    urls = [url.find('sitemap:loc', namespace).text for url in root.findall('sitemap:url', namespace)]
    return urls

def categorize_urls(urls):
    """Break down URLs into categories and subcategories."""
    categories = defaultdict(lambda: defaultdict(set))
    for url in urls:
        parsed_url = urlparse(url)
        path_segments = parsed_url.path.strip('/').split('/')
        if len(path_segments) > 1:
            category, subcategories = path_segments[0], path_segments[1:]
            categories[category][subcategories[0]].add(url)
        elif path_segments:
            categories[path_segments[0]][''].add(url)
    return categories

def main(sitemap_url):
    xml_content = fetch_sitemap(sitemap_url)
    urls = parse_sitemap(xml_content)
    categories = categorize_urls(urls)
    for category, subcategories in categories.items():
        print(f"Category: {category}")
        for subcategory, urls in subcategories.items():
            print(f"  Subcategory: {subcategory} - URLs: {len(urls)}")
            for url in urls:
                print(f"    {url}")

sitemap_url = 'https://www.mnemonic.io/sitemap.xml'  
main(sitemap_url)


Category: company
  Subcategory: whats-new - URLs: 76
    https://www.mnemonic.io/company/whats-new/2021/argus-for-mobile-is-now-available/
    https://www.mnemonic.io/company/whats-new/2017/mnemonic-joins-the-norwegian-national-security-authoritys-quality-scheme-for-incident-response/
    https://www.mnemonic.io/company/whats-new/2019/mnemonic-joins-the-board-of-the-cloud-security-alliance/
    https://www.mnemonic.io/company/whats-new/2018/ferd-invests-in-mnemonic/
    https://www.mnemonic.io/company/whats-new/2018/mnemonic-named-one-of-europes-best-workplaces/
    https://www.mnemonic.io/company/whats-new/2021/mnemonic-joins-the-dutch-cybersecurity-interest-group-cyberveilig-nederland/
    https://www.mnemonic.io/company/whats-new/2017/mnemonic-listed-as-notable-threat-intelligence-service-provider-in-gartner-report/
    https://www.mnemonic.io/company/whats-new/2018/
    https://www.mnemonic.io/company/whats-new/2020/information-on-covid-19/
    https://www.mnemonic.io/company/what