In [25]:
import requests
from urllib.parse import urljoin

def find_sitemap(url):
  """Searches for a website's sitemap using various methods.

  Args:
      url (str): The base URL of the website.

  Returns:
      str: The URL of the sitemap if found, otherwise None.
  """

  # Common sitemap locations
  sitemap_urls = [
      urljoin(url, '/sitemap.xml'),
      urljoin(url, '/sitemap_index.xml'),
  ]

  # Check robots.txt for sitemap reference
  robots_url = urljoin(url, '/robots.txt')
  try:
    response = requests.get(robots_url)
    if response.status_code == 200:
      for line in response.text.splitlines():
        if line.lower().startswith('sitemap:'):
          sitemap_urls.append(line.split(':')[1].strip())
  except requests.exceptions.RequestException:
    pass  # Ignore errors fetching robots.txt

  # Try Google search operators (less reliable)
  search_url = f"site:{url} filetype:xml inurl:sitemap"
  try:
    response = requests.get(search_url)
    if response.status_code == 200:
      for link in response.text.iterlinks():
        if link[0].endswith('.xml'):
          sitemap_urls.append(link[0])
  except requests.exceptions.RequestException:
    pass  # Ignore errors with Google search

  # Check each potential sitemap URL
  for sitemap_url in sitemap_urls:
    try:
      response = requests.get(sitemap_url)
      if response.status_code == 200 and response.headers['Content-Type'].startswith('text/xml'):
        return sitemap_url
    except requests.exceptions.RequestException:
      pass  # Ignore errors fetching potential sitemaps

  return None

# Example usage
website_url = "https://www.artificialintelligence-news.com/"
sitemap_url = find_sitemap(website_url)

if sitemap_url:
  print(f"Sitemap found at: {sitemap_url}")
else:
  print("Sitemap not found")


Sitemap found at: https://www.artificialintelligence-news.com/sitemap.xml


In [4]:
import os
import requests
from urllib.parse import urlparse
import xml.etree.ElementTree as ET  # For parsing XML content of sitemaps

# Headers for each request
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

def find_potential_sitemap_urls(xml_content):
    """Parses XML content for sitemap URLs and filters potential XML sitemap URLs based on simple heuristics.

    Args:
        xml_content (str): The XML content of a sitemap.

    Returns:
        list: A list of potential sitemap URLs based on naming heuristics.
    """
    root = ET.fromstring(xml_content)
    namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    # Look for <loc> tags within <sitemap> elements
    urls = [elem.text for elem in root.findall('.//sitemap:loc', namespaces=namespace)]

    # Filter out URLs that are likely to be XML based on their patterns
    likely_xml_urls = [url for url in urls if url.endswith('.xml') or 'sitemap' in url.lower()]

    return likely_xml_urls

def check_if_xml(urls, headers):
    """Check if the given URLs are XML sitemaps by making network requests.

    Args:
        urls (list): A list of URLs to check.
        headers (dict): Request headers.

    Returns:
        list: A list of URLs confirmed to be XML sitemaps.
    """
    xml_urls = []
    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            content_type = response.headers.get('content-type', '')
            if content_type.startswith('text/xml'):
                xml_urls.append(url)
        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

    return xml_urls

def download_sitemap(url, save_dir, downloaded_urls):
    """Downloads the sitemap from the given URL and saves it to the specified directory.
    Recursively downloads any child sitemaps.

    Args:
        url (str): The URL of the sitemap to download.
        save_dir (str): The directory to save the downloaded sitemap.
        downloaded_urls (set): A set of URLs already downloaded to prevent duplicates.

    Returns:
        None
    """
    if url in downloaded_urls:
        print(f"Skipping already downloaded sitemap: {url}")
        return  # Avoid re-downloading sitemaps

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Extract filename from URL
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path) or 'index'
            # Create filename with appropriate extension (assuming XML for sitemaps)
            sitemap_filename = f"{filename}"
            # Create save path
            save_path = os.path.join(save_dir, sitemap_filename)

            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"Sitemap downloaded successfully to: {save_path}")

            downloaded_urls.add(url)  # Mark this URL as downloaded

            child_potential_sitemap_urls = find_potential_sitemap_urls(response.content.decode())
            child_sitemap_urls = check_if_xml(child_potential_sitemap_urls,headers=headers)
            for child_url in child_sitemap_urls:
                download_sitemap(child_url, save_dir, downloaded_urls)

        else:
            print(f"Failed to download sitemap. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")

sitemap_dir = "example_sitemaps_new"

# Create the directory if it doesn't exist (improved error handling)
try:
    os.makedirs(sitemap_dir, exist_ok=True)
except OSError as e:
    print(f"Error creating directory: {e}")
    exit(1)

# URL of the sitemap
sitemap_url = "https://www.mytimerecovery.com/sitemap.xml"

downloaded_urls = set()  # To track downloaded URLs
download_sitemap(sitemap_url, sitemap_dir, downloaded_urls)


Sitemap downloaded successfully to: example_sitemaps_new\sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\post-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\page-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\jet-menu-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\treatments-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\mental-health-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\california-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\team-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\therapy-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\program-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\category-sitemap.xml
Sitemap downloaded successfully to: example_sitemaps_new\author-sitemap.xml


In [3]:
import requests

# URL of the sitemap
sitemap_url = "https://www.mytimerecovery.com/sitemap.xml"
# Headers for each request
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

# Make requests
response_sitemap = requests.get(sitemap_url, headers=headers)


# Check responses
print("Sitemap Status:", response_sitemap.status_code)


# Uncomment below to print content
print("Sitemap Content:", response_sitemap.text)


Sitemap Status: 403
Sitemap Content: <html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx</center>
</body>
</html>



In [33]:
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse('example_sitemaps\llamaindex_sitemap.xml')

# Get the root element
root = tree.getroot()

# Print the root element tag and attributes
print("Root Element:", root.tag)
print("Attributes:", root.attrib)

# Print the children of the root element
print("Children of Root Element:")
for child in root:
    print(child.tag, child.attrib)

# Print the text content of elements along with the href attribute
for element in root.iter():
    if 'href' in element.attrib:
        print(element.tag, ":", element.attrib['href'])


Root Element: {http://www.sitemaps.org/schemas/sitemap/0.9}urlset
Attributes: {}
Children of Root Element:
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://w

In [1]:
import xml.etree.ElementTree as ET
import requests

def parse_sitemap(url):
    response = requests.get(url)
    # Ensure the response is in XML format
    response.raise_for_status()

    # Parse the XML content
    root = ET.fromstring(response.content)

    # Dynamically find the namespace
    namespace = ''
    if root.tag.startswith('{'):
        namespace = root.tag.split('}')[0] + '}'

    # Check if this is a sitemap index or an actual sitemap
    if root.tag == f'{namespace}sitemapindex':
        # If it's a sitemap index, iterate through each sitemap listed and repeat
        for sitemap in root.findall(f'{namespace}sitemap'):
            loc = sitemap.find(f'{namespace}loc').text
            print(f'Found sitemap: {loc}')
            # Optionally, you can call parse_sitemap(loc) recursively to parse nested sitemaps
    else:
        # If it's an actual sitemap, iterate through each URL listed
        for url in root.findall(f'{namespace}url'):
            loc = url.find(f'{namespace}loc').text
            print(f'Found URL: {loc}')

# Example usage
sitemap_url = 'https://www.mytimerecovery.com/sitemap_index.xml'
parse_sitemap(sitemap_url)


HTTPError: 403 Client Error: Forbidden for url: https://www.mytimerecovery.com/sitemap_index.xml

In [6]:
import requests

payload = { 'api_key': 'm', 'url': 'https://example.com', 'render': True, 'country_code': 'us', 'device_type': 'desktop' }
r = requests.get('https://api.scraperapi.com/', params=payload)
print(r.text)


<!DOCTYPE html><html><head>
    <title>Example Domain</title>

    <meta charset="utf-8">
    <meta http-equiv="Content-type" content="text/html; charset=utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domain is for