In [25]:
import requests
from urllib.parse import urljoin

def find_sitemap(url):
  """Searches for a website's sitemap using various methods.

  Args:
      url (str): The base URL of the website.

  Returns:
      str: The URL of the sitemap if found, otherwise None.
  """

  # Common sitemap locations
  sitemap_urls = [
      urljoin(url, '/sitemap.xml'),
      urljoin(url, '/sitemap_index.xml'),
  ]

  # Check robots.txt for sitemap reference
  robots_url = urljoin(url, '/robots.txt')
  try:
    response = requests.get(robots_url)
    if response.status_code == 200:
      for line in response.text.splitlines():
        if line.lower().startswith('sitemap:'):
          sitemap_urls.append(line.split(':')[1].strip())
  except requests.exceptions.RequestException:
    pass  # Ignore errors fetching robots.txt

  # Try Google search operators (less reliable)
  search_url = f"site:{url} filetype:xml inurl:sitemap"
  try:
    response = requests.get(search_url)
    if response.status_code == 200:
      for link in response.text.iterlinks():
        if link[0].endswith('.xml'):
          sitemap_urls.append(link[0])
  except requests.exceptions.RequestException:
    pass  # Ignore errors with Google search

  # Check each potential sitemap URL
  for sitemap_url in sitemap_urls:
    try:
      response = requests.get(sitemap_url)
      if response.status_code == 200 and response.headers['Content-Type'].startswith('text/xml'):
        return sitemap_url
    except requests.exceptions.RequestException:
      pass  # Ignore errors fetching potential sitemaps

  return None

# Example usage
website_url = "https://www.artificialintelligence-news.com/"
sitemap_url = find_sitemap(website_url)

if sitemap_url:
  print(f"Sitemap found at: {sitemap_url}")
else:
  print("Sitemap not found")


Sitemap found at: https://www.artificialintelligence-news.com/sitemap.xml


In [26]:
import os
import requests
from urllib.parse import urlparse

def download_sitemap(url, save_dir):
  """Downloads the sitemap from the given URL and saves it to the specified directory.

  Args:
      url (str): The URL of the sitemap to download.
      save_dir (str): The directory to save the downloaded sitemap.

  Returns:
      str: The path to the saved sitemap file, or None if an error occurred.
  """

  try:
    response = requests.get(url)

    if response.status_code == 200:
      # Extract filename from URL
      parsed_url = urlparse(url)
      print(parsed_url)
      filename_components = parsed_url.netloc.split('.')
      filename = '_'.join(filename_components)
      if not filename:  # Handle cases where URL path ends with '/'
        filename = os.path.basename(parsed_url.path[:-1])  # Get path segment before trailing '/'

      # Create filename with appropriate extension (assuming XML for sitemaps)
      sitemap_filename = f"{filename}_sitemap.xml"

      # Create save path
      save_path = os.path.join(save_dir, sitemap_filename)

      with open(save_path, 'wb') as f:
        f.write(response.content)
      print(f"Sitemap downloaded successfully to: {save_path}")
      return save_path  # Return the path for potential use

    else:
      print(f"Failed to download sitemap. Status code: {response.status_code}")
      return None  # Indicate failure

  except Exception as e:
    print(f"An error occurred: {e}")
    return None  # Indicate failure

sitemap_dir = "example_sitemaps"

# Create the directory if it doesn't exist (improved error handling)
try:
  os.makedirs(sitemap_dir, exist_ok=True)  # Suppress errors if directory already exists
except OSError as e:
  print(f"Error creating directory: {e}")
  exit(1)  # Exit with an error code if directory creation fails

url = "https://www.artificialintelligence-news.com/sitemap.xml"

saved_sitemap_path = download_sitemap(url, sitemap_dir)

if saved_sitemap_path:
  print(f"You can access the downloaded sitemap at: {saved_sitemap_path}")


ParseResult(scheme='https', netloc='www.artificialintelligence-news.com', path='/sitemap.xml', params='', query='', fragment='')
Sitemap downloaded successfully to: example_sitemaps\www_artificialintelligence-news_com_sitemap.xml
You can access the downloaded sitemap at: example_sitemaps\www_artificialintelligence-news_com_sitemap.xml


In [33]:
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse('example_sitemaps\llamaindex_sitemap.xml')

# Get the root element
root = tree.getroot()

# Print the root element tag and attributes
print("Root Element:", root.tag)
print("Attributes:", root.attrib)

# Print the children of the root element
print("Children of Root Element:")
for child in root:
    print(child.tag, child.attrib)

# Print the text content of elements along with the href attribute
for element in root.iter():
    if 'href' in element.attrib:
        print(element.tag, ":", element.attrib['href'])


Root Element: {http://www.sitemaps.org/schemas/sitemap/0.9}urlset
Attributes: {}
Children of Root Element:
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://www.sitemaps.org/schemas/sitemap/0.9}url {}
{http://w

In [42]:
import xml.etree.ElementTree as ET
import requests

def parse_sitemap(url):
    response = requests.get(url)
    # Ensure the response is in XML format
    response.raise_for_status()

    # Parse the XML content
    root = ET.fromstring(response.content)

    # Dynamically find the namespace
    namespace = ''
    if root.tag.startswith('{'):
        namespace = root.tag.split('}')[0] + '}'

    # Check if this is a sitemap index or an actual sitemap
    if root.tag == f'{namespace}sitemapindex':
        # If it's a sitemap index, iterate through each sitemap listed and repeat
        for sitemap in root.findall(f'{namespace}sitemap'):
            loc = sitemap.find(f'{namespace}loc').text
            print(f'Found sitemap: {loc}')
            # Optionally, you can call parse_sitemap(loc) recursively to parse nested sitemaps
    else:
        # If it's an actual sitemap, iterate through each URL listed
        for url in root.findall(f'{namespace}url'):
            loc = url.find(f'{namespace}loc').text
            print(f'Found URL: {loc}')

# Example usage
sitemap_url = 'https://www.artificialintelligence-news.com/sitemap_index.xml'
parse_sitemap(sitemap_url)


Found sitemap: https://www.artificialintelligence-news.com/post-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/post-sitemap2.xml
Found sitemap: https://www.artificialintelligence-news.com/page-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/events-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/resources-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/videos-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/category-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/post_tag-sitemap.xml
Found sitemap: https://www.artificialintelligence-news.com/post_tag-sitemap2.xml
Found sitemap: https://www.artificialintelligence-news.com/post_tag-sitemap3.xml
Found sitemap: https://www.artificialintelligence-news.com/author-sitemap.xml
