In [68]:
import pandas as pd
import requests
from io import StringIO
from xml.etree import ElementTree as ET

def fetch_and_parse_xml(url):
    """Fetch XML content from a URL and parse it into a DataFrame."""
    response = requests.get(url)
    response.raise_for_status()
    
    # Parse XML content
    root = ET.fromstring(response.content)
    data = []
    columns = []
    
    # Define the namespaces (if any)
    namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    
    # Extract the desired fields
    data = []
    for url in root.findall('ns:url', namespaces):
        loc = url.find('ns:loc', namespaces).text if url.find('ns:loc', namespaces) is not None else None
        lastmod = url.find('ns:lastmod', namespaces).text if url.find('ns:lastmod', namespaces) is not None else None
        changefreq = url.find('ns:changefreq', namespaces).text if url.find('ns:changefreq', namespaces) is not None else None
        priority = url.find('ns:priority', namespaces).text if url.find('ns:priority', namespaces) is not None else None
        
        data.append({
            'loc': loc,
            'lastmod': lastmod,
            'changefreq': changefreq,
            'priority': priority
        })
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

def process_xml_files(urls):
    """Process a list of XML file URLs and combine them into a single DataFrame."""
    combined_df = pd.DataFrame()
    for url in urls:
        try:
            # Extract file name from URL
            file_name = url.split("/")[-1]
            # Parse XML and add the file name as a new column
            df = fetch_and_parse_xml(url)
            df['source_file'] = file_name
            # Concatenate with the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing {url}: {e}")
    
    return combined_df

# # Example usage
# urls = [
#     "https://example.com/file1.xml",
#     "https://example.com/file2.xml",
#     "https://example.com/file3.xml"
# ]

# result_df = process_xml_files(urls)

# # Display the result
# print(result_df)

In [57]:
import re
import pandas as pd

# Load text content
text_content = """
User-agent: Brightbot 1.0
Disallow: /

User-agent: *

Disallow: /au-institutional-iframe*
Disallow: /pensions-planning-iframe*
Disallow: /capstone*

Disallow: /*.dl$

Disallow: /*sign-on.oauth
Disallow: /*sign-on.saml
Disallow: /*sign-on-popup.saml
Disallow: /user-platform/
Disallow: /userplatform/
Disallow: /search/
Disallow: /*?truepdf*
Disallow: /*?norepdf*
  
Disallow: /X/
Disallow: /compliance/
  
Disallow: /ref/docs

Disallow: /api-tester/

Disallow: /ee$
Disallow: /gr$
Disallow: /is$
Disallow: /ie$
Disallow: /lv$
Disallow: /li$
Disallow: /lt$
Disallow: /mt$
Disallow: /cy$
Disallow: /gg$
Disallow: /gi$
Disallow: /im$
Disallow: /je$

Disallow: /ee/
Disallow: /gr/
Disallow: /is/
Disallow: /ie/
Disallow: /lv/
Disallow: /li/
Disallow: /lt/
Disallow: /mt/
Disallow: /cy/
Disallow: /gg/
Disallow: /gi/
Disallow: /im/
Disallow: /je/

Disallow: /us/financial-professionals/*.pdf$

SITEMAP: https://www.blackrock.com/ae/sitemap.xml
SITEMAP: https://www.blackrock.com/ae/product-sitemap.xml
SITEMAP: https://www.blackrock.com/aladdin/sitemap.xml
SITEMAP: https://www.blackrock.com/americas-offshore/sitemap.xml
SITEMAP: https://www.blackrock.com/americas-offshore/product-sitemap.xml
SITEMAP: https://www.blackrock.com/at/sitemap.xml
SITEMAP: https://www.blackrock.com/at/product-sitemap.xml
SITEMAP: https://www.blackrock.com/au/sitemap.xml
SITEMAP: https://www.blackrock.com/au/product-sitemap.xml
SITEMAP: https://www.blackrock.com/be/sitemap.xml
SITEMAP: https://www.blackrock.com/be/product-sitemap.xml
SITEMAP: https://www.blackrock.com/br/sitemap.xml
SITEMAP: https://www.blackrock.com/br/product-sitemap.xml
SITEMAP: https://www.blackrock.com/ca/sitemap.xml
SITEMAP: https://www.blackrock.com/ca/product-sitemap.xml
SITEMAP: https://www.blackrock.com/cash/sitemap.xml
SITEMAP: https://www.blackrock.com/cash/product-sitemap.xml
SITEMAP: https://www.blackrock.com/ch/sitemap.xml
SITEMAP: https://www.blackrock.com/ch/product-sitemap.xml
SITEMAP: https://www.blackrock.com/cl/sitemap.xml
SITEMAP: https://www.blackrock.com/cl/product-sitemap.xml
SITEMAP: https://www.blackrock.com/cn/sitemap.xml
SITEMAP: https://www.blackrock.com/cn/product-sitemap.xml
SITEMAP: https://www.blackrock.com/co/sitemap.xml
SITEMAP: https://www.blackrock.com/co/product-sitemap.xml
SITEMAP: https://www.blackrock.com/corporate/sitemap.xml
SITEMAP: https://www.blackrock.com/cz/sitemap.xml
SITEMAP: https://www.blackrock.com/cz/product-sitemap.xml
SITEMAP: https://www.blackrock.com/de/sitemap.xml
SITEMAP: https://www.blackrock.com/de/product-sitemap.xml
SITEMAP: https://www.blackrock.com/dk/sitemap.xml
SITEMAP: https://www.blackrock.com/dk/product-sitemap.xml
SITEMAP: https://www.blackrock.com/es/sitemap.xml
SITEMAP: https://www.blackrock.com/es/product-sitemap.xml
SITEMAP: https://www.blackrock.com/fi/sitemap.xml
SITEMAP: https://www.blackrock.com/fi/product-sitemap.xml
SITEMAP: https://www.blackrock.com/financial-markets-advisory/sitemap.xml
SITEMAP: https://www.blackrock.com/fr/sitemap.xml
SITEMAP: https://www.blackrock.com/fr/product-sitemap.xml
SITEMAP: https://www.blackrock.com/hk/sitemap.xml
SITEMAP: https://www.blackrock.com/hk/product-sitemap.xml
SITEMAP: https://www.blackrock.com/hu/sitemap.xml
SITEMAP: https://www.blackrock.com/hu/product-sitemap.xml
SITEMAP: https://www.blackrock.com/il/sitemap.xml
SITEMAP: https://www.blackrock.com/il/product-sitemap.xml
SITEMAP: https://www.blackrock.com/institutions/sitemap.xml
SITEMAP: https://www.blackrock.com/it/sitemap.xml
SITEMAP: https://www.blackrock.com/it/product-sitemap.xml
SITEMAP: https://www.blackrock.com/jp/sitemap.xml
SITEMAP: https://www.blackrock.com/jp/product-sitemap.xml
SITEMAP: https://www.blackrock.com/kiid/sitemap.xml
SITEMAP: https://www.blackrock.com/kr/sitemap.xml
SITEMAP: https://www.blackrock.com/lu/sitemap.xml
SITEMAP: https://www.blackrock.com/lu/product-sitemap.xml
SITEMAP: https://www.blackrock.com/mx/sitemap.xml
SITEMAP: https://www.blackrock.com/mx/product-sitemap.xml
SITEMAP: https://www.blackrock.com/nl/sitemap.xml
SITEMAP: https://www.blackrock.com/nl/product-sitemap.xml
SITEMAP: https://www.blackrock.com/no/sitemap.xml
SITEMAP: https://www.blackrock.com/no/product-sitemap.xml
SITEMAP: https://www.blackrock.com/pl/sitemap.xml
SITEMAP: https://www.blackrock.com/pl/product-sitemap.xml
SITEMAP: https://www.blackrock.com/pt/sitemap.xml
SITEMAP: https://www.blackrock.com/pt/product-sitemap.xml
SITEMAP: https://www.blackrock.com/sa/sitemap.xml
SITEMAP: https://www.blackrock.com/se/sitemap.xml
SITEMAP: https://www.blackrock.com/se/product-sitemap.xml
SITEMAP: https://www.blackrock.com/sg/sitemap.xml
SITEMAP: https://www.blackrock.com/sg/product-sitemap.xml
SITEMAP: https://www.blackrock.com/sk/sitemap.xml
SITEMAP: https://www.blackrock.com/sk/product-sitemap.xml
SITEMAP: https://www.blackrock.com/tw/sitemap.xml
SITEMAP: https://www.blackrock.com/tw/product-sitemap.xml
SITEMAP: https://www.blackrock.com/uk/sitemap.xml
SITEMAP: https://www.blackrock.com/uk/product-sitemap.xml
SITEMAP: https://www.blackrock.com/us/individual/sitemap.xml
SITEMAP: https://www.blackrock.com/us/individual/product-sitemap.xml
SITEMAP: https://www.blackrock.com/us/financial-professionals/sitemap.xml
SITEMAP: https://www.blackrock.com/us/financial-professionals/product-sitemap.xml
SITEMAP: https://www.blackrock.com/za/sitemap.xml
SITEMAP: https://www.blackrock.com/za/product-sitemap.xml
"""

# Step 1: Extract lines containing "SITEMAP" and "Product Sitemap"
sitemap_lines = [
    line.strip() for line in text_content.splitlines() if "SITEMAP" in line and "product-sitemap" in line.lower()
]

# Step 2: Extract the two-letter codes between slashes
def extract_two_letter_codes(url):
    return re.findall(r'/([a-z]{2})/', url)

# Create a DataFrame with the extracted sitemaps and codes
sitemap_data = []
for line in sitemap_lines:
    url = line.split(": ", 1)[-1]  # Extract the URL part
    codes = extract_two_letter_codes(url)
    sitemap_data.append({"url": url, "codes": codes})

sitemap_df = pd.DataFrame(sitemap_data)

# Get unique two-letter codes from the text
all_codes = set(
    code for line in text_content.splitlines() for code in re.findall(r'/([a-z]{2})(/|$)', line)
)

# Step 3: Generate new URLs in the correct format
def generate_new_urls_in_format(codes):
    base_url = "https://www.blackrock.com/{code}/product-sitemap.xml"
    return [base_url.format(code=code[0]) for code in codes]

# Generate new sitemap URLs
formatted_urls = generate_new_urls_in_format(all_codes)

# Convert to a DataFrame for saving or output
generated_urls_df = pd.DataFrame(formatted_urls, columns=["Generated URLs"])

# Save to a file or display
generated_urls_df.to_csv("formatted_generated_sitemaps.csv", index=False)

print("Generated URLs:")
print(generated_urls_df.head())

Generated URLs:
                                     Generated URLs
0  https://www.blackrock.com/mt/product-sitemap.xml
1  https://www.blackrock.com/gg/product-sitemap.xml
2  https://www.blackrock.com/li/product-sitemap.xml
3  https://www.blackrock.com/uk/product-sitemap.xml
4  https://www.blackrock.com/il/product-sitemap.xml


In [73]:
pd.set_option('display.max_colwidth', None)
urls=generated_urls_df['Generated URLs'].tolist()
result_df = process_xml_files(urls)
result_df

Error processing https://www.blackrock.com/sa/product-sitemap.xml: 500 Server Error: Internal Server Error for url: https://www.blackrock.com/sa/product-sitemap.xml


Unnamed: 0,loc,lastmod,changefreq,priority,source_file
0,https://www.blackrock.com/mt/products/228272/blackrock-global-allocation-a2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml
1,https://www.blackrock.com/mt/products/228273/blackrock-global-allocation-hedged-a2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml
2,https://www.blackrock.com/mt/products/228296/blackrock-global-allocation-d2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml
3,https://www.blackrock.com/mt/products/228321/blackrock-world-agriculture-hedged-a2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml
4,https://www.blackrock.com/mt/products/228340/blackrock-continental-european-flex-d2rf-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml
...,...,...,...,...,...
148726,https://www.blackrock.com/za/professionals/products/310486/ishares-world-equity-index-fund-lu,2024-11-20,WEEKLY,0.5,product-sitemap.xml
148727,https://www.blackrock.com/za/professionals/products/311981/ishares-emerging-markets-government-bond-index-fund-lu,2024-11-20,WEEKLY,0.5,product-sitemap.xml
148728,https://www.blackrock.com/za/professionals/products/314317/blackrock-multi-theme-equity-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml
148729,https://www.blackrock.com/za/professionals/products/314324/blackrock-multi-theme-equity-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml


In [86]:
# result_df.to_pickle("allXMLCombined.pkl")
result_df=pd.read_pickle("allXMLCombined.pkl")
result_df["countryCode"]=result_df["loc"].str.split("/").str[3]
result_df["audienceCode"]=result_df["loc"].str.split("/").str[4]
eu_country_codes = [
    "at", "be", "bg", "hr", "cy", "cz", "dk", "ee", "fi", "fr","li","lt","mt","no","cy",
    "de", "el", "hu", "ie", "it", "lv", "lt", "lu", "mt", "nl","je"
    "pl", "pt", "ro", "sk", "si", "es", "se","ch","gr","uk","gi","gg","is","im"
]
result_dfEU=result_df[result_df.countryCode.isin(eu_country_codes)]
result_dfEU

result_df.groupby(by=["countryCode","audienceCode"]).count()
result_df.groupby(by=["countryCode"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,loc,lastmod,changefreq,priority,source_file
countryCode,audienceCode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ae,intermediaries,2604,2604,2604,2604,2604
at,privatanleger,3955,3955,3955,3955,3955
at,professionelle-anleger,3955,3955,3955,3955,3955
au,products,113,113,113,113,113
be,individual,4454,4454,4454,4454,4454
...,...,...,...,...,...,...
us,financial-professionals,1032,1032,1032,1032,1032
us,partner,1032,1032,1032,1032,1032
us,professional-investors,1032,1032,1032,1032,1032
za,individual,124,124,124,124,124


Unnamed: 0,loc,lastmod,changefreq,priority,source_file,countryCode,audienceCode
0,https://www.blackrock.com/mt/products/228272/blackrock-global-allocation-a2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,mt,products
1,https://www.blackrock.com/mt/products/228273/blackrock-global-allocation-hedged-a2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,mt,products
2,https://www.blackrock.com/mt/products/228296/blackrock-global-allocation-d2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,mt,products
3,https://www.blackrock.com/mt/products/228321/blackrock-world-agriculture-hedged-a2-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,mt,products
4,https://www.blackrock.com/mt/products/228340/blackrock-continental-european-flex-d2rf-eur-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,mt,products
...,...,...,...,...,...,...,...
146925,https://www.blackrock.com/fr/particuliers/products/339819/ishares-developed-world-index-fund-ie,2024-11-20,WEEKLY,0.5,product-sitemap.xml,fr,particuliers
146926,https://www.blackrock.com/fr/particuliers/products/339841/ishares-short-duration-corp-bond-ucits-etf,2024-11-20,WEEKLY,0.5,product-sitemap.xml,fr,particuliers
146927,https://www.blackrock.com/fr/particuliers/products/339857/blackrock-systematic-china-a-share-opportunities-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,fr,particuliers
146928,https://www.blackrock.com/fr/particuliers/products/339866/blackrock-japan-flexible-equity-fund,2024-11-20,WEEKLY,0.5,product-sitemap.xml,fr,particuliers
