In [None]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import yfinance as yf

In [None]:
# get product id & product url from downloaded ishare ETF html file
# this is for downloading up-to-date holdings file from ishare
website_header = "https://www.ishares.com"

html = open("product_id_ishare.html", encoding="utf-8").read()
soup = BeautifulSoup(html, "html.parser")

rows = soup.select("tr:has(td.links a[href*='/produkte/'])")

records = []

for row in rows:
    links = row.select("td.links a")

    if len(links) >= 2:
        product_url = links[0].get("href")

        ids = [re.search(r"/produkte/(\d+)/", a["href"]).group(1) for a in links[:2]]

        id1, id2 = ids
        ticker = links[0].get_text(strip=True)
        fund_name = links[1].get_text(strip=True)
        records.append(
            {
                "product_id": id2,
                'product_url':website_header + product_url,
                "ticker": ticker,
                "fund_name": fund_name,
                "flag": id1 == id2,
            }
        )
records_df = pd.DataFrame(records)
records_df['ticker']=records_df['ticker'].apply(lambda x: x.strip())

In [None]:
# load selected ETF dataframe
ETF_file = pd.read_excel("iShares-Germany_filtered_ETF_descending.xlsx")
mask_etf = ETF_file["Fondstyp"] == "ETF"
mask_asset_class = ETF_file["Anlageklasse"] == "Aktien"
filtered_etf = (
    ETF_file[mask_etf & mask_asset_class]
    .reset_index()
    .sort_values(by="Fondsvermögen", ascending=False)
)
# select top 120
filtered_etf=filtered_etf.head(120)
filtered_etf['Ticker'] = filtered_etf['Ticker'].apply(lambda x: x.strip())

tickers = filtered_etf['Ticker'].tolist()
tickers = [ticker.strip() for ticker in tickers]


In [None]:
# prepare ticker names for yfinance price download
suffixes = [".DE", ".L", ".SW", ".MI", ".PA", ".AS"]

def find_valid_ticker(base):
    for s in suffixes:
        t = base + s
        data = yf.download(t, period="5d", progress=False)
        if not data.empty:
            return t
    return None

valid_map = {}
invalid = []

for t in tickers:
    if t.__contains__('.'):
        real = t
    else:
        real = find_valid_ticker(t)

    if real:
        valid_map[t] = real
    else:
        invalid.append(t)

print(valid_map)
print("still invalid:", invalid)

In [None]:
# download prices from yfinance
valid_tickers = list(valid_map.values())
data = yf.download(valid_tickers, period="3y")

if isinstance(data.columns, pd.MultiIndex):
    adj_close = data["Close"]
else:
    adj_close = data[["Close"]]  # For single ticker cases

In [None]:
# save prices
adj_close.reset_index().melt(id_vars=['Date']).to_csv('prices.csv')


In [4]:
# check if all selected tickers are available in the downloaded html ishare file
manual_list = []
for tick in tickers:
    if tick not in records_df['ticker'].tolist():
        print(f'???{tick} not found')
        manual_list.append(tick)

print(manual_list)
print(len(manual_list))

[]
0


In [None]:
# records_df['duplicated'] = records_df['ticker'].duplicated()
# TODO  handle the duplicates

In [None]:
# remove irrelevant tickers
records_df = records_df[records_df['ticker'].isin(filtered_etf['Ticker'])]

In [6]:
len(records_df) #TODO: handle the duplicates

121

In [7]:
import re

def generate_ishares_download_link(product_url, fund_name):
    """
    generate link for downloading holdings info for a specified ETF
    """

    magic_string = "1535604580385"
    base_url = f"{product_url}/{magic_string}.ajax"
    file_name = '-'.join([x.strip().upper() for x in fund_name.split()])
    params = f"?fileType=xls&fileName={file_name}_fund&dataType=fund"
    
    return base_url + params

xls_file_url_list = []
for row in records:
    final_href = generate_ishares_download_link(row['product_url'], row['fund_name'])
    # print(f"Generated URL: {final_href}")
    xls_file_url_list.append({'download_url': final_href,'product_id':row['product_id']})
# extend records_df by product download url
records_df = pd.merge(pd.DataFrame(xls_file_url_list),records_df,how='right',on='product_id')

In [8]:
def aggregate_etf_data(df, ticker):
    """
    input: df, Ticker, TER
    output: (industry_df, location_df)
    """
    # 1. convert: iShares XML 'Gewichtung (%)' e.g. 1.66856% --> /100
    df['Weight_Decimal'] = pd.to_numeric(df['Gewichtung (%)'], errors='coerce') / 100
    
    # 2. industry
    industry_agg = df.groupby('Sektor')['Weight_Decimal'].sum().reset_index()
    industry_agg.columns = ['Sector', 'Weight']
    industry_agg['Ticker'] = ticker
    
    # 3. country
    location_agg = df.groupby('Standort')['Weight_Decimal'].sum().reset_index()
    location_agg.columns = ['Location', 'Weight']
    location_agg['Ticker'] = ticker
    
    return industry_agg, location_agg

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
from io import StringIO


def parse_holdings_xls(xml_path):
    # ===== 1. read in file =====
    with open(xml_path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()

    # ===== 2. XML =====
    xml_start = content.find("<?xml")
    if xml_start == -1:
        raise ValueError("XML not found in file")

    clean_xml = content[xml_start:]

    # ===== 3. decode XML =====
    root = ET.fromstring(clean_xml)

    ns = {"ss": "urn:schemas-microsoft-com:office:spreadsheet"}

    table = root.find(".//ss:Worksheet/ss:Table", ns)
    rows = table.findall("ss:Row", ns)

    header = None
    data = []
    start = False

    for row in rows:
        cells = row.findall("ss:Cell", ns)
        values = []

        for cell in cells:
            d = cell.find("ss:Data", ns)
            values.append(d.text if d is not None else None)

        # get table header
        if values and "Emittententicker" in values:
            header = values
            start = True
            continue

        if start and any(v not in (None, "") for v in values):
            data.append(values)

    df = pd.DataFrame(data, columns=header)

    # print(df.head())
    print(f"\nsuccessfully extracted {len(df)} rows")
    return df
    

In [None]:
import requests
import random
import time

master_industry_list = []
master_location_list = []
df = []
manual_inspect =[]


def retrieve_holding_info_from_ishare(download_url, referer_url, ticker):

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Referer": referer_url,  # we are from your own website, plz dont block us :[
        "Accept": "text/csv,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }

    # use session to avoid 403 blockage
    session = requests.Session()

    try:
        response = session.get(download_url, headers=headers)
        time.sleep(random.randint(1,3))
        if response.status_code == 403:
            print(f"status code: {response.status_code} for ticker {ticker}")
            manual_inspect.append(ticker)

        if response.status_code == 200:
            with open("holdings_info.xls", "wb") as f:
                f.write(response.content)
            holdings_df = parse_holdings_xls("holdings_info.xls")
            df.append(holdings_df)
            try:
                ind_df, loc_df = aggregate_etf_data(holdings_df, ticker)

                master_industry_list.append(ind_df)
                master_location_list.append(loc_df)
                print(f"✅ {ticker} agg successful")
            except Exception as e:
                print(f"❌ {ticker} agg failed: {e}")
                manual_inspect.append(ticker)

            return None
        return None
    except Exception as e:
        print(f"somthing's wrong...: {e}")
        manual_inspect.append(ticker)


for i in range(len(records_df)):
    download_url = records_df['download_url'][i]
    referer_url = records_df["product_url"][i]
    ticker = records_df["ticker"][i]
    retrieve_holding_info_from_ishare(download_url, referer_url, ticker)

# agg to industry/location tables
final_industry_master = pd.concat(master_industry_list, ignore_index=True)
final_location_master = pd.concat(master_location_list, ignore_index=True)
final_industry_master["Sector"] = final_industry_master["Sector"].apply(
    lambda x: x.strip()
)
final_location_master["Location"] = final_location_master["Location"].apply(
    lambda x: x.strip()
)




成功提取 310 行
✅ IQQ0 聚合完成
运行出错: not well-formed (invalid token): line 42, column 42
status code: 403 for ticker IUSC
status code: 403 for ticker DAXEX

成功提取 137 行
✅ EUNY 聚合完成

成功提取 134 行
✅ IUSK 聚合完成

成功提取 636 行
✅ IUSL 聚合完成

成功提取 68 行
✅ EXI2 聚合完成

成功提取 304 行
✅ EXSI 聚合完成

成功提取 55 行
✅ EUN2 聚合完成

成功提取 55 行
✅ EXW1 聚合完成

成功提取 33 行
✅ EXX1 聚合完成

成功提取 35 行
✅ IQQA 聚合完成

成功提取 107 行
✅ IUSZ 聚合完成

成功提取 57 行
✅ IQQC 聚合完成

成功提取 57 行
✅ IQQD 聚合完成

成功提取 300 行
✅ IQQI 聚合完成
status code: 403 for ticker EXS3

成功提取 480 行
✅ IQQF 聚合完成

成功提取 1860 行
✅ IUSQ 聚合完成

成功提取 103 行
✅ IUSC 聚合完成

成功提取 1612 行
✅ IQQE 聚合完成

成功提取 1113 行
✅ EUNM 聚合完成

成功提取 417 行
✅ IQQY 聚合完成

成功提取 433 行
✅ EUNK 聚合完成

成功提取 341 行
✅ IQQU 聚合完成

成功提取 188 行
✅ IQQJ 聚合完成

成功提取 965 行
✅ EUNN 聚合完成

成功提取 637 行
✅ IQQN 聚合完成

成功提取 1336 行
✅ IQQW 聚合完成

成功提取 1392 行
✅ EUNL 聚合完成

成功提取 1366 行
✅ IBCH 聚合完成
status code: 403 for ticker EXXT
status code: 403 for ticker EXX7
status code: 403 for ticker IUSA
status code: 403 for ticker IBCF

成功提取 88 行
✅ IS0E 聚合完成
运行出错: not well-f

In [11]:
len(manual_inspect)

24

In [13]:
selected_indices=records_df[records_df['ticker'].isin(manual_inspect)].index.tolist()

In [None]:
# try again for those in manual_inspect
for idx in selected_indices:
    download_url = records_df.loc[idx,'download_url']
    referer_url = records_df.loc[idx,"product_url"]
    ticker = records_df.loc[idx,"ticker"]
    retrieve_holding_info_from_ishare(download_url, referer_url, ticker)

运行出错: not well-formed (invalid token): line 42, column 42
status code: 403 for ticker IUSC
status code: 403 for ticker DAXEX
status code: 403 for ticker EXS3

成功提取 103 行
✅ IUSC 聚合完成
status code: 403 for ticker EXXT
status code: 403 for ticker EXX7


KeyboardInterrupt: 

In [None]:
# for idx in selected_indices:
#     download_url = records_df.loc[idx,'download_url']
#     referer_url = records_df.loc[idx,"product_url"]
#     ticker = records_df.loc[idx,"ticker"]
#     retrieve_holding_info_from_ishare(download_url, referer_url, ticker)

运行出错: not well-formed (invalid token): line 42, column 42
status code: 403 for ticker IUSC
status code: 403 for ticker DAXEX
status code: 403 for ticker EXS3

成功提取 103 行
✅ IUSC 聚合完成
status code: 403 for ticker EXXT
status code: 403 for ticker EXX7
status code: 403 for ticker IUSA
status code: 403 for ticker IBCF
运行出错: not well-formed (invalid token): line 5178, column 195
运行出错: not well-formed (invalid token): line 3438, column 195
运行出错: not well-formed (invalid token): line 4522, column 194
运行出错: not well-formed (invalid token): line 42, column 37
运行出错: not well-formed (invalid token): line 42, column 42
运行出错: not well-formed (invalid token): line 42, column 37
运行出错: not well-formed (invalid token): line 42, column 37
运行出错: not well-formed (invalid token): line 42, column 37
运行出错: not well-formed (invalid token): line 42, column 46

成功提取 0 行
❌ QDVB 聚合失败: 'Gewichtung (%)'
运行出错: not well-formed (invalid token): line 42, column 37
运行出错: not well-formed (invalid token): line 42, column 37

In [15]:
# get TER info
import requests
from bs4 import BeautifulSoup
import time
import os
headers = {
    "User-Agent": "Mozilla/5.0"
}
import re

ter_list = []
manual_ter_lst = []

def get_ter(isin):
    url = f"https://www.justetf.com/en/etf-profile.html?isin={isin}"

    rq = requests.get(url, headers=headers)
    if  rq.status_code == 403:
        print(isin)
    html = rq.text
    soup = BeautifulSoup(html, "html.parser")

    # 找到包含 TER 的 label
    labels = soup.find_all(
        "div",
        class_="val bold",
        attrs={"data-testid": "etf-profile-header_ter-value"}
    )
    ter = None
    if len(labels) == 1:
        label = labels[0]
        ter = label.get_text(strip=True).split()[0]
        print("ISIN:",isin,"TER:", ter)
        pattern = r'(\d+\.\d+)(\s*%)'
        ter= float(re.match(pattern,ter).group(1))
        return ({'ISIN':isin,"TER":ter})
    else:
        print(f'ISIN:{isin} not found')
        manual_ter_lst.append(isin)
        return None
        
    
if not os.path.exists('ter_info.csv'):
    for isin in filtered_etf['ISIN'].tolist():
        ter_list.append(get_ter(isin))
        time.sleep(2)

    ter_df = pd.DataFrame(ter_list)
    # ter_df.to_csv('ter_info.csv')
else:
    ter_df = pd.read_csv('ter_info.csv')

In [16]:
ter_with_ticker_df = pd.merge(filtered_etf[['Ticker','ISIN']],ter_df,how='right',on='ISIN')

In [None]:
final_location_master = pd.merge(final_location_master,ter_with_ticker_df,how='left',on='Ticker')
# save result
final_industry_master.to_csv("master_industry_table.csv", index=False)
final_location_master.to_csv("master_location_table.csv", index=False)
