The idea of this optimizer is to find the most optimal portfolio in terms of low correlation, high past returns, diverse country and branch distribtution 

toDO:
- add etf composition chart
- add etf top 10 most value stocks
- add anual return of ytd, past 5 years
- add weight to do portfolio analysis
- clean up code, still super messy

In [None]:
# save as etf_correlation.py and run: python etf_correlation.py
import time
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import os

In [None]:
# --- user input: the ISINs you gave
isins = [
    "IE00B1XNHC34", # Global Clean Energy
    "IE00B4L5Y983", # MSCI World
    "IE00B4K48X80", # MSCI Europe
    "IE00BKM4GZ66", # MSCI Emerging Markets
]

In [None]:

# try symbol variants (these are common Yahoo conventions). The script will try each until it gets data.
# you can add other variants like '.DE', '.L', '.SW', '.AS' if you prefer a specific exchange.
variants = ["{isin}.IR", "{isin}.L", "{isin}.DE", "{isin}.AS", "{isin}", "{isin}.SW"]

# record which Yahoo symbol matched each ISIN so we can query the holdings page later
fetched_symbol_for_isin = {}

end = datetime.utcnow().date()
start = end - timedelta(days=365*5)  # ~5 years
start_str = start.strftime("%Y-%m-%d")
end_str = end.strftime("%Y-%m-%d")

def fetch_one(isin):
    for v in variants:
        symbol = v.format(isin=isin)
        try:
            df = yf.download(symbol, start=start_str, end=end_str, progress=False, auto_adjust=True, threads=True)
            if df is None or df.empty:
                # no data â€” try next
                continue
            # ensure daily frequency and drop NaNs
            df = df[['Close']].dropna()
            df.rename(columns={'Close': isin}, inplace=True)
            # remember which Yahoo symbol returned data for this ISIN (useful for holdings lookup)
            fetched_symbol_for_isin[isin] = symbol
            print(f"Fetched {symbol} rows={len(df)}")
            return df
        except Exception as e:
            # keep trying other variants
            print(f"Failed {symbol}: {e}")
            time.sleep(0.5)
            continue
    print(f"Unable to fetch any variant for ISIN {isin}")
    return None

# fetch all
frames = []
for isin in isins:
    df = fetch_one(isin)
    if df is not None:
        frames.append(df)

if not frames:
    raise SystemExit("No data fetched. Try running in Colab / local network, or provide price CSVs.")

In [None]:

# align by date (inner join) to get days where all ETFs have a price
prices = pd.concat(frames, axis=1, join='inner').sort_index()

# compute daily log returns
returns = np.log(prices / prices.shift(1)).dropna()

# correlation matrix (Pearson on daily log returns)
corr = returns.corr()
print(corr)

# Map ISINs to readable ETF names (adjust or extend as needed)
isin_to_name = {
    "IE00B1XNHC34": "Global Clean Energy",
    "IE00B4L5Y983": "MSCI World",
    "IE00B4K48X80": "MSCI Europe",
    "IE00BKM4GZ66": "MSCI Emerging Markets",
}

# Build human-readable labels preserving column order.
# Try exact match, then strip exchange suffix (after '.') , then substring match.
labels = []
for col in corr.columns:
    # 1) exact match
    name = isin_to_name.get(col)
    # 2) strip common exchange suffix (e.g. 'IE00B1XNHC34.L')
    if name is None and '.' in col:
        base = col.split('.')[0]
        name = isin_to_name.get(base)
    # 3) substring match (in case fetched symbol contains the ISIN inside)
    if name is None:
        for k, v in isin_to_name.items():
            if k in col:
                name = v
                break
    labels.append(name if name is not None else col)

# Create a display copy with labels for nicer plotting
corr_display = corr.copy()
corr_display.columns = labels
corr_display.index = labels

plt.figure(figsize=(7,6))
plt.imshow(corr_display.values, interpolation='nearest', cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar()
plt.xticks(range(len(labels)), labels, rotation=45, ha='right')
plt.yticks(range(len(labels)), labels)
plt.title("Daily log-return correlation (last ~5 years)")

# Annotate each cell with the correlation value (two decimals)
for i in range(len(labels)):
    for j in range(len(labels)):
        val = corr_display.values[i, j]
        txt = f"{val:.2f}"
        # choose text color for contrast
        color = 'white' if abs(val) > 0.5 else 'black'
        plt.text(j, i, txt, ha='center', va='center', color=color, fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# get stock holdings for the ETFs using yfinance data
from urllib.parse import quote_plus

def fetch_holdings(isin):
    # prefer the exact Yahoo symbol we fetched earlier (if available)
    symbol = fetched_symbol_for_isin.get(isin, isin)
    url = f"https://finance.yahoo.com/quote/{quote_plus(symbol)}/holdings?p={quote_plus(symbol)}"
    try:
        tables = pd.read_html(url)
    except Exception as e:
        print(f"Failed to read holdings page for {symbol}: {e}")
        return None

    # heuristics: find a table that contains a percent/weight column
    for tbl in tables:
        cols = [str(c).lower() for c in tbl.columns]
        if any('weight' in c or '%' in c or 'holding' in c for c in cols):
            df = tbl.copy()
            # find name/symbol column and percent column
            name_col = None
            pct_col = None
            for c in df.columns:
                cl = str(c).lower()
                if name_col is None and ('symbol' in cl or 'ticker' in cl or 'name' in cl):
                    name_col = c
                if pct_col is None and ('weight' in cl or '%' in cl or 'holding' in cl):
                    pct_col = c
            if name_col is None:
                name_col = df.columns[0]
            if pct_col is None:
                pct_col = df.columns[-1]
            out = df[[name_col, pct_col]].copy()
            out.columns = ['SymbolOrName', 'HoldingPercent']
            # convert percent strings like '1.23%' to float
            def to_float(x):
                if pd.isna(x):
                    return np.nan
                s = str(x).replace('%','').replace(',','').strip()
                try:
                    return float(s)
                except:
                    return np.nan
            out['HoldingPercent'] = out['HoldingPercent'].apply(to_float)
            return out.set_index('SymbolOrName')
    return None

# print holdings
for isin in isins:
    holdings = fetch_holdings(isin)
    if holdings is not None:
        print(f"\nTop holdings for ISIN {isin}:")
        print(holdings.head(10))
    else:
        print(f"No holdings data for ISIN {isin}")