See [README.md](https://github.com/druce/portfolio_optimization/blob/master/README.md) for discussion, environment setup


In [None]:
import os
import sys
from multiprocessing import Pool
from datetime import datetime
import time 
import requests
import dotenv

import numpy as np
import pandas as pd
import pandas_datareader as pdr
import xlrd

import scipy
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, leaves_list
from scipy.spatial.distance import squareform

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

import openbb
from openbb import obb
from openbb_core.app.model.obbject import OBBject

# https://www.cvxpy.org/install/index.html
import cvxpy as cp

# https://riskfolio-lib.readthedocs.io/en/latest/
import riskfolio as rp

# set seed for reproducibility
np.random.seed(2347)

print("%-20s %s" % ('python', ".".join(map(str, sys.version_info[:3]))))
print("%-20s %s" % ("numpy", np.__version__))
print("%-20s %s" % ("scipy", scipy.__version__))

print("%-20s %s" % ("pandas", pd.__version__))
print("%-20s %s" % ("pandas-datareader", pdr.__version__))
# print("%-20s %s" % ("xlrd", xlrd.__version__))
print("%-20s %s" % ("seaborn", sns.__version__))
print("%-20s %s" % ("matplotlib", matplotlib.__version__))
print("%-20s %s" % ("cvxpy", cp.__version__))
print("%-20s %s" % ("openbb", obb.system.version))

print("%-20s %s" % ("riskfolio", rp.__version__))


# Get data

In [None]:
# load spreadsheet from Damodaran website into pandas dataframe

# if below gives cert error
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

data_xls = 'https://www.stern.nyu.edu/~adamodar/pc/datasets/histretSP.xls'
data_sheet = "Returns by year"
# these will change as rows get added on Damodaran website
skiprows = range(19)
skipfooter = 13
download_df = pd.read_excel(data_xls, 
                         sheet_name=data_sheet, 
                         skiprows=skiprows,
                         skipfooter=skipfooter)
download_df


In [None]:
# set index to year as int
download_df["Year"] = download_df["Year"].astype(int)
download_df.set_index(download_df["Year"], inplace=True)
download_df


In [None]:
# download GDP for correlation matrix
series = ['GDPCA']

gdp_download = pdr.data.DataReader(series, 
                                   'fred', 
                                   start='1926-12-31')
gdp_download.reset_index(inplace=True)
gdp_download.set_index(pd.DatetimeIndex(gdp_download['DATE']).year, inplace=True)
gdp_download['GDP'] = gdp_download['GDPCA'].pct_change()
# https://fortunly.com/statistics/us-gdp-by-year-guide/#gref
gdp_download.loc[1928, 'GDP'] = 0.0110
gdp_download.loc[1929, 'GDP'] = 0.0652
gdp_download.sort_index(inplace=True)
gdp_download.to_csv('gdp_fred.csv')

gdp_download

In [None]:
real_data_df = download_df.copy()
real_data_df.columns

In [None]:
# use caution to grab real return columns, not nominal, column names are similar
# check values vs. sheet
real_data_df = download_df.copy()
real_data_df = real_data_df.drop(columns=["Real Estate"])
real_data_df = real_data_df.rename(
    columns={
        "Real Estate": "Real Estate (nominal)",
        'Inflation Rate': 'CPI',
        'S&P 500 (includes dividends)2': 'S&P',
        "US Small cap (bottom decile)22": "Small Caps",
        '!0-year T.Bonds': 'T-Notes',
        '3-month T. Bill (Real)': 'T-Bills',
        'Baa Corp Bonds': 'Baa Corps',
        'Real Estate3': 'Real Estate',
    })

real_data_df["GDP"] = gdp_download['GDP']
# filter and reorder
real_data_df = real_data_df[[
    'GDP',
    'CPI',
    'S&P',
    'Small Caps',
    'T-Bills',
    'T-Notes',
    'Baa Corps',
    'Real Estate',
    'Gold',
]]
real_data_df

In [None]:
cumreturns = (1 + real_data_df.copy()).cumprod()
(cumreturns.iloc[-1]-1)**(1/len(cumreturns))-1


In [None]:
# check gold data v MacroTrends
# gdf = pd.read_csv('MacroTrends_Data_Download.csv')
# gdf["Year"] = pd.to_datetime(gdf['date'])
# gdf = gdf.loc[gdf["Year"].dt.month ==12]
# gdf["Year"] = gdf["Year"].dt.year
# gdf = gdf.set_index("Year", drop=True)
# gdf['real_pct'] = gdf["real"].pct_change()
# gdf['nom_pct'] = gdf["nominal"].pct_change()
# gdf["damodaran_real_pct"]=download_df["Gold"]
# gdf["damodaran_nom_pct"]=download_df["Gold*"]
# gdf["real_diff"] = gdf["damodaran_real_pct"]-gdf["real_pct"]
# gdf["nom_diff"] = gdf["damodaran_nom_pct"]-gdf["nom_pct"]
# gdf[["nom_diff", 'real_diff']].plot()

# Visualize

In [None]:
pd.set_option('display.max_rows', None)  # display all rows without truncation


In [None]:
for col in real_data_df.columns:
    real_data_df[col] = real_data_df[col].astype(float)

# compute correlation matrix
my_cmap = sns.diverging_palette(10, 220, sep=80, n=50)
sns.heatmap(real_data_df.corr(), annot=True, fmt=".02f", cmap=my_cmap);


In [None]:
# drop CPI, GDP which are not assets
try:
    real_data_df.drop(labels=['CPI', 'GDP'], axis=1, inplace=True)
except:
    pass
    
df = real_data_df.copy()
df.head()


In [None]:
df.plot.line();


In [None]:
# compute correlation matrix
my_cmap = sns.diverging_palette(10, 220, sep=80, n=50)
sns.heatmap(df.corr(), annot=True, fmt=".02f", cmap=my_cmap);


In [None]:
# plot historical cumulative growth
df2 = df.copy()
for col in df2.columns:
    df2[col]+= 1
    df2[col] = df2[col].cumprod()
    
df2.plot.line();


In [None]:
# plot historical cumulative growth since 1970
df2 = df.copy().loc[1970:]
for col in df2.columns:
    df2[col]+= 1
    df2[col] = df2[col].cumprod()
    
df2.plot.line();


In [None]:
labels = list(df.columns)
labels


# Long-only optimization

## 1928 - present 

In [None]:
# arithmetic means
df.mean()

In [None]:
# geometric mean
cumreturns = (1 + df.copy()).cumprod()
(cumreturns.iloc[-1]-1)**(1/len(cumreturns))-1
# difference due to volatility and compounding and maybe divergences from IID log normal distribution


In [None]:
# compute covariance matrix
Sigma = np.cov(df.transpose())
# number of assets

n = Sigma.shape[0]
# average returns
mu = df.mean().values
# asset STDs
asset_vols = np.sqrt(Sigma.diagonal())
# variable to optimize over - portfolio weights
w = cp.Variable(n)

# objectives to optimize
# portfolio return
ret = mu.T @ w 
# volatility
vol = cp.quad_form(w, Sigma)

z = pd.DataFrame([mu, asset_vols], columns=labels)
z['rows'] = ['real return', 'vol']
z.set_index('rows')

In [None]:
w

In [None]:
Sigma

In [None]:
# Solve max return portfolio (corner solution)
# should be 100% highest return asset
prob = cp.Problem(cp.Maximize(ret),      # maximize return
                  [cp.sum(w) == 1,       # weights sum to 1
                   w >= 0]               # each w > 0
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]
maxretvol = vol.value
maxret = ret.value
print("Max return portfolio weights")
pd.DataFrame([wts], columns=labels)
# all stocks which is highest return asset

In [None]:
# solve min vol portfolio (other corner solution)
# should be mostly T-bills but there is variance in t-bills so it diversifies
prob = cp.Problem(cp.Minimize(vol),
                  [cp.sum(w) == 1,     # weights sum to 1
                   w >= 0],            # each weight >= 0
                 )
prob.solve()
# round to not get x.xxxxE-22
wts = [float('%0.6f' % v) for v in w.value]

minvol = vol.value
minvolret = ret.value
print("Min vol portfolio weights")
pd.DataFrame([wts], columns=labels)
# mostly t-bills and real estate

In [None]:
# %%time
# solve points in between
# for a series of points between min and max vol, maximize return subject to volatility constraints 

# specify a Parameter variable instead of creating new Problem at each iteration
# this allows the solver to reuse previous work
vol_limit = cp.Parameter(nonneg=True)

prob = cp.Problem(cp.Maximize(ret),
                  [cp.sum(w) == 1, 
                   w >= 0,
                   vol <= vol_limit
                  ]
                 )

# define function so we can solve many in parallel
def solve_vl(vl_val):
    vol_limit.value = vl_val
    result = prob.solve()
    return (ret.value, np.sqrt(vol.value), w.value)

# number of points on the frontier
NPOINTS = 200
vl_vals = np.linspace(np.sqrt(minvol), np.sqrt(maxretvol), NPOINTS)
vl_vals = np.square(vl_vals)
# vol constraint is in variance space, take square root of minvol and maxvol, linspace, square values)

# iterate in-process
results_dict = {}
for vl_val in vl_vals:
    # print(datetime.strftime(datetime.now(), "%H:%M:%S"), vl_val)
    results_dict[vl_val] = solve_vl(vl_val)
    
# parallel implementation
# NPROCESSES = 8
# pool = Pool(processes = NPROCESSES)
# result_values = pool.map(solve_vl, vl_vals)
# results_dict = dict(zip(vl_vals, result_values))


In [None]:
ret_df = pd.DataFrame(enumerate(results_dict.keys()))
ret_df.columns=['i', 'vol']
ret_df['return'] = [results_dict[v][0] for v in ret_df['vol']]
ret_df['std'] = [results_dict[v][1] for v in ret_df['vol']]
for i, colname in enumerate(labels):
    ret_df[colname]=[results_dict[v][2][i] for v in ret_df['vol']]


In [None]:
ret_df

In [None]:
# plot efficient frontier
def plot_efrontier(ret_df, df,
                   xlabel="Standard Deviation of Real Returns",
                   ylabel="Real Return",
                   title=None):

    Sigma = np.cov(df.transpose())
    n = Sigma.shape[0]
    mu = df.mean().values
    asset_vols = np.sqrt(Sigma.diagonal())

    risk_free_rate = 0  # availability of any risk-free real rate in this context is debatable 
    ret_df["Sharpe"] = (ret_df["return"] - risk_free_rate) / ret_df["std"]
    
    max_sharpe_index = ret_df["Sharpe"].argmax()  
#     print(max_sharpe_index)
    max_sharpe_return = ret_df.iloc[max_sharpe_index]["return"]
#     print(max_sharpe_return)
    max_sharpe_std = ret_df.iloc[max_sharpe_index]["std"]
#     print(max_sharpe_std)
    max_sharpe_ratio = ret_df.iloc[max_sharpe_index]["Sharpe"]
    
    asset_names = [t for t in ['TIPS', 'T-Bills', 'Real Estate', 'T-Notes', 'Baa Corps', 'Gold', 'S&P', 'Small Caps', 'shorts'] if t in ret_df.columns]
    
    mean_wts = ret_df[asset_names].mean() # average weights over all efficient portolios
    temp_ret_df = df[asset_names]         # historical returns
    avg_ret = temp_ret_df @ mean_wts.values
    avg_ret_mean = avg_ret.mean()
    avg_ret_std = avg_ret.std()
    
    plt.figure(figsize=(8, 4.5))

    # plot the data
    plt.plot(ret_df['std'], ret_df['return'])
    # Force both axes to start at 0
    plt.xlim(left=0, right=max(asset_vols))
    plt.ylim(bottom=min(0, min(mu)))
    
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plot_title = "Risk vs. Real Return,  %d-%d" % (df.index[0], df.index[-1]) if title is None else title
    plt.title(plot_title)

    # plot the markers
    plt.scatter(asset_vols, mu)
    xoffset = 0.0025
    yoffset = 0.0015
    labels = df.columns
    for i, label in enumerate(labels):
        plt.annotate(label, xy=(asset_vols[i]+xoffset, mu[i]+yoffset), xycoords='data',
                     horizontalalignment='left', verticalalignment='top',
                    )
    plt.scatter([max_sharpe_std], [max_sharpe_return])
    plt.annotate("Max Sharpe", xy=(max_sharpe_std+xoffset, max_sharpe_return+yoffset), xycoords='data',
                 horizontalalignment='left', verticalalignment='top',
                )
    plt.scatter([avg_ret_std], [avg_ret_mean])
    plt.annotate("EF Avg Wts", xy=(avg_ret_std+xoffset, avg_ret_mean+yoffset), xycoords='data',
                 horizontalalignment='left', verticalalignment='top',
                )
    plt.show()
    

    print("Max Sharpe Portfolio:")
    print(f"Real Return:  {100*max_sharpe_return:3.2f}%")
    print(f"SD:           {100*max_sharpe_std:3.2f}%")
    print(f"Sharpe Ratio: {max_sharpe_ratio:3.3f}")

    for col in asset_names:
        print(f"{col}: {100*ret_df.iloc[max_sharpe_index][col]:3.1f}%")
  
    print()
    print("Average over efficient frontier:")
    print(f"Real Return:  {100*avg_ret_mean:3.2f}%")
    print(f"SD:           {100*avg_ret_std:3.2f}%")
    print(f"Sharpe Ratio: {avg_ret_mean/avg_ret_std:3.3f}")
    for col in asset_names:
        print(f"{col}: {100*mean_wts[col]:3.1f}%")
            
    return max_sharpe_return, max_sharpe_std, avg_ret_mean, avg_ret_std
        
max_sharpe_return, max_sharpe_std, avg_ret_mean, avg_ret_std = plot_efrontier(ret_df, df)


In [None]:
# stacked area chart of weights vs. returns
# for given vol constraint and corresponding real return, show portfolio weights
def transition_map(ret_df, labels, startyear, endyear, max_sharpe_return=None, avg_ret_mean=None, ylim=1):
    
    x = ret_df['return']
    # absolute values so shorts don't create chaos
    y_list = [abs(ret_df[l]) for l in labels]
    pal = ['red', 'lightgreen', 'darkgreen', 'navy', 'cyan', 'violet', 'gold', ]
    
    fig = plt.figure(figsize=(8, 4.5))
    ax1 = fig.add_subplot(111)
  
    ax1.stackplot(x, y_list, labels=labels, colors=pal)
    ax1.set_xlim((ret_df['return'].iloc[0], ret_df['return'].iloc[-1]))
    ax1.set_ylim((0, ylim))
    ax1.set_xlabel('Portfolio Vol')
    ax1.set_xlabel("Portfolio Real Return")
    ax1.set_ylabel("Portfolio Weight")
    ax1.legend(loc='lower right')
#     return/std relationship is not linear, can't have both axes
#     ax2 = ax1.twiny()
#     ax2.set_xlim((ret_df['std'].iloc[0], ret_df['std'].iloc[-1]))
#     ax2.set_xlabel('Portfolio Vol')
    
    if max_sharpe_return is not None:
        ax1.axvline(max_sharpe_return, color='black', linestyle='--', linewidth=1)

# don't draw line for avg_ret_mean, doesn't correspond to a portfolio in the transition map
#     if avg_ret_mean is not None:
#         ax1.axvline(avg_ret_mean, color='black', linestyle='--', linewidth=1)
        
    plt.title("Optimal Portfolio Transition Map, %d-%d" % (startyear, endyear), y=1.16);

transition_map(ret_df, labels=df.columns, startyear=df.index[0], endyear=df.index[-1], max_sharpe_return=max_sharpe_return, avg_ret_mean=avg_ret_mean)


## 1967 - present (more inflationary era including post gold standard)

In [None]:
df = real_data_df.loc[1967:]
df.plot.line();


In [None]:
# compute covariance matrix
Sigma = np.cov(df.transpose())
# number of assets

n = Sigma.shape[0]
# average returns
mu = df.mean().values
# asset STDs
asset_vols = np.sqrt(Sigma.diagonal())
# variable to optimize over - portfolio weights
w = cp.Variable(n)

# objectives to optimize
# portfolio return
ret = mu.T @ w 
# volatility
vol = cp.quad_form(w, Sigma)

z = pd.DataFrame([mu, asset_vols], columns=labels)
z['rows'] = ['real return', 'vol']
z.set_index('rows')

In [None]:
# Solve max return portfolio (corner solution)
prob = cp.Problem(cp.Maximize(ret), 
                  [cp.sum(w) == 1, 
                   w >= 0]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]
maxretvol = vol.value
maxret = ret.value
print("Max return portfolio weights")
pd.DataFrame([wts], columns=labels)


In [None]:
# solve min vol portfolio (other corner solution)
prob = cp.Problem(cp.Minimize(vol),
                  [cp.sum(w) == 1, 
                   w >= 0]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]

minvol = vol.value
minvolret = ret.value
print("Min vol portfolio weights")
pd.DataFrame([wts], columns=labels)


In [None]:
%%time
# solve points in between
# maximize return subject to volatility constraints between minimum volatility and max return volatility

# specify a Parameter variable instead of creating new Problem at each iteration
# this allows the solver to reuse previous work
vol_limit = cp.Parameter(nonneg=True)

prob = cp.Problem(cp.Maximize(ret),
                  [cp.sum(w) == 1, 
                   w >= 0,
                   vol <= vol_limit
                  ]
                 )

# define function so we can solve many in parallel
def solve_vl(vl_val):
    vol_limit.value = vl_val
    result = prob.solve()
    return (ret.value, np.sqrt(vol.value), w.value)

# number of points on the frontier
NPOINTS = 200
vl_vals = np.linspace(np.sqrt(minvol), np.sqrt(maxretvol), NPOINTS)
vl_vals = np.square(vl_vals)
# vol constraint is in variance space, take square root of minvol and maxvol, linspace, square values)

# iterate in-process
results_dict = {}
for vl_val in vl_vals:
    # print(datetime.strftime(datetime.now(), "%H:%M:%S"), vl_val)
    results_dict[vl_val] = solve_vl(vl_val)
    
# parallel implementation
# NPROCESSES = 8
# pool = Pool(processes = NPROCESSES)
# result_values = pool.map(solve_vl, vl_vals)
# results_dict = dict(zip(vl_vals, result_values))


In [None]:
ret_df = pd.DataFrame(enumerate(results_dict.keys()))
ret_df.columns=['i', 'vol']
ret_df['return'] = [results_dict[v][0] for v in ret_df['vol']]
ret_df['std'] = [results_dict[v][1] for v in ret_df['vol']]
for i, colname in enumerate(labels):
    ret_df[colname]=[results_dict[v][2][i] for v in ret_df['vol']]
# ret_df

In [None]:
max_sharpe_return, max_sharpe_std, avg_ret_mean, avg_ret_std = plot_efrontier(ret_df, df)


In [None]:
transition_map(ret_df, labels=df.columns, startyear=df.index[0], endyear=df.index[-1], max_sharpe_return=max_sharpe_return, avg_ret_mean=avg_ret_mean)


## 1983 - present (era of globalization, post-big inflation)


In [None]:
df = real_data_df.loc[1983:]
df.plot.line();

In [None]:
# compute covariance matrix
Sigma = np.cov(df.transpose())
# number of assets

n = Sigma.shape[0]
# average returns
mu = df.mean().values
# asset STDs
asset_vols = np.sqrt(Sigma.diagonal())
# variable to optimize over - portfolio weights
w = cp.Variable(n)

# objectives to optimize
# portfolio return
ret = mu.T @ w 
# volatility
vol = cp.quad_form(w, Sigma)

z = pd.DataFrame([mu, asset_vols], columns=labels)
z['rows'] = ['real return', 'vol']
z.set_index('rows')

In [None]:
# Solve max return portfolio (corner solution)
prob = cp.Problem(cp.Maximize(ret), 
                  [cp.sum(w) == 1, 
                   w >= 0]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]
maxretvol = vol.value
maxret = ret.value
print("Max return portfolio weights")
pd.DataFrame([wts], columns=labels)


In [None]:
# solve min vol portfolio (other corner solution)
prob = cp.Problem(cp.Minimize(vol),
                  [cp.sum(w) == 1, 
                   w >= 0]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]

minvol = vol.value
minvolret = ret.value
print("Min vol portfolio weights")
pd.DataFrame([wts], columns=labels)


In [None]:
%%time
# solve points in between
# maximize return subject to volatility constraints between minimum volatility and max return volatility

# specify a Parameter variable instead of creating new Problem at each iteration
# this allows the solver to reuse previous work
vol_limit = cp.Parameter(nonneg=True)

prob = cp.Problem(cp.Maximize(ret),
                  [cp.sum(w) == 1, 
                   w >= 0,
                   vol <= vol_limit
                  ]
                 )

# define function so we can solve many in parallel
def solve_vl(vl_val):
    vol_limit.value = vl_val
    result = prob.solve()
    return (ret.value, np.sqrt(vol.value), w.value)

# number of points on the frontier
NPOINTS = 200
vl_vals = np.linspace(np.sqrt(minvol), np.sqrt(maxretvol), NPOINTS)
vl_vals = np.square(vl_vals)
# vol constraint is in variance space, take square root of minvol and maxvol, linspace, square values)

# iterate in-process
results_dict = {}
for vl_val in vl_vals:
    # print(datetime.strftime(datetime.now(), "%H:%M:%S"), vl_val)
    results_dict[vl_val] = solve_vl(vl_val)
    
# parallel implementation
# NPROCESSES = 8
# pool = Pool(processes = NPROCESSES)
# result_values = pool.map(solve_vl, vl_vals)
# results_dict = dict(zip(vl_vals, result_values))


In [None]:
ret_df = pd.DataFrame(enumerate(results_dict.keys()))
ret_df.columns=['i', 'vol']
ret_df['return'] = [results_dict[v][0] for v in ret_df['vol']]
ret_df['std'] = [results_dict[v][1] for v in ret_df['vol']]
for i, colname in enumerate(labels):
    ret_df[colname]=[results_dict[v][2][i] for v in ret_df['vol']]
# ret_df


In [None]:
max_sharpe_return, max_sharpe_std, avg_ret_mean, avg_ret_std = plot_efrontier(ret_df, df)


In [None]:
transition_map(ret_df, labels=df.columns, startyear=df.index[0], endyear=df.index[-1], max_sharpe_return=max_sharpe_return, avg_ret_mean=avg_ret_mean)


## Add a risk-free asset
- The efficient frontier above does not include a risk-free asset, when we inflation-adjust t-bill returns we get volatility and fluctuation in returns including periods of negative real returns.
- However TIPS are available which offer a guaranteed real pre-tax return. They are issued at a real rate, the principal gets adjusted for inflation, and if there is deflation you can't get back less than par. So when you buy TIPS you are guaranteed a positive real pre-tax return
- TIPS offer an inflation hedge and a safe real return, so they might dominate gold. There isn't a great theoretical argument gold should increase in value faster than inflation in the long run (gold bugs might disagree but in a fiat world, that's my story and I'm sticking to it). I could see reasonable arguments why gold should maintain its real value if supply is fixed, and there should be demand for gold when there is inflation and people lose faith in monetary authorities because it is currency-like and supply is relatively fixed, so gold offers an inflation hedge. 
- TIPS total returns are only available for approximately the last 25 years. You can model the TIPS yield as the yield on similar nominal Treasuries less inflation expectations. Hypothetically, there might be a sound way to model historical inflation expectations using recent inflation trends, gold, steepness of yield curve etc. And from there, model what TIPS total returns would theoretically have been based on Treasury total returns and changes in inflation expectations, but that is a challenge. We could also say that the best inflation hedge was gold up to 2000 and TIPS thereafter and use VIPSX OR TIP, but that is a kinky Franken-asset.
- We could also say that given the existence of TIPS, a risk-free 0 real yield asset is available. Worst case TIPS return is 0, if auction rate is 0. Or you could buy TIPS and donate any return over 0, and you are guaranteed return of principal plus inflation. You could argue that it wasn't available and if it had been then it would have modified other returns. If my aunt had wheels she'd be a bicycle.
- Lets posit that we are justified in adding a risk-free TIPS asset, with a constant zero return.
- In the real world you would get a positive real return on TIPS with some fluctuations, real TIPS should dominate the risk-free asset. So this model might underweight TIPS.


In [None]:
df = real_data_df.loc[1928:].copy()
df["TIPS"] = 0
# reorder  for chart
df = df[[ 'S&P', 'Small Caps', 'T-Notes', 'Baa Corps', 'TIPS', 'Real Estate', 'Gold', 'T-Bills' ]]
labels = df.columns
df.plot.line();

In [None]:
# compute covariance matrix
Sigma = np.cov(df.transpose())

# number of assets
n = Sigma.shape[0]
# average returns
mu = df.mean().values
# asset STDs
asset_vols = np.sqrt(Sigma.diagonal())
# variable to optimize over - portfolio weights
w = cp.Variable(n)

# objectives to optimize
# portfolio return
ret = mu.T @ w 

# volatility
vol = cp.quad_form(w, Sigma)

z = pd.DataFrame([mu, asset_vols], columns=labels)
z['rows'] = ['real return', 'vol']
z.set_index('rows')

In [None]:
# Solve max return portfolio (corner solution)
prob = cp.Problem(cp.Maximize(ret), 
                  [cp.sum(w) == 1, 
                   w >= 0]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]
maxretvol = vol.value
maxret = ret.value
print("Max return portfolio weights")
pd.DataFrame([wts], columns=labels)


In [None]:
# solve min vol portfolio (other corner solution)
prob = cp.Problem(cp.Minimize(vol),
                  [cp.sum(w) == 1, 
                   w >= 0]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]

minvol = vol.value
minvolret = ret.value
print("Min vol portfolio weights")
pd.DataFrame([wts], columns=labels)



In [None]:
%%time
# solve points in between
# maximize return subject to volatility constraints between minimum volatility and max return volatility

# specify a Parameter variable instead of creating new Problem at each iteration
# this allows the solver to reuse previous work
vol_limit = cp.Parameter(nonneg=True)

prob = cp.Problem(cp.Maximize(ret),
                  [cp.sum(w) == 1, 
                   w >= 0,
                   vol <= vol_limit
                  ]
                 )

# define function so we can solve many in parallel
def solve_vl(vl_val):
    vol_limit.value = vl_val
    result = prob.solve()
    return (ret.value, np.sqrt(vol.value), w.value)

# number of points on the frontier
NPOINTS = 200
vl_vals = np.linspace(np.sqrt(minvol), np.sqrt(maxretvol), NPOINTS)
vl_vals = np.square(vl_vals)
# vol constraint is in variance space, take square root of minvol and maxvol, linspace, square values)

# iterate in-process
results_dict = {}
for vl_val in vl_vals:
    # print(datetime.strftime(datetime.now(), "%H:%M:%S"), vl_val)
    results_dict[vl_val] = solve_vl(vl_val)
    
# parallel implementation
# NPROCESSES = 8
# pool = Pool(processes = NPROCESSES)
# result_values = pool.map(solve_vl, vl_vals)
# results_dict = dict(zip(vl_vals, result_values))


In [None]:
ret_df = pd.DataFrame(enumerate(results_dict.keys()))
ret_df.columns=['i', 'vol']
ret_df['return'] = [results_dict[v][0] for v in ret_df['vol']]
ret_df['std'] = [results_dict[v][1] for v in ret_df['vol']]
for i, colname in enumerate(labels):
    ret_df[colname]=[results_dict[v][2][i] for v in ret_df['vol']]
# ret_df



In [None]:
max_sharpe_return, max_sharpe_std, avg_ret_mean, avg_ret_std = plot_efrontier(ret_df, df)


In [None]:
transition_map(ret_df, labels=df.columns, startyear=df.index[0], endyear=df.index[-1], max_sharpe_return=max_sharpe_return, avg_ret_mean=avg_ret_mean)


In [None]:
# midwit regularization - take the mean of all optimal portfolios at any level of risk
regularized = ret_df[['S&P', 'Small Caps', 'T-Notes',
       'Baa Corps', 'TIPS', 'Real Estate', 'Gold', 'T-Bills']].mean()
with pd.option_context('display.float_format', '{:.6f}'.format):
    display(regularized)


# Long/short optimization with leverage constraint

In [None]:
x1 = real_data_df['S&P'].values
n = len(x1)
target_corr = 0.6
theta = np.arccos(target_corr)
mean_return = -0.05
x2 = np.random.normal(0, x1.std(), size=n)  
# center so actual mean = 0
X = pd.DataFrame({'x1': x1 - x1.mean(),
                  'x2': x2 - x2.mean()
                 })
# identity matrix
Id = np.diag(np.ones(n))
# QR factorization 
Q, R = np.linalg.qr(X[['x1']])
P = Q @ Q.T
x2o = (Id - P) @ X[['x2']]
Xc2 = pd.DataFrame({'x1': X['x1'], 'x2': x2o['x2']})
# divide by l2 norm
Y = Xc2 / np.sqrt(np.sum(np.square(Xc2), axis=0))
retval = Y['x2'] + (1/np.tan(theta)) * Y['x1'] + mean_return


In [None]:
# in order to include shorts, we need an asset with a poor expected return
# assume I can identify bad stocks, they are highly correlated with S&P but return negative 5%
# create a synthetic 'shorts' asset 

n = len(real_data_df)
target_corr = 0.6
def target_corr(x1, target_corr, mean_return):
    """given a series x1, return a random series with correlation target_corr to x1"""
    n = len(x1)
    theta = np.arccos(target_corr)
    
    x2 = np.random.normal(0, x1.std(), size=n)  
    # center so actual mean = 0
    X = pd.DataFrame({'x1': x1 - x1.mean(),
                      'x2': x2 - x2.mean()
                     })
    # identity matrix
    Id = np.diag(np.ones(n))
    # QR factorization
    Q = np.linalg.qr(X[['x1']])[0]
    P = Q @ Q.T
    x2o = (Id - P) @ X[['x2']]
    Xc2 = pd.DataFrame({'x1': X['x1'], 'x2': x2o['x2']})
    # divide by l2 norm
    Y = Xc2 / np.sqrt(np.sum(np.square(Xc2), axis=0))
    return Y['x2'] + (1/np.tan(theta)) * Y['x1'] + mean_return

shorts = target_corr(real_data_df['S&P'].values, 0.9, -0.05)
print("mean return %.04f" % shorts.mean())
print("vol %.04f" % shorts.std())
np.corrcoef(shorts, real_data_df['S&P'])



In [None]:
df = real_data_df.copy()
df['shorts'] = shorts.to_list()
labels = ['S&P', 'Small Caps', 'Real Estate', 'T-Bills', 'T-Notes', 'Gold', 'Baa Corps', 'shorts']

df[['S&P', 'shorts']].plot.line();

In [None]:
# compute covariance matrix
Sigma = np.cov(df.transpose())
# number of assets

n = Sigma.shape[0]
# average returns
mu = df.mean().values
# asset STDs
asset_vols = np.sqrt(Sigma.diagonal())
# variable to optimize over - portfolio weights
w = cp.Variable(n)

# objectives to optimize
# portfolio return
ret = mu.T @ w 
# volatility
vol = cp.quad_form(w, Sigma)

z = pd.DataFrame([mu, asset_vols], columns=labels)
z['rows'] = ['real return', 'vol']
z.set_index('rows')

In [None]:
my_cmap = sns.diverging_palette(10, 220, sep=80, n=50)
sns.heatmap(df.corr(), annot=True, fmt=".02f", cmap=my_cmap);


In [None]:
# Solve max return portfolio (corner solution)
# update constraints for leverage scenario
# sum of weights == 1 as before, net long 100%
# remove w >= 0 constraint
# new constraint on gross exposure <= 1.5, otherwise optimal weights are unbounded (go infinity long S&P, infinity short stonks)

prob = cp.Problem(cp.Maximize(ret), 
                  [cp.norm1(w) <= 1.5,  # gross exposure 
                   cp.sum(w) == 1]      # net exposure
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]
maxretvol = vol.value
maxret = ret.value
print("Max return portfolio weights (return=%.4f, vol=%.4f)" % (maxret, maxretvol))
pd.DataFrame([wts], columns=labels)


In [None]:
# solve min vol portfolio (other corner solution)
prob = cp.Problem(cp.Minimize(vol),
                  [cp.norm1(w) <= 1.5,
                   cp.sum(w) == 1]
                 )
prob.solve()
wts = [float('%0.4f' % v) for v in w.value]

minvol = vol.value
minvolret = ret.value
print("Min vol portfolio weights (return=%.4f, vol=%.4f)" % (minvolret, minvol))
pd.DataFrame([wts], columns=labels)


In [None]:
%%time
# solve points in between
# maximize return subject to volatility constraints between minimum volatility and max return volatility

# specify a Parameter variable instead of creating new Problem at each iteration
# this allows the solver to reuse previous work
vol_limit = cp.Parameter(nonneg=True)

prob = cp.Problem(cp.Maximize(ret), 
                  [cp.norm1(w) <= 1.5,
                   cp.sum(w) == 1,
                   vol <= vol_limit]
                 )

# define function so we can solve many in parallel
def solve_vl(vl_val):
    vol_limit.value = vl_val
    result = prob.solve()
    return (ret.value, np.sqrt(vol.value), w.value)

# number of points on the frontier
NPOINTS = 200
vl_vals = np.linspace(np.sqrt(minvol), np.sqrt(maxretvol), NPOINTS)
vl_vals = np.square(vl_vals)
# vol constraint is in variance space, take square root of minvol and maxvol, linspace, square values)

# iterate in-process
results_dict = {}
for vl_val in vl_vals:
    # print(datetime.strftime(datetime.now(), "%H:%M:%S"), vl_val)
    results_dict[vl_val] = solve_vl(vl_val)
    
# parallel implementation
# NPROCESSES = 8
# pool = Pool(processes = NPROCESSES)
# result_values = pool.map(solve_vl, vl_vals)
# results_dict = dict(zip(vl_vals, result_values))


In [None]:
ret_df = pd.DataFrame(enumerate(results_dict.keys()))
ret_df.columns=['i', 'vol']
ret_df['return'] = [results_dict[v][0] for v in ret_df['vol']]
ret_df['std'] = [results_dict[v][1] for v in ret_df['vol']]
for i, colname in enumerate(labels):
    ret_df[colname]=[results_dict[v][2][i] for v in ret_df['vol']]
# ret_df


In [None]:
max_sharpe_return, max_sharpe_std, avg_ret_mean, avg_ret_std = plot_efrontier(ret_df, df)



In [None]:
transition_map(ret_df, labels=df.columns, startyear=df.index[0], endyear=df.index[-1], max_sharpe_return=max_sharpe_return, avg_ret_mean=avg_ret_mean, ylim=1.5)
# these are absolute values for gross exposure , not net exposure. 
# left looks weird because < 1.5 gross exposure but also additional Treasury shorts

In [None]:
# net exposure always 100%
ret_df[labels].sum(axis=1)


In [None]:
# gross exposure varies
ret_df[labels].abs().sum(axis=1)


In [None]:
# min risk portfolio actually shorts t-bills and t-notes 
ret_df.iloc[0]

# Alternative approaches

- If we thought returns were really IID log normally distributed and covariance stationary, then this optimization approach whould generate an optimal forward-looking efficient frontier.
- But we can see that there are regime changes so covariance stationarity isn't a very good assumption, nor is log normality.
- Since we might be overfitting to a particular past regime, we might want to regularize, which is to say back off from the model's maximum in a systematic way towards a more diversified solution robust to regime changes, while still near optimal.
- A few regularization approaches with thanks to [Roman Rubsamen and PortfolioOptimizer.io](https://portfoliooptimizer.io):
    - [Near optimal portfolios](https://portfoliooptimizer.io/blog/mean-variance-optimization-in-practice-well-diversified-near-efficient-portfolios/) One approach is, first find the highest Sharpe portfolio. Then we can say, find the lowest risk portfolio with no more than e.g. a 0.05 drop in Sharpe ratio. Since this portfolio is more diversified, i.e. most diversified within 0.05 of maximum Sharpe, it should be more robust out-of-sample.
    - [Subset resampled portfolios](https://portfoliooptimizer.io/blog/mean-variance-optimization-in-practice-subset-resampling-based-efficient-portfolios/), Suppose we have 6 assets, do 6 optimizations, dropping one asset each time, then average all the portfolios. Similar to random forest, an ensemble of slightly weakened models performs better out of sample than a single overfitted model.
    - [Michaud resampling](https://docs.portfoliooptimizer.io/index.html#post-/portfolios/analysis/mean-variance/efficient-frontier/resampling-based) and [MCOS](https://github.com/enjine-com/mcos/tree/master). Do Monte Carlo simulations where we perturb the return forecasts and covariances randomly each time, and average all the resulting portfolios. 
    - [Hierarchical Risk Parity](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2708678) and related methods like Nested Cluster Optimization. Create a tree of assets clustered by similarity. Then starting at the bottom at each non-leaf create a risk parity portfolio of assets under the node, and recursively climb the tree to get a global portfolio. If you create an portfolio of t-bonds and notes and bills and corporates (NCO) and MBS and munis, we will get some of each, even if one is dominated under MV optimization. Same if you do US stocks and various international markets with different market caps and geographies. Then as you combine clusters, all asset classes are represented, whereas a global optimization might omit some assets. Vanilla HRP actually ignores returns and correlations, uses only variances, and creates a minimum risk portfolio assuming no covariance at each level.
    - Or the naive approach above, where we just average over the entire efficient frontier. One could do something more systematic and test how much regularization works best out of sample. One could average over e.g. all the efficient frontiers for 10- or 20-year periods or something.
- To sum up
    - If you do know your future returns and covariances, the efficient frontier and MV optimization give an optimal answer
    - And also in that case, if you can invest or borrow at the risk free rate (or close), you can do better with the Sharpe-optimal portfolio plus leverage or deleverage vs. moving left or right along the frontier due to its convexity properties
    - However, you probably don't have really good forecasts, and even really smart and sophisticated investors get burned by leverage. For instance before the financial crisis, Citibank and Harvard both decided they should be taking more risk and leverage and got burned.
    - Also there is the ['equity premium puzzle'](https://en.wikipedia.org/wiki/Equity_premium_puzzle). Historically taking equity risk has been well compensated. Conventional wisdom is that if you have a long term horizon and are able to weather swings in equity returns without selling low, then you should hold more equity than suggested by these Sharpe-optimal portfolios. See this [paper](https://www.nber.org/system/files/working_papers/w10483/w10483.pdf), do a web search for 'equity premium puzzle' or have a conversation with your favorite advanced AI about what various eminent professors of finance have said about it.
    - For these reasons you probably want to back off from MV optimal portfolios in the direction of more diversification and robustness to regime change, and if you prefer more risk and return than the Sharpe portfolio, you may want to move along the curve instead of using leverage.
    - If you can perturb forecasts in a way that models past regime changes well, Michaud resampling seems like a sound approach. I would need to do more work to understand that and parameterize it though.
    - A simple average of portfolios along the frontier is directionally similar and not massively suboptimal, and easier to explain and offers more diversification. It's near optimal without the 'near'. So it could be considered a 'not-too-far-from-optimal asset allocation for midwits', or a naive base case. It may be the simplest, dumbest asset allocation that might possibly work, as a starting point.


# Hierarchical risk parity

- Using the correlation matrix we can cluster assets into similar clusters and put equal risk into each cluster
  - Use agglomerative clustering to make a binary tree of assets
  - Start at individual assets, combine the 2 most correlated into a cluster
  - Continue iteratively combining the most correlated assets or clusters into bigger clusters, until you arrive at the root.
  - Now starting at the leaf nodes, allocate each asset to its parent cluster in inverse proportion to variance, and continue iteratively up the tree.
  - A heuristic way to create a low variance portfolio, but unlike the minimum-variance optimization it's not very sensitive to small changes in correlations and doesn't use a return forecast. Kind of a poor man's minimum variance portfolio.
    

In [None]:
# 1928-present
df = real_data_df.loc[1928:].copy()
labels = df.columns


In [None]:
# show returns and vols 
Sigma = np.cov(df.transpose())
# number of assets

n = Sigma.shape[0]
# average returns
mu = df.mean().values
# asset STDs
asset_vols = np.sqrt(Sigma.diagonal())
# variable to optimize over - portfolio weights
w = cp.Variable(n)

# objectives to optimize
# portfolio return
ret = mu.T @ w 
# volatility
vol = cp.quad_form(w, Sigma)

z = pd.DataFrame([mu, asset_vols], columns=labels)
z['rows'] = ['real return', 'vol']
z.set_index('rows')

In [None]:
# Building a Riskfolio portfolio object
port = rp.HCPortfolio(returns=df)

# Estimate optimal portfolio:

model='HRP' 
codependence = 'pearson' # Correlation matrix used to group assets in clusters
rm = 'MV' # Risk measure used, this time will be variance
rf = 0 # Risk free rate
linkage_method = 'single' # Linkage method used to build clusters
max_k = 10 # Max number of clusters used in two difference gap statistic, only for HERC model
leaf_order = True # Consider optimal order of leafs in dendrogram

w = port.optimization(model=model,
                      codependence=codependence,
                      rm=rm,
                      rf=rf,
                      linkage=linkage_method,
                      max_k=max_k,
                      leaf_order=leaf_order)
display(w.T)
# concentrated but should be less concentrated than minimum variance portfolio which is:
# 	S&P	Real Estate	T-Bills	T-Notes	Gold	Baa Corps
# 	0.001076	0.361574	0.599897	-0.0	0.037453	-0.0


In [None]:
# Nested Cluster Optimization
df = df.loc[1928:]
labels = df.columns

# Building the portfolio object
port = rp.HCPortfolio(returns=df)

# Estimate optimal portfolio:

model='NCO' 
codependence = 'pearson' # Correlation matrix used to group assets in clusters
rm = 'MV' # Risk measure used, this time will be variance
rf = 0 # Risk free rate
linkage_method = 'single' # Linkage method used to build clusters
max_k = 10 # Max number of clusters used in two difference gap statistic, only for HERC model
leaf_order = True # Consider optimal order of leafs in dendrogram

w = port.optimization(model=model,
                      codependence=codependence,
                      rm=rm,
                      rf=rf,
                      linkage=linkage_method,
                      max_k=max_k,
                      leaf_order=leaf_order)
display(w.T)  # more concentrated than HRP

In [None]:
# compute HRP ourselves instead using riskfolio module

def cov_to_corr(cov):
    """Convert covariance matrix to correlation matrix."""
    if not np.allclose(cov, cov.T):  # check symmetry
        raise ValueError("Covariance matrix is not symmetric")
    sd = np.sqrt(np.diag(cov)) # covariance to SD
    corr = cov / np.outer(sd, sd) # scale cov[i,j] by dividing by sd[i[*sd[j] to get correlation
    # fix some numerical precision errors
    # Use isclose to fix values close to 0
    corr[np.isclose(corr, 0, atol=1e-9)] = 0
    # ensure not > 1 or < -1
    corr = np.clip(corr, -1, 1)  
    return corr

def get_correlation_dist(corr):
    """Convert covariance matrix to a correlation distance matrix."""
    dist = np.sqrt((1 - corr)/2)
    dist = np.clip(dist, 0, None)      # fix numerical precision errors, dist never <0
    np.fill_diagonal(dist.values, 0)
    return dist

def show_dendrogram(linkage, labels):
    plt.figure(figsize=(12, 6))
    dendrogram(linkage,
               labels=labels,
               orientation='top',
               leaf_rotation=90,
               leaf_font_size=10,
               show_contracted=True
              )
    plt.title('Hierarchical Clustering of Asset Returns')
    plt.xlabel('Assets')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.show()

# Calculate covariance matrix
covariance_matrix = df.cov()
# use Ledoit-Wolf robust covariance when more than 30 or so assets (cols = 30% of rows)
# Ledoit-Wolf is a cross between full covariance matrix and the one that would be generated by a single-factor beta model
# lw = LedoitWolf()
# covariance_matrix = pd.DataFrame(lw.fit(df).covariance_, index=df.columns, columns=df.columns)

correlation_matrix = cov_to_corr(covariance_matrix)
distance_matrix = get_correlation_dist(correlation_matrix)
# 'ward' method tends to create balanced clusters. original de Prado paper used single linkage
# riskfolio uses ward
# need to study this more, you want to balance max between-cluster distance and min within-cluster distance
# Ward's method: minimizes distance within clusters (need to understand how it avoids single element clusters with 0 distance)
# Single linkage (nearest point): minimum distance between any 2 members
# Complete linkage (furthest point): maximum distance between any members (ok but once you put farthest points in distinct clusters how to you determine the others)
# Average linkage: average distance between all members

link = linkage(squareform(distance_matrix), method='ward')
show_dendrogram(link, df.columns.tolist())

In [None]:
leaves_list(link)

In [None]:
df.columns[leaves_list(link)]

In [None]:
correlation_matrix

In [None]:
ordered_indices = leaves_list(link)
# reorder everything
correlation_matrix = correlation_matrix.iloc[ordered_indices, ordered_indices]
covariance_matrix = covariance_matrix.iloc[ordered_indices, ordered_indices]
distance_matrix = distance_matrix.iloc[ordered_indices, ordered_indices]
df = df.iloc[:, ordered_indices].copy()


In [None]:
correlation_matrix

In [None]:
my_cmap = sns.diverging_palette(10, 220, sep=80, n=50)
sns.heatmap(correlation_matrix, annot=True, fmt=".02f", cmap=my_cmap);
# can't help feeling corps should come before T-Notes, seems more correlated with S&P so WTF

In [None]:
# Example linkage array
# [[1, 3, 0.3, 2],    # Merge observations 1 & 3, distance=0.3, size=2 - create observation 4
#  [0, 4, 0.4, 3],    # Merge observations 0 & 4 (from step 1), distance=0.4, size=2 - create observation 5
#  [2, 5, 0.6, 4]]    # Merge observations 2 & 5 (from step 2), distance=0.6, size=3

# create a linkage array based on bisection
# basically throw out the tree and just make one based on the sort order that falls out of the linkage array
# this is how it was done in the original paper but frankly, we already have a tree that contains good info 
# so I don't really understand why we wouldn't just use that.
# writing this code so we can go either way
# this will merge corps and T-notes, then stocks and gold, then merge both clusters.

def recursive_bisection(corr_matrix):
    """
    Perform recursive bisection on correlation matrix and return linkage-like structure
    Returns array with format similar to scipy.cluster.hierarchy.linkage:
    [[cluster1, cluster2, distance, size], ...]
    Really doesn't matter that the values are correlations, just uses the row/column order
    """
    n_children = len(corr_matrix)
    # Initialize with individual assets as clusters
    # Will store our linkage-like information
    bisection_links = []
    last_index = 0
    
    def split_cluster(cluster_indices):
        n_children = len(cluster_indices)
        if n <= 1:
            return cluster_indices
        else:
            split = n_children // 2
            return cluster_indices[split:], cluster_indices[:split]
    
    # Start with all assets in one cluster
    queue = [list(range(n_children))]
    
    while queue:
        current_cluster = queue.pop(0)
        if len(current_cluster) <= 1:
            continue
            
        # Split cluster
        cluster1, cluster2 = split_cluster(current_cluster)
        
        # Calculate distance (1 - avg correlation between clusters)
        avg_corr = corr_matrix.iloc[cluster1, cluster2].mean().mean()
        distance = 1 - avg_corr
        
        # Add to linkage-like structure
        if len(cluster1) > 1:
            cluster1_index = last_index - 1
            last_index = cluster1_index
        elif len(cluster1) == 1:
            cluster1_index = cluster1[0]
        else:
            print("error in bisection")
            
        if len(cluster2) > 1:
            cluster2_index = last_index - 1
            last_index = cluster2_index
        elif len(cluster2) == 1:
            cluster2_index = cluster2[0]
        else:
            print("error in bisection")

        bisection_links.append([
            cluster1_index, cluster2_index,  # use minimum index from each cluster
            distance,
            len(cluster1) + len(cluster2)
        ])
        
        # Add new clusters to queue if they're larger than 1
        if len(cluster1) > 1:
            queue.append(cluster1)
        if len(cluster2) > 1:
            queue.append(cluster2)

    n_new_clusters = -last_index
    highest_cluster_number = n_children + n_new_clusters - 1
    retarray = []
    for a in bisection_links:
        cluster1_index = highest_cluster_number + a[0] + 1 if a[0] < 0 else a[0]
        cluster2_index = highest_cluster_number + a[1] + 1 if a[1] < 0 else a[1]
        retarray.append([
            cluster1_index, cluster2_index,  # use minimum index from each cluster
            a[2],
            a[3]
        ])
    return list(reversed(retarray))

bisection_links = recursive_bisection(correlation_matrix)

bisection_links

In [None]:
# Create a 12x12 DataFrame of random numbers between -1 and 1
# random_df = pd.DataFrame(np.random.uniform(-1, 1, size=(12, 12)))

def recursive_bisection(corr_matrix):
    """
    Perform recursive bisection on ordered correlation matrix and return linkage-like structure.
    Returns array with format similar to scipy.cluster.hierarchy.linkage:
    [[cluster1, cluster2, distance, size], ...]
    """
    n = len(corr_matrix)
    bisection_links = []
    next_cluster_id = n
    
    def split_cluster(indices):
        """Split cluster into two parts based on current order"""
        mid = len(indices) // 2
        return indices[:mid], indices[mid:]
    
    def process_cluster(indices):
        """Recursively process clusters and build linkage structure"""
        if len(indices) <= 1:
            return indices[0]
            
        # Split cluster
        left_indices, right_indices = split_cluster(indices)
        
        # Process sub-clusters recursively
        left_id = process_cluster(left_indices) if len(left_indices) > 0 else None
        right_id = process_cluster(right_indices) if len(right_indices) > 0 else None
        
        # Calculate distance (1 - avg correlation between clusters)
        avg_corr = corr_matrix.iloc[left_indices, right_indices].mean().mean()
        distance = 1 - avg_corr
        
        # Add to linkage structure
        nonlocal next_cluster_id
        bisection_links.append([
            left_id,
            right_id,
            distance,
            len(left_indices) + len(right_indices)
        ])
        
        cluster_id = next_cluster_id
        next_cluster_id += 1
        return cluster_id
    
    # Start recursive process with all indices
    process_cluster(list(range(n)))
    
    return np.array(bisection_links)

# Example usage:
bisection_links = recursive_bisection(correlation_matrix)  # use the already ordered correlation matrix
print("Bisection linkage structure:")
print(bisection_links)


In [None]:
def calculate_hrp_weights(link, cov_matrix):
    """
    Calculate HRP portfolio weights using linkage structure
    
    Parameters:
    -----------
    link : numpy.ndarray
        Linkage matrix with shape (n-1, 4) where n is number of assets
    cov_matrix : pandas.DataFrame
        Covariance matrix of asset returns
        
    Returns:
    --------
    pandas.Series
        Portfolio weights indexed by asset names
    """
    # Input validation
    if not isinstance(cov_matrix, pd.DataFrame):
        raise TypeError("cov_matrix must be a pandas DataFrame")
    if not np.allclose(cov_matrix, cov_matrix.T):
        raise ValueError("cov_matrix must be symmetric")
    if link.shape[1] != 4:
        raise ValueError("link must have 4 columns (cluster1, cluster2, distance, size)")
    
    def cluster_variance(cluster_assets):
        """Calculate variance of a cluster using equal weights"""
        cluster_cov = cov_matrix.iloc[cluster_assets, cluster_assets]
        w = np.ones(len(cluster_assets)) / len(cluster_assets)
        variance = np.transpose(w) @ cluster_cov @ w
        return max(0, variance) # numerical stability check, force non-neg

    n = len(cov_matrix)
    weights = np.ones(n)
    
    # Initialize clusters dictionary: at start, each asset is in its own cluster
    clusters = {i: [i] for i in range(n)}
    
    # Process each merge from the linkage
    for i, row in enumerate(link):
        cluster1_idx = int(row[0])
        cluster2_idx = int(row[1])
        new_cluster_idx = n + i
        
        # Get assets in each cluster
        cluster1_assets = clusters[cluster1_idx]
        cluster2_assets = clusters[cluster2_idx]
        
        # Calculate cluster variances
        var1 = cluster_variance(cluster1_assets)
        var2 = cluster_variance(cluster2_assets)
        
        # Calculate weights
        alpha = 1 - (var1 / (var1 + var2))
        
        # Update weights
        for idx in cluster1_assets:
            weights[idx] *= alpha
        for idx in cluster2_assets:
            weights[idx] *= (1 - alpha)
        
        # Store the merged cluster
        clusters[new_cluster_idx] = cluster1_assets + cluster2_assets
        
        # Verify cluster size matches linkage info
        if len(clusters[new_cluster_idx]) != int(row[3]):
            raise ValueError(f"Cluster size mismatch at step {i}")    
            
    # Normalize weights
    weights = weights / np.sum(weights)
    
    return pd.Series(weights, index=cov_matrix.index)

calculate_hrp_weights(bisection_links, covariance_matrix)

# doesn't match riskfolio
# S&P	T-Notes	Gold	Baa Corps
# weights	0.110835	0.311695	0.167578	0.409892

# matches pretty close if leaf_order = False
# 	S&P	T-Notes	Gold	Baa Corps
# weights	0.182752	0.225961	0.222854	0.368433

# TODO : look up how riskfolio enforces leaf order

In [None]:
# redo this without bisecting, just use the tree created in the agglomerative clustering step.
# reorder df

# Calculate covariance matrix
covariance_matrix = df.cov()
# use Ledoit-Wolf robust covariance when more than 30 or so assets (cols = 30% of rows)
# Ledoit-Wolf is a cross between full covariance matrix and the one that would be generated by a single-factor beta model
# lw = LedoitWolf()
# covariance_matrix = pd.DataFrame(lw.fit(df).covariance_, index=df.columns, columns=df.columns)

correlation_matrix = cov_to_corr(covariance_matrix)
distance_matrix = get_correlation_dist(correlation_matrix)
# 'ward' method tends to create balanced clusters. original paper used single linkage
# riskfolio uses ward
# need to study this more, you want to balance max between-cluster distance and min within-cluster distance
# Ward's method: minimizes distance within clusters (need to understand how it avoids single element clusters with 0 distance)
# Single linkage (nearest point): minimum distance between any 2 members
# Complete linkage (furthest point): maximum distance between any members (ok but once you put farthest points in distinct clusters how to you determine the others)
# Average linkage: average distance between all members

link = linkage(squareform(distance_matrix), method='ward')
show_dendrogram(link, df.columns.tolist())


In [None]:
hrp_weights = calculate_hrp_weights(link, covariance_matrix)
hrp_weights

# more gold because merge is 2,3,4 instead fo 2,2,4

In [None]:
def ret_table(df, wts):
    cols = wts.index.tolist()
    returns = df[cols] @ wts
    mu = returns.mean()
    sd = returns.std()
    sharpe = mu/sd
    print(f"Mean return: {100*mu:3.2f}%")
    print(f"Vol:         {100*sd:3.2f}%")
    print(f"Sharpe:      {mu/sd:3.3f}%")

    return mu, sd, sharpe

ret_table(df, hrp_weights)


# Max Sharpe using portfoliooptimizer.io

In [None]:
# load API key for portfoliooptimizer.io
dotenv.load_dotenv()


In [None]:
# 1928-present
df = real_data_df.loc[1928:].copy()
labels = df.columns
n_years, n_assets = df.shape


In [None]:
mu = df.mean().to_list()
covmatrix = df.cov().values


In [None]:
# API endpoint and headers
BASEURL = "https://api.portfoliooptimizer.io/v1"
ENDPOINT = "portfolio/optimization/maximum-sharpe-ratio"
url = f"{BASEURL}/{ENDPOINT}"
print(url)

headers = {
    "Content-Type": "application/json",
    "Authorization": f'Bearer {os.getenv("PORTFOLIO_OPT_APIKEY")})'
}

# Payload
data = {
    "assets": n_assets,
    "assetsReturns": mu,
    "assetsCovarianceMatrix": covmatrix.tolist(),
    "riskFreeRate": 0
}

# Make the POST request
response = requests.post(url, json=data, headers=headers)

# Check and print the response
if response.status_code == 200:
    print("Response data:", response.json())
else:
    print("Error:", response.status_code, response.text)
   

In [None]:
wts = json.loads(response.text)['assetsWeights']
wts_df = pd.DataFrame({'Asset': df.columns.to_list(), 'Weight': wts}).set_index("Asset")
wts_df

# pretty close to above, note that we slice into 200 variances so discretization will impact it a little 

# Max Sharpe Portfolio:
# Real Return: 3.63%
# SD:          6.16%
# T-Bills: 0.0%
# Real Estate: 39.3%
# T-Notes: 0.0%
# Baa Corps: 33.3%
# Gold: 11.9%
# S&P: 13.5%
# Small Caps: 2.0%

In [None]:
ret_table(df, wts_df["Weight"])


# Near optimal diversified

[Near optimal portfolios](https://portfoliooptimizer.io/blog/mean-variance-optimization-in-practice-well-diversified-near-efficient-portfolios/) 

First find the highest Sharpe portfolio. Then, find the lowest risk portfolio with no more than e.g. a 0.05 drop in Sharpe ratio. Since this portfolio is more diversified, i.e. most diversified within 0.05 of maximum Sharpe, it should be more robust out-of-sample.


In [None]:
# 1928-present
df = real_data_df.loc[1928:].copy()
labels = df.columns
n_years, n_assets = df.shape


In [None]:
mu = df.mean().to_list()
covmatrix = df.cov().values


In [None]:
# docs
# https://docs.portfoliooptimizer.io/index.html#post-/portfolio/optimization/maximum-sharpe-ratio/diversified

# API endpoint and headers
BASEURL = "https://api.portfoliooptimizer.io/v1"
ENDPOINT = "portfolio/optimization/maximum-sharpe-ratio/diversified"
url = f"{BASEURL}/{ENDPOINT}"
print(url)

headers = {
    "Content-Type": "application/json",
    "Authorization": f'Bearer {os.getenv("PORTFOLIO_OPT_APIKEY")})'
}

# Payload
# I'm surprised there is no parameter for how 'near' is considered 'near optimal'

data = {
    "assets": n_assets,
    "assetsReturns": mu,
    "assetsCovarianceMatrix": covmatrix.tolist(),
    "riskFreeRate": 0
}

# Make the POST request
time.sleep(1) # can max out rate limit when run end to end

response = requests.post(url, json=data, headers=headers)

# Check and print the response
if response.status_code == 200:
    print("Response data:", response.json())
else:
    print("Error:", response.status_code, response.text)
    

In [None]:
wts = json.loads(response.text)['assetsWeights']
wts_df = pd.DataFrame({'Asset': df.columns.to_list(), 'Weight': wts}).set_index("Asset")
wts_df

In [None]:
ret_table(df, wts_df["Weight"])


# Subset Resampling

[Subset resampled portfolios](https://portfoliooptimizer.io/blog/mean-variance-optimization-in-practice-subset-resampling-based-efficient-portfolios/), Suppose we have 6 assets, do 6 optimizations, dropping one asset each time, then average all the portfolios. Similar to random forest, an ensemble of slightly weakened models is better than a single overfitted model.
    

In [None]:
# 1928-present
df = real_data_df.loc[1928:].copy()
df["TIPS"] = 0
labels = df.columns
n_years, n_assets = df.shape


In [None]:
mu = df.mean().to_list()
covmatrix = df.cov().values


In [None]:
# API endpoint and headers
BASEURL = "https://api.portfoliooptimizer.io/v1"
ENDPOINT = "portfolio/optimization/maximum-sharpe-ratio/subset-resampling-based"
url = f"{BASEURL}/{ENDPOINT}"
print(url)

headers = {
    "Content-Type": "application/json",
    "Authorization": f'Bearer {os.getenv("PORTFOLIO_OPT_APIKEY")})'
}

# Payload
data = {
    "assets": n_assets,
    "assetsReturns": mu,
    "assetsCovarianceMatrix": covmatrix.tolist(),
    "riskFreeRate": 0
}

# Make the POST request
time.sleep(1) # can max out rate limit when run end to end
response = requests.post(url, json=data, headers=headers)

# Check and print the response
if response.status_code == 200:
    print("Response data:", response.json())
else:
    print("Error:", response.status_code, response.text)
    

In [None]:
wts = json.loads(response.text)['assetsWeights']
wts_df = pd.DataFrame({'Asset': df.columns.to_list(), 'Weight': wts}).set_index("Asset")
wts_df


In [None]:
ret_table(df, wts_df["Weight"])


# Michaud Resampling


[Michaud resampling](https://docs.portfoliooptimizer.io/index.html#post-/portfolios/analysis/mean-variance/efficient-frontier/resampling-based) and [MCOS](https://github.com/enjine-com/mcos/tree/master). Do Monte Carlo simulations where we perturb the return forecasts and covariances randomly each time, and average all the resulting portfolios. 


In [None]:
# 1928-present
df = real_data_df.loc[1928:].copy()
df["TIPS"] = 0
labels = df.columns
n_years, n_assets = df.shape


In [None]:
mu = df.mean().to_list()
covmatrix = df.cov().values


In [None]:
# make intervals for mus , maybe should be a multiple of sds, not sure
mu_adjust = [m/3 for m in mu]
mu_adjust = [max(0.01, m) for m in mu_adjust] # at least 1% uncertainty interval or 30%
mu_intervals = list(zip([mu[i] - mu_adjust[i] for i in range(len(mu))], 
                        [mu[i] + mu_adjust[i] for i in range(len(mu))]))
mu_intervals



In [None]:
# API endpoint and headers
BASEURL = "https://api.portfoliooptimizer.io/v1"
ENDPOINT = "portfolios/optimization/maximum-sharpe-ratio/resampling-based"
url = f"{BASEURL}/{ENDPOINT}"
print(url)

headers = {
    "Content-Type": "application/json",
    "Authorization": f'Bearer {os.getenv("PORTFOLIO_OPT_APIKEY")})'
}

# Payload
data = {
    "assets": n_assets,
    "assetsReturns": mu,
    "assetsReturnsUncertaintyIntervals": mu_intervals,
    "assetsCovarianceMatrix": covmatrix.tolist(),
    "riskFreeRate": 0,
#     "assetsCorrelationMatrixUncertaintyLevel": 0.25,
#     "portfolios": 100
}

# Make the POST request
time.sleep(1) # can max out rate limit when run end to end

response = requests.post(url, json=data, headers=headers)

# Check and print the response
if response.status_code == 200:
    print("Response data:", response.json())
else:
    print("Error:", response.status_code, response.text)
    

In [None]:
wts = json.loads(response.text)['assetsWeights']
wts_df = pd.DataFrame({'Asset': df.columns.to_list(), 'Weight': wts}).set_index("Asset")
wts_df

In [None]:
ret_table(df, wts_df["Weight"])


# Factors

Optimal portfolio after decomposing returns into risk factors with PCA/SVD.


In [None]:
# we do this experiment with a synthetic portfolio
# number of stocksa
n = 1000 
# historical mean returns for each stock
mu = np.random.normal(0.1, 0.2, n)

# number of factors
m = 10

# factor covariance matrix - random symmetrical matrix
SigmaFactor = np.random.randn(m, m)/4
SigmaFactor = SigmaFactor.T @ SigmaFactor

# factor loadings, determine volatility and covariances between stocks
F = np.random.randn(n, m)
# idiosyncratic risk of each stock
D = np.diag(np.random.uniform(0, 0.9, size=n))

count, bins, ignored = plt.hist(mu, 100, density=True, align='mid')


In [None]:
w = cp.Variable(n)         # what we solve for: weight for each stock
ret = mu.T @ w             # solve for weights that maximize portfolio return
f = F.T @ w                # portfolio factor loading
Lmax = cp.Parameter()      # leverage constraint
# portfolio volatility: factor risk + idiosyncratic risk
risk = cp.quad_form(f, SigmaFactor) + cp.quad_form(w, D)
prob = cp.Problem(cp.Maximize(ret), 
                  [cp.sum(w) == 1, 
                   cp.norm(w, 1) <= Lmax])

# Solve the factor model problem.
Lmax.value = 2
prob.solve(verbose=True)

maxretvol = risk.value
maxret = ret.value
print("Max return portfolio (return=%.4f, vol=%.4f)" % (maxret, maxretvol))


In [None]:
# solve min vol portfolio (other corner solution)

prob = cp.Problem(cp.Minimize(risk),
                  [cp.sum(w) == 1, 
                   cp.norm(w, 1) <= Lmax])
prob.solve(solver=cp.OSQP)

minvol = risk.value
minvolret = ret.value
print("Min vol portfolio (return=%.4f, risk=%.4f)" % (minvolret, minvol))


In [None]:
%%time
# solve points in between
# maximize return subject to volatility constraints between minimum volatility and max return volatility
# might have to run a couple of times to get a solution

# specify a Parameter variable instead of creating new Problem at each iteration
# this allows the solver to reuse previous work
vol_limit = cp.Parameter(nonneg=True)

prob = cp.Problem(cp.Maximize(ret),
                  [cp.sum(w) == 1, 
                   cp.norm(w, 1) <= Lmax,
                   risk <= vol_limit]
                 )

# define function so we can solve many in parallel
def solve_vl(vl_val):
    vol_limit.value = vl_val
    result = prob.solve(verbose=False)
    return (ret.value, np.sqrt(risk.value), w.value)

# number of points on the frontier
NPOINTS = 200
vl_vals = np.linspace(np.sqrt(minvol), np.sqrt(maxretvol), NPOINTS)
vl_vals = np.square(vl_vals)
# vol constraint is in variance space, take square root of minvol and maxvol, linspace, square values)

# iterate in-process
results_dict = {}
for vl_val in vl_vals:
    # print(datetime.strftime(datetime.now(), "%H:%M:%S"), vl_val)
    results_dict[vl_val] = solve_vl(vl_val)
    
# parallel implementation
# NPROCESSES = 8
# pool = Pool(processes = NPROCESSES)
# result_values = pool.map(solve_vl, vl_vals)
# results_dict = dict(zip(vl_vals, result_values))


In [None]:
ret_df = pd.DataFrame(enumerate(results_dict.keys()))
ret_df.columns=['i', 'var']
ret_df['return'] = [results_dict[v][0] for v in ret_df['var']]
ret_df['std'] = [results_dict[v][1] for v in ret_df['var']]
# ret_df


In [None]:
# plot frontier
plt.figure(figsize=(8, 4.5))
#plt.scatter(asset_vols, mu)

x = ret_df['std']
y = ret_df['return']
plt.xlabel("Standard Deviation of Returns")
plt.ylabel("Return")
plt.title("Risk vs. Return")

# plot the data
plt.plot(x,y);


### Or compute factors with historical data

In [None]:
# login to openbb with email and password
obb.account.login(email=os.environ['OPENBB_USER'], password=os.environ['OPENBB_PW'], remember_me=True)

# probably a way to get S&P components from OpenBB but this didn't give tickers, have to map lei to ticker, might need a paid provider sub
# response = obb.etf.holdings(symbol='VOO', provider='sec')
# response.results[0]


In [None]:
# get tickers from Wikipedia
ticker_list = ['A',
 'AAPL',
 'ABBV',
 'ABNB',
 'ABT',
 'ACGL',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AFL',
 'AIG',
 'AIZ',
 'AJG',
 'AKAM',
 'ALB',
 'ALGN',
 'ALL',
 'ALLE',
 'AMAT',
 'AMCR',
 'AMD',
 'AME',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'ANET',
 'ANSS',
 'AON',
 'AOS',
 'APA',
 'APD',
 'APH',
 'APO',
 'APTV',
 'ARE',
 'ATO',
 'AVB',
 'AVGO',
 'AVY',
 'AWK',
 'AXON',
 'AXP',
 'AZO',
 'BA',
 'BAC',
 'BALL',
 'BAX',
 'BBY',
 'BDX',
 'BEN',
 'BF-B',
 'BG',
 'BIIB',
 'BK',
 'BKNG',
 'BKR',
 'BLDR',
 'BLK',
 'BMY',
 'BR',
 'BRK-B',
 'BRO',
 'BSX',
 'BWA',
 'BX',
 'BXP',
 'C',
 'CAG',
 'CAH',
 'CARR',
 'CAT',
 'CB',
 'CBOE',
 'CBRE',
 'CCI',
 'CCL',
 'CDNS',
 'CDW',
 'CE',
 'CEG',
 'CF',
 'CFG',
 'CHD',
 'CHRW',
 'CHTR',
 'CI',
 'CINF',
 'CL',
 'CLX',
 'CMCSA',
 'CME',
 'CMG',
 'CMI',
 'CMS',
 'CNC',
 'CNP',
 'COF',
 'COO',
 'COP',
 'COR',
 'COST',
 'CPAY',
 'CPB',
 'CPRT',
 'CPT',
 'CRL',
 'CRM',
 'CRWD',
 'CSCO',
 'CSGP',
 'CSX',
 'CTAS',
 'CTRA',
 'CTSH',
 'CTVA',
 'CVS',
 'CVX',
 'CZR',
 'D',
 'DAL',
 'DAY',
 'DD',
 'DE',
 'DECK',
 'DELL',
 'DFS',
 'DG',
 'DGX',
 'DHI',
 'DHR',
 'DIS',
 'DLR',
 'DLTR',
 'DOC',
 'DOV',
 'DOW',
 'DPZ',
 'DRI',
 'DTE',
 'DUK',
 'DVA',
 'DVN',
 'DXCM',
 'EA',
 'EBAY',
 'ECL',
 'ED',
 'EFX',
 'EG',
 'EIX',
 'EL',
 'ELV',
 'EMN',
 'EMR',
 'ENPH',
 'EOG',
 'EPAM',
 'EQIX',
 'EQR',
 'EQT',
 'ERIE',
 'ES',
 'ESS',
 'ETN',
 'ETR',
 'EVRG',
 'EW',
 'EXC',
 'EXPD',
 'EXPE',
 'EXR',
 'F',
 'FANG',
 'FAST',
 'FCX',
 'FDS',
 'FDX',
 'FE',
 'FFIV',
 'FI',
 'FICO',
 'FIS',
 'FITB',
 'FMC',
 'FOX',
 'FOXA',
 'FRT',
 'FSLR',
 'FTNT',
 'FTV',
 'GD',
 'GDDY',
 'GE',
 'GEHC',
 'GEN',
 'GEV',
 'GILD',
 'GIS',
 'GL',
 'GLW',
 'GM',
 'GNRC',
 'GOOG',
 'GOOGL',
 'GPC',
 'GPN',
 'GRMN',
 'GS',
 'GWW',
 'HAL',
 'HAS',
 'HBAN',
 'HCA',
 'HD',
 'HES',
 'HIG',
 'HII',
 'HLT',
 'HOLX',
 'HON',
 'HPE',
 'HPQ',
 'HRL',
 'HSIC',
 'HST',
 'HSY',
 'HUBB',
 'HUM',
 'HWM',
 'IBM',
 'ICE',
 'IDXX',
 'IEX',
 'IFF',
 'INCY',
 'INTC',
 'INTU',
 'INVH',
 'IP',
 'IPG',
 'IQV',
 'IR',
 'IRM',
 'ISRG',
 'IT',
 'ITW',
 'IVZ',
 'J',
 'JBHT',
 'JBL',
 'JCI',
 'JKHY',
 'JNJ',
 'JNPR',
 'JPM',
 'K',
 'KDP',
 'KEY',
 'KEYS',
 'KHC',
 'KIM',
 'KKR',
 'KLAC',
 'KMB',
 'KMI',
 'KMX',
 'KO',
 'KR',
 'KVUE',
 'L',
 'LDOS',
 'LEN',
 'LH',
 'LHX',
 'LII',
 'LIN',
 'LKQ',
 'LLY',
 'LMT',
 'LNT',
 'LOW',
 'LRCX',
 'LULU',
 'LUV',
 'LVS',
 'LW',
 'LYB',
 'LYV',
 'MA',
 'MAA',
 'MAR',
 'MAS',
 'MCD',
 'MCHP',
 'MCK',
 'MCO',
 'MDLZ',
 'MDT',
 'MET',
 'META',
 'MGM',
 'MHK',
 'MKC',
 'MKTX',
 'MLM',
 'MMC',
 'MMM',
 'MNST',
 'MO',
 'MOH',
 'MOS',
 'MPC',
 'MPWR',
 'MRK',
 'MRNA',
 'MS',
 'MSCI',
 'MSFT',
 'MSI',
 'MTB',
 'MTCH',
 'MTD',
 'MU',
 'NCLH',
 'NDAQ',
 'NDSN',
 'NEE',
 'NEM',
 'NFLX',
 'NI',
 'NKE',
 'NOC',
 'NOW',
 'NRG',
 'NSC',
 'NTAP',
 'NTRS',
 'NUE',
 'NVDA',
 'NVR',
 'NWS',
 'NWSA',
 'NXPI',
 'O',
 'ODFL',
 'OKE',
 'OMC',
 'ON',
 'ORCL',
 'ORLY',
 'OTIS',
 'OXY',
 'PANW',
 'PARA',
 'PAYC',
 'PAYX',
 'PCAR',
 'PCG',
 'PEG',
 'PEP',
 'PFE',
 'PFG',
 'PG',
 'PGR',
 'PH',
 'PHM',
 'PKG',
 'PLD',
 'PLTR',
 'PM',
 'PNC',
 'PNR',
 'PNW',
 'PODD',
 'POOL',
 'PPG',
 'PPL',
 'PRU',
 'PSA',
 'PSX',
 'PTC',
 'PWR',
 'PYPL',
 'QCOM',
 'RCL',
 'REG',
 'REGN',
 'RF',
 'RJF',
 'RL',
 'RMD',
 'ROK',
 'ROL',
 'ROP',
 'ROST',
 'RSG',
 'RTX',
 'RVTY',
 'SBAC',
 'SBUX',
 'SCHW',
 'SHW',
 'SJM',
 'SLB',
 'SMCI',
 'SNA',
 'SNPS',
 'SO',
 'SOLV',
 'SPG',
 'SPGI',
 'SRE',
 'STE',
 'STLD',
 'STT',
 'STX',
 'STZ',
 'SW',
 'SWK',
 'SWKS',
 'SYF',
 'SYK',
 'SYY',
 'T',
 'TAP',
 'TDG',
 'TDY',
 'TECH',
 'TEL',
 'TER',
 'TFC',
 'TFX',
 'TGT',
 'TJX',
 'TMO',
 'TMUS',
 'TPL',
 'TPR',
 'TRGP',
 'TRMB',
 'TROW',
 'TRV',
 'TSCO',
 'TSLA',
 'TSN',
 'TT',
 'TTWO',
 'TXN',
 'TXT',
 'TYL',
 'UAL',
 'UBER',
 'UDR',
 'UHS',
 'ULTA',
 'UNH',
 'UNP',
 'UPS',
 'URI',
 'USB',
 'V',
 'VICI',
 'VLO',
 'VLTO',
 'VMC',
 'VRSK',
 'VRSN',
 'VRTX',
 'VST',
 'VTR',
 'VTRS',
 'VZ',
 'WAB',
 'WAT',
 'WBA',
 'WBD',
 'WDAY',
 'WDC',
 'WEC',
 'WELL',
 'WFC',
 'WM',
 'WMB',
 'WMT',
 'WRB',
 'WST',
 'WTW',
 'WY',
 'WYNN',
 'XEL',
 'XOM',
 'XYL',
 'YUM',
 'ZBH',
 'ZBRA',
 'ZTS']

In [None]:
# download historical returns from Yahoo Finance
todays_date = datetime.today()
start_date = datetime(year=todays_date.year-10, month=todays_date.month, day=todays_date.day)
start_date_str = start_date.strftime("%Y-%m-%d")
symbol = "BF-B"
df = obb.equity.price.historical(symbol = symbol, 
                                 start_date = start_date, 
                                 provider="yfinance",
                                 adjustment='splits_and_dividends').to_df()
df.head()

In [None]:
# download all tickers
df = obb.equity.price.historical(symbol = ticker_list, 
                                 start_date = start_date, 
                                 provider="yfinance",
                                 adjustment='splits_and_dividends').to_df()
# too big to display
# df 


In [None]:
df = df[["symbol", "close"]].copy()
df.head()


In [None]:
# fix index
df = df.reset_index()
df["date"] = pd.to_datetime(df["date"])
df.head()


In [None]:
df_pivot = df.pivot(index="date", columns=["symbol"], values="close")
df_pivot.head()


In [None]:
# save 
df_pivot.to_pickle("df_pivot.pkl")


In [None]:
df_pivot.shape


In [None]:
# change from daily to monthly returns
dfx = df_pivot \
    .resample('ME') \
    .last() \
    .pct_change() \
    .dropna(axis=0, how='all') \
    .dropna(axis=1)      # drop columns with missing data




In [None]:
dfx.shape


In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca_result = pca.fit_transform(dfx)


In [None]:
# chart eigenvalues

explained_variance_ratio = pca.explained_variance_ratio_

# Create elbow plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), 
         np.cumsum(explained_variance_ratio), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Elbow Plot of PCA Components')
plt.grid(True)
plt.show()


In [None]:
# scree plot with individual variance explained
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), 
         explained_variance_ratio, 'bo-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot of PCA Components')
plt.grid(True)
plt.show()


In [None]:
components = pca.fit_transform(dfx)
reconstructed = pca.inverse_transform(components)

# Calculate reconstruction error
difference = np.abs(dfx - reconstructed)
reconstruction_error = np.mean(difference)

print("Reconstruction Error (Mean Absolute Difference):")
print(reconstruction_error)


In [None]:
reconstructed_data = np.dot(components, pca.components_) + pca.mean_
reconstructed_df = pd.DataFrame(reconstructed_data, columns=dfx.columns)

# Calculate reconstruction error
reconstruction_error = np.mean(np.abs(dfx.reset_index(drop=True) - reconstructed_df))
print("Reconstruction Error (Mean Absolute Difference):", reconstruction_error)


In [None]:
# reconstruct but with 20 columns
components.shape


In [None]:
pca.components_ .shape


In [None]:
# could use e.g. 20 factors that explain ~ 70% of variation
loadings = pd.DataFrame(
    pca.components_.T,  
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=dfx.columns
)
loadings.T

In [None]:
# rest left as an exercise for the reader for now, may revisit later
# compute efficient frontier of portfolios of top 20 factors
# compute max sharpe portfolio 
# back out individual stock weights
# compute backward looking performance, vol, sharpe