In [1]:
import numpy as np
import numpy.linalg as LA
import pandas as pd
import yfinance as yf

import cvxpy as cp

In [2]:
# Load the GICS sector information from the CSV file
df = pd.read_csv("data/sp500_gics_sectors.csv", index_col=0)
information_technology_stocks = list(df[df["GICS Sector"] == "Information Technology"].index)
healthcare_stocks = list(df[df["GICS Sector"] == "Health Care"].index)
financials_stocks = list(df[df["GICS Sector"] == "Financials"].index)
consumer_discretionary_stocks = list(df[df["GICS Sector"] == "Consumer Discretionary"].index)
industrials_stocks = list(df[df["GICS Sector"] == "Industrials"].index)

print("Number of information technology stocks:", len(information_technology_stocks))
print("Number of healthcare stocks:", len(healthcare_stocks))
print("Number of financials stocks:", len(financials_stocks))
print("Number of consumer discretionary stocks:", len(consumer_discretionary_stocks))
print("Number of industrials stocks:", len(industrials_stocks))

Number of information technology stocks: 68
Number of healthcare stocks: 60
Number of financials stocks: 75
Number of consumer discretionary stocks: 50
Number of industrials stocks: 79


In [3]:
# load historical price data from file (originally scraped from Yahoo Finance)
DOWNLOAD_DATA = False
if DOWNLOAD_DATA:
    tickers = information_technology_stocks + healthcare_stocks + financials_stocks + \
          consumer_discretionary_stocks + industrials_stocks
    data = yf.download(tickers, start="2020-01-01", end="2025-01-01")
    # only store close prices
    data = data['Close']
    data.to_csv("data/sp500_prices.csv")
else:
    data = pd.read_csv("data/sp500_prices.csv", index_col=0, parse_dates=True)

data = data.dropna(axis=1, how='any')
returns = data.pct_change().dropna()



In [4]:
# remove sector indices of stocks with no data
information_technology_stocks = [stock for stock in information_technology_stocks if
                                 stock in returns.columns]
healthcare_stocks = [stock for stock in healthcare_stocks if stock in returns.columns]
financials_stocks = [stock for stock in financials_stocks if stock in returns.columns]
consumer_discretionary_stocks = [stock for stock in consumer_discretionary_stocks if 
                                 stock in returns.columns]
industrials_stocks = [stock for stock in industrials_stocks if stock in returns.columns]

print("Number of information technology stocks:", len(information_technology_stocks))
print("Number of healthcare stocks:", len(healthcare_stocks))
print("Number of financials stocks:", len(financials_stocks))
print("Number of consumer discretionary stocks:", len(consumer_discretionary_stocks))
print("Number of industrials stocks:", len(industrials_stocks))

# choose the stocks in the sectors 
returns = returns[information_technology_stocks + healthcare_stocks + financials_stocks + 
                   consumer_discretionary_stocks + industrials_stocks]

idxs_info_tech = [i for i, stock in enumerate(returns.columns) if stock in 
                  information_technology_stocks]
idxs_healthcare = [i for i, stock in enumerate(returns.columns) if stock in healthcare_stocks]
idxs_financials = [i for i, stock in enumerate(returns.columns) if stock in financials_stocks]
idxs_consumer_discretionary = [i for i, stock in enumerate(returns.columns) if stock in 
                               consumer_discretionary_stocks]
idxs_industrials = [i for i, stock in enumerate(returns.columns) if stock in industrials_stocks]

groups = [idxs_info_tech, idxs_healthcare, idxs_financials, idxs_consumer_discretionary, 
          idxs_industrials]

Number of information technology stocks: 66
Number of healthcare stocks: 58
Number of financials stocks: 72
Number of consumer discretionary stocks: 48
Number of industrials stocks: 75


In [5]:
# compute the covariance matrix and regularize it
Sigma = returns.cov().values
n = Sigma.shape[0]
alpha = 0.8
Sigma = alpha * Sigma + (1-alpha) * np.trace(Sigma) / n * np.eye(n)
evals = np.linalg.eigvalsh(Sigma)
print("smallest and largest eigenvalues of Sigma:", min(evals), max(evals))

smallest and largest eigenvalues of Sigma: 0.0001224821758387878 0.06136787272333727


In [6]:
# specify risk contributions
b = np.array([0.3, 0.25, 0.20, 0.15, 0.10])

In [None]:
# formulation 1
term1, term2, term3 = 0, 0, 0

w = cp.Variable((n, ), nonneg=True, name='w')
t = cp.Variable((n, ), name='t')
constraints = [cp.sum(w) == 1, t == Sigma @ w]
w.value = np.ones(n) / n

for k, g in enumerate(groups):
    term1 += cp.square(cp.sum(cp.multiply(w[g], t[g]))) /  cp.quad_form(w, Sigma)
    term2 += (LA.norm(b[k]) ** 2) * cp.quad_form(w, Sigma)
    term3 += - 2 * b[k] * cp.sum(cp.multiply(w[g], t[g]))
obj = cp.Minimize(term1 + term2 + term3)
problem = cp.Problem(obj, constraints)
problem.solve(solver=cp.IPOPT, nlp=True, verbose=True, derivative_test='none')

risk_contributions_form1 = w.value * (Sigma @ w.value) 
risk_contributions_form1 /= np.sum(risk_contributions_form1)
risk_contributions_form1 = np.array([np.sum(risk_contributions_form1[g]) for g in groups])

(CVXPY) Oct 29 02:53:39 PM: Your problem has 638 variables, 320 constraints, and 0 parameters.
(CVXPY) Oct 29 02:53:39 PM: It is compliant with the following grammars: 
(CVXPY) Oct 29 02:53:39 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 29 02:53:39 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Oct 29 02:53:39 PM: Your problem is compiled with the CPP canonicalization backend.


                                     CVXPY                                     
                             v1.7.0.dev0+0.5870139                             
This is Ipopt version 3.11.9, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:   107209
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:    51688

Total number of variables............................:     1929
                     variables with only lower bounds:      329
                variables with lower and upper bounds:        0
                     variables with only upper bounds:        0
Total number of equality constraints.................:     1611
Total number of inequality constraints...............:        0
        inequality constraints with only lower bounds:        0
   inequality constraints with lower and upper bounds:  

np.float64(2.5397429114209363e-14)

In [19]:
# formulation 2
w = cp.Variable((n, ), nonneg=True, name='w')
t = cp.Variable((n, ), name='t')
constraints = [cp.sum(w) == 1, t == Sigma @ w]
w.value = np.ones(n) / n

obj = 0
for k, g in enumerate(groups):
    obj += cp.square(cp.sum(cp.multiply(w[g], t[g])) / cp.quad_form(w, Sigma) - b[k])

problem = cp.Problem(cp.Minimize(obj), constraints)
problem.solve(solver=cp.IPOPT, nlp=True, verbose=True, derivative_test='none')

risk_contributions_form2 = w.value * (Sigma @ w.value) 
risk_contributions_form2 /= np.sum(risk_contributions_form2)
risk_contributions_form2 = np.array([np.sum(risk_contributions_form2[g]) for g in groups])

print("Target risk contributions:", b)
print("Risk contributions from formulation 2:", risk_contributions_form2)

(CVXPY) Oct 29 03:22:32 PM: Your problem has 638 variables, 320 constraints, and 0 parameters.
(CVXPY) Oct 29 03:22:32 PM: It is compliant with the following grammars: 
(CVXPY) Oct 29 03:22:32 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 29 03:22:32 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Oct 29 03:22:32 PM: Your problem is compiled with the CPP canonicalization backend.


                                     CVXPY                                     
                             v1.7.0.dev0+0.5870139                             
This is Ipopt version 3.11.9, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:   105933
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:    51369

Total number of variables............................:     1291
                     variables with only lower bounds:      324
                variables with lower and upper bounds:        0
                     variables with only upper bounds:        0
Total number of equality constraints.................:      973
Total number of inequality constraints...............:        0
        inequality constraints with only lower bounds:        0
   inequality constraints with lower and upper bounds:  

In [None]:
# formulation 3
w = cp.Variable((n, ), nonneg=True, name='w')
constraints = [cp.sum(w) == 1]
w.value = np.ones(n) / n

obj = 0
for k, g in enumerate(groups):
    obj += cp.square(cp.sum(cp.multiply(w[g], (Sigma @ w)[g])) / cp.quad_form(w, Sigma) - b[k])

problem = cp.Problem(cp.Minimize(obj), constraints)
problem.solve(solver=cp.IPOPT, nlp=True, verbose=True, derivative_test='none')

risk_contributions_form3 = w.value * (Sigma @ w.value) 
risk_contributions_form3 /= np.sum(risk_contributions_form3)
risk_contributions_form3 = np.array([np.sum(risk_contributions_form3[g]) for g in groups])

print("Target risk contributions:", b)
print("Risk contributions from formulation 3:", risk_contributions_form3)

(CVXPY) Oct 29 03:31:20 PM: Your problem has 319 variables, 1 constraints, and 0 parameters.
(CVXPY) Oct 29 03:31:20 PM: It is compliant with the following grammars: 
(CVXPY) Oct 29 03:31:20 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 29 03:31:20 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Oct 29 03:31:20 PM: Your problem is compiled with the CPP canonicalization backend.


                                     CVXPY                                     
                             v1.7.0.dev0+0.5870139                             
This is Ipopt version 3.11.9, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:   105295
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:    51369

Total number of variables............................:      972
                     variables with only lower bounds:      324
                variables with lower and upper bounds:        0
                     variables with only upper bounds:        0
Total number of equality constraints.................:      654
Total number of inequality constraints...............:        0
        inequality constraints with only lower bounds:        0
   inequality constraints with lower and upper bounds:  

In [25]:
# function for statistical factor model via PCA on correlation matrix
def PCFA_via_corr(sigma, k):
    vola = np.sqrt(np.diag(sigma)).reshape(-1, 1)
    R = (1 / vola) * sigma * (1 / vola).T
    lmbda, Q = LA.eigh(R)
    lmbda = lmbda[::-1][0:k]
    Q = Q[:, ::-1][:, 0:k]

    # low-rank approximation of correlation matrix
    F = (Q @ np.diag(np.sqrt(lmbda)))
    d = np.diag(R - F @ F.T)
   
    # scale so it becomes low rank approximation of covariance matrix
    F = vola * F 
    d = (np.squeeze(vola)**2) * d
    
    return F, d

In [32]:
# with factor model
F, d = PCFA_via_corr(Sigma, k=10)

# formulation 3
w = cp.Variable((n, ), nonneg=True, name='w')
y = cp.Variable((F.shape[1], ), name='y')
constraints = [cp.sum(w) == 1, F.T @ w == y]
w.value = np.ones(n) / n

obj = 0
for k, g in enumerate(groups):
    obj += cp.square(cp.sum(cp.multiply(w[g], (F @ y + cp.multiply(d, w))[g])) / cp.sum_squares(y) - b[k])

problem = cp.Problem(cp.Minimize(obj), constraints)
problem.solve(solver=cp.IPOPT, nlp=True, verbose=True, derivative_test='none')

risk_contributions_form4 = w.value * ((F @ F.T + np.diag(d)) @ w.value) 
#risk_contributions_form4 = w.value * (Sigma @ w.value) 
risk_contributions_form4 /= np.sum(risk_contributions_form4)
risk_contributions_form4 = np.array([np.sum(risk_contributions_form4[g]) for g in groups])

print("Target risk contributions:", b)
print("Risk contributions from formulation 4:", risk_contributions_form4)


(CVXPY) Oct 29 03:41:52 PM: Your problem has 329 variables, 11 constraints, and 0 parameters.
(CVXPY) Oct 29 03:41:52 PM: It is compliant with the following grammars: 
(CVXPY) Oct 29 03:41:52 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 29 03:41:52 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Oct 29 03:41:52 PM: Your problem is compiled with the CPP canonicalization backend.


                                     CVXPY                                     
                             v1.7.0.dev0+0.5870139                             
This is Ipopt version 3.11.9, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:     8698
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:      339

Total number of variables............................:      982
                     variables with only lower bounds:      324
                variables with lower and upper bounds:        0
                     variables with only upper bounds:        0
Total number of equality constraints.................:      664
Total number of inequality constraints...............:        0
        inequality constraints with only lower bounds:        0
   inequality constraints with lower and upper bounds:  