In [1]:
import numpy as np
import numpy.linalg as LA
import pandas as pd
import yfinance as yf

import cvxpy as cp

In [2]:
# Load the GICS sector information from the CSV file
df = pd.read_csv("sp500_gics_sectors.csv", index_col=0)
information_technology_stocks = list(df[df["GICS Sector"] == "Information Technology"].index)
healthcare_stocks = list(df[df["GICS Sector"] == "Health Care"].index)
financials_stocks = list(df[df["GICS Sector"] == "Financials"].index)
consumer_discretionary_stocks = list(df[df["GICS Sector"] == "Consumer Discretionary"].index)
industrials_stocks = list(df[df["GICS Sector"] == "Industrials"].index)

print("Number of information technology stocks:", len(information_technology_stocks))
print("Number of healthcare stocks:", len(healthcare_stocks))
print("Number of financials stocks:", len(financials_stocks))
print("Number of consumer discretionary stocks:", len(consumer_discretionary_stocks))
print("Number of industrials stocks:", len(industrials_stocks))

Number of information technology stocks: 68
Number of healthcare stocks: 60
Number of financials stocks: 75
Number of consumer discretionary stocks: 50
Number of industrials stocks: 79


In [None]:
# download return data
tickers = information_technology_stocks + healthcare_stocks + financials_stocks + \
          consumer_discretionary_stocks + industrials_stocks
data = yf.download(tickers, start="2020-01-01", end="2025-01-01")
data = data.dropna(axis=1, how='any')
returns = data['Close'].pct_change().dropna()


  data = yf.download(tickers, start="2020-01-01", end="2025-01-01")
[*********************100%***********************]  332 of 332 completed

1 Failed download:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')


In [None]:
# remove sector indices of stocks with no data
information_technology_stocks = [stock for stock in information_technology_stocks if
                                 stock in returns.columns]
healthcare_stocks = [stock for stock in healthcare_stocks if stock in returns.columns]
financials_stocks = [stock for stock in financials_stocks if stock in returns.columns]
consumer_discretionary_stocks = [stock for stock in consumer_discretionary_stocks if 
                                 stock in returns.columns]
industrials_stocks = [stock for stock in industrials_stocks if stock in returns.columns]

print("Number of information technology stocks:", len(information_technology_stocks))
print("Number of healthcare stocks:", len(healthcare_stocks))
print("Number of financials stocks:", len(financials_stocks))
print("Number of consumer discretionary stocks:", len(consumer_discretionary_stocks))
print("Number of industrials stocks:", len(industrials_stocks))

# choose the stocks in the sectors 
returns = returns[information_technology_stocks + healthcare_stocks + financials_stocks + 
                   consumer_discretionary_stocks + industrials_stocks]

idxs_info_tech = [i for i, stock in enumerate(returns.columns) if stock in 
                  information_technology_stocks]
idxs_healthcare = [i for i, stock in enumerate(returns.columns) if stock in healthcare_stocks]
idxs_financials = [i for i, stock in enumerate(returns.columns) if stock in financials_stocks]
idxs_consumer_discretionary = [i for i, stock in enumerate(returns.columns) if stock in 
                               consumer_discretionary_stocks]
idxs_industrials = [i for i, stock in enumerate(returns.columns) if stock in industrials_stocks]

groups = [idxs_info_tech, idxs_healthcare, idxs_financials, idxs_consumer_discretionary, 
          idxs_industrials]

Number of information technology stocks: 66
Number of healthcare stocks: 58
Number of financials stocks: 72
Number of consumer discretionary stocks: 48
Number of industrials stocks: 75


In [5]:
# compute the covariance matrix and regularize it
Sigma = returns.cov().values
n = Sigma.shape[0]
alpha = 0.8
Sigma = alpha * Sigma + (1-alpha) * np.trace(Sigma) / n * np.eye(n)
evals = np.linalg.eigvalsh(Sigma)
print("smallest and largest eigenvalues of Sigma:", min(evals), max(evals))

smallest and largest eigenvalues of Sigma: 0.00012248218116280238 0.06136787550647771


In [6]:
# specify risk contributions
b = np.ones(len(groups)) / len(groups)

In [7]:
n

319

In [7]:
term1 = 0
term2 = 0
term3 = 0

w = cp.Variable((n, ), nonneg=True, name='w')
t = cp.Variable((n, ), name='t')
constraints = [cp.sum(w) == 1, t == Sigma @ w]

w.value = np.random.rand(n)
w.value = w.value / np.sum(w.value)
#w.value = np.ones(n) / n

for k, g in enumerate(groups):
    term1 += cp.square(cp.sum(cp.multiply(w[g], t[g]))) /  cp.quad_form(w, Sigma)
    term2 += (LA.norm(b[k]) ** 2) * cp.quad_form(w, Sigma)
    term3 += - 2 * b[k] * cp.sum(cp.multiply(w[g], t[g]))
obj = cp.Minimize(term1 + term2 + term3)
problem = cp.Problem(obj, constraints)
problem.solve(solver=cp.IPOPT, nlp=True, verbose=True, derivative_test='none')

(CVXPY) Oct 03 04:04:30 PM: Your problem has 638 variables, 320 constraints, and 0 parameters.
(CVXPY) Oct 03 04:04:30 PM: It is compliant with the following grammars: 
(CVXPY) Oct 03 04:04:30 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 03 04:04:30 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Oct 03 04:04:30 PM: Your problem is compiled with the CPP canonicalization backend.


                                     CVXPY                                     
                             v1.7.0.dev0+0.5870139                             

******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit http://projects.coin-or.org/Ipopt
******************************************************************************

This is Ipopt version 3.11.9, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:   107209
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:    51688

Total number of variables............................:     1929
                     variables with only lower bound

np.float64(8.602175232980819e-15)

In [8]:
risk_contributions = w.value * (Sigma @ w.value) 
risk_contributions /= np.sum(risk_contributions)
risk_contributions = np.array([np.sum(risk_contributions[g]) for g in groups])

In [5]:
import pandas as pd
import yfinance as yf

# Step 1: Download the list of S&P 500 companies and their GICS sectors from Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tables = pd.read_html(url)
df = tables[0]

# Step 2: Filter the dataframe to include only the 'Symbol' and 'GICS Sector' columns
#df = df[['Symbol', 'GICS Sector']]

# Step 3: Download historical adjusted closing prices for the top 400 stocks from 2020-2025
#tickers = df['Symbol'].head(400).tolist()
#data = yf.download(tickers, start='2020-01-01', end='2025-12-31')['Adj Close']

# Step 4: Remove any stocks with missing data
#data_clean = data.dropna(axis=1, how='any')

# Step 5: Merge the cleaned data with the GICS sector information
#sector_info = df.set_index('Symbol').loc[data_clean.columns]
#sector_info = sector_info.rename(columns={'GICS Sector': 'Sector'})

# Step 6: Save the GICS sector information to a CSV file
#sector_info.to_csv('sp500_gics_sectors.csv')

# Display the first few rows of the cleaned data and sector information
#print(data_clean.head())
#print(sector_info.head())


HTTPError: HTTP Error 403: Forbidden

In [4]:
url

'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'