In [4]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

In [5]:
db_name = 'sp500_data.db'
engine = sqlite3.connect(db_name)

In [7]:
returns = pd.read_sql("SELECT * FROM returns", engine, index_col='Date', parse_dates=['Date'])
print(f"Returns loaded: {returns.shape}")
print(f"Date range: {returns.index[0].date()} to {returns.index[-1].date()}")


Returns loaded: (391, 503)
Date range: 2024-03-28 to 2025-10-17


In [8]:
sp500_returns = pd.read_sql("SELECT * FROM sp500_returns", engine, index_col='Date', parse_dates=['Date'])
print(f"S&P 500 returns loaded: {sp500_returns.shape}\n")

engine.close()

S&P 500 returns loaded: (751, 1)



In [9]:
correlation_matrix = returns.corr()
distance_matrix = 1 - correlation_matrix

In [11]:
condensed_dist = squareform(distance_matrix.values)
print(f"length {len(condensed_dist)}\n")

length 126253



In [17]:
Z = linkage(condensed_dist, method='average')

# Cut dendrogram to get 5 clusters
n_clusters = 5
cluster_labels = fcluster(Z, n_clusters, criterion='maxclust')

# Check cluster sizes
cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()
for cluster_id, size in cluster_sizes.items():
    print(f"  Cluster {cluster_id}: {size:3d} stocks ({size/len(cluster_labels)*100:.1f}%)")
print("-" * 40)
print(f"  Total: {len(cluster_labels)} stocks\n")

  Cluster 1:   7 stocks (1.4%)
  Cluster 2: 489 stocks (97.2%)
  Cluster 3:   2 stocks (0.4%)
  Cluster 4:   4 stocks (0.8%)
  Cluster 5:   1 stocks (0.2%)
----------------------------------------
  Total: 503 stocks



In [None]:
# Create 5 equally weighted portfolios
print("Creating equally weighted portfolios for each cluster...")
portfolio_returns_dict = {}

for cluster_id in range(1, n_clusters + 1):
    # Get stocks in this cluster
    cluster_mask = cluster_labels == cluster_id
    cluster_stocks = returns.columns[cluster_mask].tolist()
    
    # Calculate equally weighted portfolio returns
    cluster_portfolio = returns[cluster_stocks].mean(axis=1)
    portfolio_returns_dict[f'Portfolio_{cluster_id}'] = cluster_portfolio

Creating equally weighted portfolios for each cluster...


In [21]:
portfolio_returns = pd.DataFrame(portfolio_returns_dict)

print(f"\nPortfolio returns DataFrame created: {portfolio_returns.shape}")
print(f"Date range: {portfolio_returns.index[0].date()} to {portfolio_returns.index[-1].date()}")
print("\nPortfolio returns summary:")
print(portfolio_returns.describe())


Portfolio returns DataFrame created: (391, 5)
Date range: 2024-03-28 to 2025-10-17

Portfolio returns summary:
       Portfolio_1  Portfolio_2  Portfolio_3  Portfolio_4  Portfolio_5
count   391.000000   391.000000   391.000000   391.000000   391.000000
mean     -0.000581     0.000520    -0.000448     0.000445     0.001164
std       0.017880     0.010213     0.023210     0.010254     0.012218
min      -0.130694    -0.057503    -0.211917    -0.051042    -0.025229
25%      -0.008049    -0.003854    -0.010421    -0.004678    -0.001835
50%       0.000153     0.000325    -0.000699     0.000906     0.000124
75%       0.008845     0.006070     0.010344     0.005648     0.001996
max       0.049030     0.085946     0.109273     0.045552     0.162274


In [23]:
mu = portfolio_returns.mean() * 252  # 252 trading days per year
cov_matrix = portfolio_returns.cov() * 252
print(cov_matrix)

             Portfolio_1  Portfolio_2  Portfolio_3  Portfolio_4  Portfolio_5
Portfolio_1     0.080561     0.011309     0.007905     0.008282     0.001263
Portfolio_2     0.011309     0.026283     0.010692     0.002217    -0.000284
Portfolio_3     0.007905     0.010692     0.135756     0.009531     0.000078
Portfolio_4     0.008282     0.002217     0.009531     0.026497     0.003884
Portfolio_5     0.001263    -0.000284     0.000078     0.003884     0.037622


In [25]:
# Risk-free rate (assume 3% annually)
rf = 0.03
def negative_sharpe(weights, mu, cov_matrix, rf):
    portfolio_return = np.dot(weights, mu)
    portfolio_variance = np.dot(weights, np.dot(cov_matrix, weights))
    portfolio_std = np.sqrt(portfolio_variance)
    
    if portfolio_std == 0:
        return np.inf
    
    sharpe_ratio = (portfolio_return - rf) / portfolio_std
    return -sharpe_ratio

# Constraints: weights sum to 1
constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

# Bounds: 0 <= weight <= 1 (no short selling)
bounds = tuple((0, 1) for _ in range(len(mu)))

# Initial guess: equal weights
w0 = np.array([1/len(mu)] * len(mu))

# Optimize
result = minimize(negative_sharpe, w0, args=(mu, cov_matrix, rf),
                  method='SLSQP', bounds=bounds, constraints=constraints)


In [26]:
optimal_weights = result.x
optimal_sharpe = -result.fun
optimal_return = np.dot(optimal_weights, mu)
optimal_variance = np.dot(optimal_weights, np.dot(cov_matrix, optimal_weights))
optimal_std = np.sqrt(optimal_variance)
