In [39]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

ETF_FILE = "Workshop Data.xlsx"
IDX_FILE = "Indexes and Spreads Data 01.09.xlsx"


In [40]:
def read_sheet(path, sheet_name, date_col="Date"):
    df = pd.read_excel(path, sheet_name=sheet_name)
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).set_index(date_col)
    return df

def detect_price_col(df):
    for col in ["Adj Close", "Close", "PX_LAST", "Last Price"]:
        if col in df.columns:
            return col
    num = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num:
        raise ValueError("No numeric price column found.")
    return num[-1]

def zscore(df):
    return (df - df.mean()) / df.std(ddof=0)

def pca_2pcs(X_std: pd.DataFrame):
    """
    PCA on standardized data using eigen-decomp of covariance.
    Returns:
      pc_scores: time series of PC1, PC2
      loadings:  contribution of each original factor to PC1, PC2
      evr:       explained variance ratios for all PCs
    """
    X = X_std.values
    cov = np.cov(X, rowvar=False)
    eigvals, eigvecs = np.linalg.eigh(cov)

    # sort descending by eigenvalue (largest variance first)
    idx = np.argsort(eigvals)[::-1]
    eigvals = eigvals[idx]
    eigvecs = eigvecs[:, idx]

    evr = eigvals / eigvals.sum()

    scores = X @ eigvecs
    pc_scores = pd.DataFrame(scores[:, :2], index=X_std.index, columns=["PC1", "PC2"])
    loadings = pd.DataFrame(eigvecs[:, :2], index=X_std.columns, columns=["PC1", "PC2"])

    return pc_scores, loadings, evr


In [None]:
# Target: HYG return (Adj HYG price -> pct_change)
hyg = read_sheet(ETF_FILE, "Adj HYG", "Date")
HYG_PX_COL = detect_price_col(hyg)
R_HYG = hyg[HYG_PX_COL].astype(float).pct_change().rename("R_HYG")

In [None]:
# Equity factor: SPY return (Adj SPY -> pct_change)
spy = read_sheet(ETF_FILE, "Adj SPY", "Date")
SPY_PX_COL = detect_price_col(spy)
F_EQ = spy[SPY_PX_COL].astype(float).pct_change().rename("F_EQ_SPY")

In [None]:
# Rates + spread shocks: diff (same period t-1 -> t)

UST_SHEET = "10yUST Yields"
HY_SHEET  = "HY Index"
IG_SHEET  = "IG Index"

UST_YIELD_COL = "PX_LAST"
OAS_COL = "OAS_SOVEREIGN_CURVE"

ust = read_sheet(IDX_FILE, UST_SHEET, "Date")
hy  = read_sheet(IDX_FILE, HY_SHEET,  "Date")
ig  = read_sheet(IDX_FILE, IG_SHEET,  "Date")

F_RATES = ust[UST_YIELD_COL].astype(float).diff().rename("F_RATES_D10Y")
F_HY    = hy[OAS_COL].astype(float).diff().rename("F_HY_DSPREAD")
F_IG    = ig[OAS_COL].astype(float).diff().rename("F_IG_DSPREAD")

# Combine and drop missing dates (holidays, mismatches)
raw = pd.concat([R_HYG, F_EQ, F_RATES, F_HY, F_IG], axis=1).dropna()

raw.head(), raw.shape

(               R_HYG  F_EQ_SPY  F_RATES_D10Y  F_HY_DSPREAD  F_IG_DSPREAD
 Date                                                                    
 2021-01-05  0.000344  0.006887        0.0417     -0.027032      0.008711
 2021-01-06 -0.001836  0.005979        0.0806     -0.098733     -0.005081
 2021-01-07  0.002874  0.014857        0.0440     -0.018072     -0.025786
 2021-01-08  0.001490  0.005698        0.0358     -0.019746     -0.009177
 2021-01-11 -0.005379 -0.006741        0.0307      0.041273      0.000662,
 (1254, 5))

In [44]:
# Candidate factor matrix for PCA
Xcand = raw[["F_EQ_SPY", "F_RATES_D10Y", "F_HY_DSPREAD", "F_IG_DSPREAD"]]

# Standardize before PCA
Xstd = zscore(Xcand).dropna()

pc_scores, loadings, evr = pca_2pcs(Xstd)

print("Explained variance ratios (first 4 PCs):")
for i in range(4):
    print(f"PC{i+1}: {evr[i]:.4f}")

print("\nPCA Loadings (factor contributions to PCs):")
print(loadings)

Explained variance ratios (first 4 PCs):
PC1: 0.5685
PC2: 0.2626
PC3: 0.1107
PC4: 0.0582

PCA Loadings (factor contributions to PCs):
                   PC1       PC2
F_EQ_SPY     -0.497914  0.426855
F_RATES_D10Y -0.190739 -0.902265
F_HY_DSPREAD  0.609056  0.060646
F_IG_DSPREAD  0.587155  0.005966


In [45]:
# Align Y with PC series (PC scores already aligned to Xstd index)
data_pc = pd.concat([raw["R_HYG"], pc_scores], axis=1).dropna()

Y = data_pc["R_HYG"]
X = sm.add_constant(data_pc[["PC1", "PC2"]])

model_pc = sm.OLS(Y, X).fit()

print("\n================ HYG REGRESSION ON PC1 + PC2 ================\n")
print(model_pc.summary())




                            OLS Regression Results                            
Dep. Variable:                  R_HYG   R-squared:                       0.723
Model:                            OLS   Adj. R-squared:                  0.722
Method:                 Least Squares   F-statistic:                     1629.
Date:                Sun, 11 Jan 2026   Prob (F-statistic):               0.00
Time:                        23:43:41   Log-Likelihood:                 5725.0
No. Observations:                1254   AIC:                        -1.144e+04
Df Residuals:                    1251   BIC:                        -1.143e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.244e-05   7.12e-05     -0.596      0