In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error

# Display options
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 140)


In [2]:
ETF_FILE = "Workshop Data.xlsx"
IDX_FILE = "Indexes and Spreads Data 01.09.xlsx"
VOL_FILE = "MOVE Vix prices.xlsx"


In [None]:
def read_sheet(path: str, sheet: str) -> pd.DataFrame:
    df = pd.read_excel(path, sheet_name=sheet)
    # Standardize Date column name
    if "Date" not in df.columns:
        # try common variants
        for c in df.columns:
            if str(c).lower().strip() in ["date", "dates"]:
                df = df.rename(columns={c: "Date"})
                break
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    return df

adj_hyg = read_sheet(ETF_FILE, "Adj HYG")
adj_spy = read_sheet(ETF_FILE, "Adj SPY")

y10 = read_sheet(IDX_FILE, "10yUST Yields")
hy = read_sheet(IDX_FILE, "HY Index")
ig = read_sheet(IDX_FILE, "IG Index")

move = read_sheet(VOL_FILE, "MOVE Index")
vix = read_sheet(VOL_FILE, "VIX")

(adj_hyg.head(2), y10.head(2), hy.head(2), ig.head(2), vix.head(2))


In [None]:
def make_features(
    adj_hyg: pd.DataFrame,
    adj_spy: pd.DataFrame,
    y10: pd.DataFrame,
    hy: pd.DataFrame,
    ig: pd.DataFrame,
    move: pd.DataFrame,
    vix: pd.DataFrame,
) -> pd.DataFrame:
    out = pd.DataFrame()

    # Target: HYG return from adjusted close
    out["Date"] = adj_hyg["Date"]
    out["r_hyg"] = adj_hyg["Adj. Close"].pct_change()

    # Equity factor proxy: SPY returns
    spy = adj_spy[["Date", "Adj. Close"]].rename(columns={"Adj. Close": "spy_adj_close"})
    spy["r_spy"] = spy["spy_adj_close"].pct_change()
    out = out.merge(spy[["Date", "r_spy"]], on="Date", how="left")

    # Rates: 10Y yield daily change (level shock)
    y10_ = y10[["Date", "PX_LAST"]].rename(columns={"PX_LAST": "y10"})
    y10_["dy10"] = y10_["y10"].diff()
    out = out.merge(y10_[["Date", "dy10"]], on="Date", how="left")

    # Credit level: HY OAS daily change
    hy_ = hy[["Date", "OAS_SOVEREIGN_CURVE"]].rename(columns={"OAS_SOVEREIGN_CURVE": "hy_oas"})
    hy_["d_hy_oas"] = hy_["hy_oas"].diff()
    out = out.merge(hy_[["Date", "d_hy_oas"]], on="Date", how="left")

    # IG OAS daily change
    ig_ = ig[["Date", "OAS_SOVEREIGN_CURVE"]].rename(columns={"OAS_SOVEREIGN_CURVE": "ig_oas"})
    ig_["d_ig_oas"] = ig_["ig_oas"].diff()
    out = out.merge(ig_[["Date", "d_ig_oas"]], on="Date", how="left")

    # Credit rotation: (HY - IG) OAS change
    # Note: align on dates first
    rot = hy_[["Date", "hy_oas"]].merge(ig_[["Date", "ig_oas"]], on="Date", how="inner")
    rot["hy_minus_ig"] = rot["hy_oas"] - rot["ig_oas"]
    rot["d_hy_minus_ig"] = rot["hy_minus_ig"].diff()
    out = out.merge(rot[["Date", "d_hy_minus_ig"]], on="Date", how="left")

    # Liquidity: change in log HYG volume
    vol = adj_hyg[["Date", "Volume"]].copy()
    vol["dlog_hyg_vol"] = np.log(vol["Volume"].replace(0, np.nan)).diff()
    out = out.merge(vol[["Date", "dlog_hyg_vol"]], on="Date", how="left")

    # Volatility proxies: daily changes
    vix_ = vix.rename(columns={"PX_LAST": "vix"}).copy()
    vix_["dvix"] = vix_["vix"].diff()
    move_ = move.rename(columns={"PX_LAST": "move"}).copy()
    move_["dmove"] = move_["move"].diff()

    out = out.merge(vix_[["Date", "dvix"]], on="Date", how="left")
    out = out.merge(move_[["Date", "dmove"]], on="Date", how="left")

    # Clean up
    out = out.sort_values("Date").reset_index(drop=True)
    return out

df = make_features(adj_hyg, adj_spy, y10, hy, ig, move, vix)

# Drop the first row (pct_change/diff NaNs) and any remaining missing dates across series
df_model = df.dropna().copy()
df_model.head()


Unnamed: 0,Date,r_hyg,r_spy,dy10,d_hy_oas,d_ig_oas,d_hy_minus_ig,dlog_hyg_vol,dvix,dmove
200,2021-01-11,inf,inf,0.0307,0.041273,0.000662,0.040611,-0.024777,2.52,2.96
201,2021-01-12,0.001266,0.000211,-0.0169,0.033598,0.000282,0.033316,0.445702,-0.75,2.35
202,2021-01-13,0.002989,0.002693,-0.0459,-0.017457,-0.015203,-0.002254,0.133542,-1.12,-4.52
203,2021-01-14,-0.000229,-0.003502,0.046,-0.069708,-0.005403,-0.064305,-0.627066,1.04,0.63
204,2021-01-15,-0.001605,-0.007293,-0.0457,0.025426,0.008244,0.017182,0.635923,1.09,-1.09


In [None]:
factor_cols = ["r_spy", "dy10", "d_hy_oas", "d_ig_oas", "d_hy_minus_ig", "dlog_hyg_vol", "dvix", "dmove"]

# cleaning 
X_df = df_model[factor_cols].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
y_s  = pd.to_numeric(df_model["r_hyg"], errors="coerce")
xy = pd.concat([X_df, y_s.rename("r_hyg")], axis=1).dropna()
X = xy[factor_cols].values
y = xy["r_hyg"].values

scaler = StandardScaler()
Xz = scaler.fit_transform(X)

pca = PCA()
X_pcs_all = pca.fit_transform(Xz)

explained = pd.Series(pca.explained_variance_ratio_, index=[f"PC{i+1}" for i in range(X_pcs_all.shape[1])])
explained_cum = explained.cumsum()

loadings = pd.DataFrame(
    pca.components_.T,
    index=factor_cols,
    columns=[f"PC{i+1}" for i in range(X_pcs_all.shape[1])]
)

print("Explained variance ratio:")
display(explained.to_frame("var_ratio").head(8))
print("\nCumulative explained variance:")
display(explained_cum.to_frame("cum_var").head(8))

print("\nLoadings (first 2 PCs):")
display(loadings[["PC1", "PC2"]].sort_values("PC1", key=lambda s: s.abs(), ascending=False))


Explained variance ratio:


Unnamed: 0,var_ratio
PC1,0.467687
PC2,0.161495
PC3,0.121901
PC4,0.101209
PC5,0.076406
PC6,0.046646
PC7,0.024657
PC8,0.0



Cumulative explained variance:


Unnamed: 0,cum_var
PC1,0.467687
PC2,0.629181
PC3,0.751082
PC4,0.852291
PC5,0.928697
PC6,0.975343
PC7,1.0
PC8,1.0



Loadings (first 2 PCs):


Unnamed: 0,PC1,PC2
d_hy_oas,0.473328,-0.243041
d_hy_minus_ig,0.4498,-0.261944
d_ig_oas,0.420306,-0.095764
r_spy,-0.403219,-0.289073
dvix,0.381939,0.305723
dmove,0.251378,0.376422
dlog_hyg_vol,0.122767,0.149417
dy10,-0.101284,0.722563


In [None]:
# PC1 and PC2
X_pcs = X_pcs_all[:, :2]
dates = df_model["Date"].values

# Time-based split: last 20% as test
split = int(0.8 * len(df_model))
X_train, X_test = X_pcs[:split], X_pcs[split:]
y_train, y_test = y[:split], y[split:]

lin = LinearRegression()
lin.fit(X_train, y_train)

yhat_train = lin.predict(X_train)
yhat_test = lin.predict(X_test)

results = {
    "train_r2": r2_score(y_train, yhat_train),
    "test_r2": r2_score(y_test, yhat_test),
    "train_rmse": np.sqrt(mean_squared_error(y_train, yhat_train)),
    "test_rmse": np.sqrt(mean_squared_error(y_test, yhat_test)),
    "intercept": float(lin.intercept_),
    "beta_pc1": float(lin.coef_[0]),
    "beta_pc2": float(lin.coef_[1]),
}
pd.Series(results).to_frame("value")


Unnamed: 0,value
train_r2,0.626989
test_r2,0.626396
train_rmse,0.00308
test_rmse,0.002206
intercept,-4e-05
beta_pc1,-0.001644
beta_pc2,-0.002226


In [None]:
# Map PC betas to z-scored factor exposures:
# y â‰ˆ a + b1*PC1 + b2*PC2
# PCk = sum_j loading_{j,k} * z_j
# => exposure to z_j is sum_k b_k * loading_{j,k}
beta = pd.Series(lin.coef_, index=["PC1", "PC2"])
exposure_z = loadings[["PC1","PC2"]].dot(beta)

# Convert z-exposure to exposure in original units:
# z_j = (x_j - mean_j) / std_j => coefficient on x_j is exposure_z / std_j
stds = pd.Series(scaler.scale_, index=factor_cols)
exposure_units = exposure_z / stds

exposure = pd.DataFrame({
    "coef_on_z": exposure_z,
    "coef_on_units": exposure_units
}).sort_values("coef_on_z", key=lambda s: s.abs(), ascending=False)

exposure


Unnamed: 0,coef_on_z,coef_on_units
dy10,-0.001442,-0.023148
dvix,-0.001308,-0.000712
r_spy,0.001306,0.120807
dmove,-0.001251,-0.000249
dlog_hyg_vol,-0.000534,-0.001533
d_ig_oas,-0.000478,-0.025508
d_hy_oas,-0.000237,-0.002769
d_hy_minus_ig,-0.000156,-0.002153


In [None]:
window = 252
betas = []
r2s = []

for end in range(window, len(df_model)):
    start = end - window
    Xw = X_pcs[start:end]
    yw = y[start:end]

    m = LinearRegression().fit(Xw, yw)
    ywh = m.predict(Xw)

    dt = df_model.index[end]   # <-- use index position
    betas.append([dt, m.intercept_, m.coef_[0], m.coef_[1]])
    r2s.append([dt, r2_score(yw, ywh)])

betas = pd.DataFrame(betas, columns=["Date", "alpha", "beta_pc1", "beta_pc2"]).set_index("Date")
r2s = pd.DataFrame(r2s, columns=["Date", "rolling_r2"]).set_index("Date")

betas.tail(), r2s.tail()


(         alpha  beta_pc1  beta_pc2
 Date                              
 1450 -0.000007 -0.001171 -0.001604
 1451  0.000006 -0.001172 -0.001595
 1452 -0.000006 -0.001178 -0.001584
 1453 -0.000009 -0.001178 -0.001586
 1454 -0.000017 -0.001177 -0.001591,
       rolling_r2
 Date            
 1450    0.737538
 1451    0.735729
 1452    0.737969
 1453    0.738211
 1454    0.739813)

In [None]:
# OLS Regression 
import statsmodels.api as sm

# X_pcs is (n_samples, n_components); we use the first 2 PCs
X_ols = sm.add_constant(X_pcs[:, :2]) 
ols_model = sm.OLS(y, X_ols).fit()

print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.630
Model:                            OLS   Adj. R-squared:                  0.630
Method:                 Least Squares   F-statistic:                     1058.
Date:                Wed, 14 Jan 2026   Prob (F-statistic):          5.80e-269
Time:                        16:11:47   Log-Likelihood:                 5502.0
No. Observations:                1245   AIC:                        -1.100e+04
Df Residuals:                    1242   BIC:                        -1.098e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.616e-05   8.27e-05     -0.558      0.5