In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

WORKSHOP_FILE = "Workshop Data.xlsx"
INDEX_FILE = "Indexes and Spreads Data 01.09.xlsx"
VOL_FILE = "MOVE Vix prices.xlsx"
SP500_FILE = "SP500 Index data.xlsx"

In [2]:
# Helpers 
def _coerce_datetime(s):
    return pd.to_datetime(s, errors="coerce")

def first_present(df, candidates):
    """Return the first column name that exists in df from candidates; else None."""
    for c in candidates:
        if c in df.columns:
            return c
    return None

def require_col(df, candidates, label):
    c = first_present(df, candidates)
    if c is None:
        raise KeyError(f"Could not find {label} column. Tried: {candidates}. Available: {list(df.columns)[:30]}...")
    return c

def safe_pct_change(x):
    x = pd.to_numeric(x, errors="coerce")
    out = x.pct_change()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

def safe_diff(x):
    x = pd.to_numeric(x, errors="coerce")
    out = x.diff()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

def safe_dlog(x):
    x = pd.to_numeric(x, errors="coerce")
    x = x.where(x > 0, np.nan)  # log-safe
    out = np.log(x).diff()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

In [3]:
workshop = pd.read_excel(WORKSHOP_FILE, sheet_name=0)
indexes = pd.read_excel(INDEX_FILE, sheet_name=0)

vix_df = pd.read_excel(VOL_FILE, sheet_name="VIX")
move_df = pd.read_excel(VOL_FILE, sheet_name="MOVE") if "MOVE" in pd.ExcelFile(VOL_FILE).sheet_names else None

sp500 = pd.read_excel(SP500_FILE, sheet_name="SPY")

for d in (workshop, indexes, vix_df, sp500):
    if "Date" in d.columns:
        d["Date"] = _coerce_datetime(d["Date"])

if move_df is not None and "Date" in move_df.columns:
    move_df["Date"] = _coerce_datetime(move_df["Date"])

# Renamed
vix_px = require_col(vix_df, ["PX_LAST", "VIX", "Close"], "VIX level")
vix_df = vix_df.rename(columns={vix_px: "VIX_PX_LAST"})[["Date", "VIX_PX_LAST"]]

spx_px = require_col(sp500, ["PX_LAST", "SP500_PX_LAST", "Close"], "SP500 level")
sp500 = sp500.rename(columns={spx_px: "SP500_PX_LAST"})[["Date", "SP500_PX_LAST"]]

if move_df is not None:
    move_px = require_col(move_df, ["PX_LAST", "MOVE", "Close"], "MOVE level")
    move_df = move_df.rename(columns={move_px: "MOVE_PX_LAST"})[["Date", "MOVE_PX_LAST"]]


In [4]:
# Merge to a unified panel 
df = workshop.merge(indexes, on="Date", how="inner").merge(vix_df, on="Date", how="inner").merge(sp500, on="Date", how="inner")
if move_df is not None:
    df = df.merge(move_df, on="Date", how="inner")

df = df.sort_values("Date").reset_index(drop=True)

In [5]:
# Target + factor engineering

hyg_px_col = require_col(
    df,
    [
        "HYG_PX_LAST", "HYG PX_LAST", "HYG",                 
        "HYG Position", "HYG Position Value", "HYG Close",   
        "HYG Total Return", "HYG Market Value"
    ],
    "HYG price/value proxy"
)
df["r_hyg"] = safe_pct_change(df[hyg_px_col])

# Equity factor: S&P 500 index return
df["r_sp500"] = safe_pct_change(df["SP500_PX_LAST"])

# Vol factors
df["dvix"] = safe_diff(df["VIX_PX_LAST"])
df["r_vix"] = safe_pct_change(df["VIX_PX_LAST"])

if "MOVE_PX_LAST" in df.columns:
    df["dmove"] = safe_diff(df["MOVE_PX_LAST"])
    df["r_move"] = safe_pct_change(df["MOVE_PX_LAST"])

# Rates / credit spreads
teny = first_present(df, ["USGG10YR Index", "10Y_Yield", "US10Y", "DGS10", "UST_10Y", "10Y", "US 10Y", "YAS_BOND_YLD"])
twoy = first_present(df, ["USGG2YR Index", "2Y_Yield", "US2Y", "DGS2", "UST_2Y", "2Y", "US 2Y"])
if teny is not None:
    df["d_10y"] = safe_diff(df[teny])
if twoy is not None:
    df["d_2y"] = safe_diff(df[twoy])
if teny is not None and twoy is not None:
    df["d_curve_2s10s"] = safe_diff(df[teny] - df[twoy])

# Credit spreads
hy_oas = first_present(df, ["LF98OAS Index", "HYOAS", "HY_OAS", "HY OAS", "HY_Spread", "US High Yield OAS", "YAS_ISPREAD", "YAS_YLD_SPREAD"])
ig_oas = first_present(df, ["LF97OAS Index", "IGOAS", "IG_OAS", "IG OAS", "IG_Spread", "US Investment Grade OAS"])

if hy_oas is not None:
    df["d_hy_oas"] = safe_diff(df[hy_oas])
if ig_oas is not None:
    df["d_ig_oas"] = safe_diff(df[ig_oas])
if hy_oas is not None and ig_oas is not None:
    df["d_hy_ig_oas"] = safe_diff(df[hy_oas] - df[ig_oas])

# Liquidity proxy (volume)
hyg_vol_col = first_present(df, ["HYG_VOLUME", "HYG Volume", "HYG_VOLUME_SHARES", "HYG_VOL", "HYG_Volume"])
if hyg_vol_col is not None:
    df["dlog_hyg_vol"] = safe_dlog(df[hyg_vol_col])

candidate_features = [
    "r_sp500", "dvix", "r_vix",
    "dmove", "r_move",
    "d_10y", "d_2y", "d_curve_2s10s",
    "d_hy_oas", "d_ig_oas", "d_hy_ig_oas",
    "dlog_hyg_vol",
]
features = [c for c in candidate_features if c in df.columns]

df_model = df[["Date", "r_hyg"] + features].set_index("Date")

print("Using HYG proxy column:", hyg_px_col)
print("Using features:", features)
df_model.tail()

Using HYG proxy column: HYG Position
Using features: ['r_sp500', 'dvix', 'r_vix', 'd_10y', 'd_hy_oas']


Unnamed: 0_level_0,r_hyg,r_sp500,dvix,r_vix,d_10y,d_hy_oas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-12-24,0.001864,0.003222,-0.53,-0.037857,-0.0429,-1.9881
2025-12-26,-0.000496,-0.000304,0.13,0.009651,0.0067,0.7108
2025-12-29,0.000372,-0.003492,0.6,0.044118,-0.0137,-0.6129
2025-12-30,0.000992,-0.001376,0.13,0.009155,0.0133,0.1862
2025-12-31,-0.000991,-0.007358,0.62,0.043266,0.054,1.6156


In [6]:
# cleaning
X_df = df_model[features].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
y_s = pd.to_numeric(df_model["r_hyg"], errors="coerce").replace([np.inf, -np.inf], np.nan)

mask_y = y_s.notna()
X_df = X_df.loc[mask_y]
y = y_s.loc[mask_y].values

print("Rows after y alignment:", len(y))

# Pipeline: median impute -> standardize -> PCA
pca = PCA(n_components=min(5, len(features)))  # keep up to 5 PCs for diagnostics; use first 2 for the main model
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("pca", pca),
])

X_pcs_all = pipe.fit_transform(X_df.values)

# Keep first 2 PCs for the regression model
X_pcs = X_pcs_all[:, :2]

Rows after y alignment: 1249


In [7]:
# PCA loadings (interpretation) 
# Convert PCA components into a readable table
components = pipe.named_steps["pca"].components_
loading = pd.DataFrame(components[:5], columns=features, index=[f"PC{i+1}" for i in range(min(5, components.shape[0]))])

# Loadings are in standardized factor space
display(loading.T.sort_values("PC1", ascending=False).head(15))
display(loading.T.sort_values("PC2", ascending=False).head(15))

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
dvix,0.515996,-0.318363,-0.150118,0.284652,0.727208
r_vix,0.505532,-0.342726,-0.212079,0.336978,-0.68443
d_hy_oas,0.395113,0.386436,0.800113,0.230357,-0.036179
d_10y,0.259773,0.793763,-0.540365,0.101598,0.011859
r_sp500,-0.504568,0.042316,-0.017661,0.861413,0.035715


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
d_10y,0.259773,0.793763,-0.540365,0.101598,0.011859
d_hy_oas,0.395113,0.386436,0.800113,0.230357,-0.036179
r_sp500,-0.504568,0.042316,-0.017661,0.861413,0.035715
dvix,0.515996,-0.318363,-0.150118,0.284652,0.727208
r_vix,0.505532,-0.342726,-0.212079,0.336978,-0.68443


In [8]:
# Linear Regression
lr = LinearRegression().fit(X_pcs, y)
yhat = lr.predict(X_pcs)

rmse = float(np.sqrt(mean_squared_error(y, yhat)))
print("In-sample R2:", round(r2_score(y, yhat), 4))
print("In-sample RMSE:", rmse)
print("alpha:", lr.intercept_)
print("betas (PC1, PC2):", lr.coef_)

In-sample R2: 0.6541
In-sample RMSE: 0.00282628498066784
alpha: -4.843076805254232e-05
betas (PC1, PC2): [-0.00199035 -0.0015973 ]


In [9]:
# Map PC betas back to original factors
beta_pc = lr.coef_[:2]                      
load_2 = components[:2, :]                 
exposure_std = beta_pc @ load_2          

exp_tbl = pd.DataFrame({"exposure_in_std_units": exposure_std}, index=features).sort_values("exposure_in_std_units", ascending=False)
display(exp_tbl)

Unnamed: 0,exposure_in_std_units
r_sp500,0.000937
r_vix,-0.000459
dvix,-0.000518
d_hy_oas,-0.001404
d_10y,-0.001785


In [10]:
# OLS regression
X_ols = sm.add_constant(X_pcs)  # const + PC1 + PC2
ols = sm.OLS(y, X_ols).fit()
print(ols.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.654
Model:                            OLS   Adj. R-squared:                  0.654
Method:                 Least Squares   F-statistic:                     1178.
Date:                Thu, 15 Jan 2026   Prob (F-statistic):          5.76e-288
Time:                        23:58:00   Log-Likelihood:                 5557.9
No. Observations:                1249   AIC:                        -1.111e+04
Df Residuals:                    1246   BIC:                        -1.109e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.843e-05   8.01e-05     -0.605      0.5

In [11]:
ridge = Ridge(alpha=1.0).fit(X_pcs, y)
yr = ridge.predict(X_pcs)
print("Ridge R2:", round(r2_score(y, yr), 4))
print("Ridge betas (PC1, PC2):", ridge.coef_)

Ridge R2: 0.6541
Ridge betas (PC1, PC2): [-0.00198985 -0.00159602]


In [12]:
# Rolling window stability (1y)
window = 252  # ~1 trading year

betas = []
r2s = []

# Use a date index aligned to X_df / y
dates = X_df.index.to_numpy()

for end in range(window, len(y)):
    start = end - window
    Xw = X_pcs[start:end]
    yw = y[start:end]

    m = LinearRegression().fit(Xw, yw)
    ywh = m.predict(Xw)

    dt = dates[end]
    betas.append([dt, m.intercept_, m.coef_[0], m.coef_[1]])
    r2s.append([dt, r2_score(yw, ywh)])

betas = pd.DataFrame(betas, columns=["Date", "alpha", "beta_pc1", "beta_pc2"]).set_index("Date")
r2s = pd.DataFrame(r2s, columns=["Date", "rolling_r2"]).set_index("Date")

display(betas.tail())
display(r2s.tail())

Unnamed: 0_level_0,alpha,beta_pc1,beta_pc2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-12-24,1.1e-05,-0.001589,-0.000967
2025-12-26,5e-06,-0.001584,-0.000976
2025-12-29,2.3e-05,-0.001589,-0.000959
2025-12-30,2.8e-05,-0.001591,-0.00096
2025-12-31,2.5e-05,-0.00159,-0.000953


Unnamed: 0_level_0,rolling_r2
Date,Unnamed: 1_level_1
2025-12-24,0.757803
2025-12-26,0.755918
2025-12-29,0.761184
2025-12-30,0.760651
2025-12-31,0.761309


In [13]:
# Approx daily contribution

# Recreate imputed+scaled factors
X_imp = pipe.named_steps["imputer"].transform(X_df.values)
Xz = pipe.named_steps["scaler"].transform(X_imp)

contrib = pd.DataFrame(Xz * exposure_std, index=X_df.index, columns=features)

top = exp_tbl.index[:6].tolist()

contrib[top].tail()

Unnamed: 0_level_0,r_sp500,r_vix,dvix,d_hy_oas,d_10y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-12-24,0.000236,0.000227,0.000147,0.00077,0.001269
2025-12-26,-7.3e-05,-3.9e-05,-3.9e-05,-0.000269,-0.000127
2025-12-29,-0.000352,-0.000232,-0.000171,0.00024,0.000447
2025-12-30,-0.000167,-3.6e-05,-3.9e-05,-6.7e-05,-0.000312
2025-12-31,-0.000691,-0.000227,-0.000176,-0.000617,-0.001457
