In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

WORKSHOP_FILE = "Workshop Data.xlsx"
INDEX_FILE = "Indexes and Spreads Data 01.09.xlsx"
VOL_FILE = "MOVE Vix prices.xlsx"
SP500_FILE = "SP500 Index data.xlsx"

In [2]:
# Helpers 
def _coerce_datetime(s):
    return pd.to_datetime(s, errors="coerce")

def first_present(df, candidates):
    """Return the first column name that exists in df from candidates; else None."""
    for c in candidates:
        if c in df.columns:
            return c
    return None

def require_col(df, candidates, label):
    c = first_present(df, candidates)
    if c is None:
        raise KeyError(f"Could not find {label} column. Tried: {candidates}. Available: {list(df.columns)[:30]}...")
    return c

def safe_pct_change(x):
    x = pd.to_numeric(x, errors="coerce")
    out = x.pct_change()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

def safe_diff(x):
    x = pd.to_numeric(x, errors="coerce")
    out = x.diff()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

def safe_dlog(x):
    x = pd.to_numeric(x, errors="coerce")
    x = x.where(x > 0, np.nan)  # log-safe
    out = np.log(x).diff()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

In [3]:
workshop = pd.read_excel(WORKSHOP_FILE, sheet_name=0)
indexes = pd.read_excel(INDEX_FILE, sheet_name=0)

vix_df = pd.read_excel(VOL_FILE, sheet_name="VIX")
move_df = pd.read_excel(VOL_FILE, sheet_name="MOVE") if "MOVE" in pd.ExcelFile(VOL_FILE).sheet_names else None

sp500 = pd.read_excel(SP500_FILE, sheet_name="SPY")

for d in (workshop, indexes, vix_df, sp500):
    if "Date" in d.columns:
        d["Date"] = _coerce_datetime(d["Date"])

if move_df is not None and "Date" in move_df.columns:
    move_df["Date"] = _coerce_datetime(move_df["Date"])

# Renamed
vix_px = require_col(vix_df, ["PX_LAST", "VIX", "Close"], "VIX level")
vix_df = vix_df.rename(columns={vix_px: "VIX_PX_LAST"})[["Date", "VIX_PX_LAST"]]

spx_px = require_col(sp500, ["PX_LAST", "SP500_PX_LAST", "Close"], "SP500 level")
# Keep volume too if it exists (useful for SPY notebook)
vol_col = first_present(sp500, ["PX_VOLUME", "VOLUME", "Volume", "VOL"])
keep_cols = ["Date", spx_px] + ([vol_col] if vol_col is not None else [])
sp500 = sp500[keep_cols].rename(columns={spx_px: "SP500_PX_LAST", vol_col: "SP500_PX_VOLUME" if vol_col is not None else vol_col})

if move_df is not None:
    move_px = require_col(move_df, ["PX_LAST", "MOVE", "Close"], "MOVE level")
    move_df = move_df.rename(columns={move_px: "MOVE_PX_LAST"})[["Date", "MOVE_PX_LAST"]]


In [4]:
# Merge to a unified panel 
df = workshop.merge(indexes, on="Date", how="inner").merge(vix_df, on="Date", how="inner").merge(sp500, on="Date", how="inner")
if move_df is not None:
    df = df.merge(move_df, on="Date", how="inner")

df = df.sort_values("Date").reset_index(drop=True)

In [5]:
# Target + factor engineering

lqd_px_col = require_col(
    df,
    ['LQD_PX_LAST', 'LQD PX_LAST', 'LQD', 'LQD Position', 'LQD Position Value', 'LQD Close', 'LQD Total Return', 'LQD Market Value'],
    "LQD price/value proxy"
)
df["r_lqd"] = safe_pct_change(df[lqd_px_col])

# Equity factor: S&P 500 index return
df["r_sp500"] = safe_pct_change(df["SP500_PX_LAST"])

# Vol factors
df["dvix"] = safe_diff(df["VIX_PX_LAST"])
df["r_vix"] = safe_pct_change(df["VIX_PX_LAST"])

if "MOVE_PX_LAST" in df.columns:
    df["dmove"] = safe_diff(df["MOVE_PX_LAST"])
    df["r_move"] = safe_pct_change(df["MOVE_PX_LAST"])

# Rates / credit spreads
teny = first_present(df, ["USGG10YR Index", "10Y_Yield", "US10Y", "DGS10", "UST_10Y", "10Y", "US 10Y", "YAS_BOND_YLD"])
twoy = first_present(df, ["USGG2YR Index", "2Y_Yield", "US2Y", "DGS2", "UST_2Y", "2Y", "US 2Y"])
if teny is not None:
    df["d_10y"] = safe_diff(df[teny])
if twoy is not None:
    df["d_2y"] = safe_diff(df[twoy])
if teny is not None and twoy is not None:
    df["d_curve_2s10s"] = safe_diff(df[teny] - df[twoy])

# Credit spreads
hy_oas = first_present(df, ["LF98OAS Index", "HYOAS", "HY_OAS", "HY OAS", "HY_Spread", "US High Yield OAS", "YAS_ISPREAD", "YAS_YLD_SPREAD"])
ig_oas = first_present(df, ["LF97OAS Index", "IGOAS", "IG_OAS", "IG OAS", "IG_Spread", "US Investment Grade OAS"])

if hy_oas is not None:
    df["d_hy_oas"] = safe_diff(df[hy_oas])
if ig_oas is not None:
    df["d_ig_oas"] = safe_diff(df[ig_oas])
if hy_oas is not None and ig_oas is not None:
    df["d_hy_ig_oas"] = safe_diff(df[hy_oas] - df[ig_oas])

# Liquidity proxy (volume)
lqd_vol_col = first_present(df, ['LQD_VOLUME', 'LQD Volume', 'LQD_VOLUME_SHARES', 'LQD_VOL', 'LQD_Volume'])
if lqd_vol_col is not None:
    df["dlog_lqd_vol"] = safe_dlog(df[lqd_vol_col])

# Also, SP500_PX_VOLUME may exist from SP500 file; keep as optional market volume factor
if "SP500_PX_VOLUME" in df.columns:
    df["dlog_sp500_vol"] = safe_dlog(df["SP500_PX_VOLUME"])

candidate_features = [
    "r_sp500", "dvix", "r_vix",
    "dmove", "r_move",
    "d_10y", "d_2y", "d_curve_2s10s",
    "d_hy_oas", "d_ig_oas", "d_hy_ig_oas",
    "dlog_lqd_vol",
    "dlog_sp500_vol",
]
features = [c for c in candidate_features if c in df.columns]

df_model = df[["Date", "r_lqd"] + features].set_index("Date")

print("Using LQD proxy column:", lqd_px_col)
print("Using features:", features)
df_model.tail()


Using LQD proxy column: LQD Position
Using features: ['r_sp500', 'dvix', 'r_vix', 'd_10y', 'd_hy_oas', 'dlog_sp500_vol']


Unnamed: 0_level_0,r_lqd,r_sp500,dvix,r_vix,d_10y,d_hy_oas,dlog_sp500_vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-12-24,0.003902,0.003222,-0.53,-0.037857,-0.0429,-1.9881,-0.828539
2025-12-26,-9e-05,-0.000304,0.13,0.009651,0.0067,0.7108,0.35592
2025-12-29,0.001356,-0.003492,0.6,0.044118,-0.0137,-0.6129,0.360544
2025-12-30,-0.001174,-0.001376,0.13,0.009155,0.0133,0.1862,-0.1761
2025-12-31,-0.004248,-0.007358,0.62,0.043266,0.054,1.6156,0.12142


In [6]:
# cleaning
X_df = df_model[features].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
y_s = pd.to_numeric(df_model["r_lqd"], errors="coerce").replace([np.inf, -np.inf], np.nan)

mask_y = y_s.notna()
X_df = X_df.loc[mask_y]
y = y_s.loc[mask_y].values

print("Rows after y alignment:", len(y))

# Pipeline: median impute -> standardize -> PCA
pca = PCA(n_components=min(5, len(features)))  # keep up to 5 PCs for diagnostics; use first 2 for the main model
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("pca", pca),
])

X_pcs_all = pipe.fit_transform(X_df.values)

# Keep first 2 PCs for the regression model
X_pcs = X_pcs_all[:, :2]

Rows after y alignment: 1249


In [7]:
# PCA loadings (interpretation) 
# Convert PCA components into a readable table
components = pipe.named_steps["pca"].components_
loading = pd.DataFrame(components[:5], columns=features, index=[f"PC{i+1}" for i in range(min(5, components.shape[0]))])

# Loadings are in standardized factor space
display(loading.T.sort_values("PC1", ascending=False).head(15))
display(loading.T.sort_values("PC2", ascending=False).head(15))

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
dvix,0.514946,0.191291,-0.260528,-0.150671,0.282643
r_vix,0.505094,0.221469,-0.261375,-0.212501,0.336349
d_hy_oas,0.392187,-0.308607,0.236509,0.80028,0.230843
d_10y,0.255968,-0.592542,0.530336,-0.539783,0.102908
dlog_sp500_vol,0.084557,0.684058,0.724171,0.004173,0.016603
r_sp500,-0.50322,-0.008931,0.046821,-0.017636,0.861875


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
dlog_sp500_vol,0.084557,0.684058,0.724171,0.004173,0.016603
r_vix,0.505094,0.221469,-0.261375,-0.212501,0.336349
dvix,0.514946,0.191291,-0.260528,-0.150671,0.282643
r_sp500,-0.50322,-0.008931,0.046821,-0.017636,0.861875
d_hy_oas,0.392187,-0.308607,0.236509,0.80028,0.230843
d_10y,0.255968,-0.592542,0.530336,-0.539783,0.102908


In [8]:
# Linear Regression
lr = LinearRegression().fit(X_pcs, y)
yhat = lr.predict(X_pcs)

rmse = float(np.sqrt(mean_squared_error(y, yhat)))
print("In-sample R2:", round(r2_score(y, yhat), 4))
print("In-sample RMSE:", rmse)
print("alpha:", lr.intercept_)
print("betas (PC1, PC2):", lr.coef_)

In-sample R2: 0.5694
In-sample RMSE: 0.003611908626155255
alpha: -0.00014855647154470326
betas (PC1, PC2): [-0.00138118  0.00321126]


In [9]:
# Map PC betas back to original factors
beta_pc = lr.coef_[:2]                      
load_2 = components[:2, :]                 
exposure_std = beta_pc @ load_2          

exp_tbl = pd.DataFrame({"exposure_in_std_units": exposure_std}, index=features).sort_values("exposure_in_std_units", ascending=False)
display(exp_tbl)

Unnamed: 0,exposure_in_std_units
dlog_sp500_vol,0.00208
r_sp500,0.000666
r_vix,1.4e-05
dvix,-9.7e-05
d_hy_oas,-0.001533
d_10y,-0.002256


In [10]:
# OLS regression
X_ols = sm.add_constant(X_pcs)  # const + PC1 + PC2
ols = sm.OLS(y, X_ols).fit()
print(ols.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.569
Model:                            OLS   Adj. R-squared:                  0.569
Method:                 Least Squares   F-statistic:                     823.9
Date:                Fri, 16 Jan 2026   Prob (F-statistic):          1.03e-228
Time:                        00:10:02   Log-Likelihood:                 5251.5
No. Observations:                1249   AIC:                        -1.050e+04
Df Residuals:                    1246   BIC:                        -1.048e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0001      0.000     -1.452      0.1

In [11]:
ridge = Ridge(alpha=1.0).fit(X_pcs, y)
yr = ridge.predict(X_pcs)
print("Ridge R2:", round(r2_score(y, yr), 4))
print("Ridge betas (PC1, PC2):", ridge.coef_)

Ridge R2: 0.5694
Ridge betas (PC1, PC2): [-0.00138084  0.00320888]


In [12]:
# Rolling window stability (1y)
window = 252  # ~1 trading year

betas = []
r2s = []

# Use a date index aligned to X_df / y
dates = X_df.index.to_numpy()

for end in range(window, len(y)):
    start = end - window
    Xw = X_pcs[start:end]
    yw = y[start:end]

    m = LinearRegression().fit(Xw, yw)
    ywh = m.predict(Xw)

    dt = dates[end]
    betas.append([dt, m.intercept_, m.coef_[0], m.coef_[1]])
    r2s.append([dt, r2_score(yw, ywh)])

betas = pd.DataFrame(betas, columns=["Date", "alpha", "beta_pc1", "beta_pc2"]).set_index("Date")
r2s = pd.DataFrame(r2s, columns=["Date", "rolling_r2"]).set_index("Date")

display(betas.tail())
display(r2s.tail())

Unnamed: 0_level_0,alpha,beta_pc1,beta_pc2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-12-24,2.3e-05,-0.001029,0.002379
2025-12-26,6.4e-05,-0.00105,0.002378
2025-12-29,3.9e-05,-0.001046,0.002445
2025-12-30,1.7e-05,-0.001036,0.00247
2025-12-31,2.3e-05,-0.001036,0.002475


Unnamed: 0_level_0,rolling_r2
Date,Unnamed: 1_level_1
2025-12-24,0.495943
2025-12-26,0.49144
2025-12-29,0.493973
2025-12-30,0.496899
2025-12-31,0.497383


In [13]:
# Approx daily contribution

# Recreate imputed+scaled factors
X_imp = pipe.named_steps["imputer"].transform(X_df.values)
Xz = pipe.named_steps["scaler"].transform(X_imp)

contrib = pd.DataFrame(Xz * exposure_std, index=X_df.index, columns=features)

top = exp_tbl.index[:6].tolist()

contrib[top].tail()

Unnamed: 0_level_0,dlog_sp500_vol,r_sp500,r_vix,dvix,d_hy_oas,d_10y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-12-24,-0.005836,0.000168,-7e-06,2.7e-05,0.00084,0.001604
2025-12-26,0.002508,-5.2e-05,1e-06,-7e-06,-0.000294,-0.00016
2025-12-29,0.00254,-0.000251,7e-06,-3.2e-05,0.000262,0.000565
2025-12-30,-0.00124,-0.000119,1e-06,-7e-06,-7.3e-05,-0.000395
2025-12-31,0.000856,-0.000492,7e-06,-3.3e-05,-0.000674,-0.001842
