In [3]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
lqd_vol   = load_excel_series("Workshop Data.xlsx", "Adj LQD", "Volume")

ig_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
lqd_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "LQD")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsLQD" not in portfolio.columns:
    portfolio["LQDCumDiv"] = portfolio["LQD Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsLQD"] = (
        portfolio["LQD Position"] + portfolio["LQDCumDiv"]
    )

# ============================================================
# Dependent Variable (LQD Return)
# ============================================================

lqd_return = portfolio["TotalReturnsLQD"].pct_change()

# ============================================================
# Equity Risk Proxy (Optional)
# ============================================================

equity_px = portfolio["SPY Position"] / -10
equity_return = log_return(equity_px)

# ============================================================
# Duration (IG – Long Duration)
# ============================================================

lqd_duration = lqd_yas["YAS_MOD_DUR"]

# ============================================================
# Factor Construction (Economically Minimal)
# ============================================================

# Credit: IG OAS change (dominant credit factor)
credit = safe_diff(ig_index["OAS_SOVEREIGN_CURVE"])

# Rates: duration-adjusted Treasury move
d_ust10 = safe_diff(ust10["PX_LAST"])
rate = -lqd_duration * d_ust10

# Liquidity: ETF trading activity
liquidity = log_return(lqd_vol)

# Assemble factor matrix
factors = pd.DataFrame({
    "Rate": rate,
    "Credit": credit,
    "Equity": equity_return,
    "Liquidity": liquidity
})

# Align and clean
factors = factors.dropna()
lqd_return = lqd_return.loc[factors.index]

# ============================================================
# --------------------- OLS REGRESSION -----------------------
# ============================================================

X_ols = sm.add_constant(factors)
y = lqd_return

ols_model = sm.OLS(y, X_ols).fit()

print("\n================ OLS Regression (LQD) =================")
print(ols_model.summary())

# Diagnostics
print("\nOLS Diagnostics")
print(f"Condition Number : {np.linalg.cond(X_ols):.2f}")
print(f"R²              : {ols_model.rsquared:.3f}")
print(f"Adj R²          : {ols_model.rsquared_adj:.3f}")
print("Durbin-Watson   :", sm.stats.stattools.durbin_watson(ols_model.resid))

# ============================================================
# --------------------- RIDGE REGRESSION ---------------------
# ============================================================

ridge_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=10.0))
])

ridge_pipeline.fit(factors, y)
ridge_pred = ridge_pipeline.predict(factors)

print("\n================ Ridge Regression ====================")
print(f"R²  : {r2_score(y, ridge_pred):.3f}")
print(f"MSE : {mean_squared_error(y, ridge_pred):.6f}")

ridge_betas = pd.Series(
    ridge_pipeline.named_steps["ridge"].coef_,
    index=factors.columns
)

print("\nRidge Betas:")
display(ridge_betas)

# ============================================================
# ------------------ RANDOM FOREST MODEL ---------------------
# ============================================================

rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=5,
    min_samples_leaf=20,
    random_state=42
)

rf_model.fit(factors, y)
rf_pred = rf_model.predict(factors)

print("\n================ Random Forest =======================")
print(f"R²  : {r2_score(y, rf_pred):.3f}")
print(f"MSE : {mean_squared_error(y, rf_pred):.6f}")

rf_importance = pd.Series(
    rf_model.feature_importances_,
    index=factors.columns
).sort_values(ascending=False)

print("\nRandom Forest Feature Importance:")
display(rf_importance)

# ============================================================
# ---------------- FACTOR ATTRIBUTION (OLS) -----------------
# ============================================================

betas = ols_model.params.drop("const")
attribution = factors.mul(betas, axis=1)
attribution["Total"] = attribution.sum(axis=1)

print("\nLatest OLS Factor Attribution (LQD):")
display(attribution.tail())

# ============================================================
# ----------------- FACTOR CORRELATION ----------------------
# ============================================================

print("\nFactor Correlation Matrix:")
display(factors.corr())



                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.849
Method:                 Least Squares   F-statistic:                     1755.
Date:                Wed, 14 Jan 2026   Prob (F-statistic):               0.00
Time:                        18:02:37   Log-Likelihood:                 6015.5
No. Observations:                1244   AIC:                        -1.202e+04
Df Residuals:                    1239   BIC:                        -1.200e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0001   5.47e-05     -1.923      0.

Rate         0.004546
Credit      -0.001683
Equity      -0.000038
Liquidity    0.000025
dtype: float64


R²  : 0.837
MSE : 0.000004

Random Forest Feature Importance:


Rate         0.892962
Credit       0.104926
Equity       0.001614
Liquidity    0.000498
dtype: float64


Latest OLS Factor Attribution (LQD):


Unnamed: 0_level_0,Rate,Credit,Equity,Liquidity,Total
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-12-24,0.002077,0.000631,-7e-06,-6.7e-05,0.002633
2025-12-26,0.00041,0.000319,-2.5e-05,3.7e-05,0.000741
2025-12-29,0.001239,-0.000726,-2.2e-05,4.2e-05,0.000534
2025-12-30,-0.000827,0.000287,1.2e-05,9e-06,-0.00052
2025-12-31,-0.003177,-0.000513,2e-06,-8e-06,-0.003696



Factor Correlation Matrix:


Unnamed: 0,Rate,Credit,Equity,Liquidity
Rate,1.0,0.192887,0.021479,0.034206
Credit,0.192887,1.0,-0.015776,0.108033
Equity,0.021479,-0.015776,1.0,-0.003292
Liquidity,0.034206,0.108033,-0.003292,1.0


In [None]:
# ============================================================
# PCA on Cross-Asset LQD Factors
# ============================================================

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# ------------------------------------------------------------
# Standardize Factors
# ------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(factors)

# ------------------------------------------------------------
# Run PCA
# ------------------------------------------------------------

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# PCA results
pca_components = pd.DataFrame(
    pca.components_,
    columns=factors.columns,
    index=[f"PC{i+1}" for i in range(len(factors.columns))]
)

explained_variance = pd.Series(
    pca.explained_variance_ratio_,
    index=pca_components.index
)

print("\n================ PCA Explained Variance ================")
display(explained_variance)

print("\n================ PCA Loadings ==========================")
display(pca_components)

# ------------------------------------------------------------
# Scree Table
# ------------------------------------------------------------

pca_summary = pd.DataFrame({
    "Explained Variance": explained_variance,
    "Cumulative Variance": explained_variance.cumsum()
})

print("\n================ PCA Summary ===========================")
display(pca_summary)

# ------------------------------------------------------------
# Project Factors into PCA Space
# ------------------------------------------------------------

pca_factors = pd.DataFrame(
    X_pca[:, :3],  # keep first 3 PCs
    index=factors.index,
    columns=["PC1", "PC2", "PC3"]
)

# ------------------------------------------------------------
# Regression on Principal Components
# ------------------------------------------------------------

X_pc_ols = sm.add_constant(pca_factors)
pc_model = sm.OLS(lqd_return.loc[pca_factors.index], X_pc_ols).fit()

print("\n================ PCA Regression (LQD) ==================")
print(pc_model.summary())





PC1    0.309197
PC2    0.251656
PC3    0.241791
PC4    0.197355
dtype: float64




Unnamed: 0,Rate,Credit,Equity,Liquidity
PC1,0.613231,0.681456,0.00467,0.399428
PC2,0.213548,-0.05399,0.943707,-0.246777
PC3,-0.427209,-0.111145,0.310461,0.841875
PC4,-0.629156,0.721353,0.114056,-0.266092





Unnamed: 0,Explained Variance,Cumulative Variance
PC1,0.309197,0.309197
PC2,0.251656,0.560853
PC3,0.241791,0.802645
PC4,0.197355,1.0



                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.304
Model:                            OLS   Adj. R-squared:                  0.302
Method:                 Least Squares   F-statistic:                     180.3
Date:                Wed, 14 Jan 2026   Prob (F-statistic):           5.01e-97
Time:                        18:17:26   Log-Likelihood:                 5060.9
No. Observations:                1244   AIC:                        -1.011e+04
Df Residuals:                    1240   BIC:                        -1.009e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0003      0.000     -2.257      0.