### Factor Modeling
 

* Rolling frequency
    * Daily
    * Weekly
    * Monthly

In [2]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
lqd_vol   = load_excel_series("Workshop Data.xlsx", "Adj LQD", "Volume")

ig_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
lqd_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "LQD")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsLQD" not in portfolio.columns:
    portfolio["LQDCumDiv"] = portfolio["LQD Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsLQD"] = (
        portfolio["LQD Position"] + portfolio["LQDCumDiv"]
    )

# ============================================================
# Equity Risk Proxy 
# ============================================================

equity_px = portfolio["SPY Position"] / -10
equity_return = log_return(equity_px)

# ============================================================
# Duration (IG – Long Duration)
# ============================================================

lqd_duration = lqd_yas["YAS_MOD_DUR"]

# ============================================================
# Rolling Frequency LQD Cross-Asset Model
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

results = {}

# ============================================================
# Loop Over Frequencies
# ============================================================

for label, freq in FREQUENCIES.items():
    print(f"\n\n================ {label.upper()} MODEL =================")

    # --------------------------------------------------------
    # Resample Raw Inputs
    # --------------------------------------------------------

    lqd_ret = portfolio["TotalReturnsLQD"].resample(freq).last().pct_change()

    equity_px_rs = equity_px.resample(freq).last()
    equity_ret = log_return(equity_px_rs)

    lqd_vol_rs = lqd_vol.resample(freq).sum()
    liquidity = log_return(lqd_vol_rs)

    ig_oas = ig_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    credit = safe_diff(ig_oas)

    ust10_rs = ust10["PX_LAST"].resample(freq).last()
    d_ust10 = safe_diff(ust10_rs)

    duration = lqd_duration.resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors_rs = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = lqd_ret.loc[factors_rs.index]

    # --------------------------------------------------------
    # ---------------------- OLS ------------------------------
    # --------------------------------------------------------

    X_ols = sm.add_constant(factors_rs)
    ols = sm.OLS(y, X_ols).fit()

    print(f"\n--- OLS ({label}) ---")
    print(f"R²     : {ols.rsquared:.3f}")
    print(f"Adj R² : {ols.rsquared_adj:.3f}")
    print("DW     :", sm.stats.stattools.durbin_watson(ols.resid))

    # --------------------------------------------------------
    # --------------------- RIDGE -----------------------------
    # --------------------------------------------------------

    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=10.0))
    ])

    ridge.fit(factors_rs, y)
    ridge_pred = ridge.predict(factors_rs)

    ridge_r2 = r2_score(y, ridge_pred)

    # --------------------------------------------------------
    # ------------------ RANDOM FOREST ------------------------
    # --------------------------------------------------------

    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=5,
        min_samples_leaf=20,
        random_state=42
    )

    rf.fit(factors_rs, y)
    rf_pred = rf.predict(factors_rs)

    rf_r2 = r2_score(y, rf_pred)

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    results[label] = {
        "OLS_R2": ols.rsquared,
        "Ridge_R2": ridge_r2,
        "RF_R2": rf_r2,
        "OLS_Betas": ols.params.drop("const"),
        "Ridge_Betas": pd.Series(
            ridge.named_steps["ridge"].coef_,
            index=factors_rs.columns
        ),
        "RF_Importance": pd.Series(
            rf.feature_importances_,
            index=factors_rs.columns
        ).sort_values(ascending=False)
    }

    print(f"Ridge R² : {ridge_r2:.3f}")
    print(f"RF R²    : {rf_r2:.3f}")

# ============================================================
# Summary Tables
# ============================================================

summary_r2 = pd.DataFrame({
    freq: {
        "OLS": results[freq]["OLS_R2"],
        "Ridge": results[freq]["Ridge_R2"],
        "RF": results[freq]["RF_R2"]
    }
    for freq in results
}).T

print("\n================ R² SUMMARY =================")
display(summary_r2)

print("\n================ OLS BETAS ==================")
display(pd.DataFrame({k: v["OLS_Betas"] for k, v in results.items()}))

print("\n========== RANDOM FOREST IMPORTANCE =========")
display(pd.DataFrame({k: v["RF_Importance"] for k, v in results.items()}))





--- OLS (Daily) ---
R²     : 0.854
Adj R² : 0.854
DW     : 2.5194549442315592
Ridge R² : 0.854
RF R²    : 0.826



--- OLS (Weekly) ---
R²     : 0.938
Adj R² : 0.937
DW     : 2.3285071615463795
Ridge R² : 0.936
RF R²    : 0.805



--- OLS (Monthly) ---
R²     : 0.976
Adj R² : 0.975
DW     : 2.3185687656390908
Ridge R² : 0.961
RF R²    : 0.189



Unnamed: 0,OLS,Ridge,RF
Daily,0.854275,0.854144,0.825893
Weekly,0.937581,0.936019,0.805403
Monthly,0.976275,0.960558,0.188593





Unnamed: 0,Daily,Weekly,Monthly
Rate,0.008442,0.008225,0.008243
Credit,-0.096574,-0.092213,-0.07615
Equity,-0.009323,0.011004,0.043324
Liquidity,5.8e-05,0.000253,0.005515





Unnamed: 0,Daily,Weekly,Monthly
Credit,0.113809,0.0728,0.0
Equity,0.002023,7e-05,0.0
Liquidity,0.000732,0.000988,0.0
Rate,0.883436,0.926142,1.0


### PCA Implementation

In [3]:
# Import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
lqd_vol   = load_excel_series("Workshop Data.xlsx", "Adj LQD", "Volume")

ig_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
lqd_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "LQD")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsLQD" not in portfolio.columns:
    portfolio["LQDCumDiv"] = portfolio["LQD Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsLQD"] = (
        portfolio["LQD Position"] + portfolio["LQDCumDiv"]
    )

# Equity proxy
equity_px = portfolio["SPY Position"] / -10

# ============================================================
# Frequencies
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

# ============================================================
# PCA Per Frequency
# ============================================================

pca_results = {}

for label, freq in FREQUENCIES.items():

    print(f"\n\n================ PCA ({label}) =================")

    # --------------------------------------------------------
    # Resample & Construct Variables
    # --------------------------------------------------------

    y = portfolio["TotalReturnsLQD"].resample(freq).last().pct_change()

    equity_ret = log_return(equity_px.resample(freq).last())
    liquidity = log_return(lqd_vol.resample(freq).sum())

    credit = safe_diff(
        ig_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    )

    d_ust10 = safe_diff(
        ust10["PX_LAST"].resample(freq).last()
    )

    duration = lqd_yas["YAS_MOD_DUR"].resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = y.loc[factors.index]

    # --------------------------------------------------------
    # Standardization
    # --------------------------------------------------------

    scaler = StandardScaler()
    factors_scaled = scaler.fit_transform(factors)

    # --------------------------------------------------------
    # PCA Fit
    # --------------------------------------------------------

    pca = PCA()
    pcs = pca.fit_transform(factors_scaled)

    pc_names = [f"PC{i+1}" for i in range(pcs.shape[1])]

    pcs_df = pd.DataFrame(
        pcs,
        index=factors.index,
        columns=pc_names
    )

    # --------------------------------------------------------
    # Explained Variance
    # --------------------------------------------------------

    explained_var = pd.Series(
        pca.explained_variance_ratio_,
        index=pc_names
    )

    print("\nExplained Variance (Cumulative):")
    display(explained_var.cumsum())

    # --------------------------------------------------------
    # PCA Loadings 
    # --------------------------------------------------------

    loadings = pd.DataFrame(
        pca.components_.T,
        index=factors.columns,
        columns=pc_names
    )

    print("\nPCA Loadings:")
    display(loadings)

    # --------------------------------------------------------
    # PCA Regression (First K PCs)
    # --------------------------------------------------------

    K = 2  # typically enough
    X_pca = sm.add_constant(pcs_df.iloc[:, :K])

    pca_model = sm.OLS(y, X_pca).fit()

    print("\nPCA Regression Summary:")
    print(pca_model.summary())

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    pca_results[label] = {
        "Explained_Variance": explained_var,
        "Loadings": loadings,
        "PCA_R2": pca_model.rsquared,
        "Model": pca_model
    }

# ============================================================
# Summary Across Frequencies
# ============================================================

print("\n================ PCA R² BY FREQUENCY ================")
display(
    pd.Series(
        {k: v["PCA_R2"] for k, v in pca_results.items()},
        name="PCA R²"
    )
)





Explained Variance (Cumulative):


PC1    0.319283
PC2    0.572220
PC3    0.815352
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.63544,0.135891,-0.407836,-0.64142
Credit,0.684415,-0.160179,-0.095525,0.704836
Equity,0.018486,0.97758,0.027498,0.207939
Liquidity,0.357002,0.014586,0.907628,-0.220335



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.181
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     106.9
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           1.14e-42
Time:                        15:13:50   Log-Likelihood:                 3887.1
No. Observations:                 971   AIC:                            -7768.
Df Residuals:                     968   BIC:                            -7754.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.928e-05      

PC1    0.317180
PC2    0.599298
PC3    0.811414
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.442617,0.504721,0.725769,-0.150354
Credit,0.673175,-0.167504,-0.148027,0.704887
Equity,0.229747,0.688938,-0.65945,-0.194183
Liquidity,0.546023,-0.492507,-0.128351,-0.665448



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.355
Model:                            OLS   Adj. R-squared:                  0.350
Method:                 Least Squares   F-statistic:                     70.46
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           4.18e-25
Time:                        15:13:50   Log-Likelihood:                 875.63
No. Observations:                 259   AIC:                            -1745.
Df Residuals:                     256   BIC:                            -1735.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0013      

PC1    0.461212
PC2    0.695454
PC3    0.884540
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,-0.457016,0.304593,0.834764,-0.0391
Credit,0.60212,0.034943,0.35046,0.716525
Equity,-0.570217,0.30192,-0.391622,0.655995
Liquidity,0.321622,0.902688,-0.164255,-0.233952



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.726
Model:                            OLS   Adj. R-squared:                  0.716
Method:                 Least Squares   F-statistic:                     74.04
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           1.88e-16
Time:                        15:13:50   Log-Likelihood:                 172.31
No. Observations:                  59   AIC:                            -338.6
Df Residuals:                      56   BIC:                            -332.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0055      

Daily      0.180889
Weekly     0.355029
Monthly    0.725606
Name: PCA R², dtype: float64