### Factor Modeling
 

* Rolling frequency
    * Daily
    * Weekly
    * Monthly

In [2]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
lqd_vol   = load_excel_series("Workshop Data.xlsx", "Adj LQD", "Volume")

ig_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
lqd_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "LQD")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsLQD" not in portfolio.columns:
    portfolio["LQDCumDiv"] = portfolio["LQD Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsLQD"] = (
        portfolio["LQD Position"] + portfolio["LQDCumDiv"]
    )

# ============================================================
# Equity Risk Proxy 
# ============================================================

equity_px = portfolio["SPY Position"] / -10
equity_return = log_return(equity_px)

# ============================================================
# Duration (IG – Long Duration)
# ============================================================

lqd_duration = lqd_yas["YAS_MOD_DUR"]

# ============================================================
# Rolling Frequency LQD Cross-Asset Model
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

results = {}

# ============================================================
# Loop Over Frequencies
# ============================================================

for label, freq in FREQUENCIES.items():
    print(f"\n\n================ {label.upper()} MODEL =================")

    # --------------------------------------------------------
    # Resample Raw Inputs
    # --------------------------------------------------------

    lqd_ret = portfolio["TotalReturnsLQD"].resample(freq).last().pct_change()

    equity_px_rs = equity_px.resample(freq).last()
    equity_ret = log_return(equity_px_rs)

    lqd_vol_rs = lqd_vol.resample(freq).sum()
    liquidity = log_return(lqd_vol_rs)

    ig_oas = ig_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    credit = safe_diff(ig_oas)

    ust10_rs = ust10["PX_LAST"].resample(freq).last()
    d_ust10 = safe_diff(ust10_rs)

    duration = lqd_duration.resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors_rs = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = lqd_ret.loc[factors_rs.index]

    # --------------------------------------------------------
    # ---------------------- OLS ------------------------------
    # --------------------------------------------------------

    X_ols = sm.add_constant(factors_rs)
    ols = sm.OLS(y, X_ols).fit()

    print(f"\n--- OLS ({label}) ---")
    print(f"R²     : {ols.rsquared:.3f}")
    print(f"Adj R² : {ols.rsquared_adj:.3f}")
    print("DW     :", sm.stats.stattools.durbin_watson(ols.resid))

    # --------------------------------------------------------
    # --------------------- RIDGE -----------------------------
    # --------------------------------------------------------

    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=10.0))
    ])

    ridge.fit(factors_rs, y)
    ridge_pred = ridge.predict(factors_rs)

    ridge_r2 = r2_score(y, ridge_pred)

    # --------------------------------------------------------
    # ------------------ RANDOM FOREST ------------------------
    # --------------------------------------------------------

    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=5,
        min_samples_leaf=20,
        random_state=42
    )

    rf.fit(factors_rs, y)
    rf_pred = rf.predict(factors_rs)

    rf_r2 = r2_score(y, rf_pred)

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    results[label] = {
        "OLS_R2": ols.rsquared,
        "Ridge_R2": ridge_r2,
        "RF_R2": rf_r2,
        "OLS_Betas": ols.params.drop("const"),
        "Ridge_Betas": pd.Series(
            ridge.named_steps["ridge"].coef_,
            index=factors_rs.columns
        ),
        "RF_Importance": pd.Series(
            rf.feature_importances_,
            index=factors_rs.columns
        ).sort_values(ascending=False)
    }

    print(f"Ridge R² : {ridge_r2:.3f}")
    print(f"RF R²    : {rf_r2:.3f}")

# ============================================================
# Summary Tables
# ============================================================

summary_r2 = pd.DataFrame({
    freq: {
        "OLS": results[freq]["OLS_R2"],
        "Ridge": results[freq]["Ridge_R2"],
        "RF": results[freq]["RF_R2"]
    }
    for freq in results
}).T

print("\n================ R² SUMMARY =================")
display(summary_r2)

print("\n================ OLS BETAS ==================")
display(pd.DataFrame({k: v["OLS_Betas"] for k, v in results.items()}))

print("\n========== RANDOM FOREST IMPORTANCE =========")
display(pd.DataFrame({k: v["RF_Importance"] for k, v in results.items()}))





--- OLS (Daily) ---
R²     : 0.880
Adj R² : 0.879
DW     : 2.4143504263167364
Ridge R² : 0.880
RF R²    : 0.839



--- OLS (Weekly) ---
R²     : 0.940
Adj R² : 0.939
DW     : 2.2729795045310572
Ridge R² : 0.938
RF R²    : 0.806



--- OLS (Monthly) ---
R²     : 0.977
Adj R² : 0.976
DW     : 2.3565752012316192
Ridge R² : 0.962
RF R²    : 0.189



Unnamed: 0,OLS,Ridge,RF
Daily,0.879856,0.879745,0.838962
Weekly,0.939868,0.938238,0.806184
Monthly,0.977411,0.96211,0.188593





Unnamed: 0,Daily,Weekly,Monthly
Rate,0.0081,0.008151,0.007944
Credit,-0.067832,-0.083946,-0.069159
Equity,0.084682,0.030285,0.0651
Liquidity,0.000125,0.000344,0.006139





Unnamed: 0,Daily,Weekly,Monthly
Rate,0.866134,0.923913,1.0
Credit,0.082216,0.068888,0.0
Equity,0.051191,0.00623,0.0
Liquidity,0.000459,0.000969,0.0


### PCA Implementation

In [3]:
# Import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
lqd_vol   = load_excel_series("Workshop Data.xlsx", "Adj LQD", "Volume")

ig_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
lqd_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "LQD")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsLQD" not in portfolio.columns:
    portfolio["LQDCumDiv"] = portfolio["LQD Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsLQD"] = (
        portfolio["LQD Position"] + portfolio["LQDCumDiv"]
    )

# Equity proxy
equity_px = portfolio["SPY Position"] / -10

# ============================================================
# Frequencies
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

# ============================================================
# PCA Per Frequency
# ============================================================

pca_results = {}

for label, freq in FREQUENCIES.items():

    print(f"\n\n================ PCA ({label}) =================")

    # --------------------------------------------------------
    # Resample & Construct Variables
    # --------------------------------------------------------

    y = portfolio["TotalReturnsLQD"].resample(freq).last().pct_change()

    equity_ret = log_return(equity_px.resample(freq).last())
    liquidity = log_return(lqd_vol.resample(freq).sum())

    credit = safe_diff(
        ig_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    )

    d_ust10 = safe_diff(
        ust10["PX_LAST"].resample(freq).last()
    )

    duration = lqd_yas["YAS_MOD_DUR"].resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = y.loc[factors.index]

    # --------------------------------------------------------
    # Standardization
    # --------------------------------------------------------

    scaler = StandardScaler()
    factors_scaled = scaler.fit_transform(factors)

    # --------------------------------------------------------
    # PCA Fit
    # --------------------------------------------------------

    pca = PCA()
    pcs = pca.fit_transform(factors_scaled)

    pc_names = [f"PC{i+1}" for i in range(pcs.shape[1])]

    pcs_df = pd.DataFrame(
        pcs,
        index=factors.index,
        columns=pc_names
    )

    # --------------------------------------------------------
    # Explained Variance
    # --------------------------------------------------------

    explained_var = pd.Series(
        pca.explained_variance_ratio_,
        index=pc_names
    )

    print("\nExplained Variance (Cumulative):")
    display(explained_var.cumsum())

    # --------------------------------------------------------
    # PCA Loadings 
    # --------------------------------------------------------

    loadings = pd.DataFrame(
        pca.components_.T,
        index=factors.columns,
        columns=pc_names
    )

    print("\nPCA Loadings:")
    display(loadings)

    # --------------------------------------------------------
    # PCA Regression (First K PCs)
    # --------------------------------------------------------

    K = 2  # typically enough
    X_pca = sm.add_constant(pcs_df.iloc[:, :K])

    pca_model = sm.OLS(y, X_pca).fit()

    print("\nPCA Regression Summary:")
    print(pca_model.summary())

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    pca_results[label] = {
        "Explained_Variance": explained_var,
        "Loadings": loadings,
        "PCA_R2": pca_model.rsquared,
        "Model": pca_model
    }

# ============================================================
# Summary Across Frequencies
# ============================================================

print("\n================ PCA R² BY FREQUENCY ================")
display(
    pd.Series(
        {k: v["PCA_R2"] for k, v in pca_results.items()},
        name="PCA R²"
    )
)





Explained Variance (Cumulative):


PC1    0.396344
PC2    0.657222
PC3    0.898658
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.237063,0.899839,-0.14391,-0.336719
Credit,0.700534,0.049964,-0.157787,0.694161
Equity,-0.631046,0.424973,0.130067,0.635816
Liquidity,0.234168,0.084799,0.968233,-0.022336



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.824
Model:                            OLS   Adj. R-squared:                  0.824
Method:                 Least Squares   F-statistic:                     2267.
Date:                Fri, 16 Jan 2026   Prob (F-statistic):               0.00
Time:                        20:45:00   Log-Likelihood:                 4633.9
No. Observations:                 971   AIC:                            -9262.
Df Residuals:                     968   BIC:                            -9247.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.928e-05   6.5

PC1    0.422206
PC2    0.682821
PC3    0.901386
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.019655,0.957925,-0.147389,-0.245498
Credit,0.657783,0.121526,-0.273642,0.691139
Equity,-0.644339,0.228409,0.267566,0.679018
Liquidity,0.389566,0.124259,0.912033,-0.031513



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.858
Method:                 Least Squares   F-statistic:                     783.4
Date:                Fri, 16 Jan 2026   Prob (F-statistic):          7.57e-110
Time:                        20:45:01   Log-Likelihood:                 1073.0
No. Observations:                 259   AIC:                            -2140.
Df Residuals:                     256   BIC:                            -2129.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0013      

PC1    0.531068
PC2    0.763816
PC3    0.934629
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.472059,0.423893,0.693956,-0.340441
Credit,-0.558169,0.089015,0.60201,0.564011
Equity,0.616067,0.17673,-0.158577,0.751054
Liquidity,-0.293376,0.883831,-0.36175,-0.043706



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsLQD   R-squared:                       0.859
Model:                            OLS   Adj. R-squared:                  0.854
Method:                 Least Squares   F-statistic:                     170.6
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           1.51e-24
Time:                        20:45:01   Log-Likelihood:                 191.95
No. Observations:                  59   AIC:                            -377.9
Df Residuals:                      56   BIC:                            -371.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0055      

Daily      0.824052
Weekly     0.859559
Monthly    0.858997
Name: PCA R², dtype: float64