### Factor Modeling
 

* Rolling frequency
    * Daily
    * Weekly
    * Monthly

In [1]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
hyg_vol   = load_excel_series("Workshop Data.xlsx", "Adj HYG", "Volume")

hy_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
hyg_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "HYG")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsHYG" not in portfolio.columns:
    portfolio["HYGCumDiv"] = portfolio["HYG Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsHYG"] = (
        portfolio["HYG Position"] + portfolio["HYGCumDiv"]
    )

# ============================================================
# Equity Risk Proxy 
# ============================================================

equity_px = portfolio["SPY Position"] / -10
equity_return = log_return(equity_px)

# ============================================================
# Duration (IG – Long Duration)
# ============================================================

hyg_duration = hyg_yas["YAS_MOD_DUR"]

# ============================================================
# Rolling Frequency HYG Cross-Asset Model
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

results = {}

# ============================================================
# Loop Over Frequencies
# ============================================================

for label, freq in FREQUENCIES.items():
    print(f"\n\n================ {label.upper()} MODEL =================")

    # --------------------------------------------------------
    # Resample Raw Inputs
    # --------------------------------------------------------

    hyg_ret = portfolio["TotalReturnsHYG"].resample(freq).last().pct_change()

    equity_px_rs = equity_px.resample(freq).last()
    equity_ret = log_return(equity_px_rs)

    hyg_vol_rs = hyg_vol.resample(freq).sum()
    liquidity = log_return(hyg_vol_rs)

    hy_oas = hy_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    credit = safe_diff(hy_oas)

    ust10_rs = ust10["PX_LAST"].resample(freq).last()
    d_ust10 = safe_diff(ust10_rs)

    duration = hyg_duration.resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors_rs = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = hyg_ret.loc[factors_rs.index]

    # --------------------------------------------------------
    # ---------------------- OLS ------------------------------
    # --------------------------------------------------------

    X_ols = sm.add_constant(factors_rs)
    ols = sm.OLS(y, X_ols).fit()

    print(f"\n--- OLS ({label}) ---")
    print(f"R²     : {ols.rsquared:.3f}")
    print(f"Adj R² : {ols.rsquared_adj:.3f}")
    print("DW     :", sm.stats.stattools.durbin_watson(ols.resid))

    # --------------------------------------------------------
    # --------------------- RIDGE -----------------------------
    # --------------------------------------------------------

    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=10.0))
    ])

    ridge.fit(factors_rs, y)
    ridge_pred = ridge.predict(factors_rs)

    ridge_r2 = r2_score(y, ridge_pred)

    # --------------------------------------------------------
    # ------------------ RANDOM FOREST ------------------------
    # --------------------------------------------------------

    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=5,
        min_samples_leaf=20,
        random_state=42
    )

    rf.fit(factors_rs, y)
    rf_pred = rf.predict(factors_rs)

    rf_r2 = r2_score(y, rf_pred)

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    results[label] = {
        "OLS_R2": ols.rsquared,
        "Ridge_R2": ridge_r2,
        "RF_R2": rf_r2,
        "OLS_Betas": ols.params.drop("const"),
        "Ridge_Betas": pd.Series(
            ridge.named_steps["ridge"].coef_,
            index=factors_rs.columns
        ),
        "RF_Importance": pd.Series(
            rf.feature_importances_,
            index=factors_rs.columns
        ).sort_values(ascending=False)
    }

    print(f"Ridge R² : {ridge_r2:.3f}") 
    print(f"RF R²    : {rf_r2:.3f}")

# ============================================================
# Summary Tables
# ============================================================

summary_r2 = pd.DataFrame({
    freq: {
        "OLS": results[freq]["OLS_R2"],
        "Ridge": results[freq]["Ridge_R2"],
        "RF": results[freq]["RF_R2"]
    }
    for freq in results
}).T

print("\n================ R² SUMMARY =================")
display(summary_r2)

print("\n================ OLS BETAS ==================")
display(pd.DataFrame({k: v["OLS_Betas"] for k, v in results.items()}))

print("\n========== RANDOM FOREST IMPORTANCE =========")
display(pd.DataFrame({k: v["RF_Importance"] for k, v in results.items()}))





--- OLS (Daily) ---
R²     : 0.504
Adj R² : 0.502
DW     : 2.156322182864488
Ridge R² : 0.504
RF R²    : 0.515



--- OLS (Weekly) ---
R²     : 0.553
Adj R² : 0.546
DW     : 2.3756692482149813
Ridge R² : 0.552
RF R²    : 0.506



--- OLS (Monthly) ---
R²     : 0.784
Adj R² : 0.768
DW     : 2.6137263117007583
Ridge R² : 0.774
RF R²    : 0.101



Unnamed: 0,OLS,Ridge,RF
Daily,0.50406,0.503974,0.514656
Weekly,0.553249,0.552344,0.505803
Monthly,0.783627,0.7745,0.10089





Unnamed: 0,Daily,Weekly,Monthly
Rate,0.009399,0.00835,0.009276
Credit,-0.130342,-0.104055,-0.076181
Equity,-0.004399,0.000264,0.078894
Liquidity,-0.000822,-0.001694,0.00417





Unnamed: 0,Daily,Weekly,Monthly
Credit,0.50341,0.558709,0.020619
Equity,0.011272,0.006532,0.402062
Liquidity,0.060307,0.00613,0.0
Rate,0.425011,0.428629,0.57732


### PCA Implementation

In [2]:
# Import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
hyg_vol   = load_excel_series("Workshop Data.xlsx", "Adj HYG", "Volume")

hy_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "HY Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
hyg_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "HYG")

# ============================================================
# HYG Total Return Construction
# ============================================================

if "TotalReturnsHYG" not in portfolio.columns:
    portfolio["HYGCumDiv"] = portfolio["HYG Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsHYG"] = (
        portfolio["HYG Position"] + portfolio["HYGCumDiv"]
    )

# Equity proxy
equity_px = portfolio["SPY Position"] / -10

# ============================================================
# Frequencies
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

# ============================================================
# PCA Per Frequency
# ============================================================

pca_results = {}

for label, freq in FREQUENCIES.items():

    print(f"\n\n================ PCA ({label}) =================")

    # --------------------------------------------------------
    # Resample & Construct Variables
    # --------------------------------------------------------

    y = portfolio["TotalReturnsHYG"].resample(freq).last().pct_change()

    equity_ret = log_return(equity_px.resample(freq).last())
    liquidity = log_return(hyg_vol.resample(freq).sum())

    credit = safe_diff(
        hy_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    )

    d_ust10 = safe_diff(
        ust10["PX_LAST"].resample(freq).last()
    )

    duration = hyg_yas["YAS_MOD_DUR"].resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = y.loc[factors.index]

    # --------------------------------------------------------
    # Standardization
    # --------------------------------------------------------

    scaler = StandardScaler()
    factors_scaled = scaler.fit_transform(factors)

    # --------------------------------------------------------
    # PCA Fit
    # --------------------------------------------------------

    pca = PCA()
    pcs = pca.fit_transform(factors_scaled)

    pc_names = [f"PC{i+1}" for i in range(pcs.shape[1])]

    pcs_df = pd.DataFrame(
        pcs,
        index=factors.index,
        columns=pc_names
    )

    # --------------------------------------------------------
    # Explained Variance
    # --------------------------------------------------------

    explained_var = pd.Series(
        pca.explained_variance_ratio_,
        index=pc_names
    )

    print("\nExplained Variance (Cumulative):")
    display(explained_var.cumsum())

    # --------------------------------------------------------
    # PCA Loadings 
    # --------------------------------------------------------

    loadings = pd.DataFrame(
        pca.components_.T,
        index=factors.columns,
        columns=pc_names
    )

    print("\nPCA Loadings:")
    display(loadings)

    # --------------------------------------------------------
    # PCA Regression (First K PCs)
    # --------------------------------------------------------

    K = 2  # typically enough
    X_pca = sm.add_constant(pcs_df.iloc[:, :K])

    pca_model = sm.OLS(y, X_pca).fit()

    print("\nPCA Regression Summary:")
    print(pca_model.summary())

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    pca_results[label] = {
        "Explained_Variance": explained_var,
        "Loadings": loadings,
        "PCA_R2": pca_model.rsquared,
        "Model": pca_model
    }

# ============================================================
# Summary Across Frequencies
# ============================================================

print("\n================ PCA R² BY FREQUENCY ================")
display(
    pd.Series(
        {k: v["PCA_R2"] for k, v in pca_results.items()},
        name="PCA R²"
    )
)





Explained Variance (Cumulative):


PC1    0.333900
PC2    0.596638
PC3    0.830769
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.66111,0.269463,-0.175899,-0.677778
Credit,0.696728,-0.001696,-0.113316,0.708327
Equity,-0.019538,0.766246,0.63057,0.12193
Liquidity,0.27771,-0.583313,0.747396,-0.154993



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsHYG   R-squared:                       0.046
Model:                            OLS   Adj. R-squared:                  0.044
Method:                 Least Squares   F-statistic:                     23.40
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           1.19e-10
Time:                        15:15:08   Log-Likelihood:                 3975.0
No. Observations:                 971   AIC:                            -7944.
Df Residuals:                     968   BIC:                            -7929.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.895e-06      

PC1    0.323919
PC2    0.602101
PC3    0.818102
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.308008,0.618167,0.723184,0.002202
Credit,0.670063,-0.200688,-0.115986,0.705186
Equity,0.231357,0.681397,-0.680565,-0.137852
Liquidity,0.634524,-0.336588,0.019581,-0.695488



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsHYG   R-squared:                       0.390
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     81.73
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           3.55e-28
Time:                        15:15:08   Log-Likelihood:                 931.75
No. Observations:                 259   AIC:                            -1857.
Df Residuals:                     256   BIC:                            -1847.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0011      

PC1    0.495066
PC2    0.714672
PC3    0.909691
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,-0.441738,-0.000312,0.877947,-0.184601
Credit,0.584639,-0.104637,0.436255,0.675966
Equity,-0.558198,0.492915,-0.143635,0.651782
Liquidity,0.389205,0.863763,0.135132,-0.290125



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsHYG   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     87.49
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           5.87e-18
Time:                        15:15:08   Log-Likelihood:                 193.89
No. Observations:                  59   AIC:                            -381.8
Df Residuals:                      56   BIC:                            -375.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0047      

Daily      0.046121
Weekly     0.389691
Monthly    0.757564
Name: PCA R², dtype: float64