### Factor Modeling
 

* Rolling frequency
    * Daily
    * Weekly
    * Monthly

In [2]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
hyg_vol   = load_excel_series("Workshop Data.xlsx", "Adj HYG", "Volume")

hy_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "IG Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
hyg_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "HYG")

# ============================================================
# LQD Total Return Construction
# ============================================================

if "TotalReturnsHYG" not in portfolio.columns:
    portfolio["HYGCumDiv"] = portfolio["HYG Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsHYG"] = (
        portfolio["HYG Position"] + portfolio["HYGCumDiv"]
    )

# ============================================================
# Equity Risk Proxy 
# ============================================================

equity_px = portfolio["SPY Position"] / -10
equity_return = log_return(equity_px)

# ============================================================
# Duration (IG – Long Duration)
# ============================================================

hyg_duration = hyg_yas["YAS_MOD_DUR"]

# ============================================================
# Rolling Frequency HYG Cross-Asset Model
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

results = {}

# ============================================================
# Loop Over Frequencies
# ============================================================

for label, freq in FREQUENCIES.items():
    print(f"\n\n================ {label.upper()} MODEL =================")

    # --------------------------------------------------------
    # Resample Raw Inputs
    # --------------------------------------------------------

    hyg_ret = portfolio["TotalReturnsHYG"].resample(freq).last().pct_change()

    equity_px_rs = equity_px.resample(freq).last()
    equity_ret = log_return(equity_px_rs)

    hyg_vol_rs = hyg_vol.resample(freq).sum()
    liquidity = log_return(hyg_vol_rs)

    hy_oas = hy_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    credit = safe_diff(hy_oas)

    ust10_rs = ust10["PX_LAST"].resample(freq).last()
    d_ust10 = safe_diff(ust10_rs)

    duration = hyg_duration.resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors_rs = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = hyg_ret.loc[factors_rs.index]

    # --------------------------------------------------------
    # ---------------------- OLS ------------------------------
    # --------------------------------------------------------

    X_ols = sm.add_constant(factors_rs)
    ols = sm.OLS(y, X_ols).fit()

    print(f"\n--- OLS ({label}) ---")
    print(f"R²     : {ols.rsquared:.3f}")
    print(f"Adj R² : {ols.rsquared_adj:.3f}")
    print("DW     :", sm.stats.stattools.durbin_watson(ols.resid))

    # --------------------------------------------------------
    # --------------------- RIDGE -----------------------------
    # --------------------------------------------------------

    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=10.0))
    ])

    ridge.fit(factors_rs, y)
    ridge_pred = ridge.predict(factors_rs)

    ridge_r2 = r2_score(y, ridge_pred)

    # --------------------------------------------------------
    # ------------------ RANDOM FOREST ------------------------
    # --------------------------------------------------------

    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=5,
        min_samples_leaf=20,
        random_state=42
    )

    rf.fit(factors_rs, y)
    rf_pred = rf.predict(factors_rs)

    rf_r2 = r2_score(y, rf_pred)

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    results[label] = {
        "OLS_R2": ols.rsquared,
        "Ridge_R2": ridge_r2,
        "RF_R2": rf_r2,
        "OLS_Betas": ols.params.drop("const"),
        "Ridge_Betas": pd.Series(
            ridge.named_steps["ridge"].coef_,
            index=factors_rs.columns
        ),
        "RF_Importance": pd.Series(
            rf.feature_importances_,
            index=factors_rs.columns
        ).sort_values(ascending=False)
    }

    print(f"Ridge R² : {ridge_r2:.3f}") 
    print(f"RF R²    : {rf_r2:.3f}")

# ============================================================
# Summary Tables
# ============================================================

summary_r2 = pd.DataFrame({
    freq: {
        "OLS": results[freq]["OLS_R2"],
        "Ridge": results[freq]["Ridge_R2"],
        "RF": results[freq]["RF_R2"]
    }
    for freq in results
}).T

print("\n================ R² SUMMARY =================")
display(summary_r2)

print("\n================ OLS BETAS ==================")
display(pd.DataFrame({k: v["OLS_Betas"] for k, v in results.items()}))

print("\n========== RANDOM FOREST IMPORTANCE =========")
display(pd.DataFrame({k: v["RF_Importance"] for k, v in results.items()}))





--- OLS (Daily) ---
R²     : 0.758
Adj R² : 0.757
DW     : 1.917279720927901
Ridge R² : 0.758
RF R²    : 0.702



--- OLS (Weekly) ---
R²     : 0.729
Adj R² : 0.725
DW     : 2.135117143098064
Ridge R² : 0.729
RF R²    : 0.618



--- OLS (Monthly) ---
R²     : 0.846
Adj R² : 0.834
DW     : 2.7111814165438495
Ridge R² : 0.839
RF R²    : 0.118



Unnamed: 0,OLS,Ridge,RF
Daily,0.758076,0.758022,0.70195
Weekly,0.729157,0.728593,0.617605
Monthly,0.845683,0.838936,0.117889





Unnamed: 0,Daily,Weekly,Monthly
Rate,0.007306,0.006652,0.006851
Credit,-0.055562,-0.052387,-0.046578
Equity,0.225696,0.199028,0.194707
Liquidity,-9.3e-05,8e-05,0.005978





Unnamed: 0,Daily,Weekly,Monthly
Equity,0.766815,0.80783,0.762887
Rate,0.178636,0.155409,0.237113
Credit,0.044889,0.035701,0.0
Liquidity,0.00966,0.00106,0.0


### PCA Implementation

In [3]:
# Import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ============================================================
# Utility Functions
# ============================================================

def log_return(series):
    return np.log(series).diff()

def safe_diff(series):
    return series.diff()

# ============================================================
# Data Loading Helper
# ============================================================

def load_excel_series(path, sheet, column=None):
    df = pd.read_excel(path, sheet_name=sheet)
    df = df.set_index("Date").sort_index()
    return df[column] if column else df

# ============================================================
# Load Data
# ============================================================

portfolio = load_excel_series("Workshop Data.xlsx", "Portfolio")
hyg_vol   = load_excel_series("Workshop Data.xlsx", "Adj HYG", "Volume")

hy_index  = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "HY Index")
ust10     = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "10yUST Yields")
hyg_yas   = load_excel_series("Indexes and Spreads Data 01.09.xlsx", "HYG")

# ============================================================
# HYG Total Return Construction
# ============================================================

if "TotalReturnsHYG" not in portfolio.columns:
    portfolio["HYGCumDiv"] = portfolio["HYG Dividends"][::-1].cumsum()[::-1]
    portfolio["TotalReturnsHYG"] = (
        portfolio["HYG Position"] + portfolio["HYGCumDiv"]
    )

# Equity proxy
equity_px = portfolio["SPY Position"] / -10

# ============================================================
# Frequencies
# ============================================================

FREQUENCIES = {
    "Daily": "D",
    "Weekly": "W-FRI",
    "Monthly": "M"
}

# ============================================================
# PCA Per Frequency
# ============================================================

pca_results = {}

for label, freq in FREQUENCIES.items():

    print(f"\n\n================ PCA ({label}) =================")

    # --------------------------------------------------------
    # Resample & Construct Variables
    # --------------------------------------------------------

    y = portfolio["TotalReturnsHYG"].resample(freq).last().pct_change()

    equity_ret = log_return(equity_px.resample(freq).last())
    liquidity = log_return(hyg_vol.resample(freq).sum())

    credit = safe_diff(
        hy_index["OAS_SOVEREIGN_CURVE"].resample(freq).last()
    )

    d_ust10 = safe_diff(
        ust10["PX_LAST"].resample(freq).last()
    )

    duration = hyg_yas["YAS_MOD_DUR"].resample(freq).last()
    rate = -duration * d_ust10

    # --------------------------------------------------------
    # Factor Matrix
    # --------------------------------------------------------

    factors = pd.DataFrame({
        "Rate": rate,
        "Credit": credit,
        "Equity": equity_ret,
        "Liquidity": liquidity
    }).dropna()

    y = y.loc[factors.index]

    # --------------------------------------------------------
    # Standardization
    # --------------------------------------------------------

    scaler = StandardScaler()
    factors_scaled = scaler.fit_transform(factors)

    # --------------------------------------------------------
    # PCA Fit
    # --------------------------------------------------------

    pca = PCA()
    pcs = pca.fit_transform(factors_scaled)

    pc_names = [f"PC{i+1}" for i in range(pcs.shape[1])]

    pcs_df = pd.DataFrame(
        pcs,
        index=factors.index,
        columns=pc_names
    )

    # --------------------------------------------------------
    # Explained Variance
    # --------------------------------------------------------

    explained_var = pd.Series(
        pca.explained_variance_ratio_,
        index=pc_names
    )

    print("\nExplained Variance (Cumulative):")
    display(explained_var.cumsum())

    # --------------------------------------------------------
    # PCA Loadings 
    # --------------------------------------------------------

    loadings = pd.DataFrame(
        pca.components_.T,
        index=factors.columns,
        columns=pc_names
    )

    print("\nPCA Loadings:")
    display(loadings)

    # --------------------------------------------------------
    # PCA Regression (First K PCs)
    # --------------------------------------------------------

    K = 2  # typically enough
    X_pca = sm.add_constant(pcs_df.iloc[:, :K])

    pca_model = sm.OLS(y, X_pca).fit()

    print("\nPCA Regression Summary:")
    print(pca_model.summary())

    # --------------------------------------------------------
    # Store Results
    # --------------------------------------------------------

    pca_results[label] = {
        "Explained_Variance": explained_var,
        "Loadings": loadings,
        "PCA_R2": pca_model.rsquared,
        "Model": pca_model
    }

# ============================================================
# Summary Across Frequencies
# ============================================================

print("\n================ PCA R² BY FREQUENCY ================")
display(
    pd.Series(
        {k: v["PCA_R2"] for k, v in pca_results.items()},
        name="PCA R²"
    )
)





Explained Variance (Cumulative):


PC1    0.416551
PC2    0.684158
PC3    0.918909
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.2675,0.856383,0.229816,-0.377143
Credit,0.69044,0.136666,-0.189348,0.684663
Equity,-0.618657,0.392986,0.276067,0.621781
Liquidity,0.262678,-0.305766,0.913848,0.04887



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsHYG   R-squared:                       0.672
Model:                            OLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                     991.5
Date:                Fri, 16 Jan 2026   Prob (F-statistic):          4.92e-235
Time:                        20:44:34   Log-Likelihood:                 4493.2
No. Observations:                 971   AIC:                            -8980.
Df Residuals:                     968   BIC:                            -8966.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.895e-06   7.6

PC1    0.462141
PC2    0.721556
PC3    0.930378
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,-0.039263,0.945935,-0.267332,-0.17944
Credit,0.653048,0.076317,-0.292071,0.69455
Equity,-0.650338,0.175135,0.24776,0.696422
Liquidity,0.386067,0.262127,0.884219,-0.01997



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsHYG   R-squared:                       0.782
Model:                            OLS   Adj. R-squared:                  0.781
Method:                 Least Squares   F-statistic:                     460.0
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           1.74e-85
Time:                        20:44:35   Log-Likelihood:                 1065.3
No. Observations:                 259   AIC:                            -2125.
Df Residuals:                     256   BIC:                            -2114.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0011      

PC1    0.578081
PC2    0.784020
PC3    0.967784
PC4    1.000000
dtype: float64


PCA Loadings:


Unnamed: 0,PC1,PC2,PC3,PC4
Rate,0.442018,0.522038,0.648781,-0.333435
Credit,-0.548386,0.118569,0.581161,0.589462
Equity,0.609745,0.215081,-0.211844,0.732853
Liquidity,-0.363461,0.816797,-0.443234,-0.065437



PCA Regression Summary:
                            OLS Regression Results                            
Dep. Variable:        TotalReturnsHYG   R-squared:                       0.895
Model:                            OLS   Adj. R-squared:                  0.891
Method:                 Least Squares   F-statistic:                     238.7
Date:                Fri, 16 Jan 2026   Prob (F-statistic):           3.90e-28
Time:                        20:44:35   Log-Likelihood:                 218.58
No. Observations:                  59   AIC:                            -431.2
Df Residuals:                      56   BIC:                            -424.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0047      

Daily      0.671986
Weekly     0.782320
Monthly    0.895017
Name: PCA R², dtype: float64