In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

WORKSHOP_FILE = "Workshop Data.xlsx"
INDEX_FILE = "Indexes and Spreads Data 01.09.xlsx"
VOL_FILE = "MOVE Vix prices.xlsx"
SP500_FILE = "SP500 Index data.xlsx"

# How many PCA components to use in the main regression
N_PCS_MAIN = 3

In [2]:
# Helpers 
def _coerce_datetime(s):
    return pd.to_datetime(s, errors="coerce")

def first_present(df, candidates):
    """Return the first column name that exists in df from candidates; else None."""
    for c in candidates:
        if c in df.columns:
            return c
    return None

def require_col(df, candidates, label):
    c = first_present(df, candidates)
    if c is None:
        raise KeyError(f"Could not find {label} column. Tried: {candidates}. Available: {list(df.columns)[:30]}...")
    return c

def safe_pct_change(x):
    x = pd.to_numeric(x, errors="coerce")
    out = x.pct_change()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

def safe_diff(x):
    x = pd.to_numeric(x, errors="coerce")
    out = x.diff()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

def safe_dlog(x):
    x = pd.to_numeric(x, errors="coerce")
    x = x.where(x > 0, np.nan)  # log-safe
    out = np.log(x).diff()
    out = out.replace([np.inf, -np.inf], np.nan)
    return out

In [3]:
# Load target ETF data (updated workbook uses one sheet per ticker)
workshop = pd.read_excel(WORKSHOP_FILE, sheet_name="HYG")

# Build an index/spread panel by merging all index sheets on Date
_xl = pd.ExcelFile(INDEX_FILE)
idx_parts = []
for _s in _xl.sheet_names:
    _d = pd.read_excel(INDEX_FILE, sheet_name=_s)
    if "Date" in _d.columns:
        _d["Date"] = _coerce_datetime(_d["Date"])
    # Prefix columns to avoid collisions, then keep Date + prefixed cols
    _pref = _s.replace(" ", "_")
    _cols = [c for c in _d.columns if c != "Date"]
    _d = _d.rename(columns={c: f"{_pref}__{c}" for c in _cols})
    idx_parts.append(_d)

indexes = idx_parts[0]
for _d in idx_parts[1:]:
    indexes = indexes.merge(_d, on="Date", how="outer")

# Standardize a few commonly-used columns expected downstream
# Target yield proxy (if available)
if "HYG__YAS_BOND_YLD" in indexes.columns:
    indexes["YAS_BOND_YLD"] = indexes["HYG__YAS_BOND_YLD"]

# 10Y yield proxy
if "10yUST_Yields__PX_LAST" in indexes.columns:
    indexes["USGG10YR Index"] = indexes["10yUST_Yields__PX_LAST"]

# OAS proxies
if "IBOXHY-HYG__OAS_SOVEREIGN_CURVE" in indexes.columns:
    indexes["HYOAS"] = indexes["IBOXHY-HYG__OAS_SOVEREIGN_CURVE"]
if "HY_Index__OAS_SOVEREIGN_CURVE" in indexes.columns:
    indexes["HY_OAS"] = indexes["HY_Index__OAS_SOVEREIGN_CURVE"]

if "IBOXIG-LQD__OAS_SOVEREIGN_CURVE" in indexes.columns:
    indexes["IGOAS"] = indexes["IBOXIG-LQD__OAS_SOVEREIGN_CURVE"]
if "IG_Index__OAS_SOVEREIGN_CURVE" in indexes.columns:
    indexes["IG_OAS"] = indexes["IG_Index__OAS_SOVEREIGN_CURVE"]

# Vol + SPX inputs
vix_df = pd.read_excel(VOL_FILE, sheet_name="VIX")
move_df = pd.read_excel(VOL_FILE, sheet_name="MOVE") if "MOVE" in pd.ExcelFile(VOL_FILE).sheet_names else None

sp500 = pd.read_excel(SP500_FILE, sheet_name="SPY")

for d in (workshop, indexes, vix_df, sp500):
    if "Date" in d.columns:
        d["Date"] = _coerce_datetime(d["Date"])

if move_df is not None and "Date" in move_df.columns:
    move_df["Date"] = _coerce_datetime(move_df["Date"])

# Standardize target price column name to <TICKER>_PX_LAST
_px = require_col(workshop, ["PX_LAST", "Adj Close", "Close", "Last", "Last Price"], "HYG close")
workshop = workshop.rename(columns={_px: "HYG_PX_LAST"})

# Renamed
vix_px = require_col(vix_df, ["PX_LAST", "VIX", "Close"], "VIX level")
vix_df = vix_df.rename(columns={vix_px: "VIX_PX_LAST"})[["Date", "VIX_PX_LAST"]]

spx_px = require_col(sp500, ["PX_LAST", "SP500_PX_LAST", "Close"], "SP500 level")
sp500 = sp500.rename(columns={spx_px: "SP500_PX_LAST"})[["Date", "SP500_PX_LAST"]]

if move_df is not None:
    move_px = require_col(move_df, ["PX_LAST", "MOVE", "Close"], "MOVE level")
    move_df = move_df.rename(columns={move_px: "MOVE_PX_LAST"})[["Date", "MOVE_PX_LAST"]]


In [4]:
# Merge to a unified panel 
df = workshop.merge(indexes, on="Date", how="inner").merge(vix_df, on="Date", how="inner").merge(sp500, on="Date", how="inner")
if move_df is not None:
    df = df.merge(move_df, on="Date", how="inner")

df = df.sort_values("Date").reset_index(drop=True)

In [5]:
# Target + factor engineering

# Prefer the standardized HYG_PX_LAST column if present (avoids accidentally grabbing a constant position/value column)
hyg_px_col = 'HYG_PX_LAST' if 'HYG_PX_LAST' in df.columns else require_col(
    df,
    ['HYG_PX_LAST', 'HYG_PX_LAST', 'HYG PX_LAST', 'HYG', 'HYG Position', 'HYG Position Value', 'HYG Close', 'HYG Total Return', 'HYG Market Value'],
    'HYG price/value proxy'
)
df["r_hyg"] = safe_pct_change(df[hyg_px_col])

# Equity factor: S&P 500 index return
df["r_sp500"] = safe_pct_change(df["SP500_PX_LAST"])

# Vol factors
df["dvix"] = safe_diff(df["VIX_PX_LAST"])
df["r_vix"] = safe_pct_change(df["VIX_PX_LAST"])

if "MOVE_PX_LAST" in df.columns:
    df["dmove"] = safe_diff(df["MOVE_PX_LAST"])
    df["r_move"] = safe_pct_change(df["MOVE_PX_LAST"])

# Rates / credit spreads
teny = first_present(df, ["USGG10YR Index", "10Y_Yield", "US10Y", "DGS10", "UST_10Y", "10Y", "US 10Y", "YAS_BOND_YLD"])
twoy = first_present(df, ["USGG2YR Index", "2Y_Yield", "US2Y", "DGS2", "UST_2Y", "2Y", "US 2Y"])
if teny is not None:
    df["d_10y"] = safe_diff(df[teny])
if twoy is not None:
    df["d_2y"] = safe_diff(df[twoy])
if teny is not None and twoy is not None:
    df["d_curve_2s10s"] = safe_diff(df[teny] - df[twoy])

# Credit spreads
hy_oas = first_present(df, ["LF98OAS Index", "HYOAS", "HY_OAS", "HY OAS", "HY_Spread", "US High Yield OAS", "YAS_ISPREAD", "YAS_YLD_SPREAD"])
ig_oas = first_present(df, ["LF97OAS Index", "IGOAS", "IG_OAS", "IG OAS", "IG_Spread", "US Investment Grade OAS"])

if hy_oas is not None:
    df["d_hy_oas"] = safe_diff(df[hy_oas])
if ig_oas is not None:
    df["d_ig_oas"] = safe_diff(df[ig_oas])
if hy_oas is not None and ig_oas is not None:
    df["d_hy_ig_oas"] = safe_diff(df[hy_oas] - df[ig_oas])

# Liquidity proxy (volume)
hyg_vol_col = first_present(df, ["HYG_VOLUME", "HYG Volume", "HYG_VOLUME_SHARES", "HYG_VOL", "HYG_Volume"])
if hyg_vol_col is not None:
    df["dlog_hyg_vol"] = safe_dlog(df[hyg_vol_col])

candidate_features = [
    "r_sp500", "dvix", "r_vix",
    "dmove", "r_move",
    "d_10y", "d_2y", "d_curve_2s10s",
    "d_hy_oas", "d_ig_oas", "d_hy_ig_oas",
    "dlog_hyg_vol",
]
features = [c for c in candidate_features if c in df.columns]

df_model = df[["Date", "r_hyg"] + features].set_index("Date")

print("Using HYG proxy column:", hyg_px_col)
print("Using features:", features)
df_model.tail()

Using HYG proxy column: HYG_PX_LAST
Using features: ['r_sp500', 'dvix', 'r_vix', 'd_10y', 'd_hy_oas', 'd_ig_oas', 'd_hy_ig_oas']


Unnamed: 0_level_0,r_hyg,r_sp500,dvix,r_vix,d_10y,d_hy_oas,d_ig_oas,d_hy_ig_oas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2026-01-02,0.000496,0.001895,-0.44,-0.029431,0.0237,-1.40119,0.576166,-1.977356
2026-01-05,0.002603,0.006354,0.39,0.026878,-0.0295,0.056807,0.178982,-0.122175
2026-01-06,0.000247,0.006197,-0.15,-0.010067,0.0118,-1.413961,-0.877332,-0.536629
2026-01-07,-0.000247,-0.00344,0.63,0.042712,-0.0255,0.307285,1.265985,-0.9587
2026-01-08,0.000556,7.7e-05,0.07,0.004551,0.0197,-3.91275,0.087868,-4.000618


In [6]:
# === Multi-horizon returns config ===

# Choose the return horizons (in trading days) you want to analyze
RETURN_HORIZONS = [1, 5, 20]        # 1d, 1w, ~1m

# Choose rolling regression window sizes (in trading days)
ROLL_WINDOWS = [60, 120, 252]       # ~3m, ~6m, ~1y

def _compound_return(r: 'pd.Series', h: int) -> 'pd.Series':
    """Compound daily simple returns to an h-day simple return.
    Uses log(1+r) summation to be numerically stable.
    """
    r = pd.to_numeric(r, errors='coerce')
    lr = np.log1p(r)
    out = np.expm1(lr.rolling(h).sum())
    return out

def _sum_diff(x: 'pd.Series', h: int) -> 'pd.Series':
    x = pd.to_numeric(x, errors='coerce')
    return x.rolling(h).sum()

def make_horizon_frame(df_model: 'pd.DataFrame', target_col: str, feature_cols: list[str], h: int) -> tuple['pd.Series', 'pd.DataFrame']:
    """Return (y, X) at horizon h.

    Convention:
      - columns starting with 'r_' are treated as daily simple returns and are compounded
      - all other columns are treated as daily deltas and are summed
    """
    y = _compound_return(df_model[target_col], h)

    Xh = {}
    for c in feature_cols:
        if c.startswith('r_'):
            Xh[c] = _compound_return(df_model[c], h)
        else:
            Xh[c] = _sum_diff(df_model[c], h)

    Xh = pd.DataFrame(Xh, index=df_model.index)
    return y, Xh

# Target is the first column of df_model (set earlier in the notebook)
TARGET_RET_COL = df_model.columns[0]


In [7]:
# cleaning
X_df = df_model[features].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
y_s = pd.to_numeric(df_model["r_hyg"], errors="coerce").replace([np.inf, -np.inf], np.nan)

mask_y = y_s.notna()
X_df = X_df.loc[mask_y]
y = y_s.loc[mask_y].values

print("Rows after y alignment:", len(y))

# Pipeline: median impute -> standardize -> PCA
pca = PCA(n_components=min(5, len(features)))  # keep up to 5 PCs for diagnostics; use first N_PCS_MAIN for the main model
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("pca", pca),
])

X_pcs_all = pipe.fit_transform(X_df.values)

# Keep first N_PCS_MAIN PCs for the regression model
X_pcs = X_pcs_all[:, :N_PCS_MAIN]

Rows after y alignment: 1259


In [8]:
# PCA loadings (interpretation)
# Convert PCA components into a readable table
components = pipe.named_steps["pca"].components_

n_show = min(max(5, N_PCS_MAIN), components.shape[0])
loading = pd.DataFrame(
    components[:n_show],
    columns=features,
    index=[f"PC{i+1}" for i in range(n_show)]
)

# Loadings are in standardized factor space
for i in range(min(N_PCS_MAIN, n_show)):
    pc = f"PC{i+1}"
    display(loading.T.sort_values(pc, ascending=False).head(15))


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
dvix,0.539718,0.145311,0.047324,-0.218342,0.330046
r_vix,0.534411,0.143927,0.05398,-0.219265,0.417579
d_ig_oas,0.356586,-0.023419,-0.325827,0.854775,0.03958
d_10y,-0.012388,0.008792,0.930085,0.354299,0.095119
d_hy_oas,-0.110874,0.695629,-0.074908,0.182993,0.0137
d_hy_ig_oas,-0.193068,0.677157,0.00766,-0.04957,-0.021092
r_sp500,-0.496219,-0.122891,-0.133997,0.110177,0.839911


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
d_hy_oas,-0.110874,0.695629,-0.074908,0.182993,0.0137
d_hy_ig_oas,-0.193068,0.677157,0.00766,-0.04957,-0.021092
dvix,0.539718,0.145311,0.047324,-0.218342,0.330046
r_vix,0.534411,0.143927,0.05398,-0.219265,0.417579
d_10y,-0.012388,0.008792,0.930085,0.354299,0.095119
d_ig_oas,0.356586,-0.023419,-0.325827,0.854775,0.03958
r_sp500,-0.496219,-0.122891,-0.133997,0.110177,0.839911


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
d_10y,-0.012388,0.008792,0.930085,0.354299,0.095119
r_vix,0.534411,0.143927,0.05398,-0.219265,0.417579
dvix,0.539718,0.145311,0.047324,-0.218342,0.330046
d_hy_ig_oas,-0.193068,0.677157,0.00766,-0.04957,-0.021092
d_hy_oas,-0.110874,0.695629,-0.074908,0.182993,0.0137
r_sp500,-0.496219,-0.122891,-0.133997,0.110177,0.839911
d_ig_oas,0.356586,-0.023419,-0.325827,0.854775,0.03958


In [9]:
# Linear Regression
lr = LinearRegression().fit(X_pcs, y)
yhat = lr.predict(X_pcs)

rmse = float(np.sqrt(mean_squared_error(y, yhat)))
print("In-sample R2:", round(r2_score(y, yhat), 4))
print("In-sample RMSE:", rmse)
print("alpha:", lr.intercept_)
print("betas (PCs):", lr.coef_)

In-sample R2: 0.5843
In-sample RMSE: 0.0030880580337914604
alpha: -4.711631299525291e-05
betas (PCs): [-0.00173835 -0.00043766 -0.00195295]


In [10]:
# Map PC betas back to original factors
beta_pc = lr.coef_[:N_PCS_MAIN]                      
load_k = components[:N_PCS_MAIN, :]                 
exposure_std = beta_pc @ load_k          

exp_tbl = pd.DataFrame({"exposure_in_std_units": exposure_std}, index=features).sort_values("exposure_in_std_units", ascending=False)
display(exp_tbl)

Unnamed: 0,exposure_in_std_units
r_sp500,0.001178
d_hy_oas,3.5e-05
d_ig_oas,2.7e-05
d_hy_ig_oas,2.4e-05
dvix,-0.001094
r_vix,-0.001097
d_10y,-0.001799


In [11]:
# OLS regression
X_ols = sm.add_constant(X_pcs)  # const + PCs
ols = sm.OLS(y, X_ols).fit()
print(ols.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.584
Model:                            OLS   Adj. R-squared:                  0.583
Method:                 Least Squares   F-statistic:                     588.0
Date:                Fri, 16 Jan 2026   Prob (F-statistic):          1.30e-238
Time:                        20:19:57   Log-Likelihood:                 5490.8
No. Observations:                1259   AIC:                        -1.097e+04
Df Residuals:                    1255   BIC:                        -1.095e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.712e-05   8.72e-05     -0.541      0.5

In [12]:
ridge = Ridge(alpha=1.0).fit(X_pcs, y)
yr = ridge.predict(X_pcs)
print("Ridge R2:", round(r2_score(y, yr), 4))
print("Ridge betas (PCs):", ridge.coef_)

Ridge R2: 0.5843
Ridge betas (PCs): [-0.00173788 -0.00043748 -0.00195147]


In [13]:
# Rolling window stability (1y)
window = 252  # ~1 trading year

betas = []
r2s = []

# Use a date index aligned to X_df / y
dates = X_df.index.to_numpy()

for end in range(window, len(y)):
    start = end - window
    Xw = X_pcs[start:end]
    yw = y[start:end]

    m = LinearRegression().fit(Xw, yw)
    ywh = m.predict(Xw)

    dt = dates[end]
    # store alpha + betas
    row = [dt, float(m.intercept_)] + [float(b) for b in m.coef_]
    betas.append(row)
    r2s.append([dt, float(r2_score(yw, ywh))])

beta_cols = ["alpha"] + [f"beta_pc{i+1}" for i in range(X_pcs.shape[1])]
betas = pd.DataFrame(betas, columns=["Date"] + beta_cols).set_index("Date")
r2s = pd.DataFrame(r2s, columns=["Date", "rolling_r2"]).set_index("Date")

display(betas.tail())
display(r2s.tail())


Unnamed: 0_level_0,alpha,beta_pc1,beta_pc2,beta_pc3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2026-01-02,4.7e-05,-0.001421,-0.000507,-0.001231
2026-01-05,4e-05,-0.001424,-0.000505,-0.001214
2026-01-06,4.5e-05,-0.001425,-0.000504,-0.001222
2026-01-07,3.1e-05,-0.001427,-0.0005,-0.00122
2026-01-08,3.6e-05,-0.001427,-0.000499,-0.00122


Unnamed: 0_level_0,rolling_r2
Date,Unnamed: 1_level_1
2026-01-02,0.744176
2026-01-05,0.745617
2026-01-06,0.74504
2026-01-07,0.747201
2026-01-08,0.747169


In [14]:
# Approx daily contribution

# Recreate imputed+scaled factors
X_imp = pipe.named_steps["imputer"].transform(X_df.values)
Xz = pipe.named_steps["scaler"].transform(X_imp)

contrib = pd.DataFrame(Xz * exposure_std, index=X_df.index, columns=features)

top = exp_tbl.index[:6].tolist()

contrib[top].tail()

Unnamed: 0_level_0,r_sp500,d_hy_oas,d_ig_oas,d_hy_ig_oas,dvix,r_vix
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2026-01-02,0.000148,-7e-06,1.2e-05,-7.153861e-06,0.000256,0.000431
2026-01-05,0.000641,1e-06,5e-06,5.391643e-08,-0.000237,-0.000326
2026-01-06,0.000624,-7e-06,-1.3e-05,-1.556327e-06,8.4e-05,0.000171
2026-01-07,-0.000441,3e-06,2.4e-05,-3.196163e-06,-0.00038,-0.000538
2026-01-08,-5.3e-05,-2.1e-05,4e-06,-1.501467e-05,-4.7e-05,-2.6e-05


In [15]:
# === Multi-horizon + multi-window PCA regression ===

def run_pca_model(y: 'pd.Series', X: 'pd.DataFrame', n_pcs: int = 3):
    # Clean
    Xc = X.apply(pd.to_numeric, errors='coerce').replace([np.inf, -np.inf], np.nan)
    yc = pd.to_numeric(y, errors='coerce').replace([np.inf, -np.inf], np.nan)

    mask = yc.notna()
    Xc = Xc.loc[mask]
    yc = yc.loc[mask]

    # Drop rows with all-NaN X (imputer will handle partial NaNs)
    all_nan = Xc.isna().all(axis=1)
    Xc = Xc.loc[~all_nan]
    yc = yc.loc[~all_nan]

    pca = PCA(n_components=min(max(n_pcs, 2), Xc.shape[1]))
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('pca', pca),
    ])

    X_pcs = pipe.fit_transform(Xc.values)
    X_use = X_pcs[:, :n_pcs]

    lr = LinearRegression().fit(X_use, yc.values)
    yhat = lr.predict(X_use)

    r2 = float(r2_score(yc.values, yhat))
    rmse = float(np.sqrt(mean_squared_error(yc.values, yhat)))

    return {
        'pipe': pipe,
        'X_index': Xc.index,
        'X_pcs': X_use,
        'y': yc.values,
        'lr': lr,
        'r2': r2,
        'rmse': rmse,
    }


def rolling_lr(X_pcs: 'np.ndarray', y: 'np.ndarray', dates, window: int):
    betas = []
    r2s = []

    for end in range(window, len(y)):
        start = end - window
        Xw = X_pcs[start:end]
        yw = y[start:end]

        m = LinearRegression().fit(Xw, yw)
        ywh = m.predict(Xw)

        dt = dates[end]
        # store alpha + betas
        row = [dt, float(m.intercept_)] + [float(b) for b in m.coef_]
        betas.append(row)
        r2s.append([dt, float(r2_score(yw, ywh))])

    beta_cols = ['alpha'] + [f'beta_pc{i+1}' for i in range(X_pcs.shape[1])]
    betas = pd.DataFrame(betas, columns=['Date'] + beta_cols).set_index('Date')
    r2s = pd.DataFrame(r2s, columns=['Date', 'rolling_r2']).set_index('Date')
    return betas, r2s


summary_rows = []
rolling_outputs = {}

for h in RETURN_HORIZONS:
    y_h, X_h = make_horizon_frame(df_model, TARGET_RET_COL, features, h)

    out = run_pca_model(y_h, X_h, n_pcs=N_PCS_MAIN)
    dates = out['X_index'].to_numpy()

    # Rolling windows
    roll = {}
    for w in ROLL_WINDOWS:
        if len(out['y']) <= w + 5:
            continue
        betas_w, r2s_w = rolling_lr(out['X_pcs'], out['y'], dates, w)
        roll[w] = {'betas': betas_w, 'r2s': r2s_w}

    rolling_outputs[h] = {
        'in_sample_r2': out['r2'],
        'rmse': out['rmse'],
        'alpha': float(out['lr'].intercept_),
        'betas_pc': [float(x) for x in out['lr'].coef_],
        'roll': roll,
    }

    summary_rows.append({
        'horizon_days': h,
        'in_sample_r2': out['r2'],
        'rmse': out['rmse'],
        'alpha': float(out['lr'].intercept_),
        'beta_pc1': float(out['lr'].coef_[0]) if len(out['lr'].coef_)>0 else np.nan,
        'beta_pc2': float(out['lr'].coef_[1]) if len(out['lr'].coef_)>1 else np.nan,
        'n_obs': int(len(out['y'])),
    })

summary = pd.DataFrame(summary_rows).sort_values('horizon_days')
summary

# Example: view a rolling R2 series
# rolling_outputs[5]['roll'][252]['r2s'].plot(title='Rolling R2 (5d horizon, 252d window)')


Unnamed: 0,horizon_days,in_sample_r2,rmse,alpha,beta_pc1,beta_pc2,n_obs
0,1,0.584309,0.003088,-4.7e-05,-0.001738,-0.000438,1259
1,5,0.604846,0.006258,-0.00025,-0.003527,0.000666,1255
2,20,0.722601,0.009666,-0.001082,-0.006054,0.006654,1240
