# Empirical Homogeneity Test

## Notebook Setup

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import torch
import sys 
sys.path.append('../')
from utils import utils
sys.executable

'/Users/fanghema/Desktop/aaSTAT_5200/STAT_5200_final_project/env/bin/python'

In [2]:
data = pd.read_csv(
    '../data/processed/data_extended.csv',
    index_col=0,
    parse_dates=True
)

factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']
assets = [col for col in data.columns if col != 'RF' and col not in factors]
data['Quarter'] = data.index.to_period("Q")

## Set up empirical testing parameters

In [3]:
factor_options = [
    ['Mkt-RF'],
    ['Mkt-RF', 'SMB', 'HML'],
    ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA'],
]
R_options = [1, 2, 5]
sample_period_options = [
    ('1963-01-01', '2025-12-31'),
    ('1963-01-01', '1983-01-01'),
    ('1973-01-01', '1993-01-01'),
    ('1983-01-01', '2003-01-01'),
    ('1993-01-01', '2013-01-01'),
    ('2003-01-01', '2023-01-01'),
]

results = pd.DataFrame(
    index=pd.MultiIndex.from_product([
        list(map(tuple, factor_options)),   # convert lists → tuples
        R_options,
        sample_period_options
    ]),
    columns=['gamma_a_lam', 'gamma_a', 'gamma_lam']
)


print(f"Total combinations: {results.shape[0]}")
counter = 0

for factors in factor_options:
    K = len(factors)
    for R in R_options:
        for sample_period in sample_period_options:
            print(f"Processing {counter}/{results.shape[0]}: {factors} - {R} - {sample_period}")
            data_slice = data.loc[
                (data.index > sample_period[0]) &
                (data.index < sample_period[1])
            ]
            beta_loading, returns_df, realized_covariance, residuals = utils.calculate_factor_loading(
                data_slice, 
                factors=factors, 
                assets=assets
            )

            excess_returns = returns_df.groupby("Quarter").sum()[assets].T.values
            industries = beta_loading.index.get_level_values(0).unique().tolist()
            factors_names = beta_loading.index.get_level_values(1).unique().tolist()

            N = len(industries)
            K = len(factors)
            T = beta_loading.shape[1]

            beta_hat_np = np.zeros((N, K, T))

            for i, asset in enumerate(industries):
                for j, factor in enumerate(factors):
                    beta_hat_np[i, j, :] = beta_loading.loc[(asset, factor)].values

            
            eta, G, beta_star, objective = utils.iterative_convergence(
                beta_hat_np, 
                excess_returns,
                N = N,
                K = K, 
                R = R,
                T = T,
                n_iter=2000
            )

            avar = utils.estimate_avar(
                beta_hat=beta_hat_np,
                excess_returns=excess_returns,
                eta=eta,
                G=G,
                beta_star=beta_star,
                realized_covariance=realized_covariance,
                residuals=residuals,
                N = N,
                K = K, 
                R = R,
                T = T,
            )

                
            gamma_a_lambda = utils.full_homogeneity_test(
                eta = eta, 
                avar = avar,
                N = N,
                K = K, 
                T = T
            )

            gamma_a = utils.intercept_homogeneity_test(
                eta = eta, 
                avar = avar,
                N = N,
                K = K, 
                T = T
            )

            gamma_lambda = utils.slope_homogeneity_test(
                eta = eta, 
                avar = avar,
                N = N,
                K = K, 
                T = T
            )
            print(f"Test statistics")
            print(f"gamma_a_lam: {gamma_a_lambda}")
            print(f"gamma_a: {gamma_a}")
            print(f"gamma_lam: {gamma_lambda}")

            results.loc[(
                tuple(factors), R, sample_period
            )] = np.asarray([
                gamma_a_lambda,
                gamma_a,
                gamma_lambda
            ])
            counter += 1
            print(f"===========================")




Total combinations: 54
Processing 0/54: ['Mkt-RF'] - 1 - ('1963-01-01', '2025-12-31')
Test statistics
gamma_a_lam: 44.21111236587071
gamma_a: -4.711196651553204
gamma_lam: -4.793084033287493
Processing 1/54: ['Mkt-RF'] - 1 - ('1963-01-01', '1983-01-01')
Test statistics
gamma_a_lam: -18.905327443833862
gamma_a: -5.2896303561338796
gamma_lam: -4.821143164301194
Processing 2/54: ['Mkt-RF'] - 1 - ('1973-01-01', '1993-01-01')
Test statistics
gamma_a_lam: 33.9508921756276
gamma_a: -4.781014953428597
gamma_lam: -4.788556878926963
Processing 3/54: ['Mkt-RF'] - 1 - ('1983-01-01', '2003-01-01')
Test statistics
gamma_a_lam: -30.28037437797454
gamma_a: -4.868809063244337
gamma_lam: -4.796938571070532
Processing 4/54: ['Mkt-RF'] - 1 - ('1993-01-01', '2013-01-01')
Test statistics
gamma_a_lam: -11.290816819833184
gamma_a: -5.328032305874369
gamma_lam: -4.735038730421992
Processing 5/54: ['Mkt-RF'] - 1 - ('2003-01-01', '2023-01-01')
Test statistics
gamma_a_lam: 19.99001539075309
gamma_a: -3.9943088285

In [4]:
results

Unnamed: 0,Unnamed: 1,Unnamed: 2,gamma_a_lam,gamma_a,gamma_lam
"(Mkt-RF,)",1,"(1963-01-01, 2025-12-31)",44.211112,-4.711197,-4.793084
"(Mkt-RF,)",1,"(1963-01-01, 1983-01-01)",-18.905327,-5.28963,-4.821143
"(Mkt-RF,)",1,"(1973-01-01, 1993-01-01)",33.950892,-4.781015,-4.788557
"(Mkt-RF,)",1,"(1983-01-01, 2003-01-01)",-30.280374,-4.868809,-4.796939
"(Mkt-RF,)",1,"(1993-01-01, 2013-01-01)",-11.290817,-5.328032,-4.735039
"(Mkt-RF,)",1,"(2003-01-01, 2023-01-01)",19.990015,-3.994309,-4.327817
"(Mkt-RF,)",2,"(1963-01-01, 2025-12-31)",-2.393524,-5.371134,-4.798328
"(Mkt-RF,)",2,"(1963-01-01, 1983-01-01)",-10.400544,-5.024345,-4.756385
"(Mkt-RF,)",2,"(1973-01-01, 1993-01-01)",-26.768827,-4.858048,-4.796836
"(Mkt-RF,)",2,"(1983-01-01, 2003-01-01)",-69.184168,-4.286288,-5.418729


In [5]:
def clean_results_index(results):

    fac_idx = results.index.get_level_values(0)
    R_idx   = results.index.get_level_values(1)
    t_idx   = results.index.get_level_values(2)

    fac_new = [len(x) for x in fac_idx]

    t_new = [
        f"{str(start)[:4]}–{str(end)[:4]}"
        for start, end in t_idx
    ]

    new_index = pd.MultiIndex.from_arrays(
        [fac_new, R_idx, t_new],
        names=["K", "R", "Period"]
    )

    cleaned = results.copy()
    cleaned.index = new_index
    return cleaned

cleaned_results = clean_results_index(results)
cleaned_results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gamma_a_lam,gamma_a,gamma_lam
K,R,Period,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1963–2025,44.211112,-4.711197,-4.793084
1,1,1963–1983,-18.905327,-5.28963,-4.821143
1,1,1973–1993,33.950892,-4.781015,-4.788557
1,1,1983–2003,-30.280374,-4.868809,-4.796939
1,1,1993–2013,-11.290817,-5.328032,-4.735039
1,1,2003–2023,19.990015,-3.994309,-4.327817
1,2,1963–2025,-2.393524,-5.371134,-4.798328
1,2,1963–1983,-10.400544,-5.024345,-4.756385
1,2,1973–1993,-26.768827,-4.858048,-4.796836
1,2,1983–2003,-69.184168,-4.286288,-5.418729


In [6]:
table_gamma_a_lam = cleaned_results["gamma_a_lam"].unstack(level=[0,1])
table_gamma_a     = cleaned_results["gamma_a"].unstack(level=[0,1])
table_gamma_lam   = cleaned_results["gamma_lam"].unstack(level=[0,1])

In [7]:
def add_p_values(table):
    table_numeric = table.apply(pd.to_numeric, errors="coerce")

    periods = table_numeric.index
    columns = table_numeric.columns

    new_rows = []
    new_index = []

    for period in periods:
        stats = table_numeric.loc[period].values.astype(float)

        new_rows.append(stats)
        new_index.append((period, "$\gamma$"))

        pvals = 2 * (1 - norm.cdf(np.abs(stats)))
        new_rows.append(pvals)
        new_index.append((period, "$p$"))

    multi_index = pd.MultiIndex.from_tuples(new_index, names=["Period", "Type"])
    new_table = pd.DataFrame(new_rows, index=multi_index, columns=columns)

    return new_table

In [8]:
table_gamma_a_lam

K,1,1,1,3,3,3,5,5,5
R,1,2,5,1,2,5,1,2,5
Period,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1963–1983,-18.905327,-10.400544,95.15651,14.379654,-4.723983,10.363631,-9.161988,10.672162,16.214659
1963–2025,44.211112,-2.393524,2441.879741,-8.878353,-5.887936,35.925008,-21.836013,-6.509332,-5.660618
1973–1993,33.950892,-26.768827,104.736811,-11.072314,-8.510655,-7.707587,-9.829076,-8.747067,31.601607
1983–2003,-30.280374,-69.184168,-156.717643,1.940018,46.647262,622.593239,-16.469659,-6.374145,0.815493
1993–2013,-11.290817,-62.272731,55907.879882,2.891161,-4.09878,51.992027,-3.234603,-3.990302,246.166635
2003–2023,19.990015,-4.486292,5177.387251,2.690405,9.907518,302.643083,19.384427,15.214341,41.825536


In [9]:
table_gamma_a_lam = add_p_values(table_gamma_a_lam)
table_gamma_a = add_p_values(table_gamma_a)
table_gamma_lam = add_p_values(table_gamma_lam)

In [10]:
latex_a_lam = table_gamma_a_lam.round(3).to_latex(
    multirow=True,
    multicolumn=True,
    index=True,
    escape=False,
    caption="Joint Homogeneity Test ($\\Gamma_{\\alpha,\\lambda}$) with p-values",
    label="tab:gamma_a_lam_with_p",
    float_format="%.2f",
)

latex_a = table_gamma_a.round(3).to_latex(
    multirow=True,
    multicolumn=True,
    index=True,
    escape=False,
    caption="Intercept Homogeneity Test ($\\Gamma_{\\alpha}$) with p-values",
    label="tab:gamma_a",
    float_format="%.2f",
)

latex_lam = table_gamma_lam.round(3).to_latex(
    multirow=True,
    multicolumn=True,
    index=True,
    escape=False,
    caption="Slope Homogeneity Test ($\\Gamma_{\\lambda}$) with p-values",
    label="tab:gamma_lam",
    float_format="%.2f",
)

print(latex_a_lam)
print(latex_a)
print(latex_lam)

\begin{table}
\caption{Joint Homogeneity Test ($\Gamma_{\alpha,\lambda}$) with p-values}
\label{tab:gamma_a_lam_with_p}
\begin{tabular}{llrrrrrrrrr}
\toprule
 & K & \multicolumn{3}{r}{1} & \multicolumn{3}{r}{3} & \multicolumn{3}{r}{5} \\
 & R & 1 & 2 & 5 & 1 & 2 & 5 & 1 & 2 & 5 \\
Period & Type &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{1963–1983} & $\gamma$ & -18.91 & -10.40 & 95.16 & 14.38 & -4.72 & 10.36 & -9.16 & 10.67 & 16.21 \\
 & $p$ & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 \\
\cline{1-11}
\multirow[t]{2}{*}{1963–2025} & $\gamma$ & 44.21 & -2.39 & 2441.88 & -8.88 & -5.89 & 35.92 & -21.84 & -6.51 & -5.66 \\
 & $p$ & 0.00 & 0.02 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 \\
\cline{1-11}
\multirow[t]{2}{*}{1973–1993} & $\gamma$ & 33.95 & -26.77 & 104.74 & -11.07 & -8.51 & -7.71 & -9.83 & -8.75 & 31.60 \\
 & $p$ & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 \\
\cline{1-11}
\multirow[t]{2}{*}{1983–2003} & $\gamma$ & -30.28 &