In [18]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from scipy.stats import norm
from settings import settings
from sklearn.utils import resample
from scipy.stats import multivariate_normal
from statsmodels.stats.multitest import multipletests
from sklearn.linear_model import LinearRegression, Lasso

## Define Parameter

In [19]:
data_path = Path("result")

In [20]:
regional_mkt_ret = pd.read_parquet(data_path / "regional_mkt_ret.parquet")
cluster_labels = pd.read_parquet(data_path / "cluster_labels.parquet")
char_info = pd.read_excel(
    "Factor Details.xlsx", sheet_name="details", usecols="A:N"
).dropna(subset=["abr_jkp"]).rename(
    columns={
        "abr_jkp": "characteristic",
        "in-sample period": "date_range",
        "group": "hxz_group",
    }
)

In [21]:
with open(data_path / "eb_est.pkl", "rb") as f:
    eb_est = pickle.load(f)

## FDR(False Discover Rate) Testing

In [22]:
def fdr_sim(t_low, a_vec, a_cov, n_sim=10000, seed=1):
    np.random.seed(seed)
    t_all = a_vec / np.sqrt(np.diag(a_cov))

    t_steps = np.sort(t_all[t_all > t_low])
    t_steps = t_steps[:-1]

    sims = multivariate_normal.rvs(mean=a_vec, cov=a_cov, size=n_sim)
    results = []
    for t in tqdm(t_steps):
        # Significant alphas under t-cutoff
        sig = t_all >= t
        sims_fdr = np.mean(
            np.sign(sims[:, sig]) != np.sign(np.tile(a_vec[sig].values, (n_sim, 1))),
            axis=1,
        )

        fdr = np.mean(sims_fdr)
        fwr = np.mean(sims_fdr > 0)

        results.append({"t_cutoff": t, "n_sig": np.sum(sig), "fdr": fdr, "fwr": fwr})

    return pd.DataFrame(results)

In [23]:
model_fdr = fdr_sim(
    t_low=0,
    a_vec=eb_est["us"]["factor_mean"].copy(),
    a_cov=eb_est["us"]["factor_cov"].copy(),
    n_sim=10000,
    # n_sim=10,
    seed=settings["seed"],
)

100%|██████████| 141/141 [00:00<00:00, 9591.10it/s]


## Multiple Testing

In [24]:
def adjust_pvalues(group):
    group["n"] = len(group)
    group["p_bonf"] = multipletests(group["p_ols"], method="bonferroni")[1]
    group["p_holm"] = multipletests(group["p_ols"], method="holm")[1]
    group["p_bh"] = multipletests(group["p_ols"], method="fdr_bh")[1]
    group["p_by"] = multipletests(group["p_ols"], method="fdr_by")[1]
    return group

In [25]:
def multiple_testing(eb_all, eb_world=None):
    combined_df = (
        pd.concat([eb_all["factors"], eb_world["factors"]])
        if eb_world
        else eb_all["factors"]
    )
    combined_df = combined_df.set_index("region") 

    # Calculate t_ols and p_ols
    combined_df["t_ols"] = combined_df["ols_est"] / combined_df["ols_se"]
    combined_df["p_ols"] = 2 * norm.sf(np.abs(combined_df["t_ols"]))

    combined_df = (
        combined_df.groupby("region", group_keys=False).apply(adjust_pvalues).reset_index()
    )
    # Select relevant columns and reshape the DataFrame
    combined_df = combined_df.loc[
        :,
        [
            "n",
            "region",
            "char_reg",
            "ols_est",
            "t_ols",
            "ols_se",
            "p_ols",
            "p_bonf",
            "p_holm",
            "p_bh",
            "p_by",
        ],
    ]
    combined_df = combined_df.melt(
        id_vars=["n", "region", "char_reg", "ols_est", "t_ols", "ols_se"],
        value_vars=["p_ols", "p_bonf", "p_holm", "p_bh", "p_by"],
        var_name="method",
        value_name="p",
    )
    # Add method and mt_adj columns
    combined_df['method'] = combined_df['method'].str.replace('p_', '').map({
        'ols': 'OLS',
        'bonf': 'Bonferroni',
        'holm': 'Holm',
        'bh': 'BH',
        'by': 'BY'
    })
    combined_df['mt_adj'] = combined_df['method'].map({
        'OLS': 'None',
        'BH': 'FDR',
        'BY': 'FDR',
        'Bonferroni': 'FWR',
        'Holm': 'FWR'
    })
    return combined_df.copy()





            

In [26]:
mt = multiple_testing(
    eb_all = eb_est["all"].copy(),
    eb_world = eb_est["world"].copy(),
)

In [27]:
mt 

Unnamed: 0,n,region,char_reg,ols_est,t_ols,ols_se,method,p,mt_adj
0,153,developed,age__developed,0.408229,2.916042,0.139994,OLS,3.545028e-03,
1,153,developed,aliq_at__developed,0.907554,6.399943,0.141807,OLS,1.554353e-10,
2,153,developed,aliq_mat__developed,0.029489,0.206205,0.143006,OLS,8.366305e-01,
3,153,developed,ami_126d__developed,0.185177,1.210297,0.153002,OLS,2.261650e-01,
4,153,developed,at_be__developed,0.120585,0.850471,0.141786,OLS,3.950632e-01,
...,...,...,...,...,...,...,...,...,...
3055,153,world,turnover_var_126d__world,-0.139822,-1.629332,0.085816,BY,6.978802e-01,FDR
3056,153,world,z_score__world,0.014332,0.131466,0.109016,BY,1.000000e+00,FDR
3057,153,world,zero_trades_126d__world,0.427679,4.974097,0.085981,BY,1.042127e-05,FDR
3058,153,world,zero_trades_21d__world,0.196193,2.285974,0.085825,BY,1.619147e-01,FDR


## Tangency Portfolios 

In [None]:
def bootstrap_tpf(data, n_boots=100, shorting=True, seed=1):
    np.random.seed(seed)
    if shorting:
        def boot_func(df):
            # Normalize each column by its standard deviation
            df_normalized = df.apply(lambda x: x / np.std(x), axis=0)
            
            # Perform linear regression without intercept
            model = LinearRegression(fit_intercept=False)
            model.fit(df_normalized, np.ones(df_normalized.shape[0]))
            
            # Extract the coefficients and compute weights
            weights = model.coef_ / np.sum(model.coef_)
            result = pd.DataFrame({
                'term': df.columns,
                'weight': weights
            })
            return result
        
    # Shorting not allowed (non-negative weights)
    else:
        def boot_func(df):
            import warnings 
            warnings.filterwarnings("ignore")
            # Normalize each column by its standard deviation
            df_normalized = df.apply(lambda x: x / np.std(x), axis=0)
            
            # Use Lasso regression with lambda=0 and non-negative constraints
            model = Lasso(alpha=0, fit_intercept=False, positive=True)
            model.fit(df_normalized, np.ones(df_normalized.shape[0]))
            
            # Extract the coefficients and compute weights
            weights = model.coef_ / np.sum(model.coef_)
            result = pd.DataFrame({
                'term': df.columns,
                'weight': weights
            })
            return result
        
    results = []
    for i in tqdm(range(n_boots)):
        # Resample the data with replacement
        bootstrapped_data = resample(data, replace=True, random_state=seed + i)
        coef = boot_func(bootstrapped_data)
        coef = coef.set_index("term").T.reset_index(drop=True)
        results.append(coef)
    result = pd.concat(results) 
    result.index = pd.RangeIndex(n_boots, name="bootstrap_iteration") 
    return result.reset_index().copy()     

In [None]:
def tpf_cluster(data, mkt_region, orig_sig, min_date, n_boots, shorting, seed):
    print(f"Run: {mkt_region}")

    if orig_sig:
        orig_sig_values = [1]
    else:
        orig_sig_values = [1, 0]
    
    # Filter market return based on region
    market_ret = regional_mkt_ret[regional_mkt_ret['region'] == mkt_region].reset_index()
    cluster_pf = (
        data
        .merge(cluster_labels, on="characteristic", how="left")
        .merge(char_info[['characteristic', 'significance']].rename(columns={"significance": "orig_sig"}), on="characteristic", how="left")
    )

    # Cluster 안에 있는 요인들의 수익률 평균
    cluster_pf = (
        cluster_pf[cluster_pf["orig_sig"].isin(orig_sig_values)]
        .groupby(['hcl_label', 'eom'], as_index=False)
        .agg(ret=('ret', 'mean'))
    )

    tpf_data = (
        cluster_pf[cluster_pf["eom"] >= min_date]
        .pivot(index='eom', columns='hcl_label', values='ret')
        .reset_index()
        .merge(market_ret[['eom', 'market']], on='eom', how='left')
        .rename(columns={'market': 'Market'})
    )

    tpf_bootstrap = bootstrap_tpf(tpf_data.drop(columns=['eom']), n_boots=n_boots, shorting=shorting, seed=seed) 
    tpf_bootstrap['market_region'] = mkt_region
    return tpf_bootstrap

## Create Tangency Portfolio

In [None]:
is_tpf_save = False

In [None]:
if is_tpf_save:
    tpf_world = tpf_cluster(eb_est['world']['input']['long'], mkt_region='world', orig_sig=True, min_date=settings['tpf']['start']['world'],
                            n_boots=settings['tpf']['bs_samples'], shorting=settings['tpf']['shorting'], seed=settings['seed'])

    tpf_us = tpf_cluster(eb_est['us']['input']['long'], mkt_region='us', orig_sig=True, min_date=settings['tpf']['start']['us'],
                        n_boots=settings['tpf']['bs_samples'], shorting=settings['tpf']['shorting'], seed=settings['seed'])

    tpf_dev = tpf_cluster(eb_est['developed']['input']['long'], mkt_region='developed', orig_sig=True, min_date=settings['tpf']['start']['developed'],
                        n_boots=settings['tpf']['bs_samples'], shorting=settings['tpf']['shorting'], seed=settings['seed'])

    tpf_emer = tpf_cluster(eb_est['emerging']['input']['long'], mkt_region='emerging', orig_sig=True, min_date=settings['tpf']['start']['emerging'],
                        n_boots=settings['tpf']['bs_samples'], shorting=settings['tpf']['shorting'], seed=settings['seed'])

    # Size Groups
    tpf_size = pd.concat([
        tpf_cluster(eb_est[f'us_{x}']['input']['long'], mkt_region='us', orig_sig=True, min_date=settings['tpf']['start']['size_grps'],
                    n_boots=settings['tpf']['bs_samples'], shorting=settings['tpf']['shorting'], seed=settings['seed']).assign(size_grp=x)
        for x in ["mega", "large", "small", "micro", "nano"]
    ])

    tpf_all = pd.concat([
        tpf_world, 
        tpf_us, 
        tpf_dev, 
        tpf_emer, 
        tpf_size, 
    ])
    tpf_all.to_parquet(data_path / "tpf_all.parquet")
else: 
    tpf_all = pd.read_parquet(data_path / "tpf_all.parquet")

In [None]:
tpf_all

term,bootstrap_iteration,Accruals,Debt Issuance,Investment,Low Leverage,Low Risk,Momentum,Profit Growth,Profitability,Quality,Seasonality,Short-Term Reversal,Size,Value,Market,market_region,size_grp
0,0,0.203477,0.018953,0.000000,0.014192,0.000000,0.005828,0.041738,0.103680,0.074951,0.093575,0.043726,0.169193,0.013058,0.217630,world,
1,1,0.199051,0.000000,0.000000,0.065819,0.040643,0.034408,0.042118,0.055944,0.088506,0.154788,0.020580,0.052583,0.031170,0.214390,world,
2,2,0.200437,0.024690,0.043566,0.017079,0.011148,0.028756,0.056915,0.099490,0.131541,0.013290,0.052610,0.115965,0.000000,0.204511,world,
3,3,0.175737,0.007459,0.009726,0.013111,0.000000,0.018854,0.000000,0.085401,0.122826,0.086774,0.054496,0.195674,0.000000,0.229943,world,
4,4,0.223260,0.000000,0.017462,0.000000,0.000000,0.061096,0.114575,0.133800,0.063055,0.062306,0.001710,0.089337,0.018840,0.214559,world,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,0.086764,0.048995,0.000000,0.147381,0.196917,0.000000,0.000000,0.251051,0.106386,0.085911,0.000000,0.000000,0.022725,0.053870,us,nano
9996,9996,0.062650,0.049223,0.034240,0.137314,0.171394,0.000000,0.019935,0.232337,0.079086,0.065684,0.000000,0.008109,0.068070,0.071959,us,nano
9997,9997,0.059563,0.013788,0.006275,0.092014,0.109767,0.000000,0.072037,0.272587,0.038309,0.163228,0.000000,0.000000,0.111850,0.060581,us,nano
9998,9998,0.022660,0.073454,0.036773,0.158111,0.230811,0.000000,0.000000,0.165315,0.116690,0.000000,0.000000,0.000000,0.110878,0.085309,us,nano
