In [65]:
import pickle
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from pathlib import Path 
from settings import settings 
from sklearn.utils import resample
from scipy.optimize import minimize
from scipy.stats import multivariate_normal

## Define the search list

In [46]:
search_list = {
    "us": ["us", "hml", 2],
    "developed": ["developed", "hml", 2],
    "emerging": ["emerging", "hml", 2],
    "all": [["us", "developed", "emerging"], "hml", 3],
    "world": ["world", "hml", 2],
    "world_ex_us": ["world_ex_us", "hml", 2],
    "us_mega": ["us", "cmp", 2, "mega"],
    "us_large": ["us", "cmp", 2, "large"],
    "us_small": ["us", "cmp", 2, "small"],
    "us_micro": ["us", "cmp", 2, "micro"],
    "us_nano": ["us", "cmp", 2, "nano"]
}


In [47]:
data_path = Path("result")

In [48]:
regional_pfs_cmp = pd.read_parquet(data_path / "regional_pfs_cmp.parquet") 
regional_pfs = pd.read_parquet(data_path / "regional_pfs.parquet")
cluster_labels = pd.read_parquet(data_path / "cluster_labels.parquet")

## Define the empirical Bayes estimation

In [49]:
def eb_prepare(data, scale_alphas, overlapping):
    if overlapping:
        data['obs'] = data.groupby(['region', 'characteristic'])['region'].transform('size')
        data = data.loc[data.groupby(['region', 'characteristic'])['obs'].idxmax()]
        data = data.drop(columns=['obs'])

    data = data.copy()
    data.loc[:, "ret_neu"] = (
        data
        .groupby(['region', 'characteristic'], group_keys=False)[["ret", "mkt_vw_exc"]]
        .apply(lambda x: x["ret"] - x['ret'].cov(x["mkt_vw_exc"]) / x["mkt_vw_exc"].var() * x["mkt_vw_exc"])
        .values
    )

    data.loc[:, "ret_neu"] *= 100

    scaling_fct = np.sqrt(10**2 / 12) / data.groupby(['region', 'characteristic'])['ret_neu'].transform('std')
    data.loc[:, 'ret_neu_scaled'] = data['ret_neu'] * scaling_fct
    data['name_wide'] = data['characteristic'] + '__' + data['region']

    if scale_alphas:
        data_wide = data.pivot(index='eom', columns='name_wide', values='ret_neu_scaled')
    else:
        data_wide = data.pivot(index='eom', columns='name_wide', values='ret_neu')
    return {
        "long": data, 
        "wide": data_wide
    }


def block_cluster_func(cor_mat: pd.DataFrame, cl_labels: pd.DataFrame):
    cor_mat = cor_mat.copy()
    cor_mat.index.name = "index"
    cl_labels = cl_labels.copy()

    __cor_long = cor_mat.reset_index().melt(id_vars='index', var_name='char2', value_name='cor') 
    # char: 요인이름과 region을 분리
    __cor_long[['char2', 'region2']] = __cor_long['char2'].str.split('__', expand=True)
    __cor_long[['char1', 'region1']] = __cor_long['index'].str.split('__', expand=True)

    # 요인별 cluster 이름을 추가
    __cor_long = __cor_long.merge(cl_labels[['characteristic', 'hcl_label']].rename(columns={'hcl_label': 'hcl1'}), left_on='char1', right_on='characteristic', how='left')
    __cor_long = __cor_long.merge(cl_labels[['characteristic', 'hcl_label']].rename(columns={'hcl_label': 'hcl2'}), left_on='char2', right_on='characteristic', how='left')

    # 개별 요인이 포함돼 있는 클러스터와 region을 합침
    __cor_long['hclreg1'] = __cor_long['hcl1'] + '__' + __cor_long['region1']
    __cor_long['hclreg2'] = __cor_long['hcl2'] + '__' + __cor_long['region2']

    # Create hcl_pair column

    __cor_long['hcl_pair'] = __cor_long.apply(lambda row: '_x_'.join(sorted([row['hclreg1'], row['hclreg2']])), axis=1)    
    __cor_long['name1'] = __cor_long['char1'] + '__' + __cor_long['region1']
    __cor_long['name2'] = __cor_long['char2'] + '__' + __cor_long['region2']

    # 같은 thema안에서 correlation의 평균
    __cluster_wise_cor = __cor_long[__cor_long['name1'] != __cor_long['name2']].groupby('hcl_pair')['cor'].mean().reset_index(name='cor_avg')    
    __cor_long = __cor_long.merge(__cluster_wise_cor, on='hcl_pair', how='left') 
    __cor_long['cor_avg'] = np.where(__cor_long['name1'] == __cor_long['name2'], 1, __cor_long['cor_avg']) 
    __cluster_block_cor_matrix = __cor_long.pivot(index='name1', columns='name2', values='cor_avg') 
    return __cluster_block_cor_matrix     
    

In [50]:
eb_est = {}
for key, x in search_list.items():
    print(f"Region: {x[0]}")
    regions = x[0]
    
    # Select the appropriate data
    if x[1] == "cmp":
        base_data = regional_pfs_cmp[regional_pfs_cmp['size_grp'] == x[3]].copy()
    elif x[1] == "hml":
        base_data = regional_pfs.copy()

    if isinstance(regions, str):
        regions = [regions]

    data = base_data[(base_data['eom'] >= settings['start_date']) & (base_data['eom'] <= settings['end_date']) & (base_data['region'].isin(regions))]
    data = eb_prepare(data, scale_alphas=settings['eb']['scale_alpha'], overlapping=settings['eb']['overlapping'])
    break

Region: us


In [51]:
min_obs=settings['eb']['min_obs']
fix_alpha=settings['eb']['fix_alpha']
bs_cov=settings['eb']['bs_cov']
layers=x[2]
shrinkage=settings['eb']['shrinkage']
cor_type=settings['eb']['cor_type']
bs_samples=settings['eb']['bs_samples']
seed=settings['seed']
sigma = None
priors = None

### Step By Step: Empyrical Bayes

In [52]:
np.random.seed(seed)

In [53]:
y_raw = data["wide"].copy()

In [54]:
# 최소 개수 제한 
obs = y_raw.notna().sum()
y = y_raw.loc[:, obs[obs >= min_obs].index]
n_fcts = len(y.columns)

In [55]:
y_mean = y.mean()

In [114]:
if sigma is None:
    if bs_cov:
        bs_samples_list = []
        for i in tqdm(range(bs_samples)):
            # 행을 중복을 허용하면서 sample하면서 mean을 계산
            sample = resample(y, replace=True)
            bs_samples_list.append(sample.mean())

        bs_full = pd.DataFrame(bs_samples_list)
        bs_full_cov = bs_full.cov()

        alpha_sd = pd.Series(np.sqrt(np.diag(bs_full_cov)), index=y_mean.index) 
        alpha_cor = bs_full.corr()
    else:
        y_sd = pd.Series(np.nanstd(y, axis=0), index=y.columns) 
        alpha_sd = y_sd / np.sqrt(y.shape[0]) 
        alpha_cor = y.corr()
    
    alpha_cor_shrunk = alpha_cor * (1-shrinkage) + np.diag(np.full(n_fcts, 1)) * shrinkage 
    if cor_type == "sample":
        alpha_cor_adj = alpha_cor_shrunk
    elif cor_type == "block_clusters":
        alpha_cor_adj = block_cluster_func(alpha_cor_shrunk, cluster_labels=cluster_labels)
    __corr = np.diag(alpha_sd) @ alpha_cor_adj @ np.diag(alpha_sd)
    sigma = pd.DataFrame(__corr.values, index=alpha_cor_adj.columns, columns=alpha_cor_adj.columns)
else:
    alpah_sd = np.sqrt(np.diag(sigma))  

100%|██████████| 10000/10000 [00:05<00:00, 1797.12it/s]


In [128]:
y_mean

name_wide
age__us                 -0.153475
aliq_at__us              0.312448
aliq_mat__us            -0.352743
ami_126d__us             0.050272
at_be__us               -0.158214
                           ...   
turnover_var_126d__us   -0.128605
z_score__us             -0.003803
zero_trades_126d__us     0.376804
zero_trades_21d__us      0.189812
zero_trades_252d__us     0.440058
Length: 153, dtype: float64