In [1]:
import numpy as np 
import pandas as pd 
from pathlib import Path 
from settings import settings 
from scipy.linalg import eigh
from scipy.stats import bootstrap
from sklearn.utils import resample

## Define the search list

In [2]:
search_list = {
    "us": ["us", "hml", 2],
    "developed": ["developed", "hml", 2],
    "emerging": ["emerging", "hml", 2],
    "all": [["us", "developed", "emerging"], "hml", 3],
    "world": ["world", "hml", 2],
    "world_ex_us": ["world_ex_us", "hml", 2],
    "us_mega": ["us", "cmp", 2, "mega"],
    "us_large": ["us", "cmp", 2, "large"],
    "us_small": ["us", "cmp", 2, "small"],
    "us_micro": ["us", "cmp", 2, "micro"],
    "us_nano": ["us", "cmp", 2, "nano"]
}


In [3]:
data_path = Path("result")

In [4]:
regional_pfs_cmp = pd.read_parquet(data_path / "regional_pfs_cmp.parquet") 
regional_pfs = pd.read_parquet(data_path / "regional_pfs.parquet")
cluster_labels = pd.read_parquet(data_path / "cluster_labels.parquet")

## Define the empirical Bayes estimation

In [5]:
def eb_prepare(data, scale_alphas, overlapping):
    if overlapping:
        data['obs'] = data.groupby(['region', 'characteristic'])['region'].transform('size')
        data = data.loc[data.groupby(['region', 'characteristic'])['obs'].idxmax()]
        data = data.drop(columns=['obs'])

    data = data.copy()
    data.loc[:, "ret_neu"] = data.groupby(['region', 'characteristic'], group_keys=False)[["ret", "mkt_vw_exc"]].apply(lambda x: x["ret"] - x['ret'].cov(x["mkt_vw_exc"]) / x["mkt_vw_exc"].var() * x["mkt_vw_exc"])
    data.loc[:, "ret_neu"] *= 100

    scaling_fct = np.sqrt(10**2 / 12) / data.groupby(['region', 'characteristic'])['ret_neu'].transform('std')
    data.loc[:, 'ret_neu_scaled'] = data['ret_neu'] * scaling_fct
    data['name_wide'] = data['characteristic'] + '__' + data['region']

    if scale_alphas:
        data_wide = data.pivot(index='eom', columns='name_wide', values='ret_neu_scaled')
    else:
        data_wide = data.pivot(index='eom', columns='name_wide', values='ret_neu')
    return {
        "long": data, 
        "wide": data_wide
    }


def block_cluster_func(cor_mat: pd.DataFrame, cl_labels: pd.DataFrame):
    cor_mat = cor_mat.copy()
    cor_mat.index.name = "index"
    cl_labels = cl_labels.copy()

    __cor_long = cor_mat.reset_index().melt(id_vars='index', var_name='char2', value_name='cor') 
    # char: 요인이름과 region을 분리
    __cor_long[['char2', 'region2']] = __cor_long['char2'].str.split('__', expand=True)
    __cor_long[['char1', 'region1']] = __cor_long['index'].str.split('__', expand=True)

    # 요인별 cluster 이름을 추가
    __cor_long = __cor_long.merge(cl_labels[['characteristic', 'hcl_label']].rename(columns={'hcl_label': 'hcl1'}), left_on='char1', right_on='characteristic', how='left')
    __cor_long = __cor_long.merge(cl_labels[['characteristic', 'hcl_label']].rename(columns={'hcl_label': 'hcl2'}), left_on='char2', right_on='characteristic', how='left')

    # 개별 요인이 포함돼 있는 클러스터와 region을 합침
    __cor_long['hclreg1'] = __cor_long['hcl1'] + '__' + __cor_long['region1']
    __cor_long['hclreg2'] = __cor_long['hcl2'] + '__' + __cor_long['region2']

    # Create hcl_pair column

    __cor_long['hcl_pair'] = __cor_long.apply(lambda row: '_x_'.join(sorted([row['hclreg1'], row['hclreg2']])), axis=1)    
    __cor_long['name1'] = __cor_long['char1'] + '__' + __cor_long['region1']
    __cor_long['name2'] = __cor_long['char2'] + '__' + __cor_long['region2']

    # 같은 thema안에서 correlation의 평균
    __cluster_wise_cor = __cor_long[__cor_long['name1'] != __cor_long['name2']].groupby('hcl_pair')['cor'].mean().reset_index(name='cor_avg')    
    __cor_long = __cor_long.merge(__cluster_wise_cor, on='hcl_pair', how='left') 
    __cor_long['cor_avg'] = np.where(__cor_long['name1'] == __cor_long['name2'], 1, __cor_long['cor_avg']) 
    __cluster_block_cor_matrix = __cor_long.pivot(index='name1', columns='name2', values='cor_avg') 
    return __cluster_block_cor_matrix     
    

In [6]:
def emp_bayes(
        data: pd.DataFrame,
        cluster_labels: pd.DataFrame, 
        min_obs=5*12, 
        fix_alpha=False,
        bs_cov=False,
        cor_type="sample",
        shrinkage=0,
        layers=3,
        bs_samples=10000,
        seed=None,
        priors=None,
        sigma=None,
        plot=True
    ):
    __data = data.copy()
    __cluster_labels = cluster_labels.copy()
    __y_raw = __data['wide'].copy() 
    __obs = __y_raw.notna().sum() 
    __y = __y_raw.loc[:, __obs[__obs >= min_obs].index] 

    # Factor 개수
    __n_fcts = __y.shape[1]                
    # 모든 요인의 timeseries 수익률 평균
    __y_mean = np.nanmean(__y, axis=0)  
    if sigma is None:
        if bs_cov:
            __bs_full = []
            for _ in range(bs_samples):
                # 행을 랜덤하게 중복해서 sample함
                __resampled_y = resample(__y, replace=True)
                __mean_resampled = __resampled_y.mean() 
                __bs_full.append(__mean_resampled)
            __bs_full = pd.DataFrame(__bs_full)
            __alpha_sd = __bs_full.std()
            __alpha_cor = __bs_full.corr()
        else:
            __y_sd = __y.std()
            __alpha_sd = __y_sd / np.sqrt(__y.shape[0])
            __alpha_cor = __y.corr().copy()
        
        # Apply shrinkage
        __alpha_cor_shrunk = __alpha_cor * (1 - shrinkage) + np.eye(__n_fcts) * shrinkage

        # Correlation Block Adjustment
        if cor_type == "sample":
            __alpha_cor_adj = __alpha_cor_shrunk.copy()
        elif cor_type == "block_clusters":
            __alpha_cor_adj = block_cluster_func(__alpha_cor_shrunk, __cluster_labels)
        sigma = np.diag(__alpha_sd) @ __alpha_cor_adj.values @ np.diag(__alpha_sd)
        sigma = pd.DataFrame(sigma, index=__alpha_cor_adj.columns, columns=__alpha_cor_adj.columns)                      
    else:
        __alpha_sd = np.sqrt(np.diag(sigma))

    # value는 요인별 수익률의 평균 
    __cm = pd.DataFrame({'char_reg': __data['wide'].columns, 'value': __y_mean})  # Assuming the first column is 'eom'
    __cm['characteristic'] = __cm['char_reg'].str.split('__').str[0]
    __cm = __cm.merge(__cluster_labels, on="characteristic", how="left")    

    __m = pd.get_dummies(__cm["hcl_label"]).astype(int).values    
    # Cluster에 얼마나 속하는지 알 수 있다. 
    __mm = __m @ __m.T

    __z = pd.get_dummies(__cm["characteristic"]).astype(int).values 
    __zz = __z @ __z.T    

    # Thmea 개수
    __n_cl = __m.shape[1]
    # Factor 개수 
    __n_s = __z.shape[1]

    __starting_values = __cm.groupby('hcl_label').agg(
            # 요인의 개수 
            n_s=('value', 'size'),
            # 테마별 요인 수익률의 평균
            signal_mean=('value', 'mean'),
            # 테마별 요인 수익률의 표준편차 
            signal_sd=('value', 'std')
        ).groupby('hcl_label').agg(
            n_c=('n_s', 'sum'),
            cl_mean=('signal_mean', 'mean'),
            cl_sd=('signal_mean', 'std'),
            # Thema에 속하는 요인들의 표준편차의 평균
            cl_signal_within=('signal_sd', 'mean')
        ).reset_index()
    
    
    if fix_alpha:
        __sd_all = np.sqrt(np.sum(__y_mean ** 2) / (len(__y_mean) - 1))
    else:
        __sd_all = np.std(__y_mean)   

    return {
        "sigma": sigma, 
        "alpha_sd": __alpha_sd, 
        'alpha_cor_shrunk': __alpha_cor_shrunk,
        'alpha_cor_adj': __alpha_cor_adj,
        'starting_values': __starting_values,
        'sd_all': __sd_all, 
    } 


In [7]:
search_list

{'us': ['us', 'hml', 2],
 'developed': ['developed', 'hml', 2],
 'emerging': ['emerging', 'hml', 2],
 'all': [['us', 'developed', 'emerging'], 'hml', 3],
 'world': ['world', 'hml', 2],
 'world_ex_us': ['world_ex_us', 'hml', 2],
 'us_mega': ['us', 'cmp', 2, 'mega'],
 'us_large': ['us', 'cmp', 2, 'large'],
 'us_small': ['us', 'cmp', 2, 'small'],
 'us_micro': ['us', 'cmp', 2, 'micro'],
 'us_nano': ['us', 'cmp', 2, 'nano']}

In [8]:
eb_est = {}
for key, x in search_list.items():
    print(f"Region: {x[0]}")
    regions = x[0]
    
    # Select the appropriate data
    if x[1] == "cmp":
        base_data = regional_pfs_cmp[regional_pfs_cmp['size_grp'] == x[3]].copy()
    elif x[1] == "hml":
        base_data = regional_pfs.copy()

    if isinstance(regions, str):
        regions = [regions]

    data = base_data[(base_data['eom'] >= settings['start_date']) & (base_data['eom'] <= settings['end_date']) & (base_data['region'].isin(regions))]
    data = eb_prepare(data, scale_alphas=settings['eb']['scale_alpha'], overlapping=settings['eb']['overlapping'])
    op = emp_bayes(
        data=data, 
        cluster_labels=cluster_labels, 
        min_obs=settings['eb']['min_obs'], 
        fix_alpha=settings['eb']['fix_alpha'], 
        bs_cov=settings['eb']['bs_cov'],
        layers=x[2], 
        shrinkage=settings['eb']['shrinkage'], 
        cor_type=settings['eb']['cor_type'], 
        bs_samples=settings['eb']['bs_samples'], 
        seed=settings['seed'], 
        sigma = None,
    )
    eb_est[key] = op

Region: us
