In [None]:
import os

import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
    
from glob import glob
from tqdm import tqdm

In [None]:
model = 'ViT_Cap_VW'  # or 'CNN_Cap_VW'

In [None]:
port = pd.read_csv(r'Factor_port\\' + model + '.csv', index_col=0)
port.index = pd.to_datetime(port.index)

In [None]:
start_date = pd.to_datetime('2001-01-01')
end_date = pd.to_datetime('2024-12-31')

port = port[(port.index >= start_date) & (port.index <= end_date)]

In [None]:
ff3 = pd.read_csv(r'data\processed_kelly\ff3.csv', index_col=0)
ff3.index = pd.to_datetime(ff3.index)
ff3 = ff3[start_date:end_date]

ff5 = pd.read_csv(r'data\processed_kelly\ff5.csv', index_col=0)
ff5.index = pd.to_datetime(ff5.index)
ff5 = ff5[start_date:end_date]

q5 = pd.read_csv(r'data\processed_kelly\q5.csv')
q5.set_index('date', inplace=True)
q5.index = pd.to_datetime(q5.index)
q5 = q5[start_date:end_date]
q5.index.name = None

test_portfolios = pd.read_csv(r'data\processed_kelly\test_portfolios.csv')
test_portfolios.set_index('date', inplace=True)
test_portfolios.index = pd.to_datetime(test_portfolios.index)
test_portfolios = test_portfolios[start_date:end_date]
test_portfolios = test_portfolios
test_portfolios.index.name = None

In [None]:
mom = pd.read_csv(r'data\processed_kelly\ff_mom.csv', index_col=0)
mom.index = pd.to_datetime(mom.index)
mom = mom[start_date:end_date]

In [None]:
kelly_factors = pd.read_csv(r'data\processed_kelly\kelly_factor.csv')
kelly_factors.set_index('date', inplace=True)
kelly_factors.index = pd.to_datetime(kelly_factors.index)
kelly_factors = kelly_factors[start_date:end_date]
kelly_factors.index.name = None

In [None]:
ff3.index = port.index
ff5.index = port.index
q5.index = port.index
test_portfolios.index = port.index
mom.index = port.index
kelly_factors.index = port.index
rf = ff3['RF']

In [None]:
factor_zoo = pd.concat([np.round(kelly_factors*100,4) ,ff5[['Mkt-RF','SMB','HML','RMW','CMA']],q5[['R_ME','R_IA','R_ROE']],np.round(port[[model.split('_')[0]]] * 100 ,4)],axis=1)

In [None]:
os.makedirs(r'result\\'+ model , exist_ok=True)

In [None]:
# === univariate beta calculation ===

beta_k_lst = []
for factor_name in tqdm(factor_zoo.columns[:-1]):
    x_i = factor_zoo[factor_name]
    betas = []
    
    for asset in test_portfolios.columns:
        y_i = test_portfolios[asset]
        if y_i.isna().sum() > 0:
            y_i = y_i[~y_i.isna()].copy()
            x_i_temp = x_i.loc[y_i.index].copy()
            x_i_temp = sm.add_constant(x_i_temp)
            ols_model = sm.OLS(y_i, x_i_temp).fit()
        else:
            x_i = sm.add_constant(x_i)
            ols_model = sm.OLS(y_i, x_i).fit()

        beta_i = ols_model.params[factor_name]
        betas.append(beta_i)

    beta_k = betas

    beta_k_lst.append(beta_k)

pd.DataFrame(beta_k_lst, index=list(factor_zoo.columns[:-1])).T.to_csv('data/beta_k.csv')

In [None]:
beta_k_lst = pd.read_csv('data\\beta_k.csv', index_col=0)

In [None]:
h_t = factor_zoo.iloc[:,:-1].copy()
h_t = np.array(h_t)
g_t = np.array(factor_zoo.iloc[:,-1:])

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
beta_k_df = pd.read_csv('data\\beta_k.csv', index_col=0)
mean_betas = (beta_k_df**2).mean(axis=0)
penalty = (mean_betas/mean_betas.mean())

In [None]:
best_lambda1_candidate = []
best_lambda1_1serule_candidate = []

seed_mses = []

for path in glob(r'result\\'+ model + '\\*lasso1*'):
    temp = pd.read_csv(path)
    mean_errors = temp.mean(axis=0)
    seed_mses.append(mean_errors)
    
seed_mse = pd.concat(seed_mses,axis=1).T

In [None]:
seed_mse_argmin_idx = int(seed_mse.mean()[seed_mse.mean() == seed_mse.mean().iloc[seed_mse.mean().argmin()]].index[0])
threshold = seed_mse.mean().iloc[seed_mse.mean().argmin()] + (seed_mse.std()/np.sqrt(200)).iloc[seed_mse_argmin_idx]
first_lambda_idx = int(seed_mse.mean()[seed_mse.mean() <=threshold].sort_values(ascending=False).index.max())

In [None]:
best_lambda2_candidate = []
best_lambda2_1serule_candidate = []
seed_mses = []

for path in glob(r'result\\'+ model + '\*lasso2*'):
    temp = pd.read_csv(path)
    mean_errors = temp.mean(axis=0)
    seed_mses.append(mean_errors)
    
seed_mse = pd.concat(seed_mses,axis=1).T

In [None]:
seed_mse_argmin_idx = int(seed_mse.mean()[seed_mse.mean() == seed_mse.mean().iloc[seed_mse.mean().argmin()]].index[0])
threshold = seed_mse.mean().iloc[seed_mse.mean().argmin()] + (seed_mse.std()/np.sqrt(200)).iloc[seed_mse_argmin_idx]
second_lambda_idx = int(seed_mse.mean()[seed_mse.mean() <=threshold].sort_values(ascending=False).index.max())

In [None]:
NumLambda = 100

lambdas1 = np.exp(np.linspace(0, -35, NumLambda))
lambdas2 = np.exp(np.linspace(0, -35, NumLambda))

In [None]:
# lasso 1 with -3.18 alpha

# first_lasso_cov
first_lasso_cov_lst = []
for idx in range(test_portfolios.shape[1]):
    y_i = test_portfolios.iloc[:, idx]
    not_nan_idx = ~np.isnan(y_i)
    y_i = y_i[not_nan_idx]
    x = h_t[not_nan_idx]
    first_lasso_cov_lst.append(np.cov(y_i, x.T, ddof=1)[0, 1:])

first_lasso_cov = np.array(first_lasso_cov_lst)
mean_ri = np.mean(test_portfolios, axis=0).values.reshape(-1, 1)

# second lasso_cov
second_lasso_cov_lst = []
for idx in range(test_portfolios.shape[1]):
    y_i = test_portfolios.iloc[:, idx]
    not_nan_idx = ~np.isnan(y_i)
    y_i = y_i[not_nan_idx]
    x = g_t[not_nan_idx]
    second_lasso_cov_lst.append(np.cov(y_i,x.T,ddof=1)[0,1:])

second_lasso_cov = np.array(second_lasso_cov_lst)

lasso1 = Lasso(alpha=lambdas1[first_lambda_idx], fit_intercept=True, max_iter=10000,tol=1e-5)
lasso1.fit(first_lasso_cov * penalty.values, mean_ri)

lasso1_pred = lasso1.predict(first_lasso_cov * penalty.values)
print(np.mean((lasso1_pred - mean_ri) ** 2))

lasso2 = Lasso(alpha=lambdas2[second_lambda_idx], fit_intercept=True, max_iter=10000,tol=1e-5)
lasso2.fit(first_lasso_cov * penalty.values, second_lasso_cov)

lasso2_pred = lasso2.predict(first_lasso_cov * penalty.values)
print(np.mean((lasso2_pred - second_lasso_cov) ** 2))

In [None]:
# final ols

y3 = test_portfolios.mean()
ols_cov = []
feat_names = list(set(factor_zoo.columns[:-1][lasso1.coef_!=0]) | set(factor_zoo.columns[:-1][lasso2.coef_!=0])) + [model.split('_')[0]]

for idx in range(test_portfolios.shape[1]):
    y_i = test_portfolios.iloc[:, idx].values
    not_nan_idx = ~np.isnan(y_i)
    y_i = y_i[not_nan_idx]
    x = factor_zoo[feat_names].copy()
    x = x.loc[not_nan_idx]
    ols_cov.append(np.cov(y_i, x.T, ddof=1)[0, 1:])

x = np.array(ols_cov)
x = pd.DataFrame(x, columns=feat_names)
x = sm.add_constant(x)
var_names = list(x.columns)
ols_model = sm.OLS(y3, np.array(x)).fit()

In [None]:
list(factor_zoo.columns[:-1][lasso1.coef_!=0])

In [None]:
list(factor_zoo.columns[:-1][lasso2.coef_!=0])

In [None]:
print(feat_names)

In [None]:
ols_model.summary()

In [None]:
# robust check

t_stat_matrix = np.zeros((15,15))

i_index = 0

for i in tqdm(range(-35,36,5)):
    j_index = 0
    for j in range(-35,36,5):

        lambda1 = np.exp(np.log(lambdas1[first_lambda_idx]) + i/100)
        lambda2 = np.exp(np.log(lambdas2[second_lambda_idx]) + j/100)

        second_lasso_cov = np.array(second_lasso_cov_lst)

        lasso1 = Lasso(alpha=lambda1, fit_intercept=True, max_iter=10000,tol=1e-5)
        lasso1.fit(first_lasso_cov * penalty.values, mean_ri)
        lasso1_pred = lasso1.predict(first_lasso_cov * penalty.values)

        lasso2 = Lasso(alpha=lambda2, fit_intercept=True, max_iter=10000,tol=1e-5)
        lasso2.fit(first_lasso_cov * penalty.values, second_lasso_cov)
        lasso2_pred = lasso2.predict(first_lasso_cov * penalty.values)

        # final ols

        y3 = test_portfolios.mean()
        ols_cov = []
        feat_names = list(factor_zoo.columns[:-1][lasso1.coef_!=0]) + list(factor_zoo.columns[:-1][lasso2.coef_!=0]) + [model.split('_')[0]]

        for idx in range(test_portfolios.shape[1]):
            y_i = test_portfolios.iloc[:, idx].values
            not_nan_idx = ~np.isnan(y_i)
            y_i = y_i[not_nan_idx]
            x = factor_zoo[feat_names].copy()
            x = x.loc[not_nan_idx]
            ols_cov.append(np.cov(y_i, x.T, ddof=1)[0, 1:])


        x = np.array(ols_cov)
        x = pd.DataFrame(x, columns=feat_names)
        x = sm.add_constant(x)
        var_names = list(x.columns)
        ols_model = sm.OLS(y3, np.array(x), var_names).fit()
        t_stat_matrix[i_index,j_index] = ols_model.tvalues.iloc[-1]
        j_index += 1
    i_index += 1

pd.DataFrame(t_stat_matrix).to_csv('result\\' + model + '_t_stat_v2.csv')