In [2]:
import pandas as pd
import numpy as np

### Prepare raw data

In [3]:
from frozen.data.etl.dataload import DataLoadManager, DatabaseTypes
from frozen.basis import FrozenConfig
from frozen.utils import Universe

dataloader = DataLoadManager(DatabaseTypes.DUCKDB)
config = FrozenConfig()
universe = Universe(config).pool

In [4]:
data = dataloader.load_volume_price("stock_daily_real", universe=universe, start_date="20230101", end_date="20240101", multiindex=True)
data["pct_chg"] = data.groupby("ticker")["pct_chg"].shift(-1) / 100

In [5]:
# number of stocks
data.index.get_level_values("ticker").nunique()

241

In [6]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,pre_close,change,pct_chg,volume,amount
ticker,trade_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
000001.SZ,2023-01-03,13.20,13.85,13.05,13.77,13.16,0.61,0.0399,2194127.94,2971546.989
000001.SZ,2023-01-04,13.71,14.42,13.63,14.32,13.77,0.55,0.0112,2189682.53,3110729.449
000001.SZ,2023-01-05,14.40,14.74,14.37,14.48,14.32,0.16,0.0097,1665425.18,2417272.356
000001.SZ,2023-01-06,14.50,14.72,14.48,14.62,14.48,0.14,0.0123,1195744.71,1747915.169
000001.SZ,2023-01-09,14.75,14.88,14.52,14.80,14.62,0.18,-0.0243,1057659.11,1561368.487
...,...,...,...,...,...,...,...,...,...,...
603993.SH,2023-12-25,4.90,4.95,4.87,4.93,4.90,0.03,-0.0020,761156.76,373549.544
603993.SH,2023-12-26,4.93,4.94,4.89,4.92,4.93,-0.01,0.0325,566923.36,278576.405
603993.SH,2023-12-27,4.91,5.12,4.91,5.08,4.92,0.16,0.0157,1519665.96,765831.347
603993.SH,2023-12-28,5.11,5.25,5.10,5.16,5.08,0.08,0.0078,1634153.79,846612.482


### Transform data to 3D

both make_3D and make_XY works

In [7]:
def make_3D(data, X_feature, y_feature):

    X_raw = data[X_feature]
    y_raw = data[y_feature]

    common_trade_dates = set(X_raw.index.get_level_values('trade_date'))
    new_index = pd.MultiIndex.from_product([X_raw.index.get_level_values('ticker').unique(), common_trade_dates], names=['ticker', 'trade_date'])
    data_reindexed = X_raw.reindex(new_index)
    # data_by_date = data_reindexed.swaplevel().groupby('trade_date', group_keys=False).apply(lambda x: x.sort_values(by='trade_date'))
    # data_by_date = data_by_date.groupby('trade_date', group_keys=False).apply(lambda x: x.sort_values('ticker'))
    data_by_date = data_reindexed.swaplevel().sort_index()
    data_reshape = data_by_date.to_numpy().reshape(data_by_date.index.get_level_values('trade_date').nunique(), -1, X_raw.shape[1])
    X = np.transpose(data_reshape, (0, 2, 1))
    y = y_raw.swaplevel().unstack().to_numpy(dtype=np.double)

    return X, y

In [8]:
data3d = make_3D(data, ['open', 'close'], 'pct_chg')

In [9]:
data3d

(array([[[ 13.2 ,  18.1 ,  25.88, ...,  55.02, 102.46,   4.53],
         [ 13.77,  18.23,  25.93, ...,  53.8 , 103.2 ,   4.63]],
 
        [[ 13.71,  18.25,  26.06, ...,  53.78, 102.69,   4.61],
         [ 14.32,  19.07,  26.44, ...,  54.12, 100.41,   4.57]],
 
        [[ 14.4 ,  19.  ,  26.42, ...,  54.4 , 101.84,   4.59],
         [ 14.48,  19.33,  26.44, ...,  54.35, 102.66,   4.6 ]],
 
        ...,
 
        [[  9.1 ,  10.17,  24.63, ...,  35.42,  89.25,   4.91],
         [  9.12,  10.17,  24.64, ...,  35.8 ,  90.83,   5.08]],
 
        [[  9.11,  10.15,  24.54, ...,  35.62,  90.74,   5.11],
         [  9.45,  10.52,  26.3 , ...,  36.56,  92.5 ,   5.16]],
 
        [[  9.42,  10.48,  26.25, ...,  36.49,  92.3 ,   5.13],
         [  9.39,  10.46,  26.48, ...,  37.55,  92.39,   5.2 ]]]),
 array([[ 0.0399,  0.0461,  0.0197, ...,  0.0059, -0.027 , -0.013 ],
        [ 0.0112,  0.0136,  0.    , ...,  0.0042,  0.0224,  0.0066],
        [ 0.0097, -0.0057, -0.0166, ..., -0.0077,  0.0021,  0

In [10]:
def make_XY(data, index_name, column_name, Y):
    '''
    Args:
        data: input pd.DataFrame with shape:[trade_dates * ticker, features]
        index_name: trade dates
        columns_name: stock codes
        Y: predict target

    Returns: X, Y, feature_names
    '''

    data = data.pivot_table(index=[index_name], columns=[column_name], sort=False, dropna=False)
    y = data.loc[:,(Y,)].to_numpy(dtype=np.double)

    data = data.drop([Y,], axis=1)

    X_0_len = len(data.index)
    X_1_len = len(data.columns.levels[0]) - 1
    X_2_len = len(data.columns.levels[1])
    
    return data.to_numpy(dtype=np.double).reshape((X_0_len, X_1_len, X_2_len)), y, data.columns.levels[0].drop([Y,])

In [11]:
data3D = make_XY(data, 'trade_date', 'ticker', 'pct_chg')

In [12]:
data3D

(array([[[ 1.32000000e+01,  1.81000000e+01,  2.58800000e+01, ...,
           5.50200000e+01,  1.02460000e+02,  4.53000000e+00],
         [ 1.38500000e+01,  1.85000000e+01,  2.61800000e+01, ...,
           5.56000000e+01,  1.03360000e+02,  4.65000000e+00],
         [ 1.30500000e+01,  1.78800000e+01,  2.54800000e+01, ...,
           5.35200000e+01,  1.00200000e+02,  4.40000000e+00],
         ...,
         [ 6.10000000e-01,  3.00000000e-02,  7.00000000e-02, ...,
          -1.18000000e+00,  7.30000000e-01,  8.00000000e-02],
         [ 2.19412794e+06,  6.36399630e+05,  2.27325830e+05, ...,
           2.63101400e+04,  6.75978900e+04,  8.59006850e+05],
         [ 2.97154699e+06,  1.15894528e+06,  5.89696754e+05, ...,
           1.42157173e+05,  6.90555464e+05,  3.91788469e+05]],
 
        [[ 1.37100000e+01,  1.82500000e+01,  2.60600000e+01, ...,
           5.37800000e+01,  1.02690000e+02,  4.61000000e+00],
         [ 1.44200000e+01,  1.92800000e+01,  2.67200000e+01, ...,
           5.48600000

In [13]:
# import joblib

In [14]:
# joblib.dump(data3D, 'data3D.pkl')

### Factor Mining

In [15]:
# data3D = joblib.load('./factor_research/data3D.pkl')

In [16]:
print(data3D[0].shape)
print(data3D[1].shape)

(242, 8, 241)
(242, 241)


In [17]:
from frozen.factor.extensions.gplearn3d import genetic

In [18]:
X_train = data3D[0]
y_train = data3D[1]
feature_names = data3D[2].tolist()

In [19]:
max_samples=0.8
sample_weight = np.ones(X_train.shape[0])
num_div = int(X_train.shape[0] * max_samples)
sample_weight[num_div:] = 0

In [20]:
function_set_all = list(genetic._all_func_dictionary.keys())
remove_list = ['log', 'abs', 'sqrt', 'delay', 'tan','sin','cos','neg', 'dynamic_ts_std', 'dynamic_ts_max', 'dynamic_ts_min', 'dynamic_ts_argmax', 'dynamic_ts_argmin']
function_set = [item for item in function_set_all if item not in remove_list]

In [21]:
gp_sample = genetic.SymbolicTransformer(generations=2,
                                        population_size=100,
                                        tournament_size=20,
                                        init_depth=(1,4),
                                        hall_of_fame=10,
                                        n_components=10,
                                        function_set=function_set,
                                        metric="rank_ic",
                                        const_range=None,
                                        p_crossover=0.4,
                                        p_hoist_mutation=0.001,
                                        p_subtree_mutation=0.01,
                                        p_point_mutation=0.01,
                                        p_point_replace=0.4,
                                        parsimony_coefficient="auto",
                                        feature_names=feature_names,
                                        max_samples=0.8,
                                        verbose=1,
                                        random_state=0, n_jobs=8)

In [22]:
gp_sample.fit_3D(X_train, y_train, baseline=0.05, need_parallel=True)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0     6.44        0.0264588        3        0.0703974       0.00316986     13.79s
   1     6.50        0.0562264        6         0.082541       0.00806607      0.00s


0,1,2
,population_size,100
,hall_of_fame,10
,n_components,10
,generations,2
,tournament_size,20
,stopping_criteria,1.0
,const_range,
,init_depth,"(1, ...)"
,init_method,'half and half'
,function_set,"['add', 'sub', ...]"


### Visualize Result

In [23]:
target_fac = gp_sample._total_program[0]
target_fac.__str__()

'div(rank(inv(open)), mul(dynamic_ts_mean(amount,57), dynamic_ts_cov(low, low,40)))'

In [24]:
print(target_fac.export_graphviz())

digraph program {
node [style=filled]
0 [label="div", fillcolor="#136ed4"] ;
1 [label="rank", fillcolor="#136ed4"] ;
2 [label="inv", fillcolor="#136ed4"] ;
3 [label="open", fillcolor="#60a6f6"] ;
2 -> 3 ;
1 -> 2 ;
4 [label="mul", fillcolor="#136ed4"] ;
5 [label="dynamic_ts_mean", fillcolor="#136ed4"] ;
6 [label="amount", fillcolor="#60a6f6"] ;
5 -> 6 ;
7 [label="dynamic_ts_cov", fillcolor="#136ed4"] ;
8 [label="low", fillcolor="#60a6f6"] ;
9 [label="low", fillcolor="#60a6f6"] ;
7 -> 9 ;
7 -> 8 ;
4 -> 7 ;
4 -> 5 ;
0 -> 4 ;
0 -> 1 ;
}
