In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.utils import *
from gpquant.SymbolicRegressor import SymbolicRegressor
from gpquant.Function import function_map

In [2]:
inst_dir = '../qlib/data/cn_data/instruments/csi300.txt'
csi = pd.read_csv(inst_dir, sep='\t', header=None, names=['symbol', 'start', 'end'])
print(csi)

       symbol       start         end
0    SZ000001  2005-01-01  2024-01-29
1    SZ000002  2005-01-01  2024-01-29
2    SZ000063  2005-01-01  2024-01-29
3    SZ000069  2005-01-01  2024-01-29
4    SZ000100  2009-01-05  2024-01-29
..        ...         ...         ...
956  SH600501  2005-01-01  2008-06-30
957  SH600662  2005-01-01  2008-06-30
958  SH600754  2005-01-01  2008-06-30
959  SH600849  2005-01-01  2008-06-30
960  SH600088  2005-01-01  2008-06-30

[961 rows x 3 columns]


In [3]:
csi.start = pd.to_datetime(csi.start)
csi

Unnamed: 0,symbol,start,end
0,SZ000001,2005-01-01,2024-01-29
1,SZ000002,2005-01-01,2024-01-29
2,SZ000063,2005-01-01,2024-01-29
3,SZ000069,2005-01-01,2024-01-29
4,SZ000100,2009-01-05,2024-01-29
...,...,...,...
956,SH600501,2005-01-01,2008-06-30
957,SH600662,2005-01-01,2008-06-30
958,SH600754,2005-01-01,2008-06-30
959,SH600849,2005-01-01,2008-06-30


In [4]:
csi300_universe = csi[csi.start >= '2016-01-01'].symbol
csi300_universe = list(csi300_universe.values)
csi300_universe

['SZ000301',
 'SZ000408',
 'SZ000596',
 'SZ000661',
 'SZ000708',
 'SZ000733',
 'SZ000786',
 'SZ000792',
 'SZ000800',
 'SZ000877',
 'SZ000938',
 'SZ000977',
 'SZ001289',
 'SZ002001',
 'SZ002027',
 'SZ002049',
 'SZ002050',
 'SZ002074',
 'SZ002129',
 'SZ002179',
 'SZ002180',
 'SZ002271',
 'SZ002311',
 'SZ002352',
 'SZ002371',
 'SZ002410',
 'SZ002459',
 'SZ002460',
 'SZ002466',
 'SZ002493',
 'SZ002555',
 'SZ002601',
 'SZ002648',
 'SZ002709',
 'SZ002714',
 'SZ002812',
 'SZ002821',
 'SZ002841',
 'SZ002916',
 'SZ002920',
 'SZ002938',
 'SZ003816',
 'SZ300014',
 'SZ300015',
 'SZ300033',
 'SZ300122',
 'SZ300142',
 'SZ300223',
 'SZ300274',
 'SZ300316',
 'SZ300347',
 'SZ300408',
 'SZ300413',
 'SZ300433',
 'SZ300450',
 'SZ300454',
 'SZ300496',
 'SZ300498',
 'SZ300628',
 'SZ300661',
 'SZ300750',
 'SZ300751',
 'SZ300759',
 'SZ300760',
 'SZ300763',
 'SZ300782',
 'SZ300896',
 'SZ300919',
 'SZ300957',
 'SZ300979',
 'SZ300999',
 'SH600011',
 'SH600025',
 'SH600039',
 'SH600061',
 'SH600089',
 'SH600132',

In [5]:
# Specify the directory containing your CSV files
data_dir = '../qlib/data/source/cn_data'


# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame, then add it to the list
for symbol in csi300_universe:
    file_path = os.path.join(data_dir, symbol.lower() + '.csv')
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        dfs.append(df)

# Use pd.concat to concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
print(df)

          symbol        date  open  high   low  close     volume  adjclose  \
0       sz000301  2016-01-04  6.43  6.49  5.80   5.81   30277556  5.260915   
1       sz000301  2016-01-05  5.51  5.94  5.37   5.72   34900153  5.179421   
2       sz000301  2016-01-06  5.79  5.94  5.72   5.93   23855168  5.369575   
3       sz000301  2016-01-07  5.80  5.80  5.34   5.34    7206400  4.835334   
4       sz000301  2016-01-08  5.56  5.60  5.09   5.45   25607354  4.934938   
...          ...         ...   ...   ...   ...    ...        ...       ...   
533914  sh600022  2023-12-25  1.35  1.37  1.35   1.35   23335790  1.350000   
533915  sh600022  2023-12-26  1.36  1.36  1.34   1.35   28741140  1.350000   
533916  sh600022  2023-12-27  1.34  1.36  1.33   1.35   37983083  1.350000   
533917  sh600022  2023-12-28  1.35  1.41  1.34   1.40  121610210  1.400000   
533918  sh600022  2023-12-29  1.38  1.39  1.35   1.37   91236421  1.370000   

        dividends  splits  
0             0.0     NaN  
1      

In [6]:
df = df.iloc[:, :7]
df.date = pd.to_datetime(df.date)
df

Unnamed: 0,symbol,date,open,high,low,close,volume
0,sz000301,2016-01-04,6.43,6.49,5.80,5.81,30277556
1,sz000301,2016-01-05,5.51,5.94,5.37,5.72,34900153
2,sz000301,2016-01-06,5.79,5.94,5.72,5.93,23855168
3,sz000301,2016-01-07,5.80,5.80,5.34,5.34,7206400
4,sz000301,2016-01-08,5.56,5.60,5.09,5.45,25607354
...,...,...,...,...,...,...,...
533914,sh600022,2023-12-25,1.35,1.37,1.35,1.35,23335790
533915,sh600022,2023-12-26,1.36,1.36,1.34,1.35,28741140
533916,sh600022,2023-12-27,1.34,1.36,1.33,1.35,37983083
533917,sh600022,2023-12-28,1.35,1.41,1.34,1.40,121610210


In [7]:
df['close_1d'] = df.groupby('symbol')['close'].shift(-1)
df = df.dropna().sort_values('date').reset_index(drop=True)
df

Unnamed: 0,symbol,date,open,high,low,close,volume,close_1d
0,sz000301,2016-01-04,6.430000,6.490000,5.800000,5.810000,30277556,5.720000
1,sz002468,2016-01-04,43.040001,43.040001,43.040001,43.040001,1002060,47.340000
2,sh601991,2016-01-04,5.130000,5.150000,4.700000,4.780000,15246520,4.780000
3,sh600655,2016-01-04,16.129999,16.150000,14.570000,14.790000,15725721,15.420000
4,sz002426,2016-01-04,10.480000,10.480000,9.384000,9.456000,26751850,9.080000
...,...,...,...,...,...,...,...,...
533628,sz000553,2023-12-29,7.000000,7.060000,6.900000,6.940000,6414100,10.700000
533629,sz000008,2023-12-29,2.350000,2.380000,2.330000,2.360000,29055600,10.610000
533630,sz002466,2023-12-29,56.299999,56.950001,55.770000,55.790001,36955672,25.007053
533631,sz000555,2023-12-29,11.180000,11.360000,11.160000,11.290000,8857500,37.889999


In [8]:
df_train = df[df.date <= '2020-12-31']
df_test = df[df.date >= '2021-01-01']
y_train = df_train['close_1d']
y_test = df_test['close_1d']

In [9]:
len(df_train)

311385

In [10]:
# Step 3
sr = SymbolicRegressor(
    population_size=2000,
    tournament_size=20,
    generations=20,
    stopping_criteria=0.01,
    p_crossover=0.7,
    p_subtree_mutate=0.1,
    p_hoist_mutate=0.1,
    p_point_mutate=0.05,
    init_depth=(6, 8),
    init_method="half and half",
    function_set=list(function_map.keys()),
    variable_set=["open", "high", "low", "close", "volume"],
    const_range=(0, 1),
    ts_const_range=(1, 10),
    build_preference=[0.75, 0.75],
    metric="sharpe ratio",
    parsimony_coefficient=0.01,
)

sr.fit(df_train, y_train)

# Step 4
print(sr.best_estimator)

------------Generation  1------------
best estimator: inv(open)
best fitness: -0.019991939291818816


ValueError: All-NaN slice encountered

In [4]:
import gpquant.Function as f 

In [32]:
x = np.random.rand(100000)
z = pd.Series(x)

In [15]:
%%time
f._ts_mean(x, 9)

CPU times: user 1.92 ms, sys: 3.63 ms, total: 5.54 ms
Wall time: 4.27 ms


array([       nan,        nan,        nan, ..., 0.44518209, 0.48789865,
       0.48027122])