In [1]:
import qlib
from pathlib import Path
from qlib.constant import REG_CN  # 'cn'
from qlib.data import D
from qlib.data.filter import ExpressionDFilter,NameDFilter

# 在这里定义路径
# 当前绝对路径
path_cur = Path.cwd() # D:/github/qlib/playGround/jupyter
# 项目根目录
path_project_root = path_cur.parent.parent # 绝对路径 ./qlib

# 数据路径
path_data = path_project_root / Path('./.qlib/qlib_bin')


In [2]:
# In order to get the data, users need to initialize Qlib with qlib.init first
qlib.init(default_conf="client", provider_uri=path_data, region=REG_CN)


[136708:MainThread](2023-03-08 10:53:35,112) INFO - qlib.Initialization - [config.py:415] - default_conf: client.
[136708:MainThread](2023-03-08 10:53:36,682) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[136708:MainThread](2023-03-08 10:53:36,683) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('D:/github/qlib/.qlib/qlib_bin')}


In [3]:
# 从calendar中读取 指定的频率和时间范围，查看数据的时间范围
start,end = '2018-01-01','2023-02-28'
cal_d = D.calendar(start_time=start, end_time=end, freq='day')
print(cal_d[:2])

# 初始化一个stook pool 从instruments中，指定 market 为文件名，读取个股code
mkt_csi300 = D.instruments(market='csi300')   # {'market': 'all', 'filter_pipe': []}
mkt_csi300_list = D.list_instruments(instruments=mkt_csi300, start_time=start, end_time=end, as_list=True)

# 有filter api 用来过滤数据
# expressionDFilter = ExpressionDFilter(rule_expression='$close>2000')
# instruments = D.instruments(market='csi300', filter_pipe=[expressionDFilter])
# D.list_instruments(instruments=instruments, start_time='2020-01-01', end_time='2020-09-25', as_list=True)


[Timestamp('2018-01-02 00:00:00') Timestamp('2018-01-03 00:00:00')]


In [4]:
# # 加载数据
# fields = ['$open','$close','$high','$low', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low']
# data = D.features(mkt_csi300, fields, start_time=start, end_time=end, freq='day')
# data

```python
# 对象初始化
class_x_config = {
        "class": "Class_X", # 类命
        "module_path": "qlib.xx.xx.xx", # 该类所在的路径
        # 该对象的超参数
        "kwargs": {
            'param_1': 'value_1',
            'param_2': 'value_2',
            ...
            'param_n': 'value_n',
        },
}
```

In [5]:
# 训练参数设置
# Qlib要求我们使用Dict类型变量，以一种约定俗成的定义方式来进行参数初始化

# 数据处理类handler设置
data_handler_config = {
    "start_time": '2018-01-01',
    "end_time": '2023-02-28',
    "fit_start_time": '2018-01-01',
    "fit_end_time": "2020-12-31",
    "instruments": "csi300" # 使用这段时间内中证三百指数成分股进行交易
}

# 训练任务参数设置
task = {
    # 使用的model的参数设置
    "model": {
        "class": "XGBModel", # 原代码中使用LGBM，这里改成XGB
        "module_path": "qlib.contrib.model.xgboost",
        # 模型的超参数
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "max_depth": 8,
            'tree_method': 'gpu_hist',
            'gpu_id': 0
        },
    },

# start,end = '2018-01-01','2023-02-28'

    # 为model提供数据的dataset类的参数设置
    "dataset": {
        "class": "DatasetH", 
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,   # 使用上边定义的data_handler_config
            },
            "segments": {
                "train": ("2018-01-01", "2020-12-31"), # 训练集时间范围
                "valid": ("2021-01-01", "2021-12-31"), # 验证集时间范围
                "test": ("2022-01-01", "2023-02-28"),  # 测试集时间范围
            },
        },
    },
}


In [7]:
# 初始化模型和数据集
from qlib.utils import init_instance_by_config

model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])   # 在初始化dataset时会消耗大量时间


[136708:MainThread](2023-03-08 11:21:32,604) INFO - qlib.timer - [log.py:128] - Time cost: 81.118s | Loading data Done
[136708:MainThread](2023-03-08 11:21:32,898) INFO - qlib.timer - [log.py:128] - Time cost: 0.111s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].groupby("datetime", group_keys=False).apply(self.zscore_func)
[136708:MainThread](2023-03-08 11:21:34,357) INFO - qlib.timer - [log.py:128] - Time cost: 1.457s | CSZScoreNorm Done
[136708:MainThread](2023-03-08 11:21:34,378) INFO - qlib.timer - [log.py:128] - Time cost: 1.773s | fit & process data Done
[136708:MainThread](2023-03-08 11:21:34,380) INFO - qlib.timer - [log.py:128] - Time cost: 82.895s | Init data Done


In [8]:
# 模型训练
# start exp to train model
from qlib.workflow import R
from qlib.utils import flatten_dict
 

with R.start(experiment_name="train_demo", experiment_id='1'):
    # R.start()会生成一个exp实例，实例化于Experiment类，而该实例初始化时会同时创建一个recoder实例，
    # 可以把他想象成一个容器，用于记录和存储实验中的各种变量以及模型。而recorder实例化于Recoder类
    # 需要experiment_id和id才能唯一确定一个recorder
    R.log_params(**flatten_dict(task))  #
    model.fit(dataset)  # 模型训练
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id # 获得这个R中的recorder的id，便于之后取用这个recorder

[136708:MainThread](2023-03-08 11:24:34,323) INFO - qlib.workflow - [exp.py:258] - Experiment 1 starts running ...
[136708:MainThread](2023-03-08 11:24:34,793) INFO - qlib.workflow - [recorder.py:341] - Recorder d5ab2c0f0fd240e59fcafe0606a43639 starts running under Experiment 1 ...


Parameters: { "loss" } are not used.

[0]	train-rmse:1.10660	valid-rmse:1.10729
[20]	train-rmse:1.00820	valid-rmse:1.01874
[40]	train-rmse:0.98434	valid-rmse:1.00209
[60]	train-rmse:0.97396	valid-rmse:0.99931
[80]	train-rmse:0.96659	valid-rmse:0.99914
[100]	train-rmse:0.95956	valid-rmse:0.99958
[115]	train-rmse:0.95390	valid-rmse:0.99994


[136708:MainThread](2023-03-08 11:24:50,808) INFO - qlib.timer - [log.py:128] - Time cost: 0.000s | waiting `async_log` Done


In [15]:
with R.start(experiment_name="train_modal_111", experiment_id='0'):
    print(R.get_exp().id)   

[136708:MainThread](2023-03-08 11:45:22,503) INFO - qlib.workflow - [exp.py:258] - Experiment 0 starts running ...
[136708:MainThread](2023-03-08 11:45:22,567) INFO - qlib.workflow - [recorder.py:341] - Recorder 76faec62fcc6407cb1b3c34de32f6d94 starts running under Experiment 0 ...
[136708:MainThread](2023-03-08 11:45:23,048) INFO - qlib.timer - [log.py:128] - Time cost: 0.004s | waiting `async_log` Done


0
