File to compute the baselines with skitlearn

In [1]:
from master_bert import MASTERModel
import pickle
import numpy as np
import time

from utils import load_all_csv_data_with_market_indexes, load_all_csv_data_without_index, csvs_to_qlib_df, PandasDataLoader
# Please install qlib first before load the data.

# Qlib
# import qlib
# from qlib.config import REG_US           # S&P 500 is a US market
# qlib.init(provider_uri=".", region=REG_US)   # provider_uri just needs to exist





# ------------------------------------------------------------
# 1.  Init Qlib and build *one* handler
import qlib, pandas as pd, numpy as np, torch
qlib.init()                               # client mode is fine

from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset import TSDatasetH          # <-- here
from qlib.data.dataset.processor import (
    DropnaProcessor, CSZScoreNorm, DropnaLabel,
)

# your tensor, names, dates exactly as before  ----------------
# stock_tensor, stock_names, feature_names = load_all_csv_data_without_index()
stock_tensor, stock_names, feature_names = load_all_csv_data_with_market_indexes()
N, T, K   = stock_tensor.shape
print("Shape: ", stock_tensor.shape)
# dates     = pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
dates = pd.to_datetime(                     # <-- NEW
    pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
)

# tensor ➜ tidy multi-index frame --------------------------------
def tensor_to_df(tensor, inst, feats, dt_index):
    flat = tensor.numpy().reshape(N * T, K)
    idx  = pd.MultiIndex.from_product([dt_index, inst],
                                      names=["datetime", "instrument"])
    cols = pd.MultiIndex.from_product([["feature"], feats])
    return pd.DataFrame(flat, index=idx, columns=cols)

df_raw = tensor_to_df(stock_tensor, stock_names, feature_names, dates)

# optional: build a forward-return label
df_raw[("label", "FWD_RET")] = (
    df_raw[("feature", "Adjusted Close")]
      .groupby("instrument").shift(-1) / df_raw[("feature", "Adjusted Close")] - 1
)

last_date = dates.iloc[-1]
df_raw = df_raw.drop(index=last_date, level="datetime")

# handler with learn / infer processors ------------------------
proc_feat = [
    {"class": "DropnaProcessor", "kwargs": {"fields_group": "feature"}},
    # {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}},
]

# proc_feat = [
#     {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}},
# ]

# proc_feat = [
#     {"class": "Fillna",          # <— correct name
#      "kwargs": {"fields_group": "feature", "fill_value": 0}},  # zero-fill; choose ffill/bfill/etc. if you like
#     {"class": "CSZScoreNorm",
#      "kwargs": {"fields_group": "feature"}},
# ]

proc_label = [{"class": "DropnaLabel"}]

handler = DataHandlerLP(
    data_loader      = StaticDataLoader(df_raw),
    infer_processors = proc_feat,          # what the model will see later
    learn_processors = proc_feat + proc_label,
)
handler.fit_process_data()                 # learn z-scores, etc.

# ------------------------------------------------------------
# 2.  Attach time splits in a TSDatasetH
split = {
    "train": (dates.iloc[8],              dates.iloc[int(T*0.8) - 1]),
    "valid": (dates.iloc[int(T*0.8)],     dates.iloc[int(T*0.9) - 1]),
    "test" : (dates.iloc[int(T*0.9)],     dates.iloc[-2]),
}

ts_ds = TSDatasetH(
    handler  = handler,
    segments = split,
    step_len = 8,          # same window the MASTER code expects
)

dl_train = ts_ds.prepare("train")   # ➜ TSDataSampler
dl_valid = ts_ds.prepare("valid")
dl_test  = ts_ds.prepare("test")





print(len(dl_train), len(dl_valid), len(dl_test))
#  → continue with your for-loop over seeds exactly as before
# ------------------------------------------------------------



[1397053:MainThread](2025-05-16 11:44:05,268) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[1397053:MainThread](2025-05-16 11:44:05,731) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[1397053:MainThread](2025-05-16 11:44:05,732) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/gabrielecarrino/.qlib/qlib_data/cn_data')}


Shape:  torch.Size([336, 3764, 276])


[1397053:MainThread](2025-05-16 11:44:28,302) INFO - qlib.timer - [log.py:127] - Time cost: 0.255s | Loading data Done
[1397053:MainThread](2025-05-16 11:44:30,465) INFO - qlib.timer - [log.py:127] - Time cost: 2.163s | DropnaProcessor Done
[1397053:MainThread](2025-05-16 11:44:30,939) INFO - qlib.timer - [log.py:127] - Time cost: 0.473s | DropnaProcessor Done
[1397053:MainThread](2025-05-16 11:44:31,094) INFO - qlib.timer - [log.py:127] - Time cost: 0.154s | DropnaLabel Done
[1397053:MainThread](2025-05-16 11:44:31,094) INFO - qlib.timer - [log.py:127] - Time cost: 2.792s | fit & process data Done
[1397053:MainThread](2025-05-16 11:44:31,095) INFO - qlib.timer - [log.py:127] - Time cost: 3.048s | Init data Done
[1397053:MainThread](2025-05-16 11:44:31,588) INFO - qlib.timer - [log.py:127] - Time cost: 0.493s | DropnaProcessor Done
[1397053:MainThread](2025-05-16 11:44:32,068) INFO - qlib.timer - [log.py:127] - Time cost: 0.477s | DropnaProcessor Done
[1397053:MainThread](2025-05-16 11

1009008 126336 126336


In [2]:
# grab the very first sample
sample = dl_train[0]

# this will print something like (step_len, num_features)
print("Sample shape:", sample.shape)

# so the number of features is the second entry:
print("Number of features:", sample.shape[1])

Sample shape: (8, 277)
Number of features: 277


Here we have a different datapoint for every label, in their implementation they are all the same!

In [3]:
first_element = dl_train[0]
print(first_element)

[[ 5.7980000e+01  5.8220001e+01  8.0932000e+06 ...  9.6427292e-01
   1.0698729e-01  3.8490832e-01]
 [ 7.7120003e+01  7.8599998e+01  6.7514000e+06 ...  8.2667828e-01
   1.3438544e-01  3.1991422e-01]
 [ 9.7320000e+01  9.7360001e+01  4.6345000e+06 ...  1.0012949e+00
   1.6744927e-01  3.9645553e-02]
 ...
 [ 1.2356000e+02  1.2379000e+02  3.4945000e+06 ...  1.2975066e+00
   2.0720080e-01  2.8916955e-01]
 [ 1.5328999e+02  1.5406000e+02  3.5224000e+06 ...  1.0743347e+00
   2.1872924e-01  2.5094354e-01]
 [ 1.8578999e+02  1.8774001e+02  3.7459000e+06 ...  1.0631595e+00
   1.7036489e-01 -1.0065502e-01]]


In [4]:
sample = dl_test[0]    # The first 6 samples have nan!!!
print(type(sample))
print(len(sample))
print(sample)

<class 'numpy.ndarray'>
8
[[ 1.4500000e+01  1.4530000e+01  7.7107000e+06 ...  1.1503834e+00
   2.8200623e-01 -2.4905902e-01]
 [ 1.0460000e+01  1.0640000e+01  1.1397400e+07 ...  1.2143085e+00
   1.9997996e-01  1.0013368e+00]
 [ 1.9920000e+01  2.0340000e+01  6.9461000e+06 ...  1.0403929e+00
   1.2858492e-01 -6.8107516e-01]
 ...
 [ 1.0622500e+01  1.0790000e+01  1.2730000e+07 ...  8.4158045e-01
   1.2026708e-01  5.7698143e-01]
 [ 1.6605000e+01  1.6767500e+01  1.0376400e+07 ...  7.4245542e-01
   1.6340670e-01  5.3569841e-01]
 [ 2.4805000e+01  2.4980000e+01  7.5204000e+06 ...  1.0416052e+00
   1.3953993e-01  1.8432951e-01]]


In [5]:
features = sample[:, :-1]
labels = sample[:, -1]
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)
print("First row of features:", features[0])
print("First label:", labels[0])

Features shape: (8, 276)
Labels shape: (8,)
First row of features: [ 1.45000000e+01  1.45299997e+01  7.71070000e+06  1.50699997e+01
  1.50200005e+01  1.26955366e+01  1.46073332e+01  1.51610775e+01
  1.40535889e+01  5.53744376e-01  1.40457401e+01  1.54320002e+01
  1.70082417e+01  3.02771360e+08 -2.14152725e+06  4.29076729e+01
  7.53898472e-02  1.48924246e-01  1.44072866e+01  1.00000000e+00
  0.00000000e+00 -2.09020591e+00  1.11653082e+09  1.10882010e+09
  1.11653082e+09  1.11277414e+09  1.13844608e+09  1.00000000e+00
  0.00000000e+00 -1.44737184e+00  7.14285736e+01  0.00000000e+00
 -7.14285736e+01  5.54216146e-01  1.44488277e+01  1.47480001e+01
  1.50471725e+01  4.05712843e+00  9.54587817e-01 -6.54285848e-02
  8.59650850e-01  8.38291855e+01  6.38180122e+01  8.83142567e+00
  0.00000000e+00  0.00000000e+00 -1.12872338e+00 -9.74532664e-01
 -9.77415979e-01 -8.07134330e-01  1.07116723e+00 -5.47241688e+00
  4.15305557e+01  1.73431263e+01  1.53119192e+01 -3.00337404e-01
 -2.25640488e+01 -2.279

In [6]:
print(stock_tensor.shape)

torch.Size([336, 3764, 276])


In [7]:
# Drop all rows in df_raw where the datetime is the last date in 'dates'
# last_date = dates.iloc[-1]
# df_raw = df_raw.drop(index=last_date, level="datetime")

In [8]:
df_raw

Unnamed: 0_level_0,Unnamed: 1_level_0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,label
Unnamed: 0_level_1,Unnamed: 1_level_1,Low,Open,Volume,High,Close,Adjusted Close,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,...,volume_std_5_rel_forbes2000,ret_mean_10_forbes2000,ret_std_10_forbes2000,volume_mean_10_rel_forbes2000,volume_std_10_rel_forbes2000,ret_mean_30_forbes2000,ret_std_30_forbes2000,volume_mean_30_rel_forbes2000,volume_std_30_rel_forbes2000,FWD_RET
datetime,instrument,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2008-01-02,MCD,57.889999,59.480000,7858300.0,59.480000,58.099998,37.773758,59.338001,60.475433,58.200565,1.137434,...,0.185704,0.001766,0.012605,0.749271,0.243681,0.000489,0.014268,0.902787,0.242897,-0.057088
2008-01-02,ABC,57.740002,58.400002,6514700.0,58.799999,57.930000,37.663235,58.956665,60.088936,57.824394,1.132271,...,0.216941,0.002752,0.012290,0.853202,0.262097,0.000809,0.014226,1.046181,0.281136,-0.047332
2008-01-02,ABT,56.430000,57.330002,9687500.0,57.470001,57.049999,37.091080,58.468666,59.625454,57.311878,1.156787,...,0.222938,0.000906,0.014950,0.643741,0.224113,0.000046,0.014985,0.748429,0.193738,-0.022991
2008-01-02,TROW,56.869999,57.360001,10784500.0,58.250000,58.029999,37.728233,58.114666,59.286335,56.943001,1.171667,...,0.229717,-0.000375,0.015823,0.583389,0.237961,0.000254,0.014711,0.635112,0.174655,-0.044858
2008-01-02,ED,56.959999,58.439999,10080800.0,58.639999,57.080002,37.110588,57.781334,58.986889,56.575775,1.205556,...,0.174049,-0.001396,0.015556,0.644833,0.276365,-0.000074,0.014652,0.704957,0.183217,-0.001134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-09,CAT,226.570007,227.389999,753900.0,228.919998,227.729996,221.014511,227.876663,231.323135,224.430191,3.446470,...,0.113441,0.002502,0.008105,1.152821,0.113035,0.001218,0.009338,1.131199,0.167672,0.002604
2022-12-09,VTR,226.029999,227.000000,584300.0,227.720001,227.270004,220.568085,227.993332,231.323364,224.663300,3.330039,...,0.163857,0.002720,0.008094,1.245022,0.131018,0.001009,0.009213,1.204786,0.169091,-0.002122
2022-12-09,REG,227.589996,227.589996,968200.0,231.639999,230.710007,223.906662,228.198669,231.598038,224.799301,3.399369,...,0.115597,0.002738,0.008102,1.080670,0.118176,0.001416,0.009151,1.054529,0.148509,-0.011552
2022-12-09,UPS,230.899994,230.899994,923600.0,234.220001,233.830002,226.934647,229.068665,232.475418,225.661926,3.406744,...,0.059297,0.004224,0.007467,1.127058,0.130012,0.001630,0.009228,1.122370,0.158589,-0.031615


In [9]:
# import numpy as np

# # Check for NaNs in all samples of dl_train
# has_nan = False
# for i, sample in enumerate(dl_train):
#     if np.isnan(sample).any():
#         print(f"NaN found in sample {i}")
#         has_nan = True
#         break

# if not has_nan:
#     print("No NaN values found in dl_train.")

To debug go in base_model.py ---> train_epoch(self, data_loader)

It seems that the first prediction and step works correctly. But all the losses after the first one are nan!

- It seems that if I do not perform the gradient step everithing works fine!

In [10]:
from master_bert import MASTERModel
import pickle
import numpy as np
import time


universe = 'sp500'
d_feat = 224
d_model = 256
t_nhead = 4
s_nhead = 2
dropout = 0.5
gate_input_start_index = 224
gate_input_end_index = 276

if universe == 'sp500':
    beta = 5
else:
    raise ValueError("Invalid universe")

n_epoch = 1
lr = 1e-5
GPU = 0
train_stop_loss_thred = 0.95


ic = []
icir = []
ric = []
ricir = []

# Training
######################################################################################
for seed in [0, 1]: #[0, 1, 2, 3, 4]:
    model = MASTERModel(
        d_feat = d_feat, d_model = d_model, t_nhead = t_nhead, s_nhead = s_nhead, T_dropout_rate=dropout, S_dropout_rate=dropout,
        beta=beta, gate_input_end_index=gate_input_end_index, gate_input_start_index=gate_input_start_index,
        n_epochs=n_epoch, lr = lr, GPU = GPU, seed = seed, train_stop_loss_thred = train_stop_loss_thred,
        save_path='model', save_prefix=f'{universe}'
    )

    start = time.time()
    # Train
    model.fit(dl_train, dl_valid)

    print("Model Trained.")

    # Test
    predictions, metrics = model.predict(dl_test)
    
    running_time = time.time()-start
    
    print('Seed: {:d} time cost : {:.2f} sec'.format(seed, running_time))
    print(metrics)

    ic.append(metrics['IC'])
    icir.append(metrics['ICIR'])
    ric.append(metrics['RIC'])
    ricir.append(metrics['RICIR'])
######################################################################################



  self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values


Feature shape: torch.Size([320, 8, 276])
Label shape: torch.Size([320])
Pred shape: torch.Size([320])
Loss shape: torch.Size([])
Feature: tensor([[[6.5884e+02, 6.7268e+02, 9.1990e+05,  ..., 1.4126e-02,
          1.1726e+00, 1.3655e-01],
         [5.1260e+01, 5.1280e+01, 1.9996e+06,  ..., 4.3236e-02,
          8.9296e-01, 2.2732e-01],
         [4.4040e+01, 4.4220e+01, 1.3726e+06,  ..., 1.8322e-02,
          1.3735e+00, 2.8140e-01],
         ...,
         [6.5410e+01, 6.5410e+01, 7.0690e+05,  ..., 8.8719e-03,
          1.1616e+00, 1.9326e-01],
         [8.8900e+01, 8.9670e+01, 3.8000e+05,  ..., 6.4266e-03,
          1.0729e+00, 1.2068e-01],
         [1.0837e+02, 1.1034e+02, 7.2700e+05,  ..., 8.4678e-03,
          9.7849e-01, 1.5085e-01]],

        [[5.1260e+01, 5.1280e+01, 1.9996e+06,  ..., 4.3236e-02,
          8.9296e-01, 2.2732e-01],
         [4.4040e+01, 4.4220e+01, 1.3726e+06,  ..., 1.8322e-02,
          1.3735e+00, 2.8140e-01],
         [3.8320e+01, 3.8640e+01, 1.0611e+06,  ..., 1.

NameError: name 'exit' is not defined