File to compute the baselines with skitlearn

In [1]:
from master_bert import MASTERModel
import pickle
import numpy as np
import time

from utils import load_all_csv_data_with_market_indexes, load_all_csv_data_without_index, csvs_to_qlib_df, PandasDataLoader
# Please install qlib first before load the data.

# Qlib
# import qlib
# from qlib.config import REG_US           # S&P 500 is a US market
# qlib.init(provider_uri=".", region=REG_US)   # provider_uri just needs to exist





# ------------------------------------------------------------
# 1.  Init Qlib and build *one* handler
import qlib, pandas as pd, numpy as np, torch
qlib.init()                               # client mode is fine

from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset import TSDatasetH          # <-- here
from qlib.data.dataset.processor import (
    DropnaProcessor, CSZScoreNorm, DropnaLabel,
)

# your tensor, names, dates exactly as before  ----------------
# stock_tensor, stock_names, feature_names = load_all_csv_data_without_index()
stock_tensor, stock_names, feature_names = load_all_csv_data_with_market_indexes()
N, T, K   = stock_tensor.shape
print("Shape: ", stock_tensor.shape)
# dates     = pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
dates = pd.to_datetime(                     # <-- NEW
    pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
)

# tensor ➜ tidy multi-index frame --------------------------------
def tensor_to_df(tensor, inst, feats, dt_index):
    flat = tensor.numpy().reshape(N * T, K)
    idx  = pd.MultiIndex.from_product([dt_index, inst],
                                      names=["datetime", "instrument"])
    cols = pd.MultiIndex.from_product([["feature"], feats])
    return pd.DataFrame(flat, index=idx, columns=cols)

df_raw = tensor_to_df(stock_tensor, stock_names, feature_names, dates)

# optional: build a forward-return label
df_raw[("label", "FWD_RET")] = (
    df_raw[("feature", "Adjusted Close")]
      .groupby("instrument").shift(-1) / df_raw[("feature", "Adjusted Close")] - 1
)

last_date = dates.iloc[-1]
df_raw = df_raw.drop(index=last_date, level="datetime")

# handler with learn / infer processors ------------------------
proc_feat = [
    {"class": "DropnaProcessor", "kwargs": {"fields_group": "feature"}},
    # {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}}, # slows down debugging
]

# proc_feat = [
#     {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}},
# ]

# proc_feat = [
#     {"class": "Fillna",          # <— correct name
#      "kwargs": {"fields_group": "feature", "fill_value": 0}},  # zero-fill; choose ffill/bfill/etc. if you like
#     {"class": "CSZScoreNorm",
#      "kwargs": {"fields_group": "feature"}},
# ]

proc_label = [{"class": "DropnaLabel"}]

handler = DataHandlerLP(
    data_loader      = StaticDataLoader(df_raw),
    infer_processors = proc_feat,          # what the model will see later
    learn_processors = proc_feat + proc_label,
)
handler.fit_process_data()                 # learn z-scores, etc.

# ------------------------------------------------------------
# 2.  Attach time splits in a TSDatasetH
split = {
    "train": (dates.iloc[8],              dates.iloc[int(T*0.8) - 1]),
    "valid": (dates.iloc[int(T*0.8)],     dates.iloc[int(T*0.9) - 1]),
    "test" : (dates.iloc[int(T*0.9)],     dates.iloc[-2]),
}

ts_ds = TSDatasetH(
    handler  = handler,
    segments = split,
    step_len = 8,          # same window the MASTER code expects
)

dl_train = ts_ds.prepare("train")   # ➜ TSDataSampler
dl_valid = ts_ds.prepare("valid")
dl_test  = ts_ds.prepare("test")





print(len(dl_train), len(dl_valid), len(dl_test))
#  → continue with your for-loop over seeds exactly as before
# ------------------------------------------------------------



[1416288:MainThread](2025-05-16 12:57:36,888) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[1416288:MainThread](2025-05-16 12:57:37,342) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[1416288:MainThread](2025-05-16 12:57:37,342) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/gabrielecarrino/.qlib/qlib_data/cn_data')}


Shape:  torch.Size([336, 3764, 276])


[1416288:MainThread](2025-05-16 12:58:00,145) INFO - qlib.timer - [log.py:127] - Time cost: 0.258s | Loading data Done
[1416288:MainThread](2025-05-16 12:58:02,358) INFO - qlib.timer - [log.py:127] - Time cost: 2.213s | DropnaProcessor Done
[1416288:MainThread](2025-05-16 12:58:02,848) INFO - qlib.timer - [log.py:127] - Time cost: 0.489s | DropnaProcessor Done
[1416288:MainThread](2025-05-16 12:58:03,006) INFO - qlib.timer - [log.py:127] - Time cost: 0.157s | DropnaLabel Done
[1416288:MainThread](2025-05-16 12:58:03,006) INFO - qlib.timer - [log.py:127] - Time cost: 2.861s | fit & process data Done
[1416288:MainThread](2025-05-16 12:58:03,006) INFO - qlib.timer - [log.py:127] - Time cost: 3.120s | Init data Done
[1416288:MainThread](2025-05-16 12:58:03,534) INFO - qlib.timer - [log.py:127] - Time cost: 0.527s | DropnaProcessor Done
[1416288:MainThread](2025-05-16 12:58:04,034) INFO - qlib.timer - [log.py:127] - Time cost: 0.497s | DropnaProcessor Done
[1416288:MainThread](2025-05-16 12

1009008 126336 126336


In [2]:
# grab the very first sample
sample = dl_train[0]

# this will print something like (step_len, num_features)
print("Sample shape:", sample.shape)

# so the number of features is the second entry:
print("Number of features:", sample.shape[1])

Sample shape: (8, 277)
Number of features: 277


Here we have a different datapoint for every label, in their implementation they are all the same!

In [3]:
first_element = dl_train[0]
print(first_element)

[[ 1.35200005e+01  1.37950001e+01  4.59380000e+06 ...  1.39906311e+00
   2.98428357e-01  4.34369445e-01]
 [ 1.79200001e+01  1.80849991e+01  4.25300000e+06 ...  9.45278168e-01
   1.13923237e-01 -9.42955017e-02]
 [ 1.58500004e+01  1.62549992e+01  6.28600000e+06 ...  7.86304772e-01
   1.02076344e-01  5.09517789e-01]
 ...
 [ 2.93950005e+01  2.97049999e+01  2.00820000e+06 ...  9.80815053e-01
   1.44468978e-01  4.71573234e-01]
 [ 4.18349991e+01  4.24049988e+01  4.12760000e+06 ...  1.35893011e+00
   2.99316883e-01  4.60487604e-02]
 [ 4.25299988e+01  4.28750000e+01  4.96920000e+06 ...  9.88571405e-01
   1.54904172e-01  3.15548420e-01]]


In [4]:
sample = dl_test[0]    # The first 6 samples have nan!!!
print(type(sample))
print(len(sample))
print(sample)

<class 'numpy.ndarray'>
8
[[ 1.3475000e+02  1.3708000e+02  7.8560000e+05 ...  8.5391355e-01
   1.8295179e-01  3.3843672e-01]
 [ 1.7746001e+02  1.7814000e+02  1.4068000e+06 ...  9.5659572e-01
   2.1712799e-01 -1.9274426e-01]
 [ 1.4107001e+02  1.4405000e+02  2.4426000e+06 ...  1.2163464e+00
   2.3522612e-01 -9.7939575e-01]
 ...
 [ 5.8933330e+00  5.9407411e+00  7.5161300e+05 ...  1.3980669e+00
   2.6410228e-01  1.9224274e-01]
 [ 6.9007411e+00  7.0044441e+00  4.6136300e+05 ...  1.0609559e+00
   2.0122029e-01  3.2328629e-01]
 [ 8.8148146e+00  8.8562956e+00  2.0621300e+05 ...  2.5206006e+00
   5.2224374e-01  2.7852380e-01]]


In [5]:
features = sample[:, :-1]
labels = sample[:, -1]
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)
print("First row of features:", features[0])
print("First label:", labels[0])

Features shape: (8, 276)
Labels shape: (8,)
First row of features: [ 1.34750000e+02  1.37080002e+02  7.85600000e+05  1.37720001e+02
  1.37529999e+02  1.27682243e+02  1.41059998e+02  1.44722336e+02
  1.37397659e+02  3.66234207e+00  1.37279831e+02  1.45665497e+02
  1.54169846e+02  3.08719296e+08  4.15086125e+05  2.49176884e+01
  6.73023283e-01  1.30812252e+00  1.51852341e+02  0.00000000e+00
  1.00000000e+00 -3.99008918e+00  2.17535600e+08  2.16750000e+08
  2.17535600e+08  2.17848208e+08  2.18530176e+08  0.00000000e+00
  1.00000000e+00  6.17755294e-01  9.28571396e+01  3.57142868e+01
 -5.71428566e+01  3.70429540e+00  1.32615311e+02  1.40794006e+02
  1.48972687e+02  1.16179457e+01  3.00456822e-01 -5.64826056e-02
  1.51514068e-01  6.27462997e+01  6.46655273e+01 -1.21503281e+02
  0.00000000e+00  1.00000000e+00 -2.17475653e+00 -2.21034503e+00
 -2.25874400e+00 -1.85205448e+00  7.73000538e-01 -5.61526155e+00
  4.13749847e+01  1.44441818e+02  1.46046585e+02  1.52538165e-01
 -2.43435726e+01 -1.740

In [6]:
print(stock_tensor.shape)

torch.Size([336, 3764, 276])


In [7]:
# Drop all rows in df_raw where the datetime is the last date in 'dates'
# last_date = dates.iloc[-1]
# df_raw = df_raw.drop(index=last_date, level="datetime")

In [8]:
df_raw

Unnamed: 0_level_0,Unnamed: 1_level_0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,label
Unnamed: 0_level_1,Unnamed: 1_level_1,Low,Open,Volume,High,Close,Adjusted Close,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,...,volume_std_5_rel_forbes2000,ret_mean_10_forbes2000,ret_std_10_forbes2000,volume_mean_10_rel_forbes2000,volume_std_10_rel_forbes2000,ret_mean_30_forbes2000,ret_std_30_forbes2000,volume_mean_30_rel_forbes2000,volume_std_30_rel_forbes2000,FWD_RET
datetime,instrument,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2008-01-02,MCHP,15.160000,15.685000,6013800.0,15.745000,15.230000,9.689595,15.824333,16.238293,15.410373,0.413960,...,0.185704,0.001766,0.012605,0.749271,0.243681,0.000489,0.014268,0.902787,0.242897,-0.165484
2008-01-02,AZO,15.070000,15.265000,4631800.0,15.375000,15.230000,9.689595,15.621667,16.028362,15.214971,0.406696,...,0.216941,0.002752,0.012290,0.853202,0.262097,0.000809,0.014226,1.046181,0.281136,-0.194164
2008-01-02,O,14.775000,15.200000,7632200.0,15.215000,14.920000,9.492368,15.422667,15.832583,15.012750,0.409917,...,0.222938,0.000906,0.014950,0.643741,0.224113,0.000046,0.014985,0.748429,0.193738,-0.166353
2008-01-02,SBUX,14.785000,14.885000,7278600.0,15.215000,14.975000,9.527364,15.260000,15.671255,14.848744,0.411256,...,0.229717,-0.000375,0.015823,0.583389,0.237961,0.000254,0.014711,0.635112,0.174655,-0.193604
2008-01-02,NDAQ,14.390000,15.110000,6112200.0,15.110000,14.400000,9.161535,15.039666,15.471505,14.607828,0.431838,...,0.174049,-0.001396,0.015556,0.644833,0.276365,-0.000074,0.014652,0.704957,0.183217,-0.204314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-09,WY,210.179993,211.199997,329100.0,214.309998,211.440002,211.440002,210.942001,215.717331,206.166672,4.775330,...,0.113441,0.002502,0.008105,1.152821,0.113035,0.001218,0.009338,1.131199,0.167672,-0.294079
2022-12-09,MU,210.000000,212.160004,297900.0,212.429993,210.080002,210.080002,211.875336,216.494308,207.256363,4.618974,...,0.163857,0.002720,0.008094,1.245022,0.131018,0.001009,0.009213,1.204786,0.169091,-0.293460
2022-12-09,MRO,207.169998,210.279999,313800.0,211.800003,208.529999,208.529999,211.274658,215.894379,206.654953,4.619709,...,0.115597,0.002738,0.008102,1.080670,0.118176,0.001416,0.009151,1.054529,0.148509,-0.271232
2022-12-09,XEL,206.330002,209.000000,272600.0,209.529999,207.570007,207.570007,210.426666,214.951721,205.901611,4.525062,...,0.059297,0.004224,0.007467,1.127058,0.130012,0.001630,0.009228,1.122370,0.158589,-0.279809


In [9]:
# import numpy as np

# # Check for NaNs in all samples of dl_train
# has_nan = False
# for i, sample in enumerate(dl_train):
#     if np.isnan(sample).any():
#         print(f"NaN found in sample {i}")
#         has_nan = True
#         break

# if not has_nan:
#     print("No NaN values found in dl_train.")

To debug go in base_model.py ---> train_epoch(self, data_loader)

It seems that the first prediction and step works correctly. But all the losses after the first one are nan!

- It seems that if I do not perform the gradient step everithing works fine!
- Now it computes the first losses and then explodes

In [None]:
from master_bert import MASTERModel
import pickle
import numpy as np
import time


universe = 'sp500'
d_feat = 224
d_model = 256
t_nhead = 4
s_nhead = 2
dropout = 0.5
gate_input_start_index = 224
gate_input_end_index = 276

if universe == 'sp500':
    beta = 5
else:
    raise ValueError("Invalid universe")

n_epoch = 1
lr = 1e-5
GPU = 0
train_stop_loss_thred = 0.95


ic = []
icir = []
ric = []
ricir = []

# Training
######################################################################################
for seed in [0, 1]: #[0, 1, 2, 3, 4]:
    model = MASTERModel(
        d_feat = d_feat, d_model = d_model, t_nhead = t_nhead, s_nhead = s_nhead, T_dropout_rate=dropout, S_dropout_rate=dropout,
        beta=beta, gate_input_end_index=gate_input_end_index, gate_input_start_index=gate_input_start_index,
        n_epochs=n_epoch, lr = lr, GPU = GPU, seed = seed, train_stop_loss_thred = train_stop_loss_thred,
        save_path='model', save_prefix=f'{universe}'
    )

    start = time.time()
    # Train
    model.fit(dl_train, dl_valid)

    print("Model Trained.")

    # Test
    predictions, metrics = model.predict(dl_test)
    
    running_time = time.time()-start
    
    print('Seed: {:d} time cost : {:.2f} sec'.format(seed, running_time))
    print(metrics)

    ic.append(metrics['IC'])
    icir.append(metrics['ICIR'])
    ric.append(metrics['RIC'])
    ricir.append(metrics['RICIR'])
######################################################################################



  self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values


Loss: 1.2440757751464844
Loss: 1.1660102605819702
Loss: 1.1318925619125366
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss:

  self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values


Seed: 0 time cost : 23.21 sec
{'IC': nan, 'ICIR': nan, 'RIC': nan, 'RICIR': nan}
Loss: 1.2660719156265259
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan


  self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values


Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
Loss: nan
