File to compute the baselines with skitlearn

In [None]:
from master import MASTERModel
import pickle
import numpy as np
import time

from utils import load_all_csv_data_with_market_indexes, load_all_csv_data_without_index, csvs_to_qlib_df, PandasDataLoader
# Please install qlib first before load the data.

# Qlib
# import qlib
# from qlib.config import REG_US           # S&P 500 is a US market
# qlib.init(provider_uri=".", region=REG_US)   # provider_uri just needs to exist





# ------------------------------------------------------------
# 1.  Init Qlib and build *one* handler
import qlib, pandas as pd, numpy as np, torch
qlib.init()                               # client mode is fine

from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset import TSDatasetH          # <-- here
from qlib.data.dataset.processor import (
    DropnaProcessor, CSZScoreNorm, DropnaLabel,
)

# your tensor, names, dates exactly as before  ----------------
# stock_tensor, stock_names, feature_names = load_all_csv_data_without_index()
stock_tensor, stock_names, feature_names = load_all_csv_data_with_market_indexes()
N, T, K   = stock_tensor.shape
print("Shape: ", stock_tensor.shape)
# dates     = pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
# dates = pd.to_datetime(                     # <-- NEW
#     pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
# )

dates = pd.to_datetime(                     # <-- NEW
    pd.read_csv("data/normalized/market_indexes_aggregated_normalized.csv")["Date"]
)

# tensor ➜ tidy multi-index frame --------------------------------
def tensor_to_df(tensor, inst, feats, dt_index):
    flat = tensor.numpy().reshape(N * T, K)
    idx  = pd.MultiIndex.from_product([dt_index, inst],
                                      names=["datetime", "instrument"])
    cols = pd.MultiIndex.from_product([["feature"], feats])
    return pd.DataFrame(flat, index=idx, columns=cols)

df_raw = tensor_to_df(stock_tensor, stock_names, feature_names, dates)

# optional: build a forward-return label
df_raw[("label", "FWD_RET")] = (
    df_raw[("feature", "Adjusted Close")]
      .groupby("instrument").shift(-1) / df_raw[("feature", "Adjusted Close")] - 1
)

last_date = dates.iloc[-1]
df_raw = df_raw.drop(index=last_date, level="datetime")

# handler with learn / infer processors ------------------------
proc_feat = [
    {"class": "DropnaProcessor", "kwargs": {"fields_group": "feature"}},
    # {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}}, # slows down debugging
]

# proc_feat = [
#     {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}},
# ]

# proc_feat = [
#     {"class": "Fillna",          # <— correct name
#      "kwargs": {"fields_group": "feature", "fill_value": 0}},  # zero-fill; choose ffill/bfill/etc. if you like
#     {"class": "CSZScoreNorm",
#      "kwargs": {"fields_group": "feature"}},
# ]

proc_label = [{"class": "DropnaLabel"}]

handler = DataHandlerLP(
    data_loader      = StaticDataLoader(df_raw),
    infer_processors = proc_feat,          # what the model will see later
    learn_processors = proc_feat + proc_label,
)
handler.fit_process_data()                 # learn z-scores, etc.

# ------------------------------------------------------------
# 2.  Attach time splits in a TSDatasetH
split = {
    "train": (dates.iloc[8],              dates.iloc[int(T*0.8) - 1]),
    "valid": (dates.iloc[int(T*0.8)],     dates.iloc[int(T*0.9) - 1]),
    "test" : (dates.iloc[int(T*0.9)],     dates.iloc[-2]),
}

ts_ds = TSDatasetH(
    handler  = handler,
    segments = split,
    step_len = 8,          # same window the MASTER code expects
)

dl_train = ts_ds.prepare("train")   # ➜ TSDataSampler
dl_valid = ts_ds.prepare("valid")
dl_test  = ts_ds.prepare("test")





print(len(dl_train), len(dl_valid), len(dl_test))
#  → continue with your for-loop over seeds exactly as before
# ------------------------------------------------------------



[1438096:MainThread](2025-05-16 15:04:47,528) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[1438096:MainThread](2025-05-16 15:04:47,987) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[1438096:MainThread](2025-05-16 15:04:47,988) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/gabrielecarrino/.qlib/qlib_data/cn_data')}


Shape:  torch.Size([336, 3764, 276])


[1438096:MainThread](2025-05-16 15:05:11,759) INFO - qlib.timer - [log.py:127] - Time cost: 0.278s | Loading data Done
[1438096:MainThread](2025-05-16 15:05:13,936) INFO - qlib.timer - [log.py:127] - Time cost: 2.176s | DropnaProcessor Done
[1438096:MainThread](2025-05-16 15:05:14,413) INFO - qlib.timer - [log.py:127] - Time cost: 0.476s | DropnaProcessor Done
[1438096:MainThread](2025-05-16 15:05:14,567) INFO - qlib.timer - [log.py:127] - Time cost: 0.153s | DropnaLabel Done
[1438096:MainThread](2025-05-16 15:05:14,567) INFO - qlib.timer - [log.py:127] - Time cost: 2.807s | fit & process data Done
[1438096:MainThread](2025-05-16 15:05:14,567) INFO - qlib.timer - [log.py:127] - Time cost: 3.087s | Init data Done
[1438096:MainThread](2025-05-16 15:05:15,054) INFO - qlib.timer - [log.py:127] - Time cost: 0.486s | DropnaProcessor Done
[1438096:MainThread](2025-05-16 15:05:15,529) INFO - qlib.timer - [log.py:127] - Time cost: 0.473s | DropnaProcessor Done
[1438096:MainThread](2025-05-16 15

1009008 126336 126336


In [2]:
# grab the very first sample
sample = dl_train[0]

# this will print something like (step_len, num_features)
print("Sample shape:", sample.shape)

# so the number of features is the second entry:
print("Number of features:", sample.shape[1])

Sample shape: (8, 277)
Number of features: 277


Here we have a different datapoint for every label, in their implementation they are all the same!

In [3]:
first_element = dl_train[0]
print(first_element)

[[-1.1274107  -1.1401069   0.08576461 ...  0.03444673  0.64926296
  -0.02793372]
 [-1.1024778  -1.1008397   2.3521283  ... -0.08351184 -0.6431154
  -0.41296357]
 [-0.63189065 -0.6386709   0.11537207 ... -0.996254   -1.0149207
  -1.3369789 ]
 ...
 [ 1.0401757   1.0303413  -0.42296115 ... -0.0754156  -0.9047193
   0.82035184]
 [ 1.926937    1.9027898   0.573591   ...  0.04343779  0.61480653
  -0.82167375]
 [ 0.35381794  0.3730829  -0.7679751  ...  0.7165421   0.8302081
  -1.2216039 ]]


In [4]:
sample = dl_test[0]    # The first 6 samples have nan!!!
print(type(sample))
print(len(sample))
print(sample)

<class 'numpy.ndarray'>
8
[[ 1.3453802   1.3354348  -0.90738887 ...  0.6329208   0.5162697
  -0.13017446]
 [ 1.0683975   1.0954674   0.20866002 ...  0.00185446  0.81938523
   0.75431275]
 [ 1.8135395   1.8009751  -0.33644143 ... -0.37910718 -0.49004704
  -1.3958982 ]
 ...
 [-1.0093921  -0.99696684  0.93279123 ... -0.13972935 -0.6144315
  -0.0213353 ]
 [-1.0267966  -1.0397656  -0.8555783  ... -0.3822517  -0.6203415
  -0.5526017 ]
 [-0.38283893 -0.38123897 -0.5314244  ... -0.84692657  0.42645553
  -1.2089746 ]]


In [5]:
features = sample[:, :-1]
labels = sample[:, -1]
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)
print("First row of features:", features[0])
print("First label:", labels[0])

Features shape: (8, 276)
Labels shape: (8,)
First row of features: [ 1.3453802   1.3354348  -0.90738887  1.3361017   1.3582584   1.2878414
  1.3223212   1.3110466   1.3337059   0.5981202   1.2753204   1.2646468
  1.2424563   1.3029484   0.44656128 -0.49137396  0.91966337  0.09104124
  1.2855325   0.85835177 -0.85835177  1.0350862   1.0941504   1.0939856
  1.087367    1.0838307   1.0700519   0.938523   -0.938523    1.1994328
 -1.0155083   1.22494     1.2845001   0.59951156  1.3457342   1.3274622
  1.3084388  -0.32592788  1.1491327   0.957126    0.8839101  -0.0950445
  0.1254988   0.9116329  -0.40120935  0.01263364  1.140706    1.2071276
  1.1166544   1.2826141   0.39299124  0.50488645 -0.93786937  1.2380749
  1.2376891   1.2318883   1.3073003   0.8111312   0.92018175  1.3440307
 -0.93452495  1.3391926   1.2694347   1.2562683   1.2405137   1.0328138
  1.0696777   0.36150318  1.3064371   0.8042285   1.8027447   0.06646705
  1.2483827   0.8478194   1.2697859   1.0435293   1.3344829   1.322

In [6]:
print(stock_tensor.shape)

torch.Size([336, 3764, 276])


In [7]:
# Drop all rows in df_raw where the datetime is the last date in 'dates'
# last_date = dates.iloc[-1]
# df_raw = df_raw.drop(index=last_date, level="datetime")

In [8]:
df_raw

Unnamed: 0_level_0,Unnamed: 1_level_0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,label
Unnamed: 0_level_1,Unnamed: 1_level_1,Low,Open,Volume,High,Close,Adjusted Close,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,...,volume_std_5_rel_forbes2000,ret_mean_10_forbes2000,ret_std_10_forbes2000,volume_mean_10_rel_forbes2000,volume_std_10_rel_forbes2000,ret_mean_30_forbes2000,ret_std_30_forbes2000,volume_mean_30_rel_forbes2000,volume_std_30_rel_forbes2000,FWD_RET
datetime,instrument,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2008-01-02,MHK,-0.829471,-0.832825,-0.169904,-0.847458,-0.833352,-0.833352,-0.826258,-0.839540,-0.812371,-1.053702,...,0.502463,0.351911,0.141333,-1.357861,0.913220,0.044328,0.318488,-0.588874,0.760018,0.456144
2008-01-02,AMGN,-0.829315,-0.826904,0.054067,-0.843893,-0.843643,-0.843643,-0.831552,-0.845066,-0.817425,-1.067191,...,0.795640,0.607206,0.103637,-0.854547,1.104797,0.199282,0.312864,0.021908,1.267009,0.430066
2008-01-02,EMR,-0.867106,-0.856666,0.884121,-0.874124,-0.877633,-0.877633,-0.841049,-0.853584,-0.827919,-1.027403,...,0.851929,0.129145,0.422229,-1.868913,0.709664,-0.170609,0.413525,-1.246356,0.108239,0.392979
2008-01-02,PXD,-0.858324,-0.873183,-0.110341,-0.870248,-0.858767,-0.858767,-0.848548,-0.861174,-0.835324,-1.035568,...,0.915551,-0.202506,0.526871,-2.161186,0.853716,-0.069668,0.377265,-1.729026,-0.144767,0.483133
2008-01-02,REGN,-0.880905,-0.850589,0.444537,-0.867613,-0.897591,-0.897591,-0.858991,-0.870005,-0.847412,-0.967216,...,0.393072,-0.466791,0.494783,-1.863624,1.253222,-0.228964,0.369428,-1.431522,-0.031253,0.427842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-09,URI,-0.140493,-0.042603,0.020682,-0.049973,-0.169237,0.163510,0.001404,0.001221,0.001586,-0.006696,...,-0.175781,0.542613,-0.397730,0.596429,-0.445849,0.397811,-0.335545,0.384041,-0.237357,-4.630393
2022-12-09,HRL,-0.217174,-0.184257,-0.453389,-0.225626,-0.251307,0.071499,-0.046337,-0.046654,-0.045987,-0.010564,...,0.297413,0.599097,-0.399005,1.042935,-0.258779,0.296674,-0.352078,0.697483,-0.218537,-9.528238
2022-12-09,SWK,-0.242735,-0.265533,-0.257842,-0.244971,-0.207175,0.120976,-0.099097,-0.100111,-0.098009,-0.034893,...,-0.155542,0.603663,-0.398094,0.247020,-0.392366,0.493863,-0.360263,0.057465,-0.491430,-6.210919
2022-12-09,CHD,-0.171475,-0.184257,-0.676882,-0.182293,-0.149107,0.186078,-0.139028,-0.141258,-0.136697,-0.078316,...,-0.683956,0.988395,-0.474151,0.471665,-0.269244,0.597738,-0.350092,0.346434,-0.357777,-4.527653


In [9]:
# import numpy as np

# # Check for NaNs in all samples of dl_train
# has_nan = False
# for i, sample in enumerate(dl_train):
#     if np.isnan(sample).any():
#         print(f"NaN found in sample {i}")
#         has_nan = True
#         break

# if not has_nan:
#     print("No NaN values found in dl_train.")

To debug go in base_model.py ---> train_epoch(self, data_loader)

It seems that the first prediction and step works correctly. But all the losses after the first one are nan!

- It seems that if I do not perform the gradient step everithing works fine!
- Now it computes the first losses and then explodes
- also changed those lines to add stability: 

-- torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0)

++ torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)


            

In [10]:
from master_bert import MASTERModel
import pickle
import numpy as np
import time


universe = 'sp500'
d_feat = 224
d_model = 256
t_nhead = 4
s_nhead = 2
dropout = 0.5
gate_input_start_index = 224
gate_input_end_index = 276

if universe == 'sp500':
    beta = 5
else:
    raise ValueError("Invalid universe")

n_epoch = 1
lr = 1e-5
GPU = 0
train_stop_loss_thred = 0.95


ic = []
icir = []
ric = []
ricir = []

# Training
######################################################################################
for seed in [0, 1]: #[0, 1, 2, 3, 4]:
    model = MASTERModel(
        d_feat = d_feat, d_model = d_model, t_nhead = t_nhead, s_nhead = s_nhead, T_dropout_rate=dropout, S_dropout_rate=dropout,
        beta=beta, gate_input_end_index=gate_input_end_index, gate_input_start_index=gate_input_start_index,
        n_epochs=n_epoch, lr = lr, GPU = GPU, seed = seed, train_stop_loss_thred = train_stop_loss_thred,
        save_path='model', save_prefix=f'{universe}'
    )

    start = time.time()
    # Train
    model.fit(dl_train, dl_valid)

    print("Model Trained.")

    # Test
    predictions, metrics = model.predict(dl_test)
    
    running_time = time.time()-start
    
    print('Seed: {:d} time cost : {:.2f} sec'.format(seed, running_time))
    print(metrics)

    ic.append(metrics['IC'])
    icir.append(metrics['ICIR'])
    ric.append(metrics['RIC'])
    ricir.append(metrics['RICIR'])
######################################################################################



  self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values


Loss: 1.6465351581573486
Loss: 1.4128618240356445
Loss: 1.4456347227096558
Loss: 1.4782273769378662
Loss: 1.3939889669418335
Loss: 1.5541136264801025
Loss: 1.2262176275253296
Loss: 1.367738127708435
Loss: 1.336190938949585
Loss: 1.2757395505905151
Loss: 1.1654170751571655
Loss: 1.1613320112228394
Loss: 1.1979951858520508
Loss: 1.0841726064682007
Loss: 1.1914963722229004
Loss: 1.1754536628723145
Loss: 1.074337363243103
Loss: 1.1869957447052002
Loss: 1.1105822324752808
Loss: 1.2213655710220337
Loss: 1.1722211837768555
Loss: 1.1243053674697876
Loss: 1.0997427701950073
Loss: 1.1697496175765991
Loss: 1.1564728021621704
Loss: 1.219059944152832
Loss: 1.2343429327011108
Loss: 1.1792058944702148
Loss: 1.0775591135025024
Loss: 1.1289008855819702
Loss: 1.0992090702056885
Loss: 1.1786965131759644
Loss: 1.1353332996368408
Loss: 1.1065329313278198
Loss: 1.1040585041046143
Loss: 1.1144760847091675
Loss: 1.1213186979293823
Loss: 1.1160811185836792
Loss: 1.0729506015777588
Loss: 1.1434720754623413
Loss

RuntimeError: Parent directory model does not exist.