File to compute the baselines with skitlearn

In [59]:
from master_bert import MASTERModel
import pickle
import numpy as np
import time

from utils import load_all_csv_data_with_market_indexes, load_all_csv_data_without_index, csvs_to_qlib_df, PandasDataLoader
# Please install qlib first before load the data.

# Qlib
# import qlib
# from qlib.config import REG_US           # S&P 500 is a US market
# qlib.init(provider_uri=".", region=REG_US)   # provider_uri just needs to exist





# ------------------------------------------------------------
# 1.  Init Qlib and build *one* handler
import qlib, pandas as pd, numpy as np, torch
qlib.init()                               # client mode is fine

from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset import TSDatasetH          # <-- here
from qlib.data.dataset.processor import (
    DropnaProcessor, CSZScoreNorm, DropnaLabel,
)

# your tensor, names, dates exactly as before  ----------------
# stock_tensor, stock_names, feature_names = load_all_csv_data_without_index()
stock_tensor, stock_names, feature_names = load_all_csv_data_with_market_indexes()
N, T, K   = stock_tensor.shape
print("Shape: ", stock_tensor.shape)
# dates     = pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
dates = pd.to_datetime(                     # <-- NEW
    pd.read_csv("data/enriched/market_indexes_aggregated.csv")["Date"]
)

# tensor ➜ tidy multi-index frame --------------------------------
def tensor_to_df(tensor, inst, feats, dt_index):
    flat = tensor.numpy().reshape(N * T, K)
    idx  = pd.MultiIndex.from_product([dt_index, inst],
                                      names=["datetime", "instrument"])
    cols = pd.MultiIndex.from_product([["feature"], feats])
    return pd.DataFrame(flat, index=idx, columns=cols)

df_raw = tensor_to_df(stock_tensor, stock_names, feature_names, dates)

# optional: build a forward-return label
df_raw[("label", "FWD_RET")] = (
    df_raw[("feature", "Adjusted Close")]
      .groupby("instrument").shift(-1) / df_raw[("feature", "Adjusted Close")] - 1
)

last_date = dates.iloc[-1]
df_raw = df_raw.drop(index=last_date, level="datetime")

# handler with learn / infer processors ------------------------
proc_feat = [
    {"class": "DropnaProcessor", "kwargs": {"fields_group": "feature"}},
    {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}},
]

# proc_feat = [
#     {"class": "CSZScoreNorm",   "kwargs": {"fields_group": "feature"}},
# ]

# proc_feat = [
#     {"class": "Fillna",          # <— correct name
#      "kwargs": {"fields_group": "feature", "fill_value": 0}},  # zero-fill; choose ffill/bfill/etc. if you like
#     {"class": "CSZScoreNorm",
#      "kwargs": {"fields_group": "feature"}},
# ]

proc_label = [{"class": "DropnaLabel"}]

handler = DataHandlerLP(
    data_loader      = StaticDataLoader(df_raw),
    infer_processors = proc_feat,          # what the model will see later
    learn_processors = proc_feat + proc_label,
)
handler.fit_process_data()                 # learn z-scores, etc.

# ------------------------------------------------------------
# 2.  Attach time splits in a TSDatasetH
split = {
    "train": (dates.iloc[8],              dates.iloc[int(T*0.8) - 1]),
    "valid": (dates.iloc[int(T*0.8)],     dates.iloc[int(T*0.9) - 1]),
    "test" : (dates.iloc[int(T*0.9)],     dates.iloc[-2]),
}

ts_ds = TSDatasetH(
    handler  = handler,
    segments = split,
    step_len = 8,          # same window the MASTER code expects
)

dl_train = ts_ds.prepare("train")   # ➜ TSDataSampler
dl_valid = ts_ds.prepare("valid")
dl_test  = ts_ds.prepare("test")





print(len(dl_train), len(dl_valid), len(dl_test))
#  → continue with your for-loop over seeds exactly as before
# ------------------------------------------------------------



[1340207:MainThread](2025-05-15 20:19:34,702) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[1340207:MainThread](2025-05-15 20:19:34,703) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[1340207:MainThread](2025-05-15 20:19:34,704) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/gabrielecarrino/.qlib/qlib_data/cn_data')}


Shape:  torch.Size([336, 3764, 276])


[1340207:MainThread](2025-05-15 20:19:56,119) INFO - qlib.timer - [log.py:127] - Time cost: 0.249s | Loading data Done
[1340207:MainThread](2025-05-15 20:19:57,995) INFO - qlib.timer - [log.py:127] - Time cost: 0.083s | Fillna Done
[1340207:MainThread](2025-05-15 20:20:31,572) INFO - qlib.timer - [log.py:127] - Time cost: 33.577s | CSZScoreNorm Done
[1340207:MainThread](2025-05-15 20:20:32,371) INFO - qlib.timer - [log.py:127] - Time cost: 0.267s | Fillna Done
[1340207:MainThread](2025-05-15 20:21:08,508) INFO - qlib.timer - [log.py:127] - Time cost: 36.137s | CSZScoreNorm Done
[1340207:MainThread](2025-05-15 20:21:09,024) INFO - qlib.timer - [log.py:127] - Time cost: 0.515s | DropnaLabel Done
[1340207:MainThread](2025-05-15 20:21:09,026) INFO - qlib.timer - [log.py:127] - Time cost: 72.907s | fit & process data Done
[1340207:MainThread](2025-05-15 20:21:09,026) INFO - qlib.timer - [log.py:127] - Time cost: 73.157s | Init data Done
[1340207:MainThread](2025-05-15 20:21:10,951) INFO - q

1009008 126336 126336


In [60]:
# grab the very first sample
sample = dl_train[0]

# this will print something like (step_len, num_features)
print("Sample shape:", sample.shape)

# so the number of features is the second entry:
print("Number of features:", sample.shape[1])

Sample shape: (8, 277)
Number of features: 277


In [61]:
first_element = dl_train[0]
print(first_element)

[[ 0.96814084  0.9395819   0.22628054 ... -1.1720321   0.6685043
  -0.5653904 ]
 [-1.1040634  -1.1261338   0.4410714  ...  1.5659484   2.9029303
   0.53851366]
 [ 1.1706895   1.384864    1.6147026  ...  0.37965342  1.0770955
   0.32320356]
 ...
 [ 1.2229986   1.1834214  -0.09168546 ...  0.13941513  1.1165327
   0.37429821]
 [ 1.0095934   0.9493088  -1.0993657  ...  1.379624   -0.36605775
  -0.00969595]
 [-0.00730161  0.01083715  0.648874   ... -0.15530944  0.06908915
   0.17126465]]


In [62]:
sample = dl_test[0]    # The first 6 samples have nan!!!
print(type(sample))
print(len(sample))
print(sample)

<class 'numpy.ndarray'>
8
[[ 1.1371658   1.2192398   0.06807215 ... -0.40199393 -0.13715328
   0.26629865]
 [ 0.8104964   0.8949515  -0.7865663  ... -0.47122377 -0.08420499
  -0.23058861]
 [-0.97513354 -0.95458597  0.8302189  ...  0.14509037  0.47901008
  -0.9181567 ]
 ...
 [-0.31323737 -0.30636364 -0.5390615  ... -0.5482849  -1.0667828
   0.2168212 ]
 [-0.07971064 -0.06750467  0.3454849  ... -0.74869204 -0.8359446
   0.6296612 ]
 [ 1.502977    1.4761885  -0.62532127 ...  0.21999142  0.30950865
  -0.01254231]]


In [63]:
features = sample[:, :-1]
labels = sample[:, -1]
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)
print("First row of features:", features[0])
print("First label:", labels[0])

Features shape: (8, 276)
Labels shape: (8,)
First row of features: [ 1.1371658   1.2192398   0.06807215  1.1960052   1.1465175   1.1465175
  1.1414465   1.1602948   1.1222199   1.2301251   0.8349727   0.8475883
  0.847749    1.2997905   1.1438606   0.09125762  2.5021372  -0.26306555
  0.81902313  0.823392   -0.823392    1.3333079   0.94988275  0.99541974
  1.017623    0.96200424  0.850138    0.8542722  -0.8542722   0.87507695
 -0.9328872   0.9364311   1.0927722   1.1833836   1.1129961   1.151287
  1.1808496   0.04754018  0.01123824  1.1071308  -0.7511019   0.28471315
  0.41079995  0.66855896 -0.4076399  -0.04157145  1.4958638   1.3655212
  1.2085332   1.2100871  -1.1757846   1.5104822  -1.7810059   0.97323525
  0.9547204   1.1100832   0.88227993  1.1057249   1.251671    1.2443619
  1.079012    1.1831415   0.77510476  0.9248912   1.0595074   0.35886338
  1.0139277   0.35948348  1.0417379   1.4916648   1.8044392   1.323453
  1.4706851   1.0766426   1.1094129   1.1645534   1.1608937   1.1

In [64]:
print(stock_tensor.shape)

torch.Size([336, 3764, 276])


In [65]:
# Drop all rows in df_raw where the datetime is the last date in 'dates'
# last_date = dates.iloc[-1]
# df_raw = df_raw.drop(index=last_date, level="datetime")

In [66]:
df_raw

Unnamed: 0_level_0,Unnamed: 1_level_0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,label
Unnamed: 0_level_1,Unnamed: 1_level_1,Low,Open,Volume,High,Close,Adjusted Close,ABER_ZG_5_15,ABER_SG_5_15,ABER_XG_5_15,ABER_ATR_5_15,...,volume_std_5_rel_forbes2000,ret_mean_10_forbes2000,ret_std_10_forbes2000,volume_mean_10_rel_forbes2000,volume_std_10_rel_forbes2000,ret_mean_30_forbes2000,ret_std_30_forbes2000,volume_mean_30_rel_forbes2000,volume_std_30_rel_forbes2000,FWD_RET
datetime,instrument,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2008-01-02,BAC,40.349998,41.529999,30764600.0,41.549999,40.560001,31.315887,41.396000,42.666706,40.125294,1.270706,...,0.185704,0.001766,0.012605,0.749271,0.243681,0.000489,0.014268,0.902787,0.242897,-0.722459
2008-01-02,NDAQ,40.250000,40.630001,22589500.0,40.810001,40.299999,31.115147,41.062000,42.285324,39.838673,1.223326,...,0.216941,0.002752,0.012290,0.853202,0.262097,0.000809,0.014226,1.046181,0.281136,-0.708290
2008-01-02,CAT,39.759998,40.000000,34921700.0,40.290001,39.849998,30.767687,40.734665,41.912437,39.556896,1.177771,...,0.222938,0.000906,0.014950,0.643741,0.224113,0.000046,0.014985,0.748429,0.193738,-0.654649
2008-01-02,DPZ,39.259998,40.049999,31884400.0,40.240002,39.900002,30.806311,40.439335,41.603920,39.274746,1.164587,...,0.229717,-0.000375,0.015823,0.583389,0.237961,0.000254,0.014711,0.635112,0.174655,-0.632794
2008-01-02,UEEC,38.279999,40.180000,42143300.0,40.209999,38.410000,29.655882,40.001331,41.216946,38.785721,1.215614,...,0.174049,-0.001396,0.015556,0.644833,0.276365,-0.000074,0.014652,0.704957,0.183217,-0.599914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-09,RJF,58.160000,58.209999,966900.0,58.799999,58.709999,57.525780,57.799999,58.937862,56.662136,1.137862,...,0.113441,0.002502,0.008105,1.152821,0.113035,0.001218,0.009338,1.131199,0.167672,-0.082846
2022-12-09,PLD,58.240002,58.740002,874000.0,59.189999,59.119999,57.927513,58.236668,59.362003,57.111328,1.125338,...,0.163857,0.002720,0.008094,1.245022,0.131018,0.001009,0.009213,1.204786,0.169091,-0.095594
2022-12-09,KEY,59.049999,59.049999,769200.0,60.209999,60.090000,58.877949,58.700001,59.827648,57.572350,1.127649,...,0.115597,0.002738,0.008102,1.080670,0.118176,0.001416,0.009151,1.054529,0.148509,-0.122592
2022-12-09,LH,60.000000,60.259998,778200.0,60.689999,60.650002,59.426651,59.164665,60.263138,58.066196,1.098472,...,0.059297,0.004224,0.007467,1.127058,0.130012,0.001630,0.009228,1.122370,0.158589,-0.145501


In [67]:
import numpy as np

# Check for NaNs in all samples of dl_train
has_nan = False
for i, sample in enumerate(dl_train):
    if np.isnan(sample).any():
        print(f"NaN found in sample {i}")
        has_nan = True
        break

if not has_nan:
    print("No NaN values found in dl_train.")

NaN found in sample 0
