In [1]:
import pandas as pd
from pathlib import Path
import torch
import sys
sys.path.append('src')

# from models.ae_kan import KANAutoencoder
from src.utils.new_preprocessing import preprocessing_dataset, simple_train_kan, change_hyperparam

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"Memory available: {torch.cuda.mem_get_info()[0] / 1024**3:.1f} GB")

print(f"Device : {device}")

GPU: NVIDIA GeForce RTX 3070 Laptop GPU
Total memory: 8.0 GB
Memory available: 6.9 GB
Device : cuda


In [2]:
data_dir = Path("data/processed/sectors")
sectors_list = [d.name for d in data_dir.iterdir() if d.is_dir() and d.name != 'unknown']

sector_log_returns = {}
for sector in sectors_list:
    returns_path = data_dir / sector / "log_returns.csv"
    df = pd.read_csv(returns_path, index_col=0)
    sector_log_returns[sector] = df.iloc[1:]  # Skip first row with NaN values

In [3]:
X_df = {}
W_df = {}
M_df = {}
data = {}

for sector in sector_log_returns:
    # Preprocess data for each sector
    X_df[sector], W_df[sector], M_df[sector] = preprocessing_dataset(
        log_returns_df=sector_log_returns[sector],
        win=60,
        min_periods=40,
        clip_val=3.0,
        min_valid_per_day=5,
        use_median=True,
        soft_weights=True
    )
    
    # Create tensors for each sector
    tensors = {
        'X': torch.tensor(X_df[sector].values, dtype=torch.float32),
        'W': torch.tensor(W_df[sector].values, dtype=torch.float32),
        'M': torch.tensor(M_df[sector].values, dtype=torch.float32)
    }

    # Split into train/test for each sector
    train_size = int(0.8 * len(tensors['X']))
    data[sector] = {
        'train': {
            'X': tensors['X'][:train_size],
            'W': tensors['W'][:train_size],
            'M': tensors['M'][:train_size]
        },
        'test': {
            'X': tensors['X'][train_size:],
            'W': tensors['W'][train_size:],
            'M': tensors['M'][train_size:]
        }
    }

print(f"\nSector: {sector}")
print(f"Train: {data[sector]['train']['X'].shape[0]} échantillons")
print(f"Test: {data[sector]['test']['X'].shape[0]} échantillons") 
print(f"Dates train: {X_df[sector].index[0]} à {X_df[sector].index[train_size-1]}")
print(f"Dates test: {X_df[sector].index[train_size]} à {X_df[sector].index[-1]}")


Sector: utilities
Train: 2987 échantillons
Test: 747 échantillons
Dates train: 2010-03-03 à 2022-01-10
Dates test: 2022-01-11 à 2024-12-31


In [5]:
log_returns_df = pd.read_csv("data/processed/dataset_log_returns.csv", index_col=0)
log_returns_df = log_returns_df.iloc[1:]

In [6]:
sp500_X_df, sp500_W_df, sp500_M_df = preprocessing_dataset(
    log_returns_df=log_returns_df,  # Utilise tous les returns
    win=60,
    min_periods=40, 
    clip_val=3.0,
    min_valid_per_day=5,
    use_median=True,
    soft_weights=True
)

sp500_tensors = {
    'X': torch.tensor(sp500_X_df.values, dtype=torch.float32),
    'W': torch.tensor(sp500_W_df.values, dtype=torch.float32), 
    'M': torch.tensor(sp500_M_df.values, dtype=torch.float32)
}

# Split train/test
train_size = int(0.8 * len(sp500_tensors['X']))
full_data = {
    'train': {
        'X': sp500_tensors['X'][:train_size],
        'W': sp500_tensors['W'][:train_size],
        'M': sp500_tensors['M'][:train_size]
    },
    'test': {
        'X': sp500_tensors['X'][train_size:],
        'W': sp500_tensors['W'][train_size:],
        'M': sp500_tensors['M'][train_size:]
    }
}

# PCA

In [8]:
sp500_X_df

Unnamed: 0_level_0,A,AAPL,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP,ADSK,...,WSM,WST,WTW,WY,WYNN,XEL,XOM,YUM,ZBH,ZBRA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-03-03,0.176307,0.214508,-1.002671,-0.784434,0.260013,0.294284,-0.259420,-0.019599,0.290195,0.137718,...,0.714586,-0.103920,-0.656116,0.563485,-0.259244,0.093261,0.094907,-0.124138,0.069346,-0.242135
2010-03-04,-0.586759,0.231693,-0.137763,0.030374,0.636821,0.155739,-0.257542,0.517186,-0.612935,-0.502886,...,0.542160,-0.152967,0.435954,-0.397244,-0.447529,-0.911701,-0.063012,0.018323,-0.372245,-1.595846
2010-03-05,1.001215,1.125137,-0.299301,-0.531185,-0.055676,-0.354772,0.082053,0.790601,2.588237,-0.289820,...,0.879064,-0.220492,-0.540799,0.313310,0.212647,-0.223054,0.846528,0.977798,0.512938,-0.613683
2010-03-08,-0.410406,0.016576,0.216081,2.360613,0.775418,-0.079945,0.134045,-0.069078,-0.072619,-0.020488,...,-0.514649,-0.558655,0.083071,0.303825,0.660560,-0.179699,0.104157,1.099242,-0.528098,0.184252
2010-03-09,0.488803,0.888241,0.811326,-1.658440,1.525193,0.193697,-0.412055,-0.095812,1.158743,0.131327,...,-0.360237,0.235009,1.285263,-0.020657,0.377738,-0.088043,0.522996,2.212131,-0.417760,0.060997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.307425,0.380875,-0.127632,-0.039825,-0.023112,-0.292695,0.413648,-0.028652,0.138817,0.120384,...,-0.407859,-0.202417,-0.169172,0.119537,-0.135992,0.046578,-0.281899,0.448062,-0.152951,0.272624
2024-12-26,-0.111896,0.020888,0.333901,0.214894,-0.302760,0.201616,-0.053040,0.077382,0.080722,-0.356029,...,-0.176964,-0.058574,0.328683,-0.085921,0.251081,-0.161660,0.176920,0.736203,-0.099286,0.082971
2024-12-27,0.333212,-1.072575,0.123520,0.188931,-0.359074,0.145287,0.136232,0.504287,-0.036963,-0.217599,...,0.074991,0.299266,-0.111940,0.159552,0.187835,0.275984,0.567581,-0.157954,0.107513,-1.038315
2024-12-30,0.129877,-0.842167,-1.268980,0.463963,-0.046032,0.606205,-0.577314,0.072116,-0.519967,0.477123,...,0.559781,0.040173,-0.329955,0.525171,-0.634840,-0.157504,0.174249,-0.713402,-0.302145,-0.405026
