In [1]:
import pandas as pd
from pathlib import Path
import torch
import sys
sys.path.append('src')

# from models.ae_kan import KANAutoencoder
from src.utils.new_preprocessing import preprocessing_dataset, simple_train_kan, change_hyperparam

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"Memory available: {torch.cuda.mem_get_info()[0] / 1024**3:.1f} GB")

print(f"Device : {device}")

GPU: NVIDIA GeForce RTX 3070 Laptop GPU
Total memory: 8.0 GB
Memory available: 6.9 GB
Device : cuda


In [2]:
data_dir = Path("data/processed/sectors")
sectors_list = [d.name for d in data_dir.iterdir() if d.is_dir() and d.name != 'unknown']

sector_log_returns = {}
for sector in sectors_list:
    returns_path = data_dir / sector / "log_returns.csv"
    df = pd.read_csv(returns_path, index_col=0)
    sector_log_returns[sector] = df.iloc[1:]  # Skip first row with NaN values

In [3]:
X_df = {}
W_df = {}
M_df = {}
data = {}

for sector in sector_log_returns:
    # Preprocess data for each sector
    X_df[sector], W_df[sector], M_df[sector] = preprocessing_dataset(
        log_returns_df=sector_log_returns[sector],
        win=60,
        min_periods=40,
        clip_val=3.0,
        min_valid_per_day=5,
        use_median=True,
        soft_weights=True
    )
    
    # Create tensors for each sector
    tensors = {
        'X': torch.tensor(X_df[sector].values, dtype=torch.float32),
        'W': torch.tensor(W_df[sector].values, dtype=torch.float32),
        'M': torch.tensor(M_df[sector].values, dtype=torch.float32)
    }

    # Split into train/test for each sector
    train_size = int(0.8 * len(tensors['X']))
    data[sector] = {
        'train': {
            'X': tensors['X'][:train_size],
            'W': tensors['W'][:train_size],
            'M': tensors['M'][:train_size]
        },
        'test': {
            'X': tensors['X'][train_size:],
            'W': tensors['W'][train_size:],
            'M': tensors['M'][train_size:]
        }
    }

print(f"\nSector: {sector}")
print(f"Train: {data[sector]['train']['X'].shape[0]} échantillons")
print(f"Test: {data[sector]['test']['X'].shape[0]} échantillons") 
print(f"Dates train: {X_df[sector].index[0]} à {X_df[sector].index[train_size-1]}")
print(f"Dates test: {X_df[sector].index[train_size]} à {X_df[sector].index[-1]}")


Sector: utilities
Train: 2987 échantillons
Test: 747 échantillons
Dates train: 2010-03-03 à 2022-01-10
Dates test: 2022-01-11 à 2024-12-31


In [4]:
data["utilities"]["train"]["X"]

tensor([[ 0.7095, -0.2273,  1.4788,  ...,  0.6971, -0.0319,  0.1895],
        [ 0.5891, -0.2844, -0.7470,  ..., -0.0982, -0.4259, -0.9280],
        [ 0.3553,  0.0000,  0.2215,  ...,  0.7744, -0.5840, -0.4092],
        ...,
        [-0.0682,  0.0000,  0.2831,  ...,  0.9535, -0.1712, -0.3092],
        [-0.2909,  1.1647, -0.8697,  ...,  0.6001,  0.0765,  0.1190],
        [-0.4511, -0.1024,  0.0603,  ..., -0.0457,  0.0000,  1.2413]])

# PCA

In [None]:
X_df = {}
W_df = {}
M_df = {}
data = {}

for sector in sector_log_returns:
    # Preprocess data for each sector
    X_df[sector], W_df[sector], M_df[sector] = preprocessing_dataset(
        log_returns_df=sector_log_returns[sector],
        win=60,
        min_periods=40,
        clip_val=3.0,
        min_valid_per_day=5,
        use_median=True,
        soft_weights=True
    )
    
    # Create tensors for each sector
    tensors = {
        'X': torch.tensor(X_df[sector].values, dtype=torch.float32),
        'W': torch.tensor(W_df[sector].values, dtype=torch.float32),
        'M': torch.tensor(M_df[sector].values, dtype=torch.float32)
    }

    # Split into train/test for each sector
    train_size = int(0.8 * len(tensors['X']))
    data[sector] = {
        'train': {
            'X': tensors['X'][:train_size],
            'W': tensors['W'][:train_size],
            'M': tensors['M'][:train_size]
        },
        'test': {
            'X': tensors['X'][train_size:],
            'W': tensors['W'][train_size:],
            'M': tensors['M'][train_size:]
        }
    }

print(f"\nSector: {sector}")
print(f"Train: {data[sector]['train']['X'].shape[0]} échantillons")
print(f"Test: {data[sector]['test']['X'].shape[0]} échantillons") 
print(f"Dates train: {X_df[sector].index[0]} à {X_df[sector].index[train_size-1]}")
print(f"Dates test: {X_df[sector].index[train_size]} à {X_df[sector].index[-1]}")

# KAN AE

In [None]:
from src.utils.new_preprocessing import hyperparameter_comparison

sector = "financials"

hyperparams = {
    'hidden_dims_choices': [32, 16],
    'latent_dims': 8,
    
    # BASIS
    'basis_types': 'spline',
    'M_values': 16,
    'poly_degrees': 3,
    'use_silu_choices': True,
    'dropout_rates': 0,
    
    # SKIP LINEAIRE
    'use_global_skip': True,
    'use_skip_choices': False,
    'skip_init_choices': 'identity',
    'skip_gain_values': 0.1,
    'max_skip_gain': 0.3,
    
    # REGULARISATION
    'lambda_alpha_values': 1e-3,
    'lambda_group_values': 1e-4,
    'lambda_tv_values': 1e-5,
    'lambda_poly_decay_values': 1e-6,
    'lambda_skip_l2_values': 1e-3,
    'lambda_reg_values': 1e-4,
    
    # LOSS
    'loss_types': 'huber',
    'huber_deltas': 1.0,
    
    # OPTIMISATION
    'batch_sizes': 64,
    'learning_rates': 0.0002,
    'weight_decays': 0.000001
}