In [8]:
import importlib.metadata
print(importlib.metadata.version("scikit-base"))


0.12.4


In [9]:
from importlib.metadata import distributions

for dist in distributions():
    try:
        name = dist.metadata['Name']
    except KeyError:
        print("Paquete con metadatos incompletos:", dist)


  name = dist.metadata['Name']


In [10]:
!pip cache purge
!pip install --force-reinstall pytorch-forecasting scikit-base


Files removed: 10 (1.9 MB)
Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.4.0-py3-none-any.whl.metadata (14 kB)
Collecting scikit-base
  Downloading scikit_base-0.12.4-py3-none-any.whl.metadata (8.8 kB)
Collecting numpy<=3.0.0 (from pytorch-forecasting)
  Downloading numpy-2.3.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting torch!=2.0.1,<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading torch-2.7.1-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting scipy<2.0,>=1.8 (from pytorch-forecasting)
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pandas<3.0.0,>=1.3.0 (from pytorch-forecasting)
  Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn<2.0,>=1.2 (from pytorch-forecasting)
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collec

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.61.0 requires numpy<2.2,>=1.24, but you have numpy 2.3.2 which is incompatible.
s3fs 2025.3.2 requires fsspec==2025.3.2.*, but you have fsspec 2025.7.0 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.1 which is incompatible.
streamlit 1.45.1 requires packaging<25,>=20, but you have packaging 25.0 which is incompatible.


In [7]:
!pip install pytorch_forecasting --upgrade

import pandas as pd
import numpy as np
from pathlib import Path
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import joblib

from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.metrics import RMSE
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_lightning import Trainer
import pytorch_lightning as pl



ValueError: A distribution name is required.

In [None]:
# Paths
FEATURES_DATA_DIR = Path("../data/processed")
MODELS_DIR = Path("../models")
MODELS_DIR.mkdir(exist_ok=True)

# Parameters
MAX_ENCODER_LENGTH = 30  # history length
MAX_PREDICTION_LENGTH = 1  # forecast horizon (1 day ahead)
EPOCHS = 50
BATCH_SIZE = 32
ACCELERATOR = "gpu" if torch.cuda.is_available() else "cpu"

pl.seed_everything(42)

In [None]:
# Loop over all assets
for feature_file in FEATURES_DATA_DIR.glob("*_features.csv"):
    asset_name = feature_file.stem.replace("_features", "")
    print(f"\n=== Training TFT for {asset_name} ===")

    # Load and prepare data
    df = pd.read_csv(feature_file).sort_values("Date").dropna()

    # Keep needed features
    feature_cols = [
        "Return", "Volatility_5d", "Volatility_21d",
        "RSI_14", "MACD", "MACD_signal", "MACD_hist",
        "ATR_14", "SMA_20", "SMA_50", "SMA_200"
    ]
    target_col = "Volatility_5d"

    # Scaling
    scaler_X = StandardScaler()
    df[feature_cols] = scaler_X.fit_transform(df[feature_cols])
    scaler_y = StandardScaler()
    df[target_col] = scaler_y.fit_transform(df[[target_col]])

    # Save scalers
    joblib.dump(scaler_X, MODELS_DIR / f"scaler_X_TFT_{asset_name}.pkl")
    joblib.dump(scaler_y, MODELS_DIR / f"scaler_y_TFT_{asset_name}.pkl")

    # Add time index and group id for TFT
    df["time_idx"] = np.arange(len(df))
    df["asset"] = asset_name

    # Train/test split index
    split_idx = int(len(df) * 0.8)

    # Define TimeSeriesDataSet
    training = TimeSeriesDataSet(
        df.iloc[:split_idx],
        time_idx="time_idx",
        target=target_col,
        group_ids=["asset"],
        min_encoder_length=MAX_ENCODER_LENGTH,
        max_encoder_length=MAX_ENCODER_LENGTH,
        max_prediction_length=MAX_PREDICTION_LENGTH,
        time_varying_known_reals=feature_cols,
        time_varying_unknown_reals=[target_col],
        target_normalizer=None
    )

    validation = TimeSeriesDataSet.from_dataset(training, df.iloc[split_idx:])

    train_dataloader = training.to_dataloader(train=True, batch_size=BATCH_SIZE, num_workers=0)
    val_dataloader = validation.to_dataloader(train=False, batch_size=BATCH_SIZE, num_workers=0)

    # Create TFT model
    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=1e-3,
        hidden_size=32,
        attention_head_size=4,
        dropout=0.1,
        hidden_continuous_size=16,
        output_size=1,  # 1 target
        loss=RMSE(),
        log_interval=10,
        reduce_on_plateau_patience=4
    )

    trainer = Trainer(
        max_epochs=EPOCHS,
        accelerator=ACCELERATOR,
        devices=1,
        gradient_clip_val=0.1
    )

    # Train model
    trainer.fit(tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

    # Save model
    tft_path = MODELS_DIR / f"volatility_model_tft_{asset_name}.ckpt"
    trainer.save_checkpoint(str(tft_path))

    # Predictions
    raw_predictions, x = tft.predict(val_dataloader, mode="raw", return_x=True)
    preds = tft.predict(val_dataloader).numpy().flatten()

    # Inverse scaling
    preds = scaler_y.inverse_transform(preds.reshape(-1, 1)).flatten()
    actuals = scaler_y.inverse_transform(x["decoder_target"].numpy().flatten().reshape(-1, 1)).flatten()

    # Metrics
    rmse = np.sqrt(mean_squared_error(actuals, preds))
    mae = mean_absolute_error(actuals, preds)
    print(f"{asset_name} → RMSE: {rmse:.6f}, MAE: {mae:.6f}")

    # Save predictions CSV
    pred_df = pd.DataFrame({
        "Date": df["Date"].iloc[split_idx + MAX_ENCODER_LENGTH:].values[:len(preds)],
        "Actual_Volatility": actuals[:len(preds)],
        "Predicted_Volatility": preds[:len(preds)]
    })
    pred_df.to_csv(MODELS_DIR / f"predictions_TFT_{asset_name}.csv", index=False)

    # Plot
    plt.figure(figsize=(12, 5))
    plt.plot(actuals, label="Actual Volatility")
    plt.plot(preds, label="Predicted Volatility")
    plt.title(f"TFT Volatility Prediction - {asset_name}")
    plt.legend()
    plt.show()