### Setup

In [None]:
import sys

if 'google.colab' in sys.modules:
    %pip install -q -U keras_tuner

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%pip install 'git+https://github.com/jeslago/epftoolbox.git'

In [None]:
%pip install finta

Collecting finta
  Downloading finta-1.3-py3-none-any.whl.metadata (6.4 kB)
Downloading finta-1.3-py3-none-any.whl (29 kB)
Installing collected packages: finta
Successfully installed finta-1.3


In [None]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [None]:
%pip install optuna-integration[xgboost]

Collecting optuna-integration[xgboost]
  Downloading optuna_integration-4.5.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.5.0-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.1/99.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.5.0


In [None]:
import os
import sys
from pathlib import Path

import finta
import keras_tuner as kt
import optuna
import optuna_integration
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import sklearn
import statsmodels
import tensorflow as tf
import tqdm
from google.colab import drive

In [None]:
drive.mount('/content/drive')

DRIVE_ROOT = Path('/content/drive/MyDrive')

PROJECT_ROOT = DRIVE_ROOT / 'projects/btc'
PROJECT_DATA_DIR = PROJECT_ROOT / 'data'
PROJECT_CLEANED_DIR = PROJECT_DATA_DIR / 'cleaned'
PROJECT_EXTERNAL_DIR = PROJECT_DATA_DIR / 'external'

CONFIG_FILENAME = 'config.yaml'
BTC_CLEANED_CSV_FILENAME = 'ohlcv_274_cleaned.csv'
FEDFUNDRATE_CSV_FILENAME = 'fedfunds.csv'
M2SL_CSV_FILENAME = 'm2sl.csv'

CONFIG_FILE_PATH = PROJECT_ROOT / CONFIG_FILENAME
BTC_CLEANED_FILE_PATH = PROJECT_CLEANED_DIR / BTC_CLEANED_CSV_FILENAME
FEDFUNDRATE_CSV_FILE_PATH = PROJECT_EXTERNAL_DIR / FEDFUNDRATE_CSV_FILENAME
M2SL_CSV_FILE_PATH = PROJECT_EXTERNAL_DIR / M2SL_CSV_FILENAME

Mounted at /content/drive


In [None]:
project_root = os.path.abspath(PROJECT_ROOT)

if project_root not in sys.path:
  sys.path.append(project_root)

print(f'Project root \'{project_root}\' added to sys.path.')

Project root '/content/drive/MyDrive/projects/btc' added to sys.path.


In [None]:
from src.analysis import *
from src.data_loader import *
from src.data_splitter import *
from src.data_cleaner import *
from src.model import *
from src.preprocessing import *
from src.utils import *
from src.visualization import *

In [None]:
print('--- Python Interpreter and Standard Library Versions ---')
print(f'- python: {sys.version}')
print(f'- python info: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')

print(f'- os, hashlib, pathlib module: (version tied to Python interpreter)')

print('\n--- Third-Party Library Versions ---')
print(f'- keras_tuner: {kt.__version__}')
print(f'- optuna: {optuna.__version__}')
print(f'- matplotlib: {matplotlib.__version__}')
print(f'- numpy: {np.__version__}')
print(f'- pandas: {pd.__version__}')
print(f'- seaborn: {seaborn.__version__}')
print(f'- sklearn: {sklearn.__version__}')
print(f'- statsmodels: {statsmodels.__version__}')
print(f'- tensorflow: {tf.__version__}')
#print(f'- tqdm: {tqdm.__version__}')

print('\n--- Google Colab Specific Module Versions ---')
print(f'- google.colab.drive: (version tied to Colab environment)')

--- Python Interpreter and Standard Library Versions ---
- python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
- python info: 3.12.12
- os, hashlib, pathlib module: (version tied to Python interpreter)

--- Third-Party Library Versions ---
- keras_tuner: 1.4.7
- optuna: 4.5.0
- matplotlib: 3.10.0
- numpy: 2.0.2
- pandas: 2.2.2
- seaborn: 0.13.2
- sklearn: 1.6.1
- statsmodels: 0.14.5
- tensorflow: 2.19.0

--- Google Colab Specific Module Versions ---
- google.colab.drive: (version tied to Colab environment)


In [None]:
%pip show epftoolbox

In [None]:
%pip show finta

Name: finta
Version: 1.3
Summary: Common financial technical indicators implemented in Pandas.
Home-page: https://github.com/peerchemist/finta
Author: Peerchemist
Author-email: peerchemist@protonmail.ch
License: LGPLv3+
Location: /usr/local/lib/python3.12/dist-packages
Requires: numpy, pandas
Required-by: 


### Load the cleaned dataset.

In [None]:
cleaned_ds = load_btc_ds(BTC_CLEANED_FILE_PATH, 'date', True)


Loading dataset from: ohlcv_274_cleaned.csv...
Dataset loaded successfully.


### Perform linear interpolation to fill the missing values on March 15, 2025, to ensure a fair and correct comparison with other models.

In [None]:
imputed_ds = fill_time_series_gaps(cleaned_ds, 'min')

### Resample the cleaned dataset into multiple timeframes.

In [None]:
resampled_ds = resample_btc_data(imputed_ds)


--- Resampling BTC Data to Multiple Timeframes ---
- Resampling to hourly frequency...
- Resampling to daily frequency...
- Resampling to weekly frequency...
- Resampling to monthly frequency...
--- BTC Data Resampling Complete ---


### Split the resampled datasets into training, validation, and test sets of the target timeframe.

In [None]:
train_ds, valid_ds, test_ds = split_btc_ds(resampled_ds['daily'], 'Day')


--- Splitting the BTC Dataset into Training, Validation, and Test Sets (Day) ---
--- BTC Dataset Splitting Complete ---


### Run the ADF test on datasets at the target frequency to verify stationarity and seasonality.

In [None]:
run_adf_test(calculate_log_and_diff(train_ds.copy(), 'close'), 'log_returns_close', 'BTC Daily Closing Returns')

= ADF Test: BTC Daily Closing Returns =
ADF Statistics: -10.8445
p-value: 0.0000
Critical Values:
	1%:-3.4320
	5%:-2.8623
	10%:-2.5672
Conclusion: The p-value is less than or equal to 0.05. The data is likely stationary and seasonal.



### Prepare the dataset by consolidating data splits, adding a comprehensive suite of features, and then splitting the data back.

In [None]:
train_prep_ds, valid_prep_ds, test_prep_ds = prepare_feature_ds(
    train_ds, valid_ds, test_ds,
    ['open', 'high', 'low', 'close', 'volume'],
    FEDFUNDRATE_CSV_FILE_PATH, M2SL_CSV_FILE_PATH
)

= Feature Engineering and Dataset Preparation =
Step 1: Concatenating data splits...
- Combined train, validation, and test sets into a single DataFrame. Full shape: (4922, 5)

Step 2: Engineering new features...
- Calculated log returns for ['open', 'high', 'low', 'close', 'volume'].
- Calculated the high-low price range as a percentage of the open price.
- Calculated Simple Moving Average (SMA) for trend.
- Calculated Relative Strength Index (RSI) for momentum.
- Calculated Average True Range (ATR) for volatility.
- Calculated On-Balance Volume (OBV) for volume pressure.
- Calculated Moving Average Convergence Divergence (MACD).
- Engineered relationship features
- Calculated technical indicators.


  result = getattr(ufunc, method)(*inputs, **kwargs)


- Macro data loaded and prepared.
- Macro data merged.
- Added macroeconomic features.

Step 3: Handling missing and infinite values...
- Replaced 0 infinite values (inf, -inf) with NaN.
- Forward-filled missing values to maintain data continuity.
- Shape before dropping NaNs: (4922, 28)
- Shape after dropping NaNs: (4873, 28)
- Dropped initial rows containing NaNs that resulted from feature calculations.

Step 4: Splitting into final datasets...
- Split the single processed DataFrame back into train, validation, and test sets.
- Train shape: (3888, 28)
- Valid shape: (492, 28)
- Test shape: (493, 28)
= Feature Preparation Complete =


### Fine-tune hyperparameters.
1.   Architectural Hyperparameters
  *   `n_estimators`
      * This is the total number of trees the model will build. More trees allow the model to learn complex patterns, but too many can lead to overfitting and longer training times.
  *   `max_depth`
      * This controls the maximum number of levels in each tree, managing its complexity. Deeper trees can capture more specific patterns but are very likely to overfit.
  *   `min_child_weight`
      * This sets the minimum sum of weights required in a leaf node. It prevents the tree from creating splits based on small, noisy groups of data, thus controlling overfitting.

2.   Training and Optimization Hyperparameters
  *   `learning_rate`
      * This shrinkage parameter scales the contribution of each new tree. A smaller value makes the model more robust and less prone to overfitting but requires more trees (`n_estimators`) to train.
  *   `objective`
      * This defines the mathematical goal, or loss function, that the model tries to minimize. For example, use `reg:pseudohubererror` for regression robust to outliers or `binary:logistic` for binary classification.

3.   Regularization and Stochasticity Hyperparameters
  *   `subsample`
      * This is the fraction of training data (rows) randomly sampled for building each tree. Using a value less than 1.0 (e.g., 0.8) prevents overfitting by making the model less dependent on specific training samples.
  *   `colsample_bytree`
      * This is the fraction of features (columns) randomly sampled for building each tree. This forces the model to find diverse patterns instead of relying on the same few strong features.
  *   `reg_alpha` (L1 Regularization)
      * This applies an L1 regularization penalty (based on absolute weight values) to the leaf weights. It encourages sparsity, potentially pushing the weights of less important features to zero.
  *   `reg_lambda` (L2 Regularization)
      * This applies an L2 regularization penalty (based on squared weight values) to the leaf weights. It makes the model more conservative by preventing any single tree from having too much influence.

4.   Utility Hyperparameters
     `n_jobs`
     * This specifies the number of parallel threads to use for training. Set this to -1 to use all available CPU cores, which significantly speeds up computation without affecting model performance.

### XGBoost Model Tuning (1-Week)

In [None]:
tf.random.set_seed(42)

In [None]:
FULL_COLS = list(train_prep_ds.columns)
TARGET_COL = 'close'
INPUT_WINDOW = 7
TARGET_WINDOW = 7
FORECAST_HORIZON = 7

In [None]:
naive_metrics, naive_pred = train_and_forecast_with_naive_model(valid_prep_ds, 'close', 7, 'Validation', 'Days')

= Naive Model Training, Evaluation, and Forecasting on Validation Set (Horizon: 7 Days) =
--- Fitting Naive Model ---

--- Evaluating Naive Model ---
- Mean Absolute Percentage Error (MAPE): 4.7134%
- Directional Accuracy (DA): 44.7699%

--- Generating Final Naive Forecast (Threshold: 0.5%) ---
- Forecast for 2024-02-22: $52003.00
- Directional Signal for 2024-02-22: 1.0


In [None]:
xgb_params_and_metrics = find_best_xgb_params(
    train_prep_ds, valid_prep_ds,
    FULL_COLS, TARGET_COL, FORECAST_HORIZON,
    INPUT_WINDOW, TARGET_WINDOW, 'Days', naive_metrics
)

[I 2025-10-26 13:04:42,293] A new study created in memory with name: no-name-1449312c-8cc6-4406-a359-db88299a5ea6


= XGBoost Model Tuning on Validation Set (Horizon: 7 Days) =

--- Preparing Data for XGBoost ---

Starting hyperparameter tuning for XGBoost model...


[I 2025-10-26 13:04:42,674] Trial 0 finished with value: 0.364401597855321 and parameters: {'max_depth': 5, 'learning_rate': 0.0862735828664018, 'min_child_weight': 8, 'subsample': 0.8795975452591109, 'colsample_bytree': 0.7468055921327309, 'reg_alpha': 6.025215736203858e-05, 'reg_lambda': 1.951722464144947e-05}. Best is trial 0 with value: 0.364401597855321.
[I 2025-10-26 13:04:43,377] Trial 1 finished with value: 0.36406029269803075 and parameters: {'max_depth': 9, 'learning_rate': 0.03027182927734624, 'min_child_weight': 8, 'subsample': 0.7061753482887407, 'colsample_bytree': 0.9909729556485982, 'reg_alpha': 0.1452824663751603, 'reg_lambda': 0.00011526449540315612}. Best is trial 1 with value: 0.36406029269803075.
[I 2025-10-26 13:04:43,746] Trial 2 finished with value: 0.3641436533751608 and parameters: {'max_depth': 4, 'learning_rate': 0.008661333735273127, 'min_child_weight': 4, 'subsample': 0.8574269294896714, 'colsample_bytree': 0.8295835055926347, 'reg_alpha': 0.00028585493941

Best MAE: 0.3626

XGBoost hyperparameter tuning complete:
- max_depth: 8
- learning_rate: 0.02035961462462537
- min_child_weight: 3
- subsample: 0.8122909048600037
- colsample_bytree: 0.756553078833234
- reg_alpha: 0.9151445932854982
- reg_lambda: 0.0003910864887555302
- n_estimators: 1000

--- Fitting XGBoost Model ---


Training: 100%|██████████| 7/7 [08:57<00:00, 76.73s/it]



--- Evaluating XGBoost Model ---


Walk-Forward Validation: 100%|██████████| 485/485 [00:02<00:00, 199.01it/s]


- Mean Absolute Percentage Error (MAPE): 4.9693%
- Directional Accuracy (DA): 47.0103%

--- Selecting Best XGB Model ---
- Naive Model Benchmark → MAPE: 4.7134%, DA: 44.7699%

- Conclusion: No XGB model outperformed the naive model.





In [None]:
params, metrics, beats_baseline = xgb_params_and_metrics

In [None]:
plot_residuals_analysis(metrics, 'comp', 14, 'XGB', 'Week')
run_ljung_box_test(metrics, 'comp', 14, 'XGB')
print(metrics['comp'])

Plot saved to 'outputs/plots/xgb_residuals_analysis_week'.
= Ljung-Box Test: XGB =
Lags tested: 14
p-value: 0.0000
Result: Reject the null hypothesis (p < 0.05).
The residuals show evidence of autocorrelation.
               true          pred
2022-10-19  19226.0  19067.761790
2022-10-20  19061.0  19283.354009
2022-10-21  19189.0  19162.664953
2022-10-22  19186.0  18793.044091
2022-10-23  19507.0  19180.504349
...             ...           ...
2024-02-11  48295.0  42559.306358
2024-02-12  49745.0  42461.181790
2024-02-13  49126.0  42531.795076
2024-02-14  51475.0  43593.614702
2024-02-15  52003.0  45140.894331

[485 rows x 2 columns]


### XGBoost Model Tuning (1-Month)

In [None]:
tf.random.set_seed(42)

In [None]:
FULL_COLS = list(train_prep_ds.columns)
TARGET_COL = 'close'
INPUT_WINDOW = 30
TARGET_WINDOW = 30
FORECAST_HORIZON = 30

In [None]:
naive_metrics, naive_pred = train_and_forecast_with_naive_model(valid_prep_ds, 'close', 30, 'Validation', 'Days')

= Naive Model Training, Evaluation, and Forecasting on Validation Set (Horizon: 30 Days) =
--- Fitting Naive Model ---

--- Evaluating Naive Model ---
- Mean Absolute Percentage Error (MAPE): 10.5813%
- Directional Accuracy (DA): 54.8611%

--- Generating Final Naive Forecast (Threshold: 0.5%) ---
- Forecast for 2024-03-16: $52003.00
- Directional Signal for 2024-03-16: 1.0


In [None]:
xgb_params_and_metrics = find_best_xgb_params(
    train_prep_ds, valid_prep_ds,
    FULL_COLS, TARGET_COL, FORECAST_HORIZON,
    INPUT_WINDOW, TARGET_WINDOW, 'Days', naive_metrics
)

= XGBoost Model Tuning on Validation Set (Horizon: 30 Days) =

--- Preparing Data for XGBoost ---


[I 2025-10-26 13:17:40,911] A new study created in memory with name: no-name-d9e8f3f2-d226-4b1f-b6e0-77fad5bbcc2b



Starting hyperparameter tuning for XGBoost model...


[I 2025-10-26 13:17:42,428] Trial 0 finished with value: 0.36425451199872544 and parameters: {'max_depth': 5, 'learning_rate': 0.0862735828664018, 'min_child_weight': 8, 'subsample': 0.8795975452591109, 'colsample_bytree': 0.7468055921327309, 'reg_alpha': 6.025215736203858e-05, 'reg_lambda': 1.951722464144947e-05}. Best is trial 0 with value: 0.36425451199872544.
[I 2025-10-26 13:17:48,884] Trial 1 finished with value: 0.36299311078968727 and parameters: {'max_depth': 9, 'learning_rate': 0.03027182927734624, 'min_child_weight': 8, 'subsample': 0.7061753482887407, 'colsample_bytree': 0.9909729556485982, 'reg_alpha': 0.1452824663751603, 'reg_lambda': 0.00011526449540315612}. Best is trial 1 with value: 0.36299311078968727.
[I 2025-10-26 13:17:50,487] Trial 2 finished with value: 0.36181834821027914 and parameters: {'max_depth': 4, 'learning_rate': 0.008661333735273127, 'min_child_weight': 4, 'subsample': 0.8574269294896714, 'colsample_bytree': 0.8295835055926347, 'reg_alpha': 0.000285854

Best MAE: 0.3611

XGBoost hyperparameter tuning complete:
- max_depth: 3
- learning_rate: 0.008993931736681457
- min_child_weight: 1
- subsample: 0.7975990992289793
- colsample_bytree: 0.8166031869068446
- reg_alpha: 0.00022737628102536837
- reg_lambda: 0.13921548533046504
- n_estimators: 1000

--- Fitting XGBoost Model ---


Training: 100%|██████████| 30/30 [17:03<00:00, 34.13s/it]



--- Evaluating XGBoost Model ---


Walk-Forward Validation: 100%|██████████| 462/462 [00:00<00:00, 1245.37it/s]



- Mean Absolute Percentage Error (MAPE): 10.5122%
- Directional Accuracy (DA): 51.5152%

--- Selecting Best XGB Model ---
- Naive Model Benchmark → MAPE: 10.5813%, DA: 54.8611%

- Conclusion: No XGB model outperformed the naive model.


In [None]:
params, metrics, beats_baseline = xgb_params_and_metrics

In [None]:
plot_residuals_analysis(metrics, 'comp', 90, 'XGB', 'Month')
run_ljung_box_test(metrics, 'comp', 90, 'XGB')
print(metrics['comp'])

Plot saved to 'outputs/plots/xgb_residuals_analysis_month'.
= Ljung-Box Test: XGB =
Lags tested: 90
p-value: 0.0000
Result: Reject the null hypothesis (p < 0.05).
The residuals show evidence of autocorrelation.
               true          pred
2022-11-11  16806.0  19010.941258
2022-11-12  16842.0  19244.930358
2022-11-13  16499.0  19119.031882
2022-11-14  16590.0  19138.248308
2022-11-15  16721.0  19402.064923
...             ...           ...
2024-02-11  48295.0  43531.230298
2024-02-12  49745.0  42937.461714
2024-02-13  49126.0  43074.027317
2024-02-14  51475.0  42840.808245
2024-02-15  52003.0  43037.613498

[462 rows x 2 columns]


### Save hyperparameters to a YAML file.

In [None]:
params_to_save = {
  'XGB': {
        'weekly': {
            'max_depth': 8,
            'learning_rate': 0.02035961462462537,
            'min_child_weight': 3,
            'subsample': 0.8122909048600037,
            'colsample_bytree': 0.756553078833234,
            'reg_alpha': 0.9151445932854982,
            'reg_lambda': 0.0003910864887555302,
            'n_estimators': 1000,
        },
        'monthly': {
            'max_depth': 3,
            'learning_rate': 0.008993931736681457,
            'min_child_weight': 1,
            'subsample': 0.7975990992289793,
            'colsample_bytree': 0.8166031869068446,
            'reg_alpha': 0.00022737628102536837,
            'reg_lambda': 0.13921548533046504,
            'n_estimators': 1000,
        },
    }
}

In [None]:
save_config(params_to_save, CONFIG_FILE_PATH)

Configuration successfully saved to '/content/drive/MyDrive/projects/btc/config.yaml'.
