### Setup

In [1]:
%pip install 'git+https://github.com/jeslago/epftoolbox.git'

Collecting git+https://github.com/jeslago/epftoolbox.git
  Cloning https://github.com/jeslago/epftoolbox.git to /tmp/pip-req-build-d5im6ukt
  Running command git clone --filter=blob:none --quiet https://github.com/jeslago/epftoolbox.git /tmp/pip-req-build-d5im6ukt
  Resolved https://github.com/jeslago/epftoolbox.git to commit 47d6e0629f65ebd19d3c12cb5689dbad0c2ea078
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: epftoolbox
  Building wheel for epftoolbox (setup.py) ... [?25l[?25hdone
  Created wheel for epftoolbox: filename=epftoolbox-1.0-py3-none-any.whl size=58746 sha256=b103243e0c428c8b4582c697b0a40f53422f6cc3f12aeafc2019358e0780d4d1
  Stored in directory: /tmp/pip-ephem-wheel-cache-o271vbod/wheels/f1/d7/c7/feb15c8f047f5b069656a52f3bcde89a7310a311f9cfac292a
Successfully built epftoolbox
Installing collected packages: epftoolbox
Successfully installed epftoolbox-1.0


In [2]:
%pip install finta

Collecting finta
  Downloading finta-1.3-py3-none-any.whl.metadata (6.4 kB)
Downloading finta-1.3-py3-none-any.whl (29 kB)
Installing collected packages: finta
Successfully installed finta-1.3


In [3]:
import os
import sys
import hashlib
from pathlib import Path

import finta
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels
import seaborn as sns
from epftoolbox.evaluation import DM
from google.colab import drive
from google.colab import files
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [5]:
drive.mount('/content/drive')

DRIVE_ROOT = Path('/content/drive/MyDrive')

PROJECT_ROOT = DRIVE_ROOT / 'projects/btc'
PROJECT_DATA_DIR = PROJECT_ROOT / 'data'
PROJECT_BTC_RAW_DIR = PROJECT_DATA_DIR / 'raw'
PROJECT_BTC_CLEANED_DIR = PROJECT_DATA_DIR / 'cleaned'
PROJECT_BTC_METADATA_DIR = PROJECT_DATA_DIR / 'metadata'

BTC_RAW_CSV_FILENAME = 'ohlcv_274_raw.csv'
BTC_CLEANED_CSV_FILENAME = 'ohlcv_274_cleaned.csv'

BTC_RAW_FILE_PATH = PROJECT_BTC_RAW_DIR / BTC_RAW_CSV_FILENAME
BTC_CLEANED_FILE_PATH = PROJECT_BTC_CLEANED_DIR / BTC_CLEANED_CSV_FILENAME

BTC_CHECKSUM_FILE_PATH = PROJECT_BTC_METADATA_DIR / 'checksums.txt'
BTC_VERSIONS_FILE_PATH = PROJECT_BTC_METADATA_DIR / 'versions.yaml'

GITIGNORE_FILE_PATH = PROJECT_ROOT / '.gitignore'

Mounted at /content/drive


In [6]:
project_root = os.path.abspath(PROJECT_ROOT)

if project_root not in sys.path:
  sys.path.append(project_root)

print(f'Project root \'{project_root}\' added to sys.path.')

Project root '/content/drive/MyDrive/projects/btc' added to sys.path.


In [7]:
from src.analysis import *
from src.data_cleaner import *
from src.data_loader import *
from src.data_splitter import *
from src.preprocessing import *
from src.utils import *
from src.visualization import *

In [8]:
print('--- Python Interpreter and Standard Library Versions ---')
print(f'- python: {sys.version}')
print(f'- python info: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')

print(f'- os, hashlib, pathlib module: (version tied to Python interpreter)')

print('\n--- Third-Party Library Versions ---')
print(f'- matplotlib: {matplotlib.__version__}')
print(f'- numpy: {np.__version__}')
print(f'- pandas: {pd.__version__}')
print(f'- seaborn: {sns.__version__}')
print(f'- statsmodels: {statsmodels.__version__}')

print('\n--- Google Colab Specific Module Versions ---')
print(f'- google.colab.drive: (version tied to Colab environment)')

--- Python Interpreter and Standard Library Versions ---
- python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
- python info: 3.12.12
- os, hashlib, pathlib module: (version tied to Python interpreter)

--- Third-Party Library Versions ---
- matplotlib: 3.10.0
- numpy: 2.0.2
- pandas: 2.2.2
- seaborn: 0.13.2
- statsmodels: 0.14.5

--- Google Colab Specific Module Versions ---
- google.colab.drive: (version tied to Colab environment)


In [9]:
%pip show epftoolbox

Name: epftoolbox
Version: 1.0
Summary: An open-access benchmark and toolbox for electricity price forecasting
Home-page: https://github.com/jeslago/epftoolbox
Author: Jesus Lago
Author-email: jesuslagogarcia@gmail.com
License: GNU AGPLv3
Location: /usr/local/lib/python3.12/dist-packages
Requires: hyperopt, keras, matplotlib, numpy, pandas, scikit-learn, scipy, statsmodels, tensorflow
Required-by: 


In [10]:
%pip show finta

Name: finta
Version: 1.3
Summary: Common financial technical indicators implemented in Pandas.
Home-page: https://github.com/peerchemist/finta
Author: Peerchemist
Author-email: peerchemist@protonmail.ch
License: LGPLv3+
Location: /usr/local/lib/python3.12/dist-packages
Requires: numpy, pandas
Required-by: 


###  Exploratory Data Analysis (EDA).
1.   Data Cleaning and Preparation
  *   Handling Missing Values: Assess the extent of missing data and apply appropriate techniques.
  *   Data Integrity and Validation: Ensure the consistency and correctness of the data by checking for anomalous outliers, and logical inconsistencies.

2.   Descriptive Statistics and Distribution Analysis
  *   Summary Statistics: Calculate and review key descriptive statistics (e.g., mean, median, standard deviation, min, max) to understand the central tendency and dispersion of the data.
  *   Distribution Visualization: Analyze the underlying distribution of key variables, such as price and volume, using histograms, and box plots. This includes examining both the original and log-transformed data to identify skewness and kurtosis.

3.   Time-Series Visualization and Relationship Analysis
  *   Temporal Distribution: Visualize the distribution of price and volume over time to identify any notable shifts or patterns.
  *   Trend Visualization: Plot price and volume trends over time, overlaying moving averages to smooth out short-term fluctuations and highlight long-term movements.
  *   Correlation Analysis: Investigate the relationship between price and volume using scatter plots and calculate rolling correlations to see how their relationship evolves over time.

4.   Time-Series Decomposition and Stationarity Testing
  *   Decomposition: Decompose the time series into its constituent components: trend, seasonality, and residual (noise) to better understand the underlying patterns.
  *   Use Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to measure the relationship between a variable and its lagged values, which helps in identifying model parameters.
  *   Stationarity Analysis: Perform statistical tests, such as the Augmented Dickey-Fuller (ADF) test, to determine if the statistical properties of the time series (like mean and variance) are constant over time.

In [11]:
raw_ds = load_btc_ds(BTC_RAW_FILE_PATH)


Loading dataset from: ohlcv_274_raw.csv...
Dataset loaded successfully.


In [12]:
cols = ['timestamp', 'open', 'high', 'low', 'close', 'volume']

In [13]:
prep_ds = raw_ds.set_axis(cols, axis=1)
prep_ds['date'] = pd.to_datetime(prep_ds['timestamp'], unit='s')
prep_ds.set_index('date', inplace=True)

In [14]:
resampled_ds = prep_ds.asfreq('min')
missing_rows = resampled_ds['close'].isna().sum()

In [15]:
resampled_ds

Unnamed: 0_level_0,timestamp,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-01-01 10:01:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:02:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:03:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:04:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000
2012-01-01 10:05:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000
...,...,...,...,...,...,...
2025-06-22 00:49:00,1.750553e+09,102886.00,102904.00,102886.00,102889.00,0.016406
2025-06-22 00:50:00,1.750553e+09,102892.00,102924.00,102869.00,102924.00,0.004449
2025-06-22 00:51:00,1.750553e+09,102923.00,103023.00,102914.00,103023.00,0.018053
2025-06-22 00:52:00,1.750554e+09,103022.00,103047.00,102997.00,103046.00,0.279288


In [None]:
print(f'Missing rows: {missing_rows}')

Missing rows: 1160


In [16]:
cleaned_ds = clean_btc_data(raw_ds)


= Cleaning and Preprocessing BTC Data =
Step 1: Resampling to 1-minute frequency to identify gaps...
- Identified 0.016% missing minutes.
- Removed known data gap (1,440) on 2025-03-15.

Step 2: Verifying data integrity...
- Negative prices found: 0
- High < Low instances: 0
- Minutes with zero volume: 1309625

Step 3: Engineering new features...
- Features engineered: is_zero_volume, pct_change, intra_minute_spread.

Step 4: Identifying and imputing outliers...
- Found 1122 potential outliers based on extreme price moves.
- Found 334 suspicious rows to forward-fill.
- Imputed 334 rows using forward-fill.
= BTC Data Cleaning and Preprocessing Complete =


In [17]:
cleaned_ds

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,is_zero_volume,pct_change,intra_minute_spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-01 10:01:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,,0.000000
2012-01-01 10:02:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
2012-01-01 10:03:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
2012-01-01 10:04:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
2012-01-01 10:05:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
2025-06-22 00:49:00,1.750553e+09,102886.00,102904.00,102886.00,102889.00,0.016406,0,-0.000165,0.000175
2025-06-22 00:50:00,1.750553e+09,102892.00,102924.00,102869.00,102924.00,0.004449,0,0.000340,0.000535
2025-06-22 00:51:00,1.750553e+09,102923.00,103023.00,102914.00,103023.00,0.018053,0,0.000962,0.001059
2025-06-22 00:52:00,1.750554e+09,103022.00,103047.00,102997.00,103046.00,0.279288,0,0.000223,0.000485


In [18]:
resampled_ds = resample_btc_data(cleaned_ds)
train_ds, valid_ds, test_ds, = split_btc_ds(resampled_ds['daily'], 'Day')


--- Resampling BTC Data to Multiple Timeframes ---
- Resampling to hourly frequency...
- Resampling to daily frequency...
- Resampling to weekly frequency...
- Resampling to monthly frequency...
--- BTC Data Resampling Complete ---

--- Splitting the BTC Dataset into Training, Validation, and Test Sets (Day) ---
--- BTC Dataset Splitting Complete ---


In [19]:
display_descriptive_statistics(train_ds, 'BTC features')

= Descriptive Statistics: BTC features =
--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3936 entries, 2012-01-01 to 2022-10-10
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    3936 non-null   float64
 1   high    3936 non-null   float64
 2   low     3936 non-null   float64
 3   close   3936 non-null   float64
 4   volume  3936 non-null   float64
dtypes: float64(5)
memory usage: 184.5 KB

--- Statistical Summary ---
               open          high           low         close         volume
count   3936.000000   3936.000000   3936.000000   3936.000000    3936.000000
mean    9555.451616   9832.381710   9246.435010   9560.352889    8960.158107
std    15174.214710  15593.204466  14677.868028  15174.404496    9626.203170
min        4.380000      4.380000      3.800000      4.380000       0.000000
25%      313.997500    321.517500    300.045000    315.035000    3079.229495
50%     1949.985000   

In [20]:
plot_btc_price_distribution(train_ds, 'price')
plot_btc_volume_distribution(train_ds, 'volume')

plot_btc_price_boxplot(train_ds)
plot_btc_volume_boxplot(train_ds)

plot_btc_price_trend(train_ds)
train_ds.loc[:, 'log_returns_close'] = np.log(train_ds['close']).diff()
train_ds.loc[:, 'rolling_volatility'] = train_ds['log_returns_close'].rolling(window=30).std() * np.sqrt(365)
plot_btc_price_rollig_volatility(train_ds)
plot_btc_volume_trend(train_ds)

plot_btc_price_and_volume_corr(train_ds)
plot_btc_price_and_volume(train_ds)
plot_autocorrelation(train_ds, 'close', 40, 'BTC Daily Closing Price', 'btc_price')
plot_autocorrelation(train_ds, 'log_returns_close', 40, 'BTC Daily Closing Log Returns', 'btc_log_returns')
plot_time_series_decomposition(train_ds, 'close', 'multiplicative', 365, 'BTC Daily Closing Price')
plot_stationarity_analysis(train_ds, 'close')

Plot saved to 'outputs/plots/btc_price_distribution.png'.
Plot saved to 'outputs/plots/btc_log_price_distribution.png'.
Plot saved to 'outputs/plots/btc_volume_distribution.png'.
Plot saved to 'outputs/plots/btc_log_volume_distribution.png'.
Plot saved to 'outputs/plots/btc_price_boxplot.png'.
Plot saved to 'outputs/plots/btc_volume_boxplot.png'.
Plot saved to 'outputs/plots/btc_price_trend.png'.
Plot saved to 'outputs/plots/btc_price_rolling_volatility.png'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ds.loc[:, 'log_returns_close'] = np.log(train_ds['close']).diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_ds.loc[:, 'rolling_volatility'] = train_ds['log_returns_close'].rolling(window=30).std() * np.sqrt(365)


Plot saved to 'outputs/plots/btc_volume_trend.png'.


  ax.legend(loc='upper left')


Plot saved to 'outputs/plots/btc_price_and_volume_corr.png'.
Plot saved to 'outputs/plots/btc_price_and_volume.png'.
Plot saved to 'outputs/plots/btc_price_autocorrelation.png'.
Plot saved to 'outputs/plots/btc_log_returns_autocorrelation.png'.
Plot saved to 'outputs/plots/btc_price_decomposition.png'.
Plot saved to 'outputs/plots/btc_price_stationarity_analysis.png'.


In [None]:
run_adf_test(train_ds, 'close', 'BTC Daily Closing Price')
run_adf_test(train_ds, 'log_returns_close', 'BTC Daily Closing Log Returns')

= ADF Test: BTC Daily Closing Price =
ADF Statistics: -1.7053
p-value: 0.4284
Critical Values:
	1%:-3.4320
	5%:-2.8623
	10%:-2.5672
Conclusion: The p-value is greater than 0.05. The data is likely non-stationary and non-seasonal.

= ADF Test: BTC Daily Closing Log Returns =
ADF Statistics: -10.9944
p-value: 0.0000
Critical Values:
	1%:-3.4320
	5%:-2.8623
	10%:-2.5672
Conclusion: The p-value is less than or equal to 0.05. The data is likely stationary and seasonal.



### Save the cleaned dataset to data/cleaned

In [None]:
cleaned_ds['date'] = pd.to_datetime(cleaned_ds['timestamp'], unit='s')

In [None]:
save_ds(BTC_CLEANED_FILE_PATH, cleaned_ds )

Successfully saved the dataset to '/content/drive/MyDrive/projects/btc/data/cleaned/ohlcv_399_cleaned.csv'.
