### Import, mount, and configure

In [None]:
# Standard library imports.
import os
import sys
import shutil
import hashlib
from pathlib import Path

# Third-party library imports.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels
from google.colab import drive
from google.colab import files
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
drive.mount('/content/drive')

# --- Root Directories ---
DRIVE_ROOT = Path('/content/drive/MyDrive')

# --- Dataset Paths ---
BTC_DATASET_ROOT = DRIVE_ROOT / 'datasets/btc'
BTC_DATASET_VERSIONS_DIR = BTC_DATASET_ROOT / 'versions'
BTC_DATASET_CHECKSUMS_DIR = BTC_DATASET_ROOT / 'checksums'

# --- Dataset File Names ---
DATASET_CSV_FILENAME = 'ohlcv_274.csv'
DATASET_TXT_FILENAME = 'ohlcv_274.txt'

# --- Project Paths ---
PROJECT_ROOT = DRIVE_ROOT / 'projects/btc'
PROJECT_DATA_DIR = PROJECT_ROOT / 'data'
PROJECT_RAW_DIR = PROJECT_DATA_DIR / 'raw'
PROJECT_CLEANED_DIR = PROJECT_DATA_DIR / 'cleaned'
PROJECT_METADATA_DIR = PROJECT_DATA_DIR / 'metadata'

# --- Specific Bitcoin File Names (within Project) ---
BTC_RAW_CSV_FILENAME = 'btc_274_raw.csv'
BTC_CLEANED_CSV_FILENAME = 'btc_274_cleaned.csv'

# --- Constructed Filepaths ---
BTC_DATASET_FILEPATH = BTC_DATASET_VERSIONS_DIR / DATASET_CSV_FILENAME

BTC_RAW_FILEPATH = PROJECT_RAW_DIR / BTC_RAW_CSV_FILENAME
BTC_CLEANED_FILEPATH = PROJECT_CLEANED_DIR / BTC_CLEANED_CSV_FILENAME

BTC_CHECKSUM_FILEPATH = PROJECT_METADATA_DIR / 'checksums.txt'
BTC_METADATA_FILEPATH = PROJECT_METADATA_DIR / 'dataset_versions.md'
GITIGNORE_FILEPATH = PROJECT_ROOT / '.gitignore'

Mounted at /content/drive


In [None]:
# Add the project root to the system path for custom module discovery.
project_root = os.path.abspath(PROJECT_ROOT)

if project_root not in sys.path:
  sys.path.append(project_root)

print(f'Project root "{project_root}" added to sys.path.')

Project root "/content/drive/MyDrive/projects/btc" added to sys.path.


In [None]:
# Custom application imports.
from src.analysis import *
from src.data_cleaner import *
from src.data_loader import *
from src.data_splitter import *
from src.preprocessing import *
from src.utils import *
from src.visualization import *

In [None]:
print('--- Python Interpreter and Standard Library Versions ---')
print(f'Python Version: {sys.version}')
print(f'Python Version Info: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')

print(f'os, shutil, hashlib, pathlib module: (version tied to Python interpreter)')

print('\n--- Third-Party Library Versions ---')
print(f'Pandas Version: {pd.__version__}')
print(f'NumPy Version: {np.__version__}')
print(f'Matplotlib Version: {matplotlib.__version__}')
print(f'Statsmodels Version: {statsmodels.__version__}')

print('\n--- Google Colab Specific Module Versions ---')
print(f'google.colab.drive, google.colab.files: (version tied to Colab environment)')

--- Python Interpreter and Standard Library Versions ---
Python Version: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
Python Version Info: 3.11.13
os, shutil, hashlib, pathlib module: (version tied to Python interpreter)

--- Third-Party Library Versions ---
Pandas Version: 2.2.2
NumPy Version: 2.0.2
Matplotlib Version: 3.10.0
Statsmodels Version: 0.14.5

--- Google Colab Specific Module Versions ---
google.colab.drive, google.colab.files: (version tied to Colab environment)


### Save the dataset to Google Drive for persistent storage

In [None]:
setup_dataset(BTC_DATASET_FILEPATH)

Dataset already exists at '/content/drive/MyDrive/datasets/btc/versions/ohlcv_274.csv'.


### Generate checksums for data integrity, version tracking, and debugging

In [None]:
# Shell commands do not accept Python variables.
# TODO: Convert to pure Python code.
!sha256sum "/content/drive/MyDrive/datasets/btc/versions/ohlcv_274.csv" > "/content/drive/MyDrive/datasets/btc/checksums/ohlcv_274.txt"
!sha256sum "/content/drive/MyDrive/datasets/btc/versions/ohlcv_274.csv" | diff - "/content/drive/MyDrive/datasets/btc/checksums/ohlcv_274.txt"
!echo "ohlcv_274.csv: $(sha256sum '/content/drive/MyDrive/datasets/btc/versions/ohlcv_274.csv' | cut -d ' ' -f 1)" >> "/content/drive/MyDrive/datasets/btc/checksums/checksums.txt

/bin/bash: -c: line 1: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 2: syntax error: unexpected end of file


###  Create a .gitignore file to exclude large datasets and local configurations

In [None]:
create_gitignore(GITIGNORE_FILEPATH)

Success: .gitignore file created at '/content/drive/MyDrive/projects/btc/.gitignore'


### Create the required directories and copy the dataset to data/raw

In [None]:
for dir in [PROJECT_RAW_DIR, PROJECT_CLEANED_DIR, PROJECT_METADATA_DIR]:
  dir.mkdir(exist_ok=True)

shutil.copy(BTC_DATASET_FILEPATH, BTC_RAW_FILEPATH)

PosixPath('/content/drive/MyDrive/projects/btc/data/raw/btc_274_raw.csv')

### Verify dataset's integrity by comparing checksums, then generate a new checksum

In [None]:
verify_and_log_checksum(BTC_DATASET_FILEPATH, BTC_RAW_FILEPATH, BTC_CHECKSUM_FILEPATH)

Computing hash for source file: ohlcv_274.csv...

Calculating checksum for: ohlcv_274.csv...
Checksum computed successfully.
Computing hash for destination file: btc_274_raw.csv...

Calculating checksum for: btc_274_raw.csv...
Checksum computed successfully.
Comparing hashes...
Hashes match. Data integrity verified.
Successfully appended checksum to /content/drive/MyDrive/projects/btc/data/metadata/checksums.txt


### Create a metadata file for the dataset

In [None]:
df_btc_raw = load_btc_dataset(BTC_RAW_FILEPATH)

Loading dataset from: btc_274_raw.csv...
Dataset loaded successfully.


In [None]:
# TODO: Wrong Indentation.
create_metadata_file(df_btc_raw, BTC_RAW_FILEPATH, BTC_DATASET_FILEPATH,
                     BTC_METADATA_FILEPATH, compute_sha256(BTC_RAW_FILEPATH))


Calculating checksum for: btc_274_raw.csv...
Checksum computed successfully.
Successfully created and appended metadata to '/content/drive/MyDrive/projects/btc/data/metadata'


### EDA
1.   Data Cleaning and Inspection.
  *   Handle missing data.
  *   Verify data integrity.

2.   Summary Statistics and Distribution Analysis.
  *   Display descriptive statistics.
  *   Perform distribution analysis.

3.   Time-Series Analysis and Visualization.
  *   Analyze price trends.
  *   Analyze volume trends.
  *   Analyze the correlation between price and volume.

4.   Trend, Seasonality, and Cyclicality.
  *   Perform time-series decomposition.
  *   Conduct autocorrelation analysis (ACF & PACF).
  *   Perform stationarity analysis (ADF Test).  

In [None]:
df_btc_raw = load_btc_dataset(BTC_RAW_FILEPATH)


Loading dataset from: btc_274_raw.csv...
Dataset loaded successfully.


In [None]:
df_btc_cleaned = clean_btc_data(df_btc_raw)


= Cleaning and Preprocessing BTC Data =
Step 1: Resampling to 1-minute frequency to identify gaps...
- Identified 0.016% missing minutes.
- Removed known data gap on 2025-03-15.

Step 2: Verifying data integrity...
- Negative prices found: 0
- High < Low instances: 0
- Minutes with zero volume: 1309625

Step 3: Engineering new features...
- Features engineered: is_zero_volume, pct_change, intra_minute_spread.

Step 4: Identifying and imputing outliers...
- Found 1122 potential outliers based on extreme price moves.
- Found 334 suspicious rows to forward-fill.
- Imputed 334 rows using forward-fill.
= BTC Data Cleaning and Preprocessing Complete =


In [None]:
df_btc_cleaned

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,is_zero_volume,pct_change,intra_minute_spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-01 10:01:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,,0.000000
2012-01-01 10:02:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
2012-01-01 10:03:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
2012-01-01 10:04:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
2012-01-01 10:05:00,1.325412e+09,4.58,4.58,4.58,4.58,0.000000,1,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
2025-06-22 00:49:00,1.750553e+09,102886.00,102904.00,102886.00,102889.00,0.016406,0,-0.000165,0.000175
2025-06-22 00:50:00,1.750553e+09,102892.00,102924.00,102869.00,102924.00,0.004449,0,0.000340,0.000535
2025-06-22 00:51:00,1.750553e+09,102923.00,103023.00,102914.00,103023.00,0.018053,0,0.000962,0.001059
2025-06-22 00:52:00,1.750554e+09,103022.00,103047.00,102997.00,103046.00,0.279288,0,0.000223,0.000485


In [None]:
btc_resampled = resample_btc_data(df_btc_cleaned)
df_btc_daily_train, df_btc_daily_valid, df_btc_daily_test = split_btc_dataset(btc_resampled['daily'], 'Day')


--- Resampling BTC Data to Multiple Timeframes ---
Resampling to hourly frequency...
Resampling to daily frequency...
Resampling to weekly frequency...
Resampling to monthly frequency...
--- BTC Data Resampling Complete ---

--- Splitting BTC Dataset to Training, Validation, and Test Set (Day) ---
--- BTC Dataset Splitting Complete ---


In [None]:
display_descriptive_statistics(df_btc_daily_train, 'BTC features')

= Descriptive Statistics: BTC features =
--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3936 entries, 2012-01-01 to 2022-10-10
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    3936 non-null   float64
 1   high    3936 non-null   float64
 2   low     3936 non-null   float64
 3   close   3936 non-null   float64
 4   volume  3936 non-null   float64
dtypes: float64(5)
memory usage: 184.5 KB

--- Statistical Summary ---
               open          high           low         close         volume
count   3936.000000   3936.000000   3936.000000   3936.000000    3936.000000
mean    9555.451616   9832.381710   9246.435010   9560.352889    8960.158107
std    15174.214710  15593.204466  14677.868028  15174.404496    9626.203170
min        4.380000      4.380000      3.800000      4.380000       0.000000
25%      313.997500    321.517500    300.045000    315.035000    3079.229495
50%     1949.985000 

In [None]:
plot_btc_price_distribution(df_btc_daily_train, 'price')
plot_btc_volume_distribution(df_btc_daily_train, 'volume')

plot_btc_price_boxplot(df_btc_daily_train)
plot_btc_volume_boxplot(df_btc_daily_train)

plot_btc_price_trend(df_btc_daily_train)
df_btc_daily_train.loc[:, 'log_returns_close'] = np.log(df_btc_daily_train['close']).diff()
df_btc_daily_train.loc[:, 'rolling_volatility'] = df_btc_daily_train['log_returns_close'].rolling(window=30).std() * np.sqrt(365)
plot_btc_price_rollig_volatility(df_btc_daily_train)
plot_btc_volume_trend(df_btc_daily_train)

plot_btc_price_and_volume_corr(df_btc_daily_train)
plot_btc_price_and_volume(df_btc_daily_train)
plot_autocorrelation(df_btc_daily_train, 'close', 40, 'BTC Daily Closing Price', 'btc_price')
plot_autocorrelation(df_btc_daily_train, 'log_returns_close', 40, 'BTC Daily Closing Log Returns', 'btc_log_returns')
plot_time_series_decomposition(df_btc_daily_train, 'close', 'multiplicative', 365, 'BTC Daily Closing Price')
plot_stationarity_analysis(df_btc_daily_train, 'close')

Plot saved to "outputs/plots/btc_price_distribution.png".
Plot saved to "outputs/plots/btc_log_price_distribution.png".
Plot saved to "outputs/plots/btc_volume_distribution.png".
Plot saved to "outputs/plots/btc_log_volume_distribution.png".
Plot saved to "outputs/plots/btc_price_boxplot.png".
Plot saved to "outputs/plots/btc_volume_boxplot.png".
Plot saved to "outputs/plots/btc_price_trend.png".


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_btc_daily_train.loc[:, 'log_returns_close'] = np.log(df_btc_daily_train['close']).diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_btc_daily_train.loc[:, 'rolling_volatility'] = df_btc_daily_train['log_returns_close'].rolling(window=30).std() * np.sqrt(365)


Plot saved to "outputs/plots/btc_price_rolling_volatility.png".
Plot saved to "outputs/plots/btc_volume_trend.png".


  ax.legend(loc='upper left')


Plot saved to "outputs/plots/btc_price_and_volume_corr.png".
Plot saved to "outputs/plots/btc_price_and_volume.png".
Plot saved to "outputs/plots/btc_price_autocorrelation.png".
Plot saved to "outputs/plots/btc_log_returns_autocorrelation.png".
Plot saved to "outputs/plots/btc_price_decomposition.png".
Plot saved to "outputs/plots/btc_price_stationarity_analysis.png".


In [None]:
run_adf_test(df_btc_daily_train, 'close', 'BTC Daily Closing Price')
run_adf_test(df_btc_daily_train, 'log_returns_close', 'BTC Daily Closing Log Returns')

= ADF Test: BTC Daily Closing Price =
ADF Statistics: -1.7223
p-value: 0.4197
Critical Values:
	1%:-3.4320
	5%:-2.8623
	10%:-2.5672
Conclusion: The p-value is greater than 0.05. The data is likely non-stationary and non-seasonal.

= ADF Test: BTC Daily Closing Log Returns =
ADF Statistics: -10.8444
p-value: 0.0000
Critical Values:
	1%:-3.4320
	5%:-2.8623
	10%:-2.5672
Conclusion: The p-value is less than or equal to 0.05. The data is likely stationary and seasonal.



### Save the cleaned dataset to data/cleaned

In [None]:
process_and_save_dataset(df_btc_cleaned, BTC_CLEANED_FILEPATH)

Successfully processed data and saved to '/content/drive/MyDrive/projects/btc/data/cleaned/btc_274_cleaned.csv'
