# Data exploration

This notebook demonstrates how to download and prepare cryptocurrency data for backtesting using the Investing Algorithm Framework.

This wil showcase the following steps:
- Define constants and parameters
- Setup folder structure for data storage and results
- Generate rolling backtest windows
- Download historical OHLCV data for specified assets and time frames
- Check data completeness and fill missing timestamps

## Constants


In [None]:
from pathlib import Path
from datetime import datetime, timezone
from investing_algorithm_framework import BacktestDateRange, \
    generate_rolling_backtest_windows

data_storage_path = Path.cwd().parent / "data"
backtest_results_dir = Path.cwd().parent / "backtest_results"
reports_dir = Path.cwd().parent / "reports"
figures_dir = reports_dir / "figures"

backtest_window_date_range = BacktestDateRange(
    start_date=datetime(2022, 1, 1, tzinfo=timezone.utc),
    end_date=datetime(2025, 12, 30, tzinfo=timezone.utc)
)
MARKET = "BITVAVO"

in_sample_assets = ["BTC", "ETH", "ADA", "SOL", "DOT"]
out_sample_assets = ["XRP", "LTC", "BCH"]
time_frames = ["2h", "4h", "1d"]

## Setup folder structure

In [None]:
import os

# create all required directories
if not os.path.exists(data_storage_path):
    os.makedirs(data_storage_path)

if not os.path.exists(backtest_results_dir):
    os.makedirs(backtest_results_dir)

if not os.path.exists(reports_dir):
    os.makedirs(reports_dir)

if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)

## Backtest windows

In [None]:
rolling_backtest_windows = generate_rolling_backtest_windows(
    start_date=backtest_window_date_range.start_date,
    end_date=backtest_window_date_range.end_date,
    train_days=365,
    test_days=180,
    gap_days=30,
    step_days=90,
)

## Data downloading


In [None]:
from investing_algorithm_framework import download_v2, TimeFrame, tqdm
in_sample_data = {}
out_sample_data = {}

for symbol in in_sample_assets:
    symbol_pair = f"{symbol}/EUR"

    for time_frame in tqdm(time_frames, desc=f"Downloading data for {symbol_pair} {time_frames}"):
        if symbol not in in_sample_data:
            in_sample_data[symbol] = {}

        result = download_v2(
            symbol=symbol_pair,
            market=MARKET,
            time_frame=time_frame,
            data_type="ohlcv",
            start_date=backtest_window_date_range.start_date,
            end_date=backtest_window_date_range.end_date,
            save=True,
            storage_path=str(data_storage_path)
        )
        in_sample_data[symbol][time_frame] = {
            "data": result.data,
            "path": result.path
        }
        first_date = result.data.index[0]

        if first_date > backtest_window_date_range.start_date:
            print(f"Warning: Data for {symbol_pair} starts on {first_date} which is after the requested start date of {backtest_window_date_range.start_date}.")


for symbol in out_sample_assets:
    symbol_pair = f"{symbol}/EUR"

    for time_frame in tqdm(time_frames, desc=f"Downloading data for {symbol_pair} {time_frames}"):
        if symbol not in out_sample_data:
            out_sample_data[symbol] = {}

        result = download_v2(
            symbol=symbol_pair,
            market=MARKET,
            time_frame=time_frame,
            data_type="ohlcv",
            start_date=backtest_window_date_range.start_date,
            end_date=backtest_window_date_range.end_date,
            save=True,
            storage_path=str(data_storage_path)
        )
        out_sample_data[symbol][time_frame] = {
            "data": result.data,
            "path": result.path
        }
        first_date = result.data.index[0]

        if first_date > backtest_window_date_range.start_date:
            print(f"Warning: Data for {symbol_pair} starts on {first_date} which is after the requested start date of {backtest_window_date_range.start_date}.")


## Check data completeness and fill missing timestamps

In [None]:
from investing_algorithm_framework import fill_missing_timeseries_data, tqdm, get_missing_timeseries_data_entries

for symbol, time_frames_dict in tqdm(in_sample_data.items(), desc="Checking in-sample data"):
    for time_frame, entry in time_frames_dict.items():
        data = entry["data"]
        file_path = entry["path"]
        missing_dates = get_missing_timeseries_data_entries(data)

        if len(missing_dates) > 0:
            print(f"Filling {len(missing_dates)} missing dates for {symbol} {time_frame}")
            fill_missing_timeseries_data(
                data,
                missing_dates=missing_dates,
                save_to_file=True,
                file_path=str(file_path)
            )

for symbol, time_frames_dict in tqdm(out_sample_data.items(), desc="Checking out-sample data"):
    for time_frame, entry in time_frames_dict.items():
        data = entry["data"]
        file_path = entry["path"]
        missing_dates = get_missing_timeseries_data_entries(data)

        if len(missing_dates) > 0:
            print(f"Filling {len(missing_dates)} missing dates for {symbol} {time_frame}")
            fill_missing_timeseries_data(
                data,
                missing_dates=missing_dates,
                save_to_file=True,
                file_path=str(file_path)
            )
