# 01 - Download Data

Download historical OHLCV data from Binance for backtesting.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from datetime import datetime, timedelta

In [None]:
# Load configuration
config_path = Path('../config/default.yaml')
with open(config_path) as f:
    config = yaml.safe_load(f)

print(f"Symbol: {config['market']['symbol']}")
print(f"Timeframes: {config['market']['base_timeframes']}")

In [None]:
# Parameters from config
SYMBOL = config['market']['symbol']
TIMEFRAME = '1m'  # 1 minute bars
DAYS_TO_DOWNLOAD = 180  # 6 months

# Output paths
RAW_DATA_PATH = Path('../data/raw')
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)

## Download from Binance

In [None]:
# Using ccxt for data download
try:
    import ccxt
    
    exchange = ccxt.binance({
        'enableRateLimit': True,
    })
    
    print(f"Connected to Binance")
except ImportError:
    print("ccxt not installed. Install with: pip install ccxt")

In [None]:
def download_ohlcv(symbol: str, timeframe: str, days: int) -> pd.DataFrame:
    """
    Download OHLCV data from Binance.
    
    Args:
        symbol: Trading pair (e.g., 'BTC/USDT')
        timeframe: Candle timeframe (e.g., '1m', '5m', '1h')
        days: Number of days to download
    
    Returns:
        DataFrame with OHLCV data
    """
    all_data = []
    
    # Calculate timestamps
    end_time = datetime.now()
    start_time = end_time - timedelta(days=days)
    
    # Convert to milliseconds
    since = int(start_time.timestamp() * 1000)
    
    print(f"Downloading {symbol} {timeframe} from {start_time} to {end_time}")
    
    while since < end_time.timestamp() * 1000:
        try:
            ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since=since, limit=1000)
            if not ohlcv:
                break
            
            all_data.extend(ohlcv)
            since = ohlcv[-1][0] + 1  # Move to next timestamp
            
            # Progress update
            if len(all_data) % 10000 == 0:
                print(f"  Downloaded {len(all_data)} candles...")
                
        except Exception as e:
            print(f"Error: {e}")
            break
    
    # Convert to DataFrame
    df = pd.DataFrame(all_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('timestamp', inplace=True)
    
    # Remove duplicates
    df = df[~df.index.duplicated(keep='first')]
    
    print(f"Downloaded {len(df)} candles from {df.index[0]} to {df.index[-1]}")
    
    return df

In [None]:
# Download data
# Note: Convert BTCUSDT to BTC/USDT format for ccxt
ccxt_symbol = f"{SYMBOL[:3]}/{SYMBOL[3:]}"

# This may take a while for minute data
# data = download_ohlcv(ccxt_symbol, TIMEFRAME, DAYS_TO_DOWNLOAD)

# For demo, create synthetic data
print("Creating synthetic data for demo...")
n = 1440 * 30  # 30 days of minute data
np.random.seed(42)
returns = np.random.randn(n) * 0.0001
prices = 50000 * np.exp(np.cumsum(returns))

dates = pd.date_range(start='2024-01-01', periods=n, freq='1min')

data = pd.DataFrame({
    'open': prices * (1 + np.random.randn(n) * 0.0001),
    'high': prices * (1 + np.abs(np.random.randn(n) * 0.0003)),
    'low': prices * (1 - np.abs(np.random.randn(n) * 0.0003)),
    'close': prices,
    'volume': np.random.randint(10, 100, n) * 0.1,
}, index=dates)

print(f"Data shape: {data.shape}")
data.head()

In [None]:
# Save data
output_file = RAW_DATA_PATH / f"{SYMBOL}_{TIMEFRAME}.parquet"
data.to_parquet(output_file)
print(f"Saved to {output_file}")

In [None]:
# Basic statistics
print("Data Statistics:")
print(f"  Date range: {data.index[0]} to {data.index[-1]}")
print(f"  Total candles: {len(data)}")
print(f"  Price range: ${data['close'].min():.2f} - ${data['close'].max():.2f}")
print(f"  Mean volume: {data['volume'].mean():.4f} BTC")