In [None]:
'''
Step1a. Download Binance Hist Data 
Snapshot Fn: 
* Takes OHLCV (1min candles) & order book data
* Saves 3 files for each snapshot:
1. OHLCV data (open, high, low, close, volume)
2. Asks
3. Bids 

Scheduling:
* Runs every minute (configurable)
* 1st run happens immediately
* Runs continuously until stopped with Ctrl+C

ex. File Org - Snapshot taken of Binance ETH/USDT pair OHLCV, bids & asks at 14:33pm UTC on 6-May-25. 
Binance/ETHUSDT_20240506_1433_ohlcv.csv
Binance/ETHUSDT_20240506_1433_asks.csv
Binance/ETHUSDT_20240506_1433_bids.csv
'''
import ccxt
import pandas as pd
import time
from datetime import datetime, timezone  # Added timezone to imports
import os
import schedule

# Configuration
BINANCE_SYMBOLS = ['ETH/USDT']
OUTPUT_DIR = "Binance"
SNAPSHOT_INTERVAL_MINUTES = 1

def initialize_binance_client():
    """Initialize and return Binance client with rate limiting"""
    return ccxt.binance({
        'enableRateLimit': True,
        'options': {
            'defaultType': 'spot',
            'adjustForTimeDifference': True,}})

def ensure_output_directory():
    """Ensure output dir exists"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)

def take_orderbook_snapshot(binance, symbol):
    """Take snapshot of order book and OHLCV data for a symbol"""
    try:
        base_symbol = symbol.replace('/', '')
        current_time = datetime.now(timezone.utc)
        timestamp = current_time.strftime("%Y%m%d_%H%M")  
        
        # Prepare filenames
        prefix = f"{OUTPUT_DIR}/{base_symbol}_{timestamp}"
        ohlcv_file = f"{prefix}_ohlcv.csv"
        asks_file = f"{prefix}_asks.csv"
        bids_file = f"{prefix}_bids.csv"
        
        # Skip if files already exist (prevent overwrites within same second)
        if os.path.exists(ohlcv_file):
            print(f"Files for {timestamp}Z already exist, skipping...")
            return
        
        # Get current min OHLCV data
        ohlcv = binance.fetch_ohlcv(symbol, '1m', limit=1)
        orderbook = binance.fetch_order_book(symbol)
        
        # Verify OHLCV timestamp aligns with our current time
        if ohlcv:
            ohlcv_timestamp = ohlcv[0][0]
            ohlcv_time = datetime.fromtimestamp(ohlcv_timestamp/1000, timezone.utc)
            time_diff = (current_time - ohlcv_time).total_seconds()
            
            if abs(time_diff) > 60:
                print(f"Warning: OHLCV timestamp differs by {time_diff} seconds from current time")
        
        # Save OHLCV data
        ohlcv_df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
        ohlcv_df.to_csv(ohlcv_file, index=False)
        
        # Save order book data
        pd.DataFrame(orderbook['asks']).to_csv(asks_file, index=False, header=['price', 'amount'])
        pd.DataFrame(orderbook['bids']).to_csv(bids_file, index=False, header=['price', 'amount'])
        
        print(f"Saved UTC snapshot for {symbol} at {timestamp}Z (OHLCV time: {ohlcv_time.strftime('%Y%m%d_%H%M%S')}Z)")

    except Exception as e:
        print(f"Error taking snapshot for {symbol}: {str(e)}")

def job():
    """Job to run every minute"""
    binance = initialize_binance_client()
    for symbol in BINANCE_SYMBOLS:
        take_orderbook_snapshot(binance, symbol)

def run_scheduler():
    """Run the scheduler continuously"""
    print(f"Starting orderbook snapshot service. Snapshots every {SNAPSHOT_INTERVAL_MINUTES} minute(s) in UTC...")
    print(f"Press Ctrl+C to stop")
    
    # Schedule the job
    schedule.every(SNAPSHOT_INTERVAL_MINUTES).minutes.do(job)
    
    # Run immediately first time
    job()
    
    # Run continuously
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == '__main__':
    ensure_output_directory()
    try:
        run_scheduler()
    except KeyboardInterrupt:
        print("\nStopping orderbook snapshot service...")

In [6]:
'''
Step1b. Above Binance 1 minute csv snaps for hist backtesting purposes encountering robustness issues.
'''
import pandas as pd
import glob
import os
from datetime import datetime, timedelta

# Config
DATA_DIR = 'Binance/'
START_DATE = datetime(2025, 5, 5, 8, 0)
END_DATE = datetime(2025, 5, 7, 10, 0)

def analyze_missing_data():
    # Generate expected minute intervals
    expected_minutes = pd.date_range(START_DATE, END_DATE, freq='1min')
    
    # Get all existing files
    ohlcv_files = set(glob.glob(os.path.join(DATA_DIR, 'ETHUSDT_*_ohlcv.csv')))
    bids_files = set(glob.glob(os.path.join(DATA_DIR, 'ETHUSDT_*_bids.csv')))
    asks_files = set(glob.glob(os.path.join(DATA_DIR, 'ETHUSDT_*_asks.csv')))
    all_files = ohlcv_files | bids_files | asks_files
    
    # Extract timestamps from filenames
    def extract_time(file_set):
        times = []
        for f in file_set:
            try:
                parts = os.path.basename(f).split('_')
                time_str = f"{parts[1]}_{parts[2]}"
                dt = datetime.strptime(time_str, '%Y%m%d_%H%M')
                times.append(dt)
            except:
                continue
        return pd.Series(times)
    
    ohlcv_times = extract_time(ohlcv_files)
    bids_times = extract_time(bids_files)
    asks_times = extract_time(asks_files)
    all_times = extract_time(all_files)
    
    # Find missing minutes for each type
    def find_missing(expected, actual):
        expected_set = set(expected)
        actual_set = set(actual)
        return sorted(expected_set - actual_set)
    
    missing_ohlcv = find_missing(expected_minutes, ohlcv_times)
    missing_bids = find_missing(expected_minutes, bids_times)
    missing_asks = find_missing(expected_minutes, asks_times)
    all_missing = set(missing_ohlcv) | set(missing_bids) | set(missing_asks)
    all_missing = sorted(all_missing)
    
    # Find continuous missing periods (for longest gap analysis)
    missing_periods = []
    current_start = None
    prev_time = None
    
    for time in all_missing:
        if current_start is None:
            current_start = time
        elif (time - prev_time) > timedelta(minutes=1):
            duration = (prev_time - current_start).total_seconds() / 60 + 1
            missing_periods.append({
                'start': current_start,
                'end': prev_time,
                'duration_minutes': duration})
            current_start = time
        prev_time = time
    
    if current_start is not None:
        duration = (prev_time - current_start).total_seconds() / 60 + 1
        missing_periods.append({
            'start': current_start,
            'end': prev_time,
            'duration_minutes': duration})
    
    # Generate report
    print(f"\n{'='*50}")
    print("COMPREHENSIVE MISSING DATA ANALYSIS")
    print(f"{'='*50}\n")
    
    # Basic stats
    print(f"Expected time range: {START_DATE} to {END_DATE}")
    print(f"Total expected minutes: {len(expected_minutes)}")
    print(f"Total expected files: {len(expected_minutes)*3}\n")
    
    print(f"Files found:")
    print(f"- OHLCV: {len(ohlcv_files)}")
    print(f"- Bids: {len(bids_files)}")
    print(f"- Asks: {len(asks_files)}")
    print(f"TOTAL: {len(all_files)}\n")
    
    print(f"Missing files:")
    print(f"- OHLCV: {len(missing_ohlcv)}")
    print(f"- Bids: {len(missing_bids)}")
    print(f"- Asks: {len(missing_asks)}")
    print(f"TOTAL: {len(missing_ohlcv)+len(missing_bids)+len(missing_asks)}\n")
    
    # Longest missing period
    if missing_periods:
        longest = max(missing_periods, key=lambda x: x['duration_minutes'])
        print(f"LONGEST MISSING PERIOD:")
        print(f"- Start: {longest['start']}")
        print(f"- End: {longest['end']}")
        print(f"- Duration: {longest['duration_minutes']:.1f} minutes ({longest['duration_minutes']/60:.1f} hours)\n")
        
        print(f"ALL MISSING PERIODS (>{timedelta(minutes=1)}):")
        for i, period in enumerate(sorted(missing_periods, key=lambda x: x['duration_minutes'], reverse=True), 1):
            if period['duration_minutes'] > 1:  # Only show gaps >1 minute
                print(f"{i}. {period['start']} to {period['end']} ({period['duration_minutes']:.1f} minutes)")
    else:
        print("No missing periods found!")
    
    # Save detailed reports
    os.makedirs('analysis', exist_ok=True)
    
    # Save missing timestamps
    missing_df = pd.DataFrame({
        'timestamp': all_missing,
        'missing_ohlcv': [1 if t in missing_ohlcv else 0 for t in all_missing],
        'missing_bids': [1 if t in missing_bids else 0 for t in all_missing],
        'missing_asks': [1 if t in missing_asks else 0 for t in all_missing]})
    missing_df.to_csv('analysis/missing_files_detailed.csv', index=False)
    
    # Save missing periods
    if missing_periods:
        periods_df = pd.DataFrame(missing_periods)
        periods_df = periods_df.sort_values('duration_minutes', ascending=False)
        periods_df.to_csv('analysis/missing_periods.csv', index=False)
    
    print("\nDetailed reports saved to 'analysis/' directory:")
    print("- missing_files_detailed.csv")
    print("- missing_periods.csv")

if __name__ == "__main__":
    analyze_missing_data()


COMPREHENSIVE MISSING DATA ANALYSIS

Expected time range: 2025-05-05 08:00:00 to 2025-05-07 10:00:00
Total expected minutes: 3001
Total expected files: 9003

Files found:
- OHLCV: 2200
- Bids: 2200
- Asks: 2200
TOTAL: 6600

Missing files:
- OHLCV: 801
- Bids: 801
- Asks: 801
TOTAL: 2403

LONGEST MISSING PERIOD:
- Start: 2025-05-06 16:27:00
- End: 2025-05-06 16:59:00
- Duration: 33.0 minutes (0.6 hours)

ALL MISSING PERIODS (>0:01:00):
1. 2025-05-06 16:27:00 to 2025-05-06 16:59:00 (33.0 minutes)
2. 2025-05-05 17:46:00 to 2025-05-05 18:02:00 (17.0 minutes)
3. 2025-05-06 00:41:00 to 2025-05-06 00:57:00 (17.0 minutes)
4. 2025-05-06 07:56:00 to 2025-05-06 08:12:00 (17.0 minutes)
5. 2025-05-06 12:06:00 to 2025-05-06 12:22:00 (17.0 minutes)
6. 2025-05-06 18:27:00 to 2025-05-06 18:43:00 (17.0 minutes)
7. 2025-05-07 03:44:00 to 2025-05-07 04:00:00 (17.0 minutes)
8. 2025-05-05 18:04:00 to 2025-05-05 18:19:00 (16.0 minutes)
9. 2025-05-05 18:37:00 to 2025-05-05 18:52:00 (16.0 minutes)
10. 2025-05

In [1]:
'''
Step1c.  1inch Fusion WETH/USDT trade data csv download
* via Dune Analytics API query.
* 1inch Fusion Dune query itself is run first via Dune GUI. 
* For those wanting to interact w/Dune GUI - query itself is saved for reference as 1inchFusionDuneQuery.pdf
Copy the query text off the .pdf into Dune GUI & run.
.ipynb code below is then run to save as: 

Output generated: /1inch/oneinch_weth_usdt_trades.csv 
Above .csv Will be saved in a zip file along w/Step 1 Binance csv files for those wishing 
to start directly from Step 2-WETH-USDT.ipynb file. 

nb. AFAIK unlike Binance OHLCV data, bids & asks orderbook data not hist. accessible (needs to be saved in real-time as done above)
'''
import pandas as pd
import requests
import os
from datetime import datetime
DUNE_API_KEY = os.getenv('DUNE_API_KEY', '')                                # Enter your DUNE_API_KEY between ''

# Enter DUNE_QUERY_ID below after creating & running the query in https://dune.com
# DUNE_QUERY_ID will be generated & visible in the (ex. https://dune.com/queries/1234567) 
DUNE_QUERY_ID = ''                                                          # Enter your DUNE_QUERY_ID between ''    
DUNE_API_URL = f"https://api.dune.com/api/v1/query/{DUNE_QUERY_ID}/results"
#Set as desired
START_DATE = datetime(2025, 5, 5, 8, 00)  

def load_1inch_data_from_dune():
    """Load 1inch WETH/USDT trades from Dune Analytics starting from specific timestamp"""
    headers = {'X-Dune-API-Key': DUNE_API_KEY}
    params = {'limit': 10000}  # Increased limit to get more data
    
    try:
        print("Fetching WETH/USDT trades from Dune since...")
        response = requests.get(DUNE_API_URL, headers=headers, params=params)
        response.raise_for_status()
        
        data = response.json()
        if not data.get('result', {}).get('rows'):
            raise ValueError("No data returned from Dune API")
            
        df = pd.DataFrame(data['result']['rows'])
        
        # Convert & clean data
        df['block_time'] = pd.to_datetime(df['block_time'])
        df['date'] = df['block_time'].dt.tz_localize(None)
        
        if 'price_per_eth' not in df.columns:
            raise ValueError("price_per_eth column missing from Dune query output")
        
        # Filter for trades after our start date
        df = df[df['date'] >= START_DATE]
        
        print(f"Successfully loaded {len(df)} WETH/USDT trades from Dune since {START_DATE}")
        return df
        
    except Exception as e:
        print(f"Error loading Dune data: {str(e)}")
        return pd.DataFrame()

def save_data_to_csv():
    """Main fn to collect & save 1inch data"""
    print("Starting 1inch data collection...")
    
    # Create dir if it doesn't exist
    os.makedirs("1inch", exist_ok=True)
    
    # Load data from Dune
    oneinch_data = load_1inch_data_from_dune()
    
    # Save to CSV file w/timestamp for versioning
    if not oneinch_data.empty:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
        #oneinch_file = f"1inch/oneinch_weth_usdt_trades_{timestamp}.csv"
        oneinch_file = f"1inch/oneinch_weth_usdt_trades.csv"
        oneinch_data.to_csv(oneinch_file, index=False)
        print(f"Saved 1inch WETH/USDT data to {oneinch_file}")
        print(f"Time range: {oneinch_data['date'].min()} to {oneinch_data['date'].max()}")
        print(f"Total trades: {len(oneinch_data)}")
    else:
        print("No 1inch trade data was retrieved")
    
    print("Data collection complete!")

if __name__ == "__main__":
    save_data_to_csv()

Starting 1inch data collection...
Fetching WETH/USDT trades from Dune since...
Successfully loaded 1795 WETH/USDT trades from Dune since 2025-05-05 08:00:00
Saved 1inch WETH/USDT data to 1inch/oneinch_weth_usdt_trades.csv
Time range: 2025-05-05 08:04:23 to 2025-05-07 09:08:47
Total trades: 1795
Data collection complete!
