In [None]:
import requests
import time
import datetime
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()
COINDESKAPIKEY = os.getenv("COINDESKAPIKEY") 


In [11]:
def get_unix_timestamp(dt):
    return int(dt.timestamp())

def fetch_hourly_data_batch(to_ts, limit=1500, market="binance", instrument="BTC-USDT"):
    """
    Fetch a batch of hourly data from the Coindesk API using the working headers.
    """
    url = "https://data-api.coindesk.com/spot/v1/historical/hours"
    params = {
        "market": market,
        "instrument": instrument,
        "limit": limit,
        "aggregate": 1,  # hourly intervals
        "fill": "true",
        "apply_mapping": "true",
        "response_format": "JSON",
        "to_ts": to_ts,
        "api_key": COINDESKAPIKEY
    }
    headers = {"Content-type": "application/json; charset=UTF-8"}
    
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()  # Raise an exception for HTTP errors
    data = response.json()
    return data.get("data", [])

def fetch_hourly_data_npages():
    """
    Paginate backwards using `to_ts` for 3 pages (3 calls) of data.
    """
    # Set end time to now (UTC)
    end_dt = datetime.datetime.utcnow()
    end_ts = get_unix_timestamp(end_dt)
    
    all_records = []
    current_to_ts = end_ts
    max_pages = 5  # Limit to 3 API calls
    page_count = 0

    while page_count < max_pages:
        batch_data = fetch_hourly_data_batch(to_ts=current_to_ts)
        
        if not batch_data:
            print("No more data returned. Stopping pagination.")
            break
        
        all_records.extend(batch_data)
        page_count += 1
        
        # Assuming the batch is sorted descending, the last record is the earliest
        earliest_record = batch_data[-1]
        earliest_timestamp = int(earliest_record["time"])
        
        print(f"Page {page_count} retrieved, earliest timestamp in batch: {earliest_timestamp}")
        
        # Prepare for next iteration: subtract one hour to avoid duplicate data
        current_to_ts = earliest_timestamp - 3600
        
        time.sleep(1)  # To avoid rate limiting
    
    df = pd.DataFrame(all_records)
    
    if df.empty:
        print("No data retrieved.")
    else:
        # Verify that the 'time' column exists
        if "time" not in df.columns:
            print("ERROR: The DataFrame does not have a 'time' column. Columns are:", df.columns)
        else:
            # Optional: Convert the 'time' column to datetime for easier interpretation.
            df["datetime"] = pd.to_datetime(df["time"], unit='s', utc=True)
            df.sort_values(by="datetime", inplace=True)
    
    return df


In [12]:
# Run the function and display a preview
df_prices = fetch_hourly_data_npages()
if not df_prices.empty:
    print("Fetched data shape:", df_prices.shape)
    display(df_prices.head())
else:
    print("DataFrame is empty; no data was retrieved.")

No more data returned. Stopping pagination.
No data retrieved.
DataFrame is empty; no data was retrieved.


In [8]:
# Save to CSV if desired
df_prices.to_csv("../data/raw/btc_usdt_6months_hourly.csv", index=False)