In [5]:
#Backfill
import pandas as pd
import time
import requests
from datetime import datetime, timedelta
from google.cloud import bigquery

# === CONFIGURATION ===
PROJECT_ID = "oplabs-tools-data"
TABLE_ID = "rpc_table_uploads.market_data"
FULL_TABLE_ID = f"{PROJECT_ID}.{TABLE_ID}"
START_DATE = datetime(2025, 3, 1).date()
END_DATE = datetime.now().date()

# === Step 1: Query existing timestamps from BigQuery ===
client = bigquery.Client(project=PROJECT_ID)
query = f"""
    SELECT DISTINCT DATE(timestamp) AS ts_date
    FROM `{FULL_TABLE_ID}`
    WHERE DATE(timestamp) BETWEEN '{START_DATE}' AND '{END_DATE}'
"""
existing_dates_df = client.query(query).to_dataframe()
existing_dates = set(existing_dates_df["ts_date"])

# === Step 2: Identify missing dates ===
all_dates = set(pd.date_range(start=START_DATE, end=END_DATE).date)
missing_dates = sorted(all_dates - existing_dates)
print(f"Missing dates: {missing_dates}")

# === Step 3: Fetch ETH prices for missing dates ===
def fetch_price_for_date(date_obj, max_retries=5):
    unix_ts_start = int(datetime.combine(date_obj, datetime.min.time()).timestamp())
    unix_ts_end = int(datetime.combine(date_obj + timedelta(days=1), datetime.min.time()).timestamp())

    url = "https://api.coingecko.com/api/v3/coins/ethereum/market_chart/range"
    params = {
        "vs_currency": "usd",
        "from": unix_ts_start,
        "to": unix_ts_end
    }

    for attempt in range(max_retries):
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            prices = data.get("prices", [])
            if not prices:
                print(f"⚠️ No price data for {date_obj}")
                return None

            df = pd.DataFrame(prices, columns=["timestamp", "price"])
            avg_price = df["price"].mean()
            print(f"✅ {date_obj} → {round(avg_price, 2)} USD")
            return {
                "timestamp": datetime.combine(date_obj, datetime.min.time()).isoformat(),
                "eth_usd": round(avg_price, 2)
            }

        elif response.status_code == 429:
            wait_time = 10 * (attempt + 1)
            print(f"⏳ Rate limited on {date_obj} (attempt {attempt+1}/{max_retries}) — sleeping {wait_time}s...")
            time.sleep(wait_time)

        else:
            print(f"❌ Failed to fetch for {date_obj}: HTTP {response.status_code}")
            return None

    print(f"🚫 Max retries exceeded for {date_obj}")
    return None

# === Step 4: Prepare and insert data ===
rows_to_insert = []
for missing_date in missing_dates:
    result = fetch_price_for_date(missing_date)
    if result:
        rows_to_insert.append(result)

print(f"Inserting {len(rows_to_insert)} rows into BigQuery...")

errors = client.insert_rows_json(FULL_TABLE_ID, rows_to_insert)
if not errors:
    print("✅ Data inserted successfully!")
else:
    print("❌ Errors occurred during insertion:", errors)


Missing dates: [datetime.date(2025, 4, 6), datetime.date(2025, 4, 7), datetime.date(2025, 4, 8), datetime.date(2025, 4, 9), datetime.date(2025, 4, 10), datetime.date(2025, 4, 11), datetime.date(2025, 4, 12), datetime.date(2025, 4, 13), datetime.date(2025, 4, 14), datetime.date(2025, 4, 15), datetime.date(2025, 4, 16), datetime.date(2025, 4, 17), datetime.date(2025, 4, 18), datetime.date(2025, 4, 19), datetime.date(2025, 4, 20), datetime.date(2025, 4, 21)]
⏳ Rate limited on 2025-04-06 (attempt 1/5) — sleeping 10s...
⏳ Rate limited on 2025-04-06 (attempt 2/5) — sleeping 20s...
✅ 2025-04-06 → 1705.3 USD
✅ 2025-04-07 → 1536.06 USD
✅ 2025-04-08 → 1521.32 USD
⏳ Rate limited on 2025-04-09 (attempt 1/5) — sleeping 10s...
⏳ Rate limited on 2025-04-09 (attempt 2/5) — sleeping 20s...
⏳ Rate limited on 2025-04-09 (attempt 3/5) — sleeping 30s...
✅ 2025-04-09 → 1543.41 USD
✅ 2025-04-10 → 1557.26 USD
✅ 2025-04-11 → 1560.19 USD
✅ 2025-04-12 → 1617.61 USD
⏳ Rate limited on 2025-04-13 (attempt 1/5) — sl