In [1]:
from binance.client import Client as bnb_client
from datetime import datetime
import pandas as pd 
import numpy as np 
import requests
import time

client = bnb_client()

COINGECKO_API = "https://api.coingecko.com/api/v3"
BINANCE_API = "https://api.binance.com"
START_DATE = "2021-01-01"
INTERVAL = "1h"

In [2]:
def get_top_N_coins(n=300):
    url = f"{COINGECKO_API}/coins/markets"
    coins = []

    pages = (n // 100) + (1 if n % 100 != 0 else 0)

    for page in range(1, pages + 1):
        params = {
            "vs_currency": "usd",
            "order": "market_cap_desc",
            "per_page": 100,
            "page": page
        }
        r = requests.get(url, params=params)
        r.raise_for_status()
        coins.extend([
            (coin["id"], coin["symbol"].upper()) for coin in r.json()
        ])
        time.sleep(2.5)  # Respect rate limits

    return coins[:n]  # Trim to exact number

In [3]:
def get_binance_px(symbol,freq,start_ts = START_DATE):
    data = client.get_historical_klines(symbol,freq,start_ts)
    columns = ['open_time','open','high','low','close','volume','close_time','quote_volume',
    'num_trades','taker_base_volume','taker_quote_volume','ignore']

    data = pd.DataFrame(data,columns = columns)
    
    # Convert from POSIX timestamp (number of millisecond since jan 1, 1970)
    data['open_time'] = data['open_time'].map(lambda x: datetime.utcfromtimestamp(x/1000))
    data['close_time'] = data['close_time'].map(lambda x: datetime.utcfromtimestamp(x/1000))
    return data 

In [4]:
def get_binance_usdt_pairs():
    url = f"{BINANCE_API}/api/v3/exchangeInfo"
    r = requests.get(url)
    r.raise_for_status()
    symbols = r.json()["symbols"]
    return set(s["symbol"] for s in symbols if s["quoteAsset"] == "USDT" and s["status"] == "TRADING")

In [5]:
def match_symbols(coingecko_list, binance_symbols):
    matched = []
    for _, symbol in coingecko_list:
        pair = f"{symbol}USDT"
        if pair in binance_symbols:
            matched.append(pair)
    return matched

In [6]:
# Getting top 100 coins by market cap from CoinGecko
top_coins = get_top_N_coins()

# Getting Binance tradable USDT pairs
binance_symbols = get_binance_usdt_pairs()

# Matching to Binance USDT pairs
matched_symbols = match_symbols(top_coins, binance_symbols)
print(f"✅ Matched {len(matched_symbols)} symbols: {matched_symbols[:5]}...")

✅ Matched 133 symbols: ['BTCUSDT', 'ETHUSDT', 'XRPUSDT', 'BNBUSDT', 'SOLUSDT']...


In [7]:
df_o = pd.read_pickle('binance_1h_crypto_data.pkl')
for col in df_o.symbol.unique():
    if col not in matched_symbols:
        if len(df_o[df_o.symbol == col]) > 24*365*4:
            matched_symbols.append(col)
            
matched_symbols = list(set(matched_symbols))

print(f"✅ Matched {len(matched_symbols)} symbols: {matched_symbols[:5]}...")

✅ Matched 133 symbols: ['APEUSDT', 'BTCUSDT', 'SFPUSDT', 'TFUELUSDT', 'STRKUSDT']...


In [8]:
start = time.time()
results = []

for symbol in matched_symbols:
    data = get_binance_px(symbol, INTERVAL)
    data = data[['open_time','open','high','low','close','volume']]
    data['symbol'] = symbol
    if not data.empty:
        results.append(data)
        
df = pd.concat(results, ignore_index=True)

end = time.time()
time_taken = end - start
print("Run complete after {} hours, {} mins, and {} seconds".format(
    int(time_taken/3600),
    int((time_taken%3600)/60), 
    (20000%3600)%60))

ReadTimeout: HTTPSConnectionPool(host='api.binance.com', port=443): Read timed out. (read timeout=10)

In [None]:
for col in matched_symbols:
    if col not in df.symbol.unique():
        print(col)

In [None]:
if (df[df.symbol == 'BTCUSDT'].shape[0] > df_o[df_o.symbol == 'BTCUSDT'].shape[0]):
#     df_o.to_pickle("binance_1h_crypto_data_OLD.pkl")
    df.to_pickle("binance_1h_crypto_data.pkl")
    print("price file rewritten successfully!")
else:
    print(df[df.symbol == 'BTCUSDT'].shape[0], df_o[df_o.symbol == 'BTCUSDT'].shape[0])