In [39]:
from tqdm import tqdm
from loguru import logger
import pandas as pd
import numpy as np
import glob
import re

In [36]:
"""
Show trades data available
"""

fps_trades_btc_usd = sorted(glob.glob("/mnt/vol1/coinbase_trades_*_btcusd.parquet"), reverse=True)
print("TRADES\n------\n")
print('\n'.join(fps_trades_btc_usd[:10]))
print("...")
print('\n'.join(fps_trades_btc_usd[-10:]))
print()

df_trades_files = pd.DataFrame({
    pd.to_datetime(re.findall(r'\d{8}', f)[0]): f
    for f in fps_trades_btc_usd
}.items(), columns=['date','file'])

TRADES
------

/mnt/vol1/coinbase_trades_20240805_btcusd.parquet
/mnt/vol1/coinbase_trades_20240804_btcusd.parquet
/mnt/vol1/coinbase_trades_20240803_btcusd.parquet
/mnt/vol1/coinbase_trades_20240802_btcusd.parquet
/mnt/vol1/coinbase_trades_20240801_btcusd.parquet
/mnt/vol1/coinbase_trades_20240731_btcusd.parquet
/mnt/vol1/coinbase_trades_20240730_btcusd.parquet
/mnt/vol1/coinbase_trades_20240729_btcusd.parquet
/mnt/vol1/coinbase_trades_20240728_btcusd.parquet
/mnt/vol1/coinbase_trades_20240727_btcusd.parquet
...
/mnt/vol1/coinbase_trades_20230918_btcusd.parquet
/mnt/vol1/coinbase_trades_20230917_btcusd.parquet
/mnt/vol1/coinbase_trades_20230916_btcusd.parquet
/mnt/vol1/coinbase_trades_20230915_btcusd.parquet
/mnt/vol1/coinbase_trades_20230914_btcusd.parquet
/mnt/vol1/coinbase_trades_20230913_btcusd.parquet
/mnt/vol1/coinbase_trades_20230912_btcusd.parquet
/mnt/vol1/coinbase_trades_20230911_btcusd.parquet
/mnt/vol1/coinbase_trades_20230910_btcusd.parquet
/mnt/vol1/coinbase_trades_20230

In [None]:
"""
Read in example week of data
"""

# [start_date, end_date]
start_date = "20240728"
end_date = "20240803"
date_range = pd.date_range(start_date, end_date) # inclusive on both

# read in files
matched_files = df_trades_files[(
    (df_trades_files['date'] >= pd.to_datetime(start_date))&
    (df_trades_files['date'] <= pd.to_datetime(end_date))
)]['file'].tolist()
df_data = pd.concat([ # may be slow
    pd.read_parquet(f)
    for f in tqdm(sorted(matched_files), total=len(matched_files))
]).sort_values(by=['timestamp','local_timestamp']).reset_index(drop=True)
# note: timestamp sort does not guaranteed local timestamp sorted (delay of information)
# for historical analysis: use timestamp (real time of trade)
# for trading: use local_timestamp (real time we got it)
logger.info(f"Read in shape: {df_data.shape}")

# Derived columns
logger.info("Adding addition info")
# Floor down to nearest hour
df_data['timestamp_1h'] = df_data['timestamp'].apply(lambda ts: ts.replace(minute=0, second=0, microsecond=0))

print(df_data.dtypes)
df_data.head()

100%|██████████| 7/7 [00:00<00:00, 14.32it/s]
[32m2024-08-08 11:23:42.006[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mRead in shape: (3899033, 9)[0m


In [47]:
"""
Nb. (buys, sells) at each hour
"""

df_data.groupby('date')

Unnamed: 0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount,date
0,coinbase,BTC-USD,2024-07-28 00:00:00.351319+00:00,2024-07-28 00:00:00.390004+00:00,669926906,buy,67901.81,2.1e-05,2024-07-28
1,coinbase,BTC-USD,2024-07-28 00:00:01.332798+00:00,2024-07-28 00:00:01.371314+00:00,669926907,buy,67901.8,0.0001,2024-07-28
2,coinbase,BTC-USD,2024-07-28 00:00:01.709024+00:00,2024-07-28 00:00:01.747253+00:00,669926908,buy,67901.79,8.7e-05,2024-07-28
3,coinbase,BTC-USD,2024-07-28 00:00:01.900599+00:00,2024-07-28 00:00:01.938845+00:00,669926909,buy,67901.79,0.000204,2024-07-28
4,coinbase,BTC-USD,2024-07-28 00:00:02.083781+00:00,2024-07-28 00:00:02.122448+00:00,669926910,sell,67897.22,0.012,2024-07-28
