## Market trades, summary statistics

THis is a rough summary of market trades data.

In [1]:
from aspidoceleon.bigquery import BigQuery
from aspidoceleon.dataframe import convert_timestamp_columns
from charadrius.const import Const
from loguru import logger
import pandas as pd
import datetime
import re
import os
import glob
constants = Const()
bigquery = BigQuery()

# workdir
workdir = f"{constants.PROJECT_WORKDIR}/market_trades"
os.makedirs(workdir, exist_ok=True)
logger.info("workdir")

# This week, Sunday to today inclusive
TODAY = datetime.datetime.now().strftime(constants.DATE_FMT_NODASH)
WEEK_START = pd.to_datetime(datetime.datetime.now()).to_period("W-SAT").start_time.strftime(constants.DATE_FMT_NODASH)
logger.info(f"Date range: {WEEK_START}, {TODAY}")

### Read in real-time data

In [None]:
"""
Fetch market trades RT data
"""
fp_rt = f"{workdir}/market_trades_{WEEK_START}_{TODAY}.parquet"
# could skip caching by removing file
sql_rt = f"""
select
    *
from `fastitocalon.coinbase.rt_coinbase_market_trades_*`
where _table_suffix between '{WEEK_START}' and '{TODAY}'
"""
df_rt = bigquery.query_cache(sql=sql_rt, fp=fp_rt)
df_rt = convert_timestamp_columns(df_rt)
logger.info(f"fetched shape {df_rt.shape}")

In [None]:
"""
quick summary
"""
print(df_rt.shape)
print(df_rt['local_timestamp'].agg(['min','max']))
print(df_rt.dtypes)
df_rt.head()

### Read in batch data

In [None]:
"""
Read in batch data for given time period (all currencies)

Convert to same format as live data:
> remove exchange (redundant)
> convert symbols: lower, remove dash

and noting that the batch files are not snapshots, but the realtime data is (stream is too fast)
"""
df_batch = pd.concat([
    pd.read_parquet(f) for f in glob.glob("/mnt/vol1/coinbase_trades*")
    if (
        (pd.to_datetime(re.findall(r"2[0-9]+", f)[0]) >= pd.to_datetime(WEEK_START)) &
        (pd.to_datetime(re.findall(r"2[0-9]+", f)[0]) <= pd.to_datetime(TODAY))
    )
]).drop(columns=['exchange','date']) # redundant
df_batch['is_snapshot'] = False # no snapshots, actually all real data
df_batch['symbol'] = df_batch['symbol'].apply(lambda x: x.replace("-","").lower())
df_batch = df_batch[df_rt.columns]

In [None]:
"""
quick summary
"""
print(df_batch.shape)
print(df_batch['local_timestamp'].agg(['min','max']))
print(df_batch.dtypes)
df_batch.head()