See readme.md for ideal fields and descriptions

In [1]:
# ! pip install pandas pyarrow
# ! pip install polars

In [2]:
import subprocess
import os
from datetime import datetime, timedelta, timezone
import time
import pyarrow.parquet as pq
import pandas as pd


In [3]:
import polars as pl

In [4]:
# Init timestamps
trailing_days = 1

current_date_utc = datetime.utcnow().date()
# Convert the current date to a Unix timestamp (remaining in UTC)
current_timestamp_utc = datetime(current_date_utc.year, current_date_utc.month, current_date_utc.day, tzinfo=timezone.utc).timestamp()
previous_date_utc = current_date_utc - timedelta(days=trailing_days)
previous_timestamp_utc = datetime(previous_date_utc.year, previous_date_utc.month, previous_date_utc.day, tzinfo=timezone.utc).timestamp()

print('end: ' + str(current_date_utc))
print('start: ' + str(previous_date_utc))

end: 2023-12-17
start: 2023-12-16


In [5]:
# Test doing t24h for Lyra

fields = 'blocks txs'# traces'
rpc_url = 'https://rpc.lyra.finance/'
chain_name = 'lyra'
start_timestamp = int(previous_timestamp_utc)
end_timestamp = int(current_timestamp_utc)

dry_run = 0

In [6]:
# Generate Command
if dry_run == 1:
    dry_txt = '--dry'
else:
    dry_txt = ''

command = f"cryo {fields} --rpc {rpc_url} --timestamps {start_timestamp}:{end_timestamp} --subdirs datatype --label {chain_name} {dry_txt}"
print(command)

cryo blocks txs --rpc https://rpc.lyra.finance/ --timestamps 1702684800:1702771200 --subdirs datatype --label lyra 


In [7]:
start_time = time.time()
# Run the command using subprocess.run and capture the output
result = subprocess.run(
    command, 
    shell=True, 
    stdout=subprocess.PIPE,  # Capture standard output
    stderr=subprocess.PIPE,  # Capture standard error
    text=True  # Capture output as text (Python 3.7+)
)

# Display the captured output
if result.returncode == 0:
    print("Command succeeded. Output:")
    print(result.stdout)
# else:
#     print("Command failed. Error output:")
#     print(result.stderr)

end_time = time.time()

In [8]:
# Calculate the elapsed time
elapsed_time = end_time - start_time
# Print the elapsed time in seconds
print(f"Elapsed time: {elapsed_time:.4f} seconds")

Elapsed time: 93.5411 seconds


In [9]:
# # Read parquet files
txs = pl.scan_parquet('transactions__' + chain_name + '/*.parquet')
blocks = pl.scan_parquet('blocks__' + chain_name + '/*.parquet')

# Rename the 'gas_used' column to 'block_gas_used' in the 'blocks' DataFrame
blocks = blocks.rename({"gas_used": "block_gas_used"})

# Perform the join on 'block_number' and 'chain_id'
joined_df = blocks.join(
    txs,
    on=["block_number", "chain_id"],
    how="inner"  # You can specify the type of join you want (inner, outer, left, right)
)

# Convert Unix timestamp to datetime and create a new column 'timestamp_dt'
joined_df = joined_df.with_columns(
    pl.from_epoch("timestamp", time_unit="s").alias("timestamp_dt")
)

# Truncate the 'timestamp_dt' column to the day and create a new column 'timestamp_date'
joined_df = joined_df.with_columns(
    pl.col("timestamp_dt").dt.truncate("1d").alias("timestamp_date")
)

In [10]:
# print(blocks.schema)
# print(txs.schema)
print(joined_df.schema)

#test output
joined_pd = joined_df.collect().to_pandas()
joined_pd.tail(5)

OrderedDict([('block_hash', Binary), ('author', Binary), ('block_number', UInt32), ('block_gas_used', UInt64), ('extra_data', Binary), ('timestamp', UInt32), ('base_fee_per_gas', UInt64), ('chain_id', UInt64), ('transaction_index', UInt64), ('transaction_hash', Binary), ('nonce', UInt64), ('from_address', Binary), ('to_address', Binary), ('value_binary', Binary), ('value_string', Utf8), ('value_f64', Float64), ('input', Binary), ('gas_limit', UInt64), ('gas_used', UInt64), ('gas_price', UInt64), ('transaction_type', UInt32), ('max_priority_fee_per_gas', UInt64), ('max_fee_per_gas', UInt64), ('success', Boolean), ('timestamp_dt', Datetime(time_unit='us', time_zone=None)), ('timestamp_date', Datetime(time_unit='us', time_zone=None))])


Unnamed: 0,block_hash,author,block_number,block_gas_used,extra_data,timestamp,base_fee_per_gas,chain_id,transaction_index,transaction_hash,...,input,gas_limit,gas_used,gas_price,transaction_type,max_priority_fee_per_gas,max_fee_per_gas,success,timestamp_dt,timestamp_date
32299,b'\xd4R\x93w@H\xd3|\x04j\xd6\xd9\xb5j\x101\xee...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1374788,46913,b'',1702771191,50,957,0,b'N\xf2X\xdf\x03Bj?c\xc06\x087\x19\xa069 \x8f\...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,46913,0,126,,,True,2023-12-16 23:59:51,2023-12-16
32300,b'\xf1\x1f\n\xa8\xb7\xe4q\x0e=\x82{6|\xf8\x94\...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1374789,46913,b'',1702771193,50,957,0,b'UU\xf0\x82wX\x80\x85\x946\x0f\xe2\x9f=V\x1e&...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,46913,0,126,,,True,2023-12-16 23:59:53,2023-12-16
32301,b'1AU\x88\x18\xc8\xd7Q9\n\xce\x17$\x07\xfdx3\x...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1374790,46913,b'',1702771195,50,957,0,b'S\xdd6\x03\xb7Jq\xb0\x12\xc0u\x8c\x1c\x82\x9...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,46913,0,126,,,True,2023-12-16 23:59:55,2023-12-16
32302,"b""\xa6['u\x9d\xefW%}V\x10\xd0\x12\xbd\xe0\x1a\...",b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1374791,50501,b'',1702771197,50,957,0,b'\xe3\xad\x7f\x9d\xb3|J7\x81\x8f\xd7g\xc3\x06...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,50501,0,126,,,True,2023-12-16 23:59:57,2023-12-16
32303,b'\xf5\xd7L\xf2f\xfb\xdb\xc8\xe3\x1f\xda\xca\x...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1374792,64013,b'',1702771199,50,957,0,b'\xdd\xb31\xe9\xa4z\x8e\t\x9b\xcbO\x0f\x82\x8...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,64013,0,126,,,True,2023-12-16 23:59:59,2023-12-16


In [16]:
# Assuming you have a DataFrame named 'joined_df' with the required columns

result_df = joined_df.group_by([pl.col("timestamp_date"), pl.col("chain_id")]).agg(
    num_blocks=pl.col("block_number").n_unique(),
    min_block_number=pl.col("block_number").min(),
    max_block_number=pl.col("block_number").max(),
    min_block_time=pl.col("timestamp").min(),
    max_block_time=pl.col("timestamp").max(),

    num_user_transactions=
        pl.when(pl.col("gas_price") > 0).then(pl.col("transaction_hash")).n_unique(),
    num_success_user_transactions=
        pl.when((pl.col("gas_price") > 0) & pl.col("success")).then(pl.col("transaction_hash")).n_unique(),
    num_senders=pl.col("from_address").filter(pl.col("gas_price") > 0).n_unique(),

    total_gas_used=pl.col("gas_used").sum(),
    user_gas_used=pl.col("gas_used").filter(pl.col("gas_price") > 0).sum(),
    
    l2_fees_base_fees_eth=(pl.col("base_fee_per_gas") * pl.col("gas_used")).sum() / 1e18,
    l2_fees_priority_fees_eth=pl.when(pl.col("gas_price") > 0).then((pl.col("gas_price") - pl.col("base_fee_per_gas")) * pl.col("gas_used")).sum() / 1e18,
    l2_fees_total_fees_eth=(pl.col("gas_price") * pl.col("gas_used")).sum() / 1e18,
)
result_df

In [17]:
#Execute and turn to pandas
result_df = result_df.collect().to_pandas()


In [18]:
# Filter
result_df = result_df[result_df['min_block_time']>= start_timestamp] #seems like 1 block before gets pulled. yolo.

In [19]:
result_df['min_block_time_dt'] = pd.to_datetime(result_df['min_block_time'], unit='s')
result_df['max_block_time_dt'] = pd.to_datetime(result_df['min_block_time'], unit='s')
display(result_df.sort_values(by='timestamp_date',ascending=False))

Unnamed: 0,timestamp_date,chain_id,num_blocks,min_block_number,max_block_number,min_block_time,max_block_time,num_user_transactions,num_success_user_transactions,num_senders,total_gas_used,user_gas_used,l2_fees_base_fees_eth,l2_fees_priority_fees_eth,l2_fees_total_fees_eth,min_block_time_dt,max_block_time_dt
1,2023-12-16,957,32200,1331593,1374792,1702684801,1702771199,104,104,12.0,1756986046,142090722,8.78493e-08,0.001439,0.001439,2023-12-16 00:00:01,2023-12-16 00:00:01
