See readme.md for ideal fields and descriptions

In [1]:
# ! pip install pandas pyarrow
# ! pip install polars

In [2]:
import subprocess
import os
from datetime import datetime, timedelta
import time
import pyarrow.parquet as pq
import pandas as pd


In [3]:
import polars as pl

In [4]:
# Init timestamps
trailing_days = 2

current_date_txt = datetime.utcnow().date()
current_date = datetime(current_date_txt.year, current_date_txt.month, current_date_txt.day)
previous_date = current_date - timedelta(days=trailing_days)
print('end: ' + str(current_date))
print('start: ' + str(previous_date))

end: 2023-12-17 00:00:00
start: 2023-12-15 00:00:00


In [5]:
# Test doing t24h for Lyra

fields = 'blocks txs'# traces'
rpc_url = 'https://rpc.lyra.finance/'
chain_name = 'lyra'
start_timestamp = int(previous_date.timestamp())
end_timestamp = int(current_date.timestamp())

dry_run = 0

In [6]:
# Generate Command
if dry_run == 1:
    dry_txt = '--dry'
else:
    dry_txt = ''

command = f"cryo {fields} --rpc {rpc_url} --timestamps {start_timestamp}:{end_timestamp} --subdirs datatype --label {chain_name} {dry_txt}"
print(command)

cryo blocks txs --rpc https://rpc.lyra.finance/ --timestamps 1702616400:1702789200 --subdirs datatype --label lyra 


In [7]:
start_time = time.time()
# Run the command using subprocess.run and capture the output
result = subprocess.run(
    command, 
    shell=True, 
    stdout=subprocess.PIPE,  # Capture standard output
    stderr=subprocess.PIPE,  # Capture standard error
    text=True  # Capture output as text (Python 3.7+)
)

# Display the captured output
if result.returncode == 0:
    print("Command succeeded. Output:")
    print(result.stdout)
# else:
#     print("Command failed. Error output:")
#     print(result.stderr)

end_time = time.time()

In [8]:
# Calculate the elapsed time
elapsed_time = end_time - start_time
# Print the elapsed time in seconds
print(f"Elapsed time: {elapsed_time:.4f} seconds")

Elapsed time: 178.5445 seconds


In [15]:
# # Read parquet files
txs = pl.scan_parquet('transactions__' + chain_name + '/*.parquet')
blocks = pl.scan_parquet('blocks__' + chain_name + '/*.parquet')

# Rename the 'gas_used' column to 'block_gas_used' in the 'blocks' DataFrame
blocks = blocks.rename({"gas_used": "block_gas_used"})

# Perform the join on 'block_number' and 'chain_id'
joined_df = blocks.join(
    txs,
    on=["block_number", "chain_id"],
    how="inner"  # You can specify the type of join you want (inner, outer, left, right)
)

# Convert Unix timestamp to datetime and create a new column 'timestamp_dt'
joined_df = joined_df.with_columns(
    pl.from_epoch("timestamp", time_unit="s").alias("timestamp_dt")
)

# Truncate the 'timestamp_dt' column to the day and create a new column 'timestamp_date'
joined_df = joined_df.with_columns(
    pl.col("timestamp_dt").dt.truncate("1d").alias("timestamp_date")
)

In [16]:
# print(blocks.schema)
# print(txs.schema)
print(joined_df.schema)

#test output
joined_pd = joined_df.collect().to_pandas()
joined_pd.head(5)

OrderedDict([('block_hash', Binary), ('author', Binary), ('block_number', UInt32), ('block_gas_used', UInt64), ('extra_data', Binary), ('timestamp', UInt32), ('base_fee_per_gas', UInt64), ('chain_id', UInt64), ('transaction_index', UInt64), ('transaction_hash', Binary), ('nonce', UInt64), ('from_address', Binary), ('to_address', Binary), ('value_binary', Binary), ('value_string', Utf8), ('value_f64', Float64), ('input', Binary), ('gas_limit', UInt64), ('gas_used', UInt64), ('gas_price', UInt64), ('transaction_type', UInt32), ('max_priority_fee_per_gas', UInt64), ('max_fee_per_gas', UInt64), ('success', Boolean), ('timestamp_dt', Datetime(time_unit='us', time_zone=None)), ('timestamp_date', Datetime(time_unit='us', time_zone=None))])


Unnamed: 0,block_hash,author,block_number,block_gas_used,extra_data,timestamp,base_fee_per_gas,chain_id,transaction_index,transaction_hash,...,input,gas_limit,gas_used,gas_price,transaction_type,max_priority_fee_per_gas,max_fee_per_gas,success,timestamp_dt,timestamp_date
0,b'\x1c\xde\xe7M\xc7\xa3&\xb0.\xc6\x12/\xc5<\xa...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1297392,46913,b'',1702616399,50,957,0,"b'\x1dj\xdb0\xdc""\xce\xc1\xe8\x89\x9b\x18\xf2\...",...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,46913,0,126,,,True,2023-12-15 04:59:59,2023-12-15
1,b'\xbe\x86\xfe\xda\x13/\xf5i\xbcpT\xbfP\x95\xd...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1297393,46913,b'',1702616401,50,957,0,b'\xc1\x13\xb2\xb8\xf0\x81P\x1e\xd7\x8e\xa5\x0...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,46913,0,126,,,True,2023-12-15 05:00:01,2023-12-15
2,b'\x94w\xbdctZ\xb1\xe3O\xa6@e\x8bOtuE\xd2\xbeP...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1297394,50501,b'',1702616403,50,957,0,"b""\xcb\xfb\x9fs\xccL\xb8\xf7\xae-\x06\xcf\r\xa...",...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,50501,0,126,,,True,2023-12-15 05:00:03,2023-12-15
3,b'\xa84\xa4;\x94<t\xda\xfa?\xb5\xa6\xf1\xea\x9...,b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1297395,64013,b'',1702616405,50,957,0,b'\x0cD\x9fq\x95+_\xf7\xc7\xd1\xd1\xc9\xa9e\x1...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,64013,0,126,,,True,2023-12-15 05:00:05,2023-12-15
4,"b'\x07\x8d\xfe\xfc,\x92C\xdc\xf5\x01\xc8\x0e<+...",b'B\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0...,1297396,46913,b'',1702616407,50,957,0,b')X^fa1\xf4M3I\x87|\xbd\x9bVUc\x07Y\x96\xcbV\...,...,"b""\x01]\x8e\xb9\x00\x00\x00\x00\x00\x00\x00\x0...",1000000,46913,0,126,,,True,2023-12-15 05:00:07,2023-12-15


In [17]:
# Assuming you have a DataFrame named 'joined_df' with the required columns

result_df = joined_df.group_by(pl.col("timestamp_date")).agg(
    num_blocks=pl.col("block_number").n_unique(),
    num_user_transactions=
        pl.when(pl.col("gas_price") > 0).then(pl.col("transaction_hash")).n_unique(),
    num_success_user_transactions=
        pl.when((pl.col("gas_price") > 0) & pl.col("success")).then(pl.col("transaction_hash")).n_unique(),
    total_gas_used=pl.col("gas_used").sum(),
    user_gas_used=pl.col("gas_used").filter(pl.col("gas_price") > 0).sum(),
    num_senders=pl.col("from_address").filter(pl.col("gas_price") > 0).n_unique(),
    l2_fees_base_fees=(pl.col("base_fee_per_gas") * pl.col("gas_used")).sum(),
    l2_fees_priority_fees=pl.when(pl.col("gas_price") > 0).then((pl.col("gas_price") - pl.col("base_fee_per_gas")) * pl.col("gas_used")).sum(),
    l2_fees_total_fees=(pl.col("gas_price") * pl.col("gas_used")).sum(),
)
result_df

In [None]:
#Execute and turn to pandas
result_df = result_df.collect().to_pandas()

In [19]:

display(result_df.sort_values(by='timestamp_date',ascending=False))

Unnamed: 0,timestamp_date,num_blocks,num_user_transactions,num_success_user_transactions,total_gas_used,user_gas_used,num_senders,l2_fees_base_fees,l2_fees_priority_fees,l2_fees_total_fees
0,2023-12-17,8015,19,19,436133735,33780608,7,21806686750,5718083300000,5719772330400
2,2023-12-16,40200,117,117,2179889899,163502039,12,108994494950,1442751656500000,1442759831601950
1,2023-12-15,30201,168,168,1715418008,199521739,15,85770900400,259502570600000,259512546686950
