See readme.md for ideal fields and descriptions

In [None]:
# Can do installs here when conde env is activated
# ! pip install pandas pyarrow
# ! pip install polars
# ! pip install maturin
# ! pip install cryo
# ! pip install web3

In [None]:
import os
import glob
from datetime import datetime, timedelta, timezone
import time
import pyarrow.parquet as pq
import pandas as pd
import cryo
import sys
sys.path.append("../../helper_functions")
import web3py_utils as w3py
import os_utils as osu
sys.path.pop()
import polars as pl
# test adding cryo_cli python

In [None]:
output_directory = 'cryo_outputs/'
# clear out
osu.clear_folder(output_directory)

In [None]:
# Test doing Degen Chain
config_chain = {
        'rpc_url': 'https://rpc.redstonechain.com',
        'chain_name': 'redstone',
        'block_time_sec': 2,
        'block_time_buffer': 0
        }
# Lyra
# 'rpc_url': 'https://rpc.lyra.finance/'
# 'chain_name': 'lyra'
# 'block_time_sec': 2
# 'block_time_buffer': 0
# DEGEN
# 'rpc_url': 'https://rpc.degen.tips',
# 'chain_name': 'degen',
# 'block_time_sec': 0.383, #Note: Arb Stack not deterministic
# 'block_time_buffer': 0.25

###
whole_day_only = False#True
trailing_days = 0.25
dry_run = False
fields = ['blocks', 'txs']
requests_per_second_max = 500 # -1 means ignore


In [None]:
#Intermediate Calc
rpc_url = config_chain['rpc_url']
chain_name = config_chain['chain_name']
block_time_sec = config_chain['block_time_sec']
block_time_buffer = config_chain['block_time_buffer']

blocks_per_day = (60*60*24) / block_time_sec
blocks_per_day_lo = (60*60*24) / (block_time_sec+block_time_buffer)
if block_time_sec-block_time_buffer > 0:
        blocks_per_day_hi = (60*60*24) / (block_time_sec-block_time_buffer)
else:
        blocks_per_day_hi = (60*60*24) / 0.01

print(blocks_per_day)

In [None]:
# Init timestamps

# Get the current time in UTC
current_time_utc = datetime.utcnow()
print(current_time_utc)

# If only whole days then shift the ending time to the start of the day
if whole_day_only:
        current_date_utc = datetime.combine(datetime.utcnow().date(), datetime.min.time())
        time_difference = current_time_utc - current_date_utc
        difference_days_fraction = time_difference.total_seconds() / (24 * 3600)  # There are 86400 seconds in a day
else: 
        current_date_utc = current_time_utc
        difference_days_fraction = 0
print('day fraction :' + str(difference_days_fraction))

print(current_date_utc)
starting_date_utc = current_date_utc - timedelta(days=trailing_days)

current_block = w3py.getLatestBlockNumber(rpc_url)

ending_block = int( current_block - (difference_days_fraction * blocks_per_day_lo) )
starting_block = int( ending_block - (trailing_days * blocks_per_day_hi) )

print('current: ' + str(int(current_block)))
print('end: ' + str(int(ending_block)))
print('start: ' + str(int(starting_block)))

In [None]:
start_timestamp = int(starting_date_utc.timestamp())
end_timestamp = int(current_date_utc.timestamp())

start_time = time.time()

In [8]:
# Generate Command
# if dry_run == 1:
#     dry_txt = '--dry'
# else:
#     dry_txt = ''
# if requests_per_second_max > -1:
#     rps_txt = '--requests-per-second ' + str(requests_per_second_max)
# else:
#     rps_txt = ''

# Fetch and save blocks data in JSON
data = cryo.freeze(
    ['txs', 'blocks'],
    blocks=[str(starting_block) + ":" + str(ending_block)],
    rpc=rpc_url,
    output_dir= output_directory,
    file_format='parquet',
    label=chain_name,
    hex=True,
    dry=dry_run,
    requests_per_second=requests_per_second_max
)

In [None]:
# Calculate the elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
# Print the elapsed time in seconds
print(f"Elapsed time: {elapsed_time:.4f} seconds")

In [None]:
# Move this to a cryo_utils eventually

def load_parquet_files(chain_name, data_type_name, data_directory):
    pattern = f"*{data_type_name}__{chain_name}*.parquet"
    file_paths = glob.glob(os.path.join(data_directory, pattern))
    
    if file_paths:
        df = pl.scan_parquet(file_paths)
        # Further processing can go here, for example:
        # df = df.filter(pl.col("some_column") > 0)
        return df.collect()  # Collecting after all transformations
    else:
        print("No files found matching the pattern.")
        return None

In [None]:
# # Read parquet files
txs = load_parquet_files(chain_name, 'transactions', output_directory)
blocks = load_parquet_files(chain_name, 'blocks', output_directory)

# Rename the 'gas_used' column to 'block_gas_used' in the 'blocks' DataFrame
blocks = blocks.rename({"gas_used": "block_gas_used"})

# Perform the join on 'block_number' and 'chain_id'
joined_df = blocks.join(
    txs,
    on=["block_number", "chain_id"],
    how="inner"  # You can specify the type of join you want (inner, outer, left, right)
)

# Convert Unix timestamp to datetime and create a new column 'timestamp_dt'
joined_df = joined_df.with_columns(
    pl.from_epoch("timestamp", time_unit="s").alias("timestamp_dt")
)

# Truncate the 'timestamp_dt' column to the day and create a new column 'timestamp_date'
joined_df = joined_df.with_columns(
    pl.col("timestamp_dt").dt.truncate("1d").alias("timestamp_date")
)

In [None]:
# print(blocks.schema)
# print(txs.schema)
print(joined_df.schema)
print(type(joined_df))

#test output
joined_pd = joined_df.to_pandas()
# print(joined_pd.tail(5))

print('num blocks: ' + str(joined_pd['block_number'].nunique()))

In [None]:
# Assuming you have a DataFrame named 'joined_df' with the required columns

result_df = joined_df.group_by([pl.col("timestamp_date"), pl.col("chain_id")]).agg(
    num_blocks=pl.col("block_number").n_unique(),
    min_block_number=pl.col("block_number").min(),
    max_block_number=pl.col("block_number").max(),
    min_block_time=pl.col("timestamp").min(),
    max_block_time=pl.col("timestamp").max(),

    num_user_transactions=
        pl.when(pl.col("gas_price") > 0).then(pl.col("transaction_hash")).count(),
    num_success_user_transactions=
        pl.when((pl.col("gas_price") > 0) & pl.col("success")).then(pl.col("transaction_hash")).count(),
    num_senders=pl.col("from_address").filter(pl.col("gas_price") > 0).n_unique(),

    total_gas_used=pl.col("gas_used").sum(),
    user_gas_used=pl.col("gas_used").filter(pl.col("gas_price") > 0).sum(),
    total_gas_used_per_block = pl.col("gas_used").sum() / pl.col("block_number").n_unique(),
    user_gas_used_per_block = pl.col("gas_used").filter(pl.col("gas_price") > 0).sum() / pl.col("block_number").n_unique(),
    
    l2_fees_base_fees_eth=(pl.col("base_fee_per_gas") * pl.col("gas_used")).sum() / 1e18,
    l2_fees_priority_fees_eth=pl.when(pl.col("gas_price") > 0).then((pl.col("gas_price") - pl.col("base_fee_per_gas")) * pl.col("gas_used")).sum() / 1e18,
    l2_fees_total_fees_eth=(pl.col("gas_price") * pl.col("gas_used")).sum() / 1e18,
)
result_df = result_df.to_pandas()

In [None]:
# Filter
result_df = result_df[ (result_df['min_block_time']>= start_timestamp ) & (result_df['min_block_time']<= end_timestamp ) ]
#seems like 1 block before gets pulled. yolo.

In [None]:
result_df['min_block_time_dt'] = pd.to_datetime(result_df['min_block_time'], unit='s')
result_df['max_block_time_dt'] = pd.to_datetime(result_df['max_block_time'], unit='s')
display(result_df.sort_values(by='timestamp_date',ascending=False))