In [1]:
from op_datasets.processing.execute import reader, BatchInput

# pick some block range that covers multiple days
inputbatch: BatchInput = None
for _inputbatch in reader(
    chain="op", block_spec="105376883:+500", source_spec="goldsky"
):
    inputbatch = _inputbatch
    break

dataframes = inputbatch.dataframes

[2m2024-10-15 14:01:48[0m [[32m[1minfo     [0m] [1mLoaded env var: OP_ANALYTICS_VAULT[0m
[2m2024-10-15 14:01:49[0m [[32m[1minfo     [0m] [1mLoaded 5 items into vault.    [0m
[2m2024-10-15 14:01:50[0m [[32m[1minfo     [0m] [1mInitialized Clickhouse client.[0m
[2m2024-10-15 14:01:50[0m [[32m[1minfo     [0m] [1mQuery success: blocks in 1.39s 2000 rows[0m
[2m2024-10-15 14:01:52[0m [[32m[1minfo     [0m] [1mQuery success: logs in 3.23s 59381 rows[0m
[2m2024-10-15 14:01:53[0m [[32m[1minfo     [0m] [1mQuery success: transactions in 3.88s 14166 rows[0m


In [2]:
dataframes

{'blocks': shape: (2_000, 24)
 ┌───────┬─────────┬──────────┬────────────┬───┬──────────┬───────────┬────────────┬────────────────┐
 │ chain ┆ network ┆ chain_id ┆ dt         ┆ … ┆ gas_used ┆ gas_limit ┆ extra_data ┆ transaction_co │
 │ ---   ┆ ---     ┆ ---      ┆ ---        ┆   ┆ ---      ┆ ---       ┆ ---        ┆ unt            │
 │ str   ┆ str     ┆ i32      ┆ str        ┆   ┆ i64      ┆ i64       ┆ str        ┆ ---            │
 │       ┆         ┆          ┆            ┆   ┆          ┆           ┆            ┆ i64            │
 ╞═══════╪═════════╪══════════╪════════════╪═══╪══════════╪═══════════╪════════════╪════════════════╡
 │ op    ┆ mainnet ┆ 10       ┆ 2023-06-09 ┆ … ┆ 676868   ┆ 30000000  ┆ 0x         ┆ 5              │
 │ op    ┆ mainnet ┆ 10       ┆ 2023-06-09 ┆ … ┆ 1149120  ┆ 30000000  ┆ 0x         ┆ 7              │
 │ op    ┆ mainnet ┆ 10       ┆ 2023-06-09 ┆ … ┆ 3368746  ┆ 30000000  ┆ 0x         ┆ 9              │
 │ op    ┆ mainnet ┆ 10       ┆ 2023-06-09 ┆ … ┆ 417

In [3]:
import polars as pl

In [4]:
def daily_address_summary(
    df: pl.DataFrame, conditions: dict[str, callable], **groupby_args: dict[str, str]
) -> pl.DataFrame:
    # Create filter expression from conditions dictionary
    filter_expr = pl.lit(True)
    for col, condition in conditions.items():
        filter_expr &= condition(pl.col(col))

    # Apply the filter to the DataFrame
    _filter_df = df.filter(filter_expr)

    # Extract groupby columns from the arguments
    groupby_cols = list(groupby_args.values())

    # SQL query to perform aggregations
    groupby_clause = ", ".join(groupby_cols)

    query = f"""
    SELECT 
        {groupby_clause}

        -- transactions
        ,COUNT(hash) AS total_txs
        ,COUNT(CASE WHEN receipt_status = 1 THEN hash ELSE NULL END) AS total_txs_success
        ,COUNT(CASE WHEN receipt_status != 1 THEN hash ELSE NULL END) AS total_txs_fail

        -- blocks
        ,COUNT(DISTINCT block_number) AS total_blocks
        ,COUNT(DISTINCT CASE WHEN receipt_status = 1 THEN block_number ELSE NULL END) AS total_blocks_success
        ,COUNT(DISTINCT CASE WHEN receipt_status != 1 THEN block_number ELSE NULL END) AS total_blocks_fail
        ,MIN(block_number) AS min_block_number
        ,MAX(block_number) AS max_block_number
        ,MAX(block_number) - MIN(block_number) + 1 AS block_interval_active

        -- nonce
        ,MIN(nonce) AS min_nonce
        ,MAX(nonce) AS max_nonce
        ,MAX(nonce) - MIN(nonce) + 1 AS nonce_interval_active

        -- gas usage
        ,SUM(receipt_gas_used) AS total_gas_used
        ,SUM(CASE WHEN receipt_status = 1 THEN receipt_gas_used ELSE 0 END) AS total_gas_used_success
        ,SUM(CASE WHEN receipt_status != 1 THEN receipt_gas_used ELSE 0 END) AS total_gas_used_fail

        -- block timestamp
        ,MIN(block_timestamp) AS min_block_timestamp
        ,MAX(block_timestamp) AS max_block_timestamp
        ,MAX(block_timestamp) - MIN(block_timestamp) AS time_interval_active

        -- to addresses, to identify contracts in the future
        ,COUNT(DISTINCT to_address) AS num_to_addresses
        ,COUNT(DISTINCT CASE WHEN receipt_status = 1 THEN to_address ELSE NULL END) AS num_to_addresses_success
        ,COUNT(DISTINCT CASE WHEN receipt_status != 1 THEN to_address ELSE NULL END) AS num_to_addresses_fail

        -- get number of hours active
    
    FROM
        _filter_df
    GROUP BY
        {groupby_clause}
    """

    # Execute the query and collect the result
    result = pl.sql(query).collect()

    return result

In [5]:
CONDITIONS = {
    "gas_price": lambda x: x > 0,
}

In [6]:
result = daily_address_summary(
    dataframes["transactions"],
    conditions=CONDITIONS,
    address="from_address",
    chain_id="chain_id",
    chain="chain",
    dt="dt",
)

In [7]:
result.head()

from_address,chain_id,chain,dt,total_txs,total_txs_success,total_txs_fail,total_blocks,total_blocks_success,total_blocks_fail,min_block_number,max_block_number,block_interval_active,min_nonce,max_nonce,nonce_interval_active,total_gas_used,total_gas_used_success,total_gas_used_fail,min_block_timestamp,max_block_timestamp,time_interval_active,num_to_addresses,num_to_addresses_success,num_to_addresses_fail
str,i32,str,str,u32,u32,u32,u32,u32,u32,i64,i64,i64,i64,i64,i64,i64,i64,i64,u32,u32,u32,u32,u32,u32
"""0x848f0b533ea9cfcb3f59ec4ea841…",10,"""op""","""2023-06-09""",2,2,0,2,2,0,105377653,105377662,10,1,2,2,302928,302928,0,1686354083,1686354101,18,2,2,0
"""0x368b9d607072165fb207dfe4af46…",10,"""op""","""2023-06-09""",1,1,0,1,1,0,105376287,105376287,1,26,26,1,270700,270700,0,1686351351,1686351351,0,1,1,0
"""0x9477081461ae9a118cdb2bab35fb…",10,"""op""","""2023-06-09""",1,1,0,1,1,0,105377564,105377564,1,1,1,1,21000,21000,0,1686353905,1686353905,0,1,1,0
"""0x3b0350e3c9d5a04450c29e7c3a56…",10,"""op""","""2023-06-09""",2,2,0,2,2,0,105376796,105376833,38,43,44,2,56087,56087,0,1686352369,1686352443,74,2,2,0
"""0x1d3286a3348fa99852d147c57a79…",10,"""op""","""2023-06-09""",17,15,2,16,14,2,105376080,105377943,1864,234467,234483,17,3383196,3297576,85620,1686350937,1686354663,3726,2,2,1


In [8]:
from op_coreutils.bigquery.write import overwrite_partitions
import os

os.environ["OPLABS_ENV"] = "prod"

In [9]:
overwrite_partitions(result, "temp", "daily_address_summary", expiration_days=999)

[2m2024-10-15 14:01:53[0m [[32m[1minfo     [0m] [1mWriting 1 partitions to BQ [2023-06-09 00:00:00 ... 2023-06-09 00:00:00][0m
[2m2024-10-15 14:01:53[0m [[32m[1minfo     [0m] [1mDRYRUN OVERWRITE PARTITION: Wrote 2.9Krows 162.8KB to BQ temp.daily_address_summary[0m
