## Prepare data reader and model execution context

In [5]:
from pprint import pprint

from op_analytics.coreutils.partitioned.location import DataLocation
from op_analytics.coreutils.partitioned.reader import DataReader
from op_analytics.datapipeline.etl.ingestion.reader.byblock import construct_readers_byblock
from op_analytics.datapipeline.etl.ingestion.reader.request import BlockBatchRequest
from op_analytics.datapipeline.models.compute.markers import ModelsDataSpec
from op_analytics.datapipeline.models.compute.testutils import setup_execution_context

model_name = "account_abstraction_prefilter"


# Select a model.
data_spec = ModelsDataSpec(root_path_prefix="blockbatch", models=[model_name])

# Select a block batch.
blockbatch_request = BlockBatchRequest.build(
    chains=["base"],
    range_spec="19894001:+1",
    # range_spec="19910194:+1",
    root_paths_to_read=data_spec.input_root_paths,
)

# Construct readers
readers: list[DataReader] = construct_readers_byblock(
    blockbatch_request=blockbatch_request,
    read_from=DataLocation.GCS,
)

# Show details for the batch we are processing.
pprint(readers[0])

# Ensure existence of data needed by the reader.
assert readers[0].inputs_ready

# Set up execution context and get handles to model input args.
# In subsequent cells you can use the model input args however you want.
ctx, input_datasets, auxiliary_templates = setup_execution_context(
    model_name=model_name,
    data_reader=readers[0],  # use the first reader
)


[2m2025-01-29 15:15:47[0m [[32m[1minfo     [0m] [1mprepared 1 input batches.     [0m [36mfilename[0m=[35mbyblock.py[0m [36mlineno[0m=[35m88[0m [36mprocess[0m=[35m77753[0m
DataReader(partitions=Partition(cols=[PartitionColumn(name='chain',
                                                      value='base'),
                                      PartitionColumn(name='dt',
                                                      value='2024-09-17')]),
           read_from=DataLocation.GCS,
           dataset_paths={'ingestion/logs_v1': ['gs://oplabs-tools-data-sink/ingestion/logs_v1/chain=base/dt=2024-09-17/000019894000.parquet'],
                          'ingestion/traces_v1': ['gs://oplabs-tools-data-sink/ingestion/traces_v1/chain=base/dt=2024-09-17/000019894000.parquet']},
           inputs_ready=True,
           extra_marker_data={'max_block': 19896000,
                              'min_block': 19894000,
                              'num_blocks': 2000})
[2m2025-01-

In [12]:
# EntryPoint logs.
entrypoint_logs = auxiliary_templates["account_abstraction_prefilter/entrypoint_logs"].create_table(
    duckdb_context=ctx,
    template_parameters={
        "raw_logs": input_datasets["ingestion/logs_v1"].as_subquery(),
    },
)

# Table with EntryPoint transaction hashes. Used to filter the raw traces.
ctx.client.sql(f"""
CREATE OR REPLACE TABLE txhashes AS
SELECT DISTINCT transaction_hash FROM {entrypoint_logs}
ORDER BY transaction_hash
""")

from op_analytics.datapipeline.models.code.account_abstraction.abis import (
    INNER_HANDLE_OP_FUNCTION_METHOD_ID_v0_6_0,
    INNER_HANDLE_OP_FUNCTION_METHOD_ID_v0_7_0,
)

# Prefiltered traces.
entrypoint_traces = auxiliary_templates[
    "account_abstraction_prefilter/entrypoint_prefiltered_traces"
].create_table(
    duckdb_context=ctx,
    template_parameters={
        "raw_traces": input_datasets["ingestion/traces_v1"].as_subquery(),
        "entrypoint_txhashes": "txhashes",
        "inner_handle_op_method_ids": ", ".join(
            [
                f"'{INNER_HANDLE_OP_FUNCTION_METHOD_ID_v0_6_0}'",
                f"'{INNER_HANDLE_OP_FUNCTION_METHOD_ID_v0_7_0}'",
            ]
        ),
    },
)

[2m2025-01-29 16:17:42[0m [[32m[1minfo     [0m] [1mconstructed read_parquet() string with 1 paths[0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m263[0m [36mprocess[0m=[35m77753[0m
[2m2025-01-29 16:17:42[0m [[32m[1minfo     [0m] [1mRendering query               [0m [36mfilename[0m=[35mquerybuilder.py[0m [36mlineno[0m=[35m40[0m [36mprocess[0m=[35m77753[0m [36mtemplate[0m=[35maccount_abstraction_prefilter/entrypoint_logs[0m
[2m2025-01-29 16:17:44[0m [[32m[1minfo     [0m] [1mduck db size: 74.2MB          [0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m36[0m [36mprocess[0m=[35m77753[0m
[2m2025-01-29 16:17:44[0m [[32m[1minfo     [0m] [1mconstructed read_parquet() string with 1 paths[0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m263[0m [36mprocess[0m=[35m77753[0m
[2m2025-01-29 16:17:44[0m [[32m[1minfo     [0m] [1mRendering query               [0m [36mfilename[0m=[35mquerybuilder.py

In [13]:
ctx.client.sql("SHOW TABLES")

┌──────────────────────────────────────────────────────────────┐
│                             name                             │
│                           varchar                            │
├──────────────────────────────────────────────────────────────┤
│ account_abstraction_prefilter__entrypoint_logs               │
│ account_abstraction_prefilter__entrypoint_prefiltered_traces │
│ txhashes                                                     │
└──────────────────────────────────────────────────────────────┘

In [14]:
# NOTES:
# 
# Block batch filtering 
#
# Batch=19910000:
#  logs   :  731998  ->  19725  (2.7%)
#  traces : 3997893  -> 199594  (4.9%)
#
# Batch=19910000:
#  logs   :  680683  ->  30251  (4.4%)
#  traces : 4036203  -> 348751  (8.6%)  245413 if we filter traces with !=delegatecall

ctx.client.sql("""
SELECT 'logs' AS table, count(*) as num_rows FROM account_abstraction_prefilter__entrypoint_logs
UNION ALL
SELECT 'traces' AS table, count(*) as num_rows FROM account_abstraction_prefilter__entrypoint_prefiltered_traces
""")

┌─────────┬──────────┐
│  table  │ num_rows │
│ varchar │  int64   │
├─────────┼──────────┤
│ logs    │    30251 │
│ traces  │   245413 │
└─────────┴──────────┘