## Prepare data reader for a given chain and date

In [1]:
from op_analytics.coreutils.duckdb_inmem import init_client
from op_analytics.coreutils.partitioned.reader import DataReader
from op_analytics.coreutils.partitioned.location import DataLocation
from op_analytics.datapipeline.etl.intermediate.construct import construct_data_readers

from op_analytics.datapipeline.models.compute.udfs import create_duckdb_macros


# Define the input data range.
read_batches: list[DataReader] = construct_data_readers(
    chains=["op"],
    models=["event_emitting_transactions_list"],
    range_spec="@20241118:+1",
    read_from=DataLocation.GCS
)


# Select input for one date and build the intermediate model inputs.
batch = read_batches[0]


duckdb_client = init_client()
create_duckdb_macros(duckdb_client)


[2m2024-12-13 00:14:50[0m [[32m[1mdebug    [0m] [1mconnecting to OPLABS Clickhouse client...[0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m25[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:50[0m [[32m[1minfo     [0m] [1mloaded vault from .env file   [0m [36mfilename[0m=[35mvault.py[0m [36mlineno[0m=[35m32[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:50[0m [[32m[1mdebug    [0m] [1mloaded vault: 17 items        [0m [36mfilename[0m=[35mvault.py[0m [36mlineno[0m=[35m76[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:50[0m [[32m[1mdebug    [0m] [1minitialized OPLABS Clickhouse client.[0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m37[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:50[0m [[32m[1minfo     [0m] [1mprepared 1 input batches.     [0m [36mfilename[0m=[35mreader_bydate.py[0m [36mlineno[0m=[35m97[0m [36mprocess[0m=[35m8769[0m


## Run the model

This automatically registers the model outputs as duckdb tables.

In [2]:
from op_analytics.datapipeline.models.compute.testutils import execute_model_in_memory

execute_model_in_memory(
    duckdb_client=duckdb_client,
    model="event_emitting_transactions_list",
    data_reader=batch,
    limit_input_parquet_files=1
)

# The duckdb database will have the following:
#   - input tables
#   - views used by the model
#   - model outputs
# 
# You can use duckdb to inspect any of the above results.
duckdb_client.sql("SHOW TABLES")

[2m2024-12-13 00:14:50[0m [[32m[1minfo     [0m] [1mExecuting model...            [0m [36mfilename[0m=[35mtestutils.py[0m [36mlineno[0m=[35m220[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:50[0m [[32m[1minfo     [0m] [1mduckdb dataset='ingestion/logs_v1' using 1/22 parquet paths, first path is gs://oplabs-tools-data-sink/ingestion/logs_v1/chain=op/dt=2024-11-18/000128144000.parquet[0m [36mfilename[0m=[35mreader.py[0m [36mlineno[0m=[35m68[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:53[0m [[32m[1minfo     [0m] [1mregistered view: 'ingestion_logs_v1' using 1 parquet paths[0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m53[0m [36mprocess[0m=[35m8769[0m
[2m2024-12-13 00:14:53[0m [[32m[1minfo     [0m] [1mduckdb dataset='ingestion/transactions_v1' using 1/22 parquet paths, first path is gs://oplabs-tools-data-sink/ingestion/transactions_v1/chain=op/dt=2024-11-18/000128144000.parquet[0m [36mfilename[0m=[35mreade

┌─────────────────────────────────────┐
│                name                 │
│               varchar               │
├─────────────────────────────────────┤
│ event_emitting_transactions_list    │
│ event_emitting_transactions_list_v1 │
│ ingestion_logs_v1                   │
│ ingestion_transactions_v1           │
└─────────────────────────────────────┘

## Verify model results

In [3]:
duckdb_client.sql("SELECT * FROM event_emitting_transactions_list_v1 LIMIT 10")

┌────────────┬─────────────────────┬─────────────────┬─────────┬─────────┬──────────┬──────────────┬────────────────────────────────────────────────────────────────────┬────────────┬────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┐
│     dt     │     block_hour      │ block_timestamp │ network │  chain  │ chain_id │ block_number │                          transaction_hash                          │ gas_price  │ count_total_events │ count_approval_events │ count_wrapping_events │ count_transfer_events │
│    date    │      timestamp      │     uint32      │ varchar │ varchar │  int32   │    int64     │                              varchar                               │   int64    │       int64        │     decimal(38,0)     │     decimal(38,0)     │     decimal(38,0)     │
├────────────┼─────────────────────┼─────────────────┼─────────┼─────────┼──────────┼──────────────┼────────────────────────────────────────────────────────────────────┼───

### Check the data output size

In [4]:
duckdb_client.sql("SELECT COUNT(*) FROM event_emitting_transactions_list_v1")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        16679 │
└──────────────┘

In [6]:
duckdb_client.sql("SELECT input FROM event_emitting_transactions_list_v1 LIMIT 10")


BinderException: Binder Error: Referenced column "input" not found in FROM clause!
Candidate bindings: "event_emitting_transactions_list_v1.network", "event_emitting_transactions_list_v1.chain_id", "event_emitting_transactions_list_v1.block_number"

### You can also convert the results to dataframes to inspect them in more familiar ways

In [None]:
duckdb_client.sql("SELECT * FROM refined_transactions_fees_v1 LIMIT 10").pl().head()

### Get table schema

In [None]:
duckdb_client.sql("DESCRIBE event_emitting_transactions_v1")
