In [6]:
from pprint import pprint

from op_analytics.coreutils.clickhouse.inferschema import infer_schema_from_parquet
from op_analytics.coreutils.partitioned.location import DataLocation
from op_analytics.coreutils.partitioned.reader import DataReader
from op_analytics.datapipeline.etl.ingestion.reader.byblock import construct_readers_byblock
from op_analytics.datapipeline.etl.ingestion.reader.request import BlockBatchRequest
from op_analytics.datapipeline.etl.ingestion.reader.rootpaths import RootPath


# Prepare data readers
blockbatch_request = BlockBatchRequest.build(
    chains=["base"],
    range_spec="@20250107:+1",
    root_paths_to_read=[
        RootPath.of("blockbatch/refined_traces/refined_traces_fees_v1"),
        RootPath.of("blockbatch/refined_traces/refined_transactions_fees_v1"),
    ],
)
readers: list[DataReader] = construct_readers_byblock(
    blockbatch_request=blockbatch_request,
    read_from=DataLocation.GCS,
)


# Show reader details.
pprint(readers[0])


# Infer schema
parquet_path = readers[0].dataset_paths["blockbatch/refined_traces/refined_traces_fees_v1"][0]
print(parquet_path)
infer_schema_from_parquet(parquet_path, "refined_traces_fees_v1")


[2m2025-01-10 04:21:06[0m [[32m[1minfo     [0m] [1mprepared 109 input batches.   [0m [36mfilename[0m=[35mbyblock.py[0m [36mlineno[0m=[35m78[0m [36mprocess[0m=[35m3378[0m
DataReader(partitions=Partition(cols=[PartitionColumn(name='chain',
                                                      value='base'),
                                      PartitionColumn(name='dt',
                                                      value='2025-01-07')]),
           read_from=DataLocation.GCS,
           dataset_paths={'blockbatch/refined_traces/refined_traces_fees_v1': ['gs://oplabs-tools-data-sink/blockbatch/refined_traces/refined_traces_fees_v1/chain=base/dt=2025-01-07/000024709200.parquet'],
                          'blockbatch/refined_traces/refined_transactions_fees_v1': ['gs://oplabs-tools-data-sink/blockbatch/refined_traces/refined_transactions_fees_v1/chain=base/dt=2025-01-07/000024709200.parquet']},
           inputs_ready=True,
           extra_marker_data={'max_blo

In [7]:
# Infer schema
parquet_path = readers[0].dataset_paths["blockbatch/refined_traces/refined_transactions_fees_v1"][0]
print(parquet_path)
infer_schema_from_parquet(parquet_path, "refined_transactions_fees_v1")

gs://oplabs-tools-data-sink/blockbatch/refined_traces/refined_transactions_fees_v1/chain=base/dt=2025-01-07/000024709200.parquet
[2m2025-01-10 04:21:07[0m [[32m[1minfo     [0m] [1musing gcs path: https://storage.googleapis.com/oplabs-tools-data-sink/blockbatch/refined_traces/refined_transactions_fees_v1/chain=base/dt=2025-01-07/000024709200.parquet[0m [36mfilename[0m=[35minferschema.py[0m [36mlineno[0m=[35m19[0m [36mprocess[0m=[35m3378[0m
CREATE TABLE IF NOT EXISTS refined_transactions_fees_v1
(
    `chain_id` Nullable(Int32),
    `network` Nullable(String),
    `nonce` Nullable(Int64),
    `transaction_index` Nullable(Int64),
    `from_address` Nullable(String),
    `to_address` Nullable(String),
    `block_number` Nullable(Int64),
    `block_timestamp` Nullable(UInt32),
    `hash` Nullable(String),
    `transaction_type` Nullable(Int32),
    `gas_price` Nullable(Int64),
    `gas_limit` Nullable(Int64),
    `l2_gas_used` Nullable(Int64),
    `receipt_l1_gas_used` Nu