In [1]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import duckdb

In [2]:
con = duckdb.connect()
con.execute("""-- Set memory limits before running query
                SET memory_limit='16GB';
                -- Enable progress tracking
                SET enable_progress_bar=true;
                -- Enable detailed profiling
                SET profiling_mode='detailed';
            """)

<duckdb.duckdb.DuckDBPyConnection at 0x109a7e5b0>

In [3]:
# Define file path and view name
parquet_path = "/Users/couch/intel_research/data/0000_part_00 (1).parquet"
view_name = "parquet_view"

# Create a view over the Parquet file
con.execute(f"""
    CREATE VIEW {view_name} AS 
    SELECT * FROM read_parquet('{parquet_path}');
""")

<duckdb.duckdb.DuckDBPyConnection at 0x109a7e5b0>

In [4]:
df = con.execute(f"SELECT * FROM {view_name} LIMIT 5").fetchdf()
print(df)

              load_ts         batch_id  \
0 2019-10-24 04:09:21  20191023-210226   
1 2019-10-24 04:09:21  20191023-210226   
2 2019-10-24 04:09:21  20191023-210226   
3 2019-10-24 04:09:21  20191023-210226   
4 2019-10-24 04:09:21  20191023-210226   

                                           audit_zip  \
0  2019102316-i-0728cf8ab79db9e58-BfZXhQdKBpG8dxo...   
1  2019102316-i-0728cf8ab79db9e58-BfZXhQdKBpG8dxo...   
2  2019102316-i-0728cf8ab79db9e58-BfZXhQdKBpG8dxo...   
3  2019102316-i-0728cf8ab79db9e58-BfZXhQdKBpG8dxo...   
4  2019102316-i-0728cf8ab79db9e58-BfZXhQdKBpG8dxo...   

                           audit_internal_path  \
0  V8_1_SYSTEMUSAGEBYFGNDAPP_20191023231918.V8   
1  V8_1_SYSTEMUSAGEBYFGNDAPP_20191023231918.V8   
2  V8_1_SYSTEMUSAGEBYFGNDAPP_20191023231918.V8   
3  V8_1_SYSTEMUSAGEBYFGNDAPP_20191023231918.V8   
4  V8_1_SYSTEMUSAGEBYFGNDAPP_20191023231918.V8   

                               guid  interval_start_utc    interval_end_utc  \
0  0028155e73c04ed2a420eb00af9

┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││    Query Profiling Information    ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
SELECT * FROM parquet_view LIMIT 5
┌────────────────────────────────────────────────┐
│┌──────────────────────────────────────────────┐│
││              Total Time: 0.0555s             ││
│└──────────────────────────────────────────────┘│
└────────────────────────────────────────────────┘
┌────────────────────────────────────────────────┐
│               Optimizer: 0.0061s               │
│┌──────────────────────────────────────────────┐│
││        Build Side Probe Side: 0.0000s        ││
││           Column Lifetime: 0.0000s           ││
││           Common Aggregate: 0.0000s          ││
││        Common Subexpressions: 0.0000s        ││
││      Compressed Materialization: 0.0000s     ││
││          Cte Filter Pusher: 0.0000s          ││
││             Deliminator: 0.0000s             ││


In [5]:
df.columns

Index(['load_ts', 'batch_id', 'audit_zip', 'audit_internal_path', 'guid',
       'interval_start_utc', 'interval_end_utc', 'interval_local_start',
       'interval_local_end', 'ts', 'dt', 'pid', 'proc_name', 'proc_package',
       'captioned', 'duration', 'metric_name', 'aggregation_type',
       'attribute_level1', 'nrs', 'avg_val', 'min_val', 'max_val',
       'percentile_50th', 'percentile_75th', 'percentile_90th'],
      dtype='object')

In [6]:
df = con.execute(f"SELECT DISTINCT proc_name FROM {view_name}").fetchdf()
print(df)

                                               proc_name
0                                          searchapp.exe
1                                    scansnapupdater.exe
2                                              mstsc.exe
3                                              atmgr.exe
4       intel-driver-and-support-assistant-installer.exe
...                                                  ...
29177                                   open-sankore.exe
29178                                   retailiq.wpf.exe
29179  microsoft.surface.diagnostics.app.wpf.desktopb...
29180                                   dreamwatcher.exe
29181             dpfilelist generator v1.5 by baris.exe

[29182 rows x 1 columns]


┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││    Query Profiling Information    ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
SELECT DISTINCT proc_name FROM parquet_view
┌────────────────────────────────────────────────┐
│┌──────────────────────────────────────────────┐│
││              Total Time: 0.107s              ││
│└──────────────────────────────────────────────┘│
└────────────────────────────────────────────────┘
┌────────────────────────────────────────────────┐
│               Optimizer: 0.0009s               │
│┌──────────────────────────────────────────────┐│
││        Build Side Probe Side: 0.0000s        ││
││           Column Lifetime: 0.0000s           ││
││           Common Aggregate: 0.0000s          ││
││        Common Subexpressions: 0.0000s        ││
││      Compressed Materialization: 0.0000s     ││
││          Cte Filter Pusher: 0.0000s          ││
││             Deliminator: 0.0000s       

In [7]:
df = con.execute(f"SELECT DISTINCT aggregation_type FROM {view_name}").fetchdf()
print(df)

   aggregation_type
0                 2
1                 1
2                 0


┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││    Query Profiling Information    ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
SELECT DISTINCT aggregation_type FROM parquet_view
┌────────────────────────────────────────────────┐
│┌──────────────────────────────────────────────┐│
││              Total Time: 0.0728s             ││
│└────────────────────��─────────────────────────┘│
└────────────────────────────────────────────────┘
┌────────────────────────────────────────────────┐
│               Optimizer: 0.0012s               │
│┌──────────────────────────────────────────────┐│
││        Build Side Probe Side: 0.0000s        ││
││           Column Lifetime: 0.0000s           ││
││           Common Aggregate: 0.0000s          ││
││        Common Subexpressions: 0.0000s        ││
││      Compressed Materialization: 0.0000s     ││
││          Cte Filter Pusher: 0.0000s          ││
││             Deliminator: 0.0000