In [3]:
import json
import collections
import os
from types import SimpleNamespace

explain_plan = {'Plan': {'Node Type': 'Aggregate', 'Strategy': 'Plain', 'Partial Mode': 'Simple', 'Parallel Aware': False, 'Async Capable': False, 'Startup Cost': 2.74, 'Total Cost': 2.75, 'Plan Rows': 1, 'Plan Width': 32, 'Output': ['avg((nation.n_regionkey + nation.n_nationkey))'], 'Plans': [{'Node Type': 'Hash Join', 'Parent Relationship': 'Outer', 'Parallel Aware': False, 'Async Capable': False, 'Join Type': 'Inner', 'Startup Cost': 1.11, 'Total Cost': 2.66, 'Plan Rows': 16, 'Plan Width': 8, 'Output': ['nation.n_regionkey', 'nation.n_nationkey'], 'Inner Unique': False, 'Hash Cond': '(nation.n_regionkey = region.r_regionkey)', 'Plans': [{'Node Type': 'Seq Scan', 'Parent Relationship': 'Outer', 'Parallel Aware': False, 'Async Capable': False, 'Relation Name': 'nation', 'Schema': 'public', 'Alias': 'nation', 'Startup Cost': 0.0, 'Total Cost': 1.31, 'Plan Rows': 20, 'Plan Width': 8, 'Output': ['nation.n_nationkey', 'nation.n_name', 'nation.n_regionkey', 'nation.n_comment'], 'Filter': '(nation.n_regionkey <> 3)'}, {'Node Type': 'Hash', 'Parent Relationship': 'Inner', 'Parallel Aware': False, 'Async Capable': False, 'Startup Cost': 1.06, 'Total Cost': 1.06, 'Plan Rows': 4, 'Plan Width': 4, 'Output': ['region.r_regionkey'], 'Plans': [{'Node Type': 'Seq Scan', 'Parent Relationship': 'Outer', 'Parallel Aware': False, 'Async Capable': False, 'Relation Name': 'region', 'Schema': 'public', 'Alias': 'region', 'Startup Cost': 0.0, 'Total Cost': 1.06, 'Plan Rows': 4, 'Plan Width': 4, 'Output': ['region.r_regionkey'], 'Filter': '(region.r_regionkey <> 3)'}]}]}]}, 'peakmem': 17220}

database_stats_file = '/home/wuy/DB/pg_mem_data/tpch/database_stats.json'

with open(database_stats_file, 'r') as f:
    database_stats = json.load(f, object_hook=lambda d: SimpleNamespace(**d))

In [4]:
column_id_mapping = dict() # map (table, column) to a number
table_id_mapping = dict()
partial_column_name_mapping = collections.defaultdict(set)

# enrich column stats with table sizes
table_sizes = dict()
for table_stat in database_stats.table_stats:
    table_sizes[table_stat.relname] = table_stat.reltuples

for i, column_stat in enumerate(database_stats.column_stats):
    table = column_stat.tablename
    column = column_stat.attname
    column_stat.table_size = table_sizes[table]
    column_id_mapping[(table, column)] = i
    partial_column_name_mapping[column].add(table)

# similar for table statistics
for i, table_stat in enumerate(database_stats.table_stats):
    table = table_stat.relname
    table_id_mapping[table] = i

# parse individual queries
parsed_plans = []
avg_runtimes = []
no_tables = []
no_filters = []
op_perc = collections.defaultdict(int)

In [7]:
def map_node_type(node_type):
    # Map the Node Type to op_name as per your target structure
    mapping = {
        "Aggregate": "Finalize Aggregate",
        "Hash Join": "Hash Join",
        "Seq Scan": "Sequential Scan",
        "Hash": "Hash",
        # Add other mappings as needed
    }
    return mapping.get(node_type, node_type)

def map_output_columns(output):
    # Example mapping function for output columns
    # This needs to be customized based on how you want to map output columns
    # For demonstration, we'll just return the list as is
    return output

def transform_plan(node):
    transformed = {
        "plain_content": [],
        "plan_parameters": {},
        "children": []
    }

    # Map Node Type to op_name
    node_type = node.get("Node Type", "")
    transformed["plan_parameters"]["op_name"] = map_node_type(node_type)

    # Common mappings
    transformed["plan_parameters"]["est_startup_cost"] = node.get("Startup Cost", 0.0)
    transformed["plan_parameters"]["est_cost"] = node.get("Total Cost", 0.0)
    transformed["plan_parameters"]["est_card"] = node.get("Plan Rows", 0.0)
    transformed["plan_parameters"]["est_width"] = node.get("Plan Width", 0.0)

    # Example: Handling specific node types
    if node_type == "Aggregate":
        transformed["plan_parameters"]["act_startup_cost"] = node.get("Actual Startup", 0.0)
        transformed["plan_parameters"]["act_time"] = node.get("Actual Total Time", 0.0)
        # Add more mappings as needed
    elif node_type == "Hash Join":
        transformed["plan_parameters"]["join_type"] = node.get("Join Type", "")
        transformed["plan_parameters"]["hash_cond"] = node.get("Hash Cond", "")
    elif node_type == "Seq Scan":
        transformed["plan_parameters"]["relation_name"] = node.get("Relation Name", "")
        transformed["plan_parameters"]["filter"] = node.get("Filter", "")

    # Handle Output Columns
    output = node.get("Output", [])
    transformed["plan_parameters"]["output_columns"] = map_output_columns(output)

    # Handle other parameters as needed

    # Recursively handle child plans
    child_plans = node.get("Plans", [])
    for child in child_plans:
        transformed_child = transform_plan(child)
        transformed["children"].append(transformed_child)

    return transformed


In [8]:
transform_plan(explain_plan)

{'plain_content': [],
 'plan_parameters': {'op_name': '',
  'est_startup_cost': 0.0,
  'est_cost': 0.0,
  'est_card': 0.0,
  'est_width': 0.0,
  'output_columns': []},
 'children': []}