In [10]:
from models.training.utils import batch_to as custom_batch_to
from models.training.utils import flatten_dict
import torch
import numpy as np
import json
import os
import pandas as pd
import logging
import models.dataset.dataset_creation as dataset_creation
from cross_db_benchmark.benchmark_tools.database import DatabaseSystem
import importlib
from models.zero_shot_models.specific_models.model import zero_shot_models

from cross_db_benchmark.benchmark_tools.utils import load_json
from tqdm import tqdm
hyperparameter_path = 'setup/tuned_hyperparameters/tune_est_best_config.json'
hyperparams = load_json(hyperparameter_path, namespace=False)

loss_class_name='QLoss'
# loss_class_name='MSELoss'
max_epoch_tuples=100000
seed = 0
device = 'cpu'
num_workers = 1
limit_queries=None
limit_queries_affected_wl=None
skip_train=False
max_no_epochs = None

p_dropout = hyperparams.pop('p_dropout')
# general fc out
fc_out_kwargs = dict(p_dropout=p_dropout,
                        activation_class_name='LeakyReLU',
                        activation_class_kwargs={},
                        norm_class_name='Identity',
                        norm_class_kwargs={},
                        residual=hyperparams.pop('residual'),
                        dropout=hyperparams.pop('dropout'),
                        activation=True,
                        inplace=True)
final_mlp_kwargs = dict(width_factor=hyperparams.pop('final_width_factor'),
                        n_layers=hyperparams.pop('final_layers'),
                        loss_class_name=loss_class_name,
                        loss_class_kwargs=dict())
tree_layer_kwargs = dict(width_factor=hyperparams.pop('tree_layer_width_factor'),
                            n_layers=hyperparams.pop('message_passing_layers'))
node_type_kwargs = dict(width_factor=hyperparams.pop('node_type_width_factor'),
                        n_layers=hyperparams.pop('node_layers'),
                        one_hot_embeddings=True,
                        max_emb_dim=hyperparams.pop('max_emb_dim'),
                        drop_whole_embeddings=False)
final_mlp_kwargs.update(**fc_out_kwargs)
tree_layer_kwargs.update(**fc_out_kwargs)
node_type_kwargs.update(**fc_out_kwargs)



train_kwargs = dict(optimizer_class_name='AdamW',
                optimizer_kwargs=dict(
                    lr=hyperparams.pop('lr'),
                ),
                final_mlp_kwargs=final_mlp_kwargs,
                node_type_kwargs=node_type_kwargs,
                tree_layer_kwargs=tree_layer_kwargs,
                tree_layer_name=hyperparams.pop('tree_layer_name'),
                plan_featurization_name=hyperparams.pop('plan_featurization_name'),  # 'PostgresEstSystemCardDetail' in tune_est_best_config.json, while 'PostgresTrueCardDetail' is the default as defined in train_default(), the third one is 'PostgresDeepDBEstSystemCardDetail'
                hidden_dim=hyperparams.pop('hidden_dim'),
                output_dim=1,
                epochs=200 if max_no_epochs is None else max_no_epochs,
                early_stopping_patience=20,
                max_epoch_tuples=max_epoch_tuples,
                batch_size=hyperparams.pop('batch_size'),
                device=device,
                num_workers=num_workers,
                seed=seed,
                limit_queries=limit_queries,
                limit_queries_affected_wl=limit_queries_affected_wl,
                skip_train=skip_train
                )

assert len(hyperparams) == 0, f"Not all hyperparams were used (not used: {hyperparams.keys()}). Hence generation " \
                                    f"and reading does not seem to fit"

# Set up the parameters for the experiment
train_dataset = 'tpcds'
test_dataset = 'tpch'
target_dir = f'evaluation_train_{train_dataset}_test_{test_dataset}'
statistics_file = f'{train_dataset}_data/statistics_workload_combined.json'

workload_runs = f'{train_dataset}_data/train_plans.json'
test_workload_runs = f'{test_dataset}_data/val_plans.json'

param_dict = flatten_dict(train_kwargs)


def get_logger():

    log = logging.getLogger()
    log.setLevel(logging.DEBUG)
    fmt = f"[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d]:%(message)s"
    formatter = logging.Formatter(fmt, datefmt="%Y-%m-%d %H:%M:%S")

    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    log.addHandler(ch)
    return log

logger = get_logger()

In [11]:
seed = 0
# seed for reproducibility
torch.manual_seed(seed)
np.random.seed(seed)

plan_featurization_name = train_kwargs['plan_featurization_name']
database = DatabaseSystem.POSTGRES
batch_size = train_kwargs['batch_size']
num_workers = train_kwargs['num_workers']
limit_queries = train_kwargs['limit_queries']
limit_queries_affected_wl = train_kwargs['limit_queries_affected_wl']
final_mlp_kwargs = train_kwargs['final_mlp_kwargs']
loss_class_name = final_mlp_kwargs['loss_class_name']

importlib.reload(dataset_creation)
label_norm, feature_statistics, train_loader, val_loader, test_loader = \
dataset_creation.create_dataloader(logger, workload_runs, test_workload_runs, statistics_file, plan_featurization_name, database,
                    val_ratio=0.15, batch_size=batch_size, shuffle=True, num_workers=num_workers,
                    pin_memory=False, limit_queries=limit_queries,
                    limit_queries_affected_wl=limit_queries_affected_wl, loss_class_name=loss_class_name)
print(f"train_loader length: {len(train_loader.dataset)}")
print(f"val_loader length: {len(val_loader.dataset)}")
print(f"test_loader length: {len(test_loader.dataset)}")

source tpcds_data/train_plans.json
source tpch_data/val_plans.json
train_loader length: 45565
val_loader length: 8042
test_loader length: 12424


In [12]:
model_kwargs = dict()

hidden_dim = train_kwargs['hidden_dim']
output_dim = train_kwargs['output_dim']
tree_layer_name = train_kwargs['tree_layer_name']
model = zero_shot_models[database](device=device, hidden_dim=hidden_dim, final_mlp_kwargs=final_mlp_kwargs,
                                       node_type_kwargs=node_type_kwargs, output_dim=output_dim,
                                       feature_statistics=feature_statistics, tree_layer_name=tree_layer_name,
                                       tree_layer_kwargs=tree_layer_kwargs,
                                       plan_featurization_name=plan_featurization_name,
                                       label_norm=label_norm,
                                       **model_kwargs)
    # move to gpu
model = model.to(model.device)

In [13]:
checkpoint = torch.load(f'evaluation/{train_dataset}.pt')
model.load_state_dict(checkpoint['model'])

<All keys matched successfully>

In [15]:
model.eval()
labels = []
preds =[]
with torch.autograd.no_grad():
    for batch in tqdm(test_loader):
        input_model, label, sample_idxs_batch = custom_batch_to(batch, model.device, model.label_norm)
        output = model(input_model)
        curr_pred = output.cpu().numpy()
        curr_label = label.cpu().numpy()
        if model.label_norm is not None:
            curr_pred = model.label_norm.inverse_transform(curr_pred)
            curr_label = model.label_norm.inverse_transform(curr_label.reshape(-1, 1))
            curr_label = curr_label.reshape(-1)
        preds.append(curr_pred.reshape(-1))
        labels.append(curr_label.reshape(-1))

labels = np.concatenate(labels)
preds = np.concatenate(preds)
from metrics import compute_metrics
metrics = compute_metrics(labels, preds)
print(metrics)

100%|██████████| 7/7 [02:19<00:00, 19.98s/it]

{'qerror_50 (Median)': 2.3576115369796753, 'qerror_95': 5.90754699707031, 'qerror_max': 2245.23583984375, 'mean_qerror': 3.3295248, 'mre': 0.82093817, 'rmse': 527259.1}





In [3]:
import psycopg2
import json

import os
import sys
import psycopg2
from tqdm import tqdm

conn_params = {
    "dbname": "tpc_h",
    "user": "wuy",
    "password": "",
    "host": "localhost"
}
 

# Connect to PostgreSQL
def connect_to_postgres():
    return psycopg2.connect(**conn_params)

# Execute a query with EXPLAIN (ANALYZE, FORMAT JSON)
def execute_explain_analyze_query(query):
    try:
        # Establish the connection
        conn = connect_to_postgres()
        cur = conn.cursor()

        # Use EXPLAIN (ANALYZE, FORMAT JSON) to get query execution plan
        explain_query = f"EXPLAIN (FORMAT JSON) {query}"
        cur.execute(explain_query)

        # Fetch the result (EXPLAIN result in JSON format)
        result = cur.fetchone()[0]  # The result is returned as a single-row tuple

        # Convert the JSON string to a Python object for easier manipulation
        explain_json = json.loads(json.dumps(result))
        return explain_json

    except Exception as e:
        print(f"Error: {e}")
    finally:
        if conn:
            cur.close()
            conn.close()

# Example complex query
complex_query = """
SELECT SUM("lineitem"."l_linenumber") as agg_0 FROM "nation" JOIN "region" ON "nation"."n_regionkey" = "region"."r_regionkey" JOIN "supplier" ON "nation"."n_nationkey" = "supplier"."s_nationkey" JOIN "partsupp" ON "supplier"."s_suppkey" = "partsupp"."ps_suppkey" JOIN "part" ON "partsupp"."ps_partkey" = "part"."p_partkey" JOIN "lineitem" ON "partsupp"."ps_partkey" = "lineitem"."l_partkey" AND "partsupp"."ps_suppkey" = "lineitem"."l_suppkey"  WHERE "region"."r_comment" = 'uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl' AND "lineitem"."l_shipinstruct" = 'COLLECT COD' AND "lineitem"."l_returnflag" = 'N' AND "region"."r_regionkey" != 2;
"""

# Execute the query and get the explain analyze result
explain_result = execute_explain_analyze_query(complex_query)

# Print the JSON explain result (formatted)
print(json.dumps(explain_result, indent=2))


[
  {
    "Plan": {
      "Node Type": "Aggregate",
      "Strategy": "Plain",
      "Partial Mode": "Finalize",
      "Parallel Aware": false,
      "Async Capable": false,
      "Startup Cost": 160309.37,
      "Total Cost": 160309.38,
      "Plan Rows": 1,
      "Plan Width": 8,
      "Plans": [
        {
          "Node Type": "Gather",
          "Parent Relationship": "Outer",
          "Parallel Aware": false,
          "Async Capable": false,
          "Startup Cost": 160309.15,
          "Total Cost": 160309.36,
          "Plan Rows": 2,
          "Plan Width": 8,
          "Workers Planned": 2,
          "Single Copy": false,
          "Plans": [
            {
              "Node Type": "Aggregate",
              "Strategy": "Plain",
              "Partial Mode": "Partial",
              "Parent Relationship": "Outer",
              "Parallel Aware": false,
              "Async Capable": false,
              "Startup Cost": 159309.15,
              "Total Cost": 159309.16,
   

In [1]:
import json
with open('tpcds_data/tiny_raw_plan.json') as f:
    raw_plan = json.load(f)

In [2]:
from collect_db_stats import collect_db_statistics
conn_params = {
    "dbname": "tpc_ds",
    "user": "wuy",
    "password": "",
    "host": "localhost"
}
stats=collect_db_statistics(conn_params=conn_params)


In [3]:
print(stats['table_stats'])

[{'relname': 'dbgen_version', 'reltuples': 1.0, 'relpages': 1}, {'relname': 'customer_address', 'reltuples': 50000.0, 'relpages': 1087}, {'relname': 'customer_demographics', 'reltuples': 1920800.0, 'relpages': 21828}, {'relname': 'date_dim', 'reltuples': 73049.0, 'relpages': 1405}, {'relname': 'warehouse', 'reltuples': 5.0, 'relpages': 1}, {'relname': 'ship_mode', 'reltuples': 20.0, 'relpages': 1}, {'relname': 'time_dim', 'reltuples': 86400.0, 'relpages': 1397}, {'relname': 'reason', 'reltuples': 35.0, 'relpages': 1}, {'relname': 'income_band', 'reltuples': 20.0, 'relpages': 1}, {'relname': 'item', 'reltuples': 18000.0, 'relpages': 1230}, {'relname': 'store', 'reltuples': 12.0, 'relpages': 1}, {'relname': 'call_center', 'reltuples': 6.0, 'relpages': 1}, {'relname': 'customer', 'reltuples': 100000.0, 'relpages': 2768}, {'relname': 'web_site', 'reltuples': 30.0, 'relpages': 2}, {'relname': 'store_returns', 'reltuples': 287514.0, 'relpages': 4791}, {'relname': 'household_demographics', 'r

In [4]:
raw_plan['database_stats']=stats
with open('tpcds_data/tiny_raw_plan.json', 'w') as f:
    json.dump(raw_plan, f)

In [5]:
with open('tpcds_data/raw_plan.json') as f:
    plan = json.load(f)
plan['database_stats']=stats
with open('tpcds_data/raw_plan.json', 'w') as f:
    json.dump(plan, f)

In [6]:
with open('tpcds_data/parsed_plan.json') as f:
    plan = json.load(f)
plan['database_stats']=stats
with open('tpcds_data/parsed_plan.json', 'w') as f:
    json.dump(plan, f)

In [4]:
with open('tpcds_data/raw_plan.json') as f:
    plan = json.load(f)

In [7]:
plan['database_stats']

{'column_stats': [{'tablename': 'dbgen_version',
   'attname': 'dv_version',
   'null_frac': 0.0,
   'avg_width': 7,
   'n_distinct': -1.0,
   'correlation': None,
   'data_type': 'character varying'},
  {'tablename': 'dbgen_version',
   'attname': 'dv_create_date',
   'null_frac': 0.0,
   'avg_width': 4,
   'n_distinct': -1.0,
   'correlation': None,
   'data_type': 'date'},
  {'tablename': 'dbgen_version',
   'attname': 'dv_create_time',
   'null_frac': 0.0,
   'avg_width': 8,
   'n_distinct': -1.0,
   'correlation': None,
   'data_type': 'time without time zone'},
  {'tablename': 'dbgen_version',
   'attname': 'dv_cmdline_args',
   'null_frac': 0.0,
   'avg_width': 6,
   'n_distinct': -1.0,
   'correlation': None,
   'data_type': 'character varying'},
  {'tablename': 'customer_address',
   'attname': 'ca_address_sk',
   'null_frac': 0.0,
   'avg_width': 4,
   'n_distinct': -1.0,
   'correlation': 0.9991675,
   'data_type': 'integer'},
  {'tablename': 'customer_address',
   'attname'

In [None]:
raw_plan = {
    "Plan": {
        "Node Type": "Seq Scan",
        "Relation Name": "employees",
        "Alias": "employees",
        "Startup Cost": 0.00,
        "Total Cost": 183.00,
        "Plan Rows": 3700,
        "Plan Width": 2048,
        "Filter": "salary > 50000"
    }
}


In [1]:
import pandas as pd

# Sample data
data = {
    'Plan Rows': [3700, 1000, 5000, 2000],
    'Plan Width': [2048, 1024, 512, 256],
    'Peak Memory Usage': [120, 60, 150, 75]  # This is the target variable
}
df = pd.DataFrame(data)


In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Prepare data
X = df[['Plan Rows', 'Plan Width']]
y = df['Peak Memory Usage']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create XGBoost regressor object
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

# Fit model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
print(y_pred)


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


[109.48981]


In [3]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 2449.24100965634
