In [427]:
import collections

import torch
import tqdm
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from src.query_plan import QueryPlanNode, QueryPlan
import typing
import pickle
import os.path
import numpy as np
from src.cost_learner import CostLearnerNet
from src.config import DATA_PATH_PKL, DATA_PATH_MODEL
from src.embedding import Embedding, EmbeddingNet4Nodes


class CostLearner:
    """
        完成训练之后, 现在应当能对给定的查询计划进行代价预测了!
        对于任意的查询计划, 该类的一个实例可以给出查询代价的预测值
    """

    def __init__(self, vocab_encoding_dict: typing.Union[str, dict[str, np.ndarray]],
                 node_embedding_net_load_from: str, cost_learner_net_load_from: str):
        self.vocab_encode_mapping = pickle.load(open(vocab_encoding_dict, "rb"))
        self.embedding_net = EmbeddingNet4Nodes(load_from=node_embedding_net_load_from)
        # self.embedding_net.model.summary()
        self.cost_learner_net = torch.load(cost_learner_net_load_from)

    def __call__(self, _query_plan: typing.Union[np.ndarray, QueryPlan, torch.Tensor] = None) -> float:
        """
            对于任意的查询计划, 该类的一个实例可以给出查询代价的预测值
        """
        embedded_vec = self.embed(_query_plan) if isinstance(_query_plan, QueryPlan) else _query_plan
        _out = self.cost_learner_net(torch.tensor(embedded_vec, dtype=torch.float32).unsqueeze(0))
        return _out.detach().numpy()[0][0]

    @staticmethod
    def get_node_list_of_query_plan(qp: QueryPlan):
        def __callback(cur_node: QueryPlanNode, _list: list):
            _list.append(cur_node.to_vector())

        # # 1. 获取查询计划内所有被执行节点的描述
        node_list = list()
        qp.post_order(_callback=__callback, _list=node_list)
        # # 在pycharm外运行 设置draw = False
        qp.make_digraph(draw=False)
        return node_list

    def embed(self, qp: typing.Union[typing.List[typing.List[str]], QueryPlan], need_cost_label: bool = False):
        node_list = self.get_node_list_of_query_plan(qp) if isinstance(qp, QueryPlan) else qp
        # # 2. 做查询计划嵌入, 节点中的每一个词
        plan_seq = Embedding.vectorize_sentences(list(map(lambda x: x[:-1], node_list)),
                                                 self.vocab_encode_mapping)
        # # 扩充
        plan_seq = pad_sequences(plan_seq, padding="post", dtype=np.float32,
                                 maxlen=self.embedding_net.model.input_shape[1])
        embedded_vector = self.embedding_net(plan_seq)
        return embedded_vector if not need_cost_label else (embedded_vector, node_list[-1][-1])


c = CostLearner(os.path.join(DATA_PATH_PKL, "vocab_encode.pkl"),
                os.path.join(DATA_PATH_MODEL, "embedding_all_nodes.h5"),
                os.path.join(DATA_PATH_MODEL, "cost_model_9.model"))

In [431]:
query_plan_path = r"C:\Users\QQ863\Documents\Projects\PycharmProjects\DeepO\data\plan\5c"
query_plan = Embedding.load_from(_query_plan_path=query_plan_path)
query_plan

<src.query_plan.QueryPlan at 0x1ec8796a280>

In [455]:
def predict_cost_and_get_confidence_intervals(query_plan: QueryPlan, n_sample=10, ci_multiplier=6):
    """
        mean, std, diff, lower bound, upper bound, in_or_not
    """
    embedded_vec, cost = c.embed(query_plan, True)
    cost_norm = (5346925.973 - float(cost)) / 5346925.973
    pred_test = np.array(list(map(lambda x: c(embedded_vec), range(n_sample))))
    pred_test_mean = pred_test.mean()
    pred_std = pred_test.std(ddof=2)
    return dict(
        mean=pred_test_mean,
        std=pred_std,
        lower_bound=pred_test_mean - (pred_std * ci_multiplier),
        upper_bound=pred_test_mean + (pred_std * ci_multiplier),
        diff=pred_test_mean - cost_norm,
        ans=(pred_test_mean - (pred_std * ci_multiplier)) <= cost_norm <= (pred_test_mean + (pred_std * ci_multiplier))
    )


predict_cost_and_get_confidence_intervals(query_plan)

{'mean': 0.9890974,
 'std': 0.00039862338,
 'lower_bound': 0.9867056761286221,
 'upper_bound': 0.9914891566731967,
 'diff': -0.010854090689087936,
 'ans': False}

In [433]:
from src.config import DATA_PATH_PLANS_FOR_TRAIN

ans = []
qp_list = []
for query_plan_path in os.listdir(DATA_PATH_PLANS_FOR_TRAIN):
    full_name = os.path.join(DATA_PATH_PLANS_FOR_TRAIN, query_plan_path)
    with open(full_name, "r", encoding="utf-8") as f:
        qp = Embedding.load_from(_query_plan_raw=f.read())
        qp_list.append(qp)

In [434]:
import tqdm

for p in range(6, 30):
    ans = list(map(lambda qp: predict_cost_and_get_confidence_intervals(qp, ci_multiplier=p)['ans'], qp_list))
    print(p, len(list(filter(bool, ans))) / len(ans))

6 0.1504424778761062
7 0.19469026548672566
8 0.18584070796460178
9 0.25663716814159293
10 0.26548672566371684
11 0.2831858407079646
12 0.3805309734513274
13 0.4424778761061947
14 0.4336283185840708
15 0.6017699115044248
16 0.672566371681416
17 0.6283185840707964
18 0.8230088495575221
19 0.7522123893805309
20 0.8407079646017699
21 0.8938053097345132
22 0.8407079646017699
23 0.8938053097345132
24 0.9469026548672567
25 0.9203539823008849
26 0.9292035398230089
27 0.9557522123893806
28 0.9469026548672567
29 0.9734513274336283
