Skip to content
Permalink
Browse files

Optimizer refactor and cost model additions (#1484)

* Separate cost model from optimizer and add postgres cost model

* Fix bug overflow in Analyze

* Changes to cost model construction

* Fix bug in stats hashing

* Fix to commutativity of equality comparison expressions

* Revert group equality, this works

* Move Postgres cost model to header file and add some starting plan tests, along with more optimizer test utility functions

* Remove printf

* Remove old optimizer constructor

* Fix unused variable

* Testing if changing llvm path fixes travis

* Did not work : (

* Ok trying changing to 3.9.1_2

* Revert "Fix bug overflow in Analyze"

This reverts commit fcbf161.

* Update LLVM dir in travis config

* Add trivial cost model

* Move files into stats folder

* Add test cases for trivial cost model

* Delete cost.h and cost.cpp that were commented out

* Cost model name and directory refactoring

* Fix three join test
  • Loading branch information...
GustavoAngulo authored and apavlo committed Feb 13, 2019
1 parent ca7410f commit a96b3769c2aa3148d96f17f433a3fa2b16ed8c78
Showing with 1,143 additions and 1,214 deletions.
  1. +7 −0 src/expression/abstract_expression.cpp
  2. +0 −63 src/include/optimizer/cost_calculator.h
  3. +43 −0 src/include/optimizer/cost_model/abstract_cost_model.h
  4. +161 −0 src/include/optimizer/cost_model/default_cost_model.h
  5. +282 −0 src/include/optimizer/cost_model/postgres_cost_model.h
  6. +119 −0 src/include/optimizer/cost_model/trivial_cost_model.h
  7. +1 −1 src/include/optimizer/group_expression.h
  8. +5 −1 src/include/optimizer/optimizer.h
  9. +8 −5 src/include/optimizer/optimizer_metadata.h
  10. 0 src/include/optimizer/{ → stats}/child_stats_deriver.h
  11. +108 −106 src/include/optimizer/stats/column_stats.h
  12. +0 −192 src/include/optimizer/stats/cost.h
  13. +1 −1 src/include/optimizer/{ → stats}/stats.h
  14. 0 src/include/optimizer/{ → stats}/stats_calculator.h
  15. +2 −2 src/include/optimizer/stats/stats_util.h
  16. +1 −1 src/include/optimizer/stats/table_stats.h
  17. 0 src/include/optimizer/{ → stats}/tuple_sample.h
  18. +0 −151 src/optimizer/cost_calculator.cpp
  19. +1 −13 src/optimizer/group_expression.cpp
  20. +1 −1 src/optimizer/memo.cpp
  21. +25 −2 src/optimizer/optimizer.cpp
  22. +6 −10 src/optimizer/optimizer_task.cpp
  23. +1 −1 src/optimizer/{ → stats}/child_stats_deriver.cpp
  24. +0 −461 src/optimizer/stats/cost.cpp
  25. +1 −1 src/optimizer/{ → stats}/stats.cpp
  26. +1 −1 src/optimizer/{ → stats}/stats_calculator.cpp
  27. +1 −1 src/optimizer/{ → stats}/tuple_sample.cpp
  28. +3 −3 src/traffic_cop/traffic_cop.cpp
  29. +0 −195 test/optimizer/cost_test.cpp
  30. +79 −2 test/optimizer/optimizer_test_util.cpp
  31. +286 −0 test/optimizer/plan_test.cpp
@@ -113,6 +113,13 @@ bool AbstractExpression::operator==(const AbstractExpression &rhs) const {
if (exp_type_ != rhs.exp_type_ || children_.size() != rhs.children_.size())
return false;

// TODO: Try sorting the children
// TODO: Extend this to other comparison predicates
if (exp_type_ == ExpressionType::COMPARE_EQUAL && children_.size() == 2 && rhs.children_.size() == 2) {
return (*children_[0] == *rhs.children_[0] && *children_[1] == *rhs.children_[1]) ||
(*children_[0] == *rhs.children_[1] && *children_[1] == *rhs.children_[0]);
}

for (unsigned i = 0; i < children_.size(); i++) {
if (*children_[i].get() != *rhs.children_[i].get()) return false;
}

This file was deleted.

Oops, something went wrong.
@@ -0,0 +1,43 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// abstract_cost_calculator.h
//
// Identification: src/include/optimizer/abstract_cost_calculator.h
//
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/operator_visitor.h"

namespace peloton {
namespace optimizer {

class Memo;

// Default cost when cost model cannot compute correct cost.
static constexpr double DEFAULT_COST = 1;

// Estimate the cost of processing each row during a query.
static constexpr double DEFAULT_TUPLE_COST = 0.01;

// Estimate the cost of processing each index entry during an index scan.
static constexpr double DEFAULT_INDEX_TUPLE_COST = 0.005;

// Estimate the cost of processing each operator or function executed during a
// query.
static constexpr double DEFAULT_OPERATOR_COST = 0.0025;

class AbstractCostModel : public OperatorVisitor {
public:
virtual double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) = 0;
};

} // namespace optimizer
} // namespace peloton

@@ -0,0 +1,161 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// cost_calculator.h
//
// Identification: src/include/optimizer/cost_calculator.h
//
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/cost_model/abstract_cost_model.h"
#include "expression/tuple_value_expression.h"
#include "catalog/table_catalog.h"
#include "optimizer/memo.h"
#include "optimizer/operators.h"
#include "optimizer/stats/stats_storage.h"
#include "optimizer/stats/table_stats.h"

namespace peloton {
namespace optimizer {

class Memo;
// Derive cost for a physical group expression
class DefaultCostModel : public AbstractCostModel {
public:
DefaultCostModel(){};

double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) {
gexpr_ = gexpr;
memo_ = memo;
txn_ = txn;
gexpr_->Op().Accept(this);
return output_cost_;
}

void Visit(UNUSED_ATTRIBUTE const DummyScan *op) {
output_cost_ = 0.f;
}
void Visit(const PhysicalSeqScan *op) {
auto table_stats = std::dynamic_pointer_cast<TableStats>(
StatsStorage::GetInstance()->GetTableStats(
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
if (table_stats->GetColumnCount() == 0) {
output_cost_ = 1.f;
return;
}
output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) {
auto table_stats = std::dynamic_pointer_cast<TableStats>(
StatsStorage::GetInstance()->GetTableStats(
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) {
output_cost_ = 0.f;
return;
}
// Index search cost + scan cost
output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST +
memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() *
DEFAULT_TUPLE_COST;
}

void Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) {
output_cost_ = 0.f;
}

void Visit(const PhysicalOrderBy *) { SortCost(); }

void Visit(const PhysicalLimit *op) {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();

output_cost_ =
std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) {
auto left_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
auto right_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();

output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) {
auto left_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
auto right_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();
// TODO(boweic): Build (left) table should have different cost to probe table
output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) {
// TODO(boweic): Integrate hash in groupby may cause us to miss the
// opportunity to further optimize some query where the child output is
// already hashed by the GroupBy key, we'll do a hash anyway
output_cost_ = HashCost() + GroupByCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) {
// Sort group by does not sort the tuples, it requires input columns to be
// sorted
output_cost_ = GroupByCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) {
output_cost_ = HashCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) {
// TODO(boweic): Ditto, separate groupby operator and implementation(e.g.
// hash, sort) may enable opportunity for further optimization
output_cost_ = HashCost() + GroupByCost();
}

private:

double HashCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
// O(tuple)
return child_num_rows * DEFAULT_TUPLE_COST;
}

double SortCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();

if (child_num_rows == 0) {
return 1.0f;
}
// O(tuple * log(tuple))
return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST;
}

double GroupByCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
// O(tuple)
return child_num_rows * DEFAULT_TUPLE_COST;
}

GroupExpression *gexpr_;
Memo *memo_;
concurrency::TransactionContext *txn_;
double output_cost_ = 0;
};

} // namespace optimizer
} // namespace peloton
Oops, something went wrong.

0 comments on commit a96b376

Please sign in to comment.
You can’t perform that action at this time.