Skip to content
This repository has been archived by the owner on Sep 27, 2019. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
Optimizer refactor and cost model additions (#1484)
* Separate cost model from optimizer and add postgres cost model

* Fix bug overflow in Analyze

* Changes to cost model construction

* Fix bug in stats hashing

* Fix to commutativity of equality comparison expressions

* Revert group equality, this works

* Move Postgres cost model to header file and add some starting plan tests, along with more optimizer test utility functions

* Remove printf

* Remove old optimizer constructor

* Fix unused variable

* Testing if changing llvm path fixes travis

* Did not work : (

* Ok trying changing to 3.9.1_2

* Revert "Fix bug overflow in Analyze"

This reverts commit fcbf161.

* Update LLVM dir in travis config

* Add trivial cost model

* Move files into stats folder

* Add test cases for trivial cost model

* Delete cost.h and cost.cpp that were commented out

* Cost model name and directory refactoring

* Fix three join test
  • Loading branch information
GustavoAngulo authored and apavlo committed Feb 13, 2019
1 parent ca7410f commit a96b376
Show file tree
Hide file tree
Showing 31 changed files with 1,143 additions and 1,214 deletions.
7 changes: 7 additions & 0 deletions src/expression/abstract_expression.cpp
Expand Up @@ -113,6 +113,13 @@ bool AbstractExpression::operator==(const AbstractExpression &rhs) const {
if (exp_type_ != rhs.exp_type_ || children_.size() != rhs.children_.size())
return false;

// TODO: Try sorting the children
// TODO: Extend this to other comparison predicates
if (exp_type_ == ExpressionType::COMPARE_EQUAL && children_.size() == 2 && rhs.children_.size() == 2) {
return (*children_[0] == *rhs.children_[0] && *children_[1] == *rhs.children_[1]) ||
(*children_[0] == *rhs.children_[1] && *children_[1] == *rhs.children_[0]);
}

for (unsigned i = 0; i < children_.size(); i++) {
if (*children_[i].get() != *rhs.children_[i].get()) return false;
}
Expand Down
63 changes: 0 additions & 63 deletions src/include/optimizer/cost_calculator.h

This file was deleted.

43 changes: 43 additions & 0 deletions src/include/optimizer/cost_model/abstract_cost_model.h
@@ -0,0 +1,43 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// abstract_cost_calculator.h
//
// Identification: src/include/optimizer/abstract_cost_calculator.h
//
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/operator_visitor.h"

namespace peloton {
namespace optimizer {

class Memo;

// Default cost when cost model cannot compute correct cost.
static constexpr double DEFAULT_COST = 1;

// Estimate the cost of processing each row during a query.
static constexpr double DEFAULT_TUPLE_COST = 0.01;

// Estimate the cost of processing each index entry during an index scan.
static constexpr double DEFAULT_INDEX_TUPLE_COST = 0.005;

// Estimate the cost of processing each operator or function executed during a
// query.
static constexpr double DEFAULT_OPERATOR_COST = 0.0025;

class AbstractCostModel : public OperatorVisitor {
public:
virtual double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) = 0;
};

} // namespace optimizer
} // namespace peloton

161 changes: 161 additions & 0 deletions src/include/optimizer/cost_model/default_cost_model.h
@@ -0,0 +1,161 @@
//===----------------------------------------------------------------------===//
//
// Peloton
//
// cost_calculator.h
//
// Identification: src/include/optimizer/cost_calculator.h
//
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
//
//===----------------------------------------------------------------------===//

#pragma once

#include "optimizer/cost_model/abstract_cost_model.h"
#include "expression/tuple_value_expression.h"
#include "catalog/table_catalog.h"
#include "optimizer/memo.h"
#include "optimizer/operators.h"
#include "optimizer/stats/stats_storage.h"
#include "optimizer/stats/table_stats.h"

namespace peloton {
namespace optimizer {

class Memo;
// Derive cost for a physical group expression
class DefaultCostModel : public AbstractCostModel {
public:
DefaultCostModel(){};

double CalculateCost(GroupExpression *gexpr, Memo *memo,
concurrency::TransactionContext *txn) {
gexpr_ = gexpr;
memo_ = memo;
txn_ = txn;
gexpr_->Op().Accept(this);
return output_cost_;
}

void Visit(UNUSED_ATTRIBUTE const DummyScan *op) {
output_cost_ = 0.f;
}
void Visit(const PhysicalSeqScan *op) {
auto table_stats = std::dynamic_pointer_cast<TableStats>(
StatsStorage::GetInstance()->GetTableStats(
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
if (table_stats->GetColumnCount() == 0) {
output_cost_ = 1.f;
return;
}
output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) {
auto table_stats = std::dynamic_pointer_cast<TableStats>(
StatsStorage::GetInstance()->GetTableStats(
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) {
output_cost_ = 0.f;
return;
}
// Index search cost + scan cost
output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST +
memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() *
DEFAULT_TUPLE_COST;
}

void Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) {
output_cost_ = 0.f;
}

void Visit(const PhysicalOrderBy *) { SortCost(); }

void Visit(const PhysicalLimit *op) {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();

output_cost_ =
std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) {
auto left_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
auto right_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();

output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) {
auto left_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
auto right_child_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();
// TODO(boweic): Build (left) table should have different cost to probe table
output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST;
}
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {}
void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) {
// TODO(boweic): Integrate hash in groupby may cause us to miss the
// opportunity to further optimize some query where the child output is
// already hashed by the GroupBy key, we'll do a hash anyway
output_cost_ = HashCost() + GroupByCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) {
// Sort group by does not sort the tuples, it requires input columns to be
// sorted
output_cost_ = GroupByCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) {
output_cost_ = HashCost();
}
void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) {
// TODO(boweic): Ditto, separate groupby operator and implementation(e.g.
// hash, sort) may enable opportunity for further optimization
output_cost_ = HashCost() + GroupByCost();
}

private:

double HashCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
// O(tuple)
return child_num_rows * DEFAULT_TUPLE_COST;
}

double SortCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();

if (child_num_rows == 0) {
return 1.0f;
}
// O(tuple * log(tuple))
return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST;
}

double GroupByCost() {
auto child_num_rows =
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
// O(tuple)
return child_num_rows * DEFAULT_TUPLE_COST;
}

GroupExpression *gexpr_;
Memo *memo_;
concurrency::TransactionContext *txn_;
double output_cost_ = 0;
};

} // namespace optimizer
} // namespace peloton

0 comments on commit a96b376

Please sign in to comment.