Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Commit a96b376

Browse files
GustavoAnguloapavlo
authored andcommitted
Optimizer refactor and cost model additions (#1484)
* Separate cost model from optimizer and add postgres cost model * Fix bug overflow in Analyze * Changes to cost model construction * Fix bug in stats hashing * Fix to commutativity of equality comparison expressions * Revert group equality, this works * Move Postgres cost model to header file and add some starting plan tests, along with more optimizer test utility functions * Remove printf * Remove old optimizer constructor * Fix unused variable * Testing if changing llvm path fixes travis * Did not work : ( * Ok trying changing to 3.9.1_2 * Revert "Fix bug overflow in Analyze" This reverts commit fcbf161. * Update LLVM dir in travis config * Add trivial cost model * Move files into stats folder * Add test cases for trivial cost model * Delete cost.h and cost.cpp that were commented out * Cost model name and directory refactoring * Fix three join test
1 parent ca7410f commit a96b376

31 files changed

+1143
-1214
lines changed

src/expression/abstract_expression.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,13 @@ bool AbstractExpression::operator==(const AbstractExpression &rhs) const {
113113
if (exp_type_ != rhs.exp_type_ || children_.size() != rhs.children_.size())
114114
return false;
115115

116+
// TODO: Try sorting the children
117+
// TODO: Extend this to other comparison predicates
118+
if (exp_type_ == ExpressionType::COMPARE_EQUAL && children_.size() == 2 && rhs.children_.size() == 2) {
119+
return (*children_[0] == *rhs.children_[0] && *children_[1] == *rhs.children_[1]) ||
120+
(*children_[0] == *rhs.children_[1] && *children_[1] == *rhs.children_[0]);
121+
}
122+
116123
for (unsigned i = 0; i < children_.size(); i++) {
117124
if (*children_[i].get() != *rhs.children_[i].get()) return false;
118125
}

src/include/optimizer/cost_calculator.h

-63
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// abstract_cost_calculator.h
6+
//
7+
// Identification: src/include/optimizer/abstract_cost_calculator.h
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#pragma once
14+
15+
#include "optimizer/operator_visitor.h"
16+
17+
namespace peloton {
18+
namespace optimizer {
19+
20+
class Memo;
21+
22+
// Default cost when cost model cannot compute correct cost.
23+
static constexpr double DEFAULT_COST = 1;
24+
25+
// Estimate the cost of processing each row during a query.
26+
static constexpr double DEFAULT_TUPLE_COST = 0.01;
27+
28+
// Estimate the cost of processing each index entry during an index scan.
29+
static constexpr double DEFAULT_INDEX_TUPLE_COST = 0.005;
30+
31+
// Estimate the cost of processing each operator or function executed during a
32+
// query.
33+
static constexpr double DEFAULT_OPERATOR_COST = 0.0025;
34+
35+
class AbstractCostModel : public OperatorVisitor {
36+
public:
37+
virtual double CalculateCost(GroupExpression *gexpr, Memo *memo,
38+
concurrency::TransactionContext *txn) = 0;
39+
};
40+
41+
} // namespace optimizer
42+
} // namespace peloton
43+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// cost_calculator.h
6+
//
7+
// Identification: src/include/optimizer/cost_calculator.h
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#pragma once
14+
15+
#include "optimizer/cost_model/abstract_cost_model.h"
16+
#include "expression/tuple_value_expression.h"
17+
#include "catalog/table_catalog.h"
18+
#include "optimizer/memo.h"
19+
#include "optimizer/operators.h"
20+
#include "optimizer/stats/stats_storage.h"
21+
#include "optimizer/stats/table_stats.h"
22+
23+
namespace peloton {
24+
namespace optimizer {
25+
26+
class Memo;
27+
// Derive cost for a physical group expression
28+
class DefaultCostModel : public AbstractCostModel {
29+
public:
30+
DefaultCostModel(){};
31+
32+
double CalculateCost(GroupExpression *gexpr, Memo *memo,
33+
concurrency::TransactionContext *txn) {
34+
gexpr_ = gexpr;
35+
memo_ = memo;
36+
txn_ = txn;
37+
gexpr_->Op().Accept(this);
38+
return output_cost_;
39+
}
40+
41+
void Visit(UNUSED_ATTRIBUTE const DummyScan *op) {
42+
output_cost_ = 0.f;
43+
}
44+
void Visit(const PhysicalSeqScan *op) {
45+
auto table_stats = std::dynamic_pointer_cast<TableStats>(
46+
StatsStorage::GetInstance()->GetTableStats(
47+
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
48+
if (table_stats->GetColumnCount() == 0) {
49+
output_cost_ = 1.f;
50+
return;
51+
}
52+
output_cost_ = table_stats->num_rows * DEFAULT_TUPLE_COST;
53+
}
54+
void Visit(UNUSED_ATTRIBUTE const PhysicalIndexScan *op) {
55+
auto table_stats = std::dynamic_pointer_cast<TableStats>(
56+
StatsStorage::GetInstance()->GetTableStats(
57+
op->table_->GetDatabaseOid(), op->table_->GetTableOid(), txn_));
58+
if (table_stats->GetColumnCount() == 0 || table_stats->num_rows == 0) {
59+
output_cost_ = 0.f;
60+
return;
61+
}
62+
// Index search cost + scan cost
63+
output_cost_ = std::log2(table_stats->num_rows) * DEFAULT_INDEX_TUPLE_COST +
64+
memo_->GetGroupByID(gexpr_->GetGroupID())->GetNumRows() *
65+
DEFAULT_TUPLE_COST;
66+
}
67+
68+
void Visit(UNUSED_ATTRIBUTE const QueryDerivedScan *op) {
69+
output_cost_ = 0.f;
70+
}
71+
72+
void Visit(const PhysicalOrderBy *) { SortCost(); }
73+
74+
void Visit(const PhysicalLimit *op) {
75+
auto child_num_rows =
76+
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
77+
78+
output_cost_ =
79+
std::min((size_t)child_num_rows, (size_t)op->limit) * DEFAULT_TUPLE_COST;
80+
}
81+
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerNLJoin *op) {
82+
auto left_child_rows =
83+
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
84+
auto right_child_rows =
85+
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();
86+
87+
output_cost_ = left_child_rows * right_child_rows * DEFAULT_TUPLE_COST;
88+
}
89+
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftNLJoin *op) {}
90+
void Visit(UNUSED_ATTRIBUTE const PhysicalRightNLJoin *op) {}
91+
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterNLJoin *op) {}
92+
void Visit(UNUSED_ATTRIBUTE const PhysicalInnerHashJoin *op) {
93+
auto left_child_rows =
94+
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
95+
auto right_child_rows =
96+
memo_->GetGroupByID(gexpr_->GetChildGroupId(1))->GetNumRows();
97+
// TODO(boweic): Build (left) table should have different cost to probe table
98+
output_cost_ = (left_child_rows + right_child_rows) * DEFAULT_TUPLE_COST;
99+
}
100+
void Visit(UNUSED_ATTRIBUTE const PhysicalLeftHashJoin *op) {}
101+
void Visit(UNUSED_ATTRIBUTE const PhysicalRightHashJoin *op) {}
102+
void Visit(UNUSED_ATTRIBUTE const PhysicalOuterHashJoin *op) {}
103+
void Visit(UNUSED_ATTRIBUTE const PhysicalInsert *op) {}
104+
void Visit(UNUSED_ATTRIBUTE const PhysicalInsertSelect *op) {}
105+
void Visit(UNUSED_ATTRIBUTE const PhysicalDelete *op) {}
106+
void Visit(UNUSED_ATTRIBUTE const PhysicalUpdate *op) {}
107+
void Visit(UNUSED_ATTRIBUTE const PhysicalHashGroupBy *op) {
108+
// TODO(boweic): Integrate hash in groupby may cause us to miss the
109+
// opportunity to further optimize some query where the child output is
110+
// already hashed by the GroupBy key, we'll do a hash anyway
111+
output_cost_ = HashCost() + GroupByCost();
112+
}
113+
void Visit(UNUSED_ATTRIBUTE const PhysicalSortGroupBy *op) {
114+
// Sort group by does not sort the tuples, it requires input columns to be
115+
// sorted
116+
output_cost_ = GroupByCost();
117+
}
118+
void Visit(UNUSED_ATTRIBUTE const PhysicalDistinct *op) {
119+
output_cost_ = HashCost();
120+
}
121+
void Visit(UNUSED_ATTRIBUTE const PhysicalAggregate *op) {
122+
// TODO(boweic): Ditto, separate groupby operator and implementation(e.g.
123+
// hash, sort) may enable opportunity for further optimization
124+
output_cost_ = HashCost() + GroupByCost();
125+
}
126+
127+
private:
128+
129+
double HashCost() {
130+
auto child_num_rows =
131+
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
132+
// O(tuple)
133+
return child_num_rows * DEFAULT_TUPLE_COST;
134+
}
135+
136+
double SortCost() {
137+
auto child_num_rows =
138+
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
139+
140+
if (child_num_rows == 0) {
141+
return 1.0f;
142+
}
143+
// O(tuple * log(tuple))
144+
return child_num_rows * std::log2(child_num_rows) * DEFAULT_TUPLE_COST;
145+
}
146+
147+
double GroupByCost() {
148+
auto child_num_rows =
149+
memo_->GetGroupByID(gexpr_->GetChildGroupId(0))->GetNumRows();
150+
// O(tuple)
151+
return child_num_rows * DEFAULT_TUPLE_COST;
152+
}
153+
154+
GroupExpression *gexpr_;
155+
Memo *memo_;
156+
concurrency::TransactionContext *txn_;
157+
double output_cost_ = 0;
158+
};
159+
160+
} // namespace optimizer
161+
} // namespace peloton

0 commit comments

Comments
 (0)