Skip to content

Commit

Permalink
Test NestedLoopJoin without filter in join fuzzer (#9923)
Browse files Browse the repository at this point in the history
Summary:

A correctness bug was found recently in NestedLoopJoin without filter (#9892). 
So this diff extends join fuzzer to cover this case with 10% chance. This diff 
also makes the build side input to contain rows that do not match the probe side.

Differential Revision: D57761514
  • Loading branch information
kagamiori authored and facebook-github-bot committed May 24, 2024
1 parent c5d10f5 commit e22fcb3
Showing 1 changed file with 83 additions and 8 deletions.
91 changes: 83 additions & 8 deletions velox/exec/fuzzer/JoinFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,13 @@ class JoinFuzzer {
const std::vector<RowVectorPtr>& buildInput,
const core::PlanNodePtr& plan);

// Generates and executes plans using NestedLoopJoin without filters. The
// result is compared to DuckDB.
void testCrossProduct(
core::JoinType joinType,
const std::vector<RowVectorPtr>& probeInput,
const std::vector<RowVectorPtr>& buildInput);

int32_t randInt(int32_t min, int32_t max) {
return boost::random::uniform_int_distribution<int32_t>(min, max)(rng_);
}
Expand Down Expand Up @@ -361,20 +368,23 @@ std::vector<RowVectorPtr> JoinFuzzer::generateBuildInput(
// probeInput.

// To ensure there are some matches, sample with replacement 10% of probe join
// keys and use these as build keys.
// TODO Add a few random rows as well.
// keys and use these as 80% of build keys. The rest build keys are randomly
// generated.
std::vector<RowVectorPtr> input;
for (const auto& probe : probeInput) {
auto numRows = 1 + probe->size() / 10;
auto build = BaseVector::create<RowVector>(rowType, numRows, probe->pool());
auto numRows = 1 + probe->size() / 8;
auto build = vectorFuzzer_.fuzzRow(rowType, numRows, false);

// Pick probe side rows to copy.
std::vector<vector_size_t> rowNumbers(numRows);
SelectivityVector rows(numRows, false);
for (auto i = 0; i < numRows; ++i) {
rowNumbers[i] = randInt(0, probe->size() - 1);
if (vectorFuzzer_.coinToss(0.8) && probe->size() > 0) {
rowNumbers[i] = randInt(0, probe->size() - 1);
rows.setValid(i, true);
}
}

SelectivityVector rows(numRows);
for (auto i = 0; i < probeKeys.size(); ++i) {
build->childAt(i)->resize(numRows);
build->childAt(i)->copy(probe->childAt(i).get(), rows, rowNumbers.data());
Expand Down Expand Up @@ -968,6 +978,51 @@ bool isTableScanSupported(const TypePtr& type) {
return true;
}

void JoinFuzzer::testCrossProduct(
core::JoinType joinType,
const std::vector<RowVectorPtr>& probeInput,
const std::vector<RowVectorPtr>& buildInput) {
auto outputColumns =
concat(asRowType(probeInput[0]->type()), asRowType(buildInput[0]->type()))
->names();

auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
auto plan = JoinFuzzer::PlanWithSplits{
PlanBuilder(planNodeIdGenerator)
.values(probeInput)
.nestedLoopJoin(
PlanBuilder(planNodeIdGenerator).values(buildInput).planNode(),
"",
outputColumns,
joinType)
.planNode()};

const auto expected = execute(plan, /*injectSpill=*/false);

// If OOM injection is not enabled verify the results against DuckDB.
if (!FLAGS_enable_oom_injection) {
if (auto duckDbResult =
computeDuckDbResult(probeInput, buildInput, plan.plan)) {
VELOX_CHECK(
assertEqualResults(
duckDbResult.value(), plan.plan->outputType(), {expected}),
"Velox and DuckDB results don't match");
}
}

std::vector<PlanWithSplits> altPlans;
addFlippedJoinPlan<core::NestedLoopJoinNode>(plan.plan, altPlans);

for (const auto& plan : altPlans) {
auto actual = execute(plan, /*injectSpill=*/false);
if (actual != nullptr && expected != nullptr) {
VELOX_CHECK(
assertEqualResults({expected}, {actual}),
"Logically equivalent plans produced different results");
}
}
}

void JoinFuzzer::verify(core::JoinType joinType) {
const bool nullAware =
isNullAwareSupported(joinType) && vectorFuzzer_.coinToss(0.5);
Expand Down Expand Up @@ -998,6 +1053,18 @@ void JoinFuzzer::verify(core::JoinType joinType) {
}
}

// Test cross product without filter with 10% chance. Avoid testing cross
// product if input size is too large.
if ((core::isInnerJoin(joinType) || core::isLeftJoin(joinType) ||
core::isFullJoin(joinType)) &&
FLAGS_batch_size * FLAGS_num_batches <= 10000) {
if (vectorFuzzer_.coinToss(0.1)) {
testCrossProduct(joinType, probeInput, buildInput);
testCrossProduct(joinType, flatProbeInput, flatBuildInput);
return;
}
}

auto outputColumns =
(core::isLeftSemiProjectJoin(joinType) ||
core::isLeftSemiFilterJoin(joinType) || core::isAntiJoin(joinType))
Expand Down Expand Up @@ -1173,10 +1240,18 @@ void JoinFuzzer::addPlansWithTableScan(
const int32_t numGroups = randInt(1, probeScanSplits.size());
const std::vector<exec::Split> groupedProbeScanSplits =
generateSplitsWithGroup(
tableDir, numGroups, /*isProbe=*/true, probeKeys.size(), probeInput);
tableDir,
numGroups,
/*isProbe=*/true,
probeKeys.size(),
probeInput);
const std::vector<exec::Split> groupedBuildScanSplits =
generateSplitsWithGroup(
tableDir, numGroups, /*isProbe=*/false, buildKeys.size(), buildInput);
tableDir,
numGroups,
/*isProbe=*/false,
buildKeys.size(),
buildInput);

for (const auto& planWithTableScan : plansWithTableScan) {
altPlans.push_back(planWithTableScan);
Expand Down

0 comments on commit e22fcb3

Please sign in to comment.