From 97b89312511045a9e1d41290de42ad18ec25d338 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Oct 2025 11:11:34 +0800 Subject: [PATCH 01/24] refactor(query): refactor code struct --- .../src/physical_plans/physical_hash_join.rs | 7 +- .../processors/transforms/hash_join/desc.rs | 10 +- .../transforms/hash_join/hash_join_state.rs | 605 +----------------- .../new_hash_join/hashtable/basic.rs | 107 ++++ .../new_hash_join/hashtable/fixed_keys.rs | 204 ++++++ .../transforms/new_hash_join/hashtable/mod.rs | 53 ++ .../new_hash_join/hashtable/serialize_keys.rs | 241 +++++++ .../hashtable/single_binary_key.rs | 147 +++++ .../transforms/new_hash_join/join.rs | 6 +- .../new_hash_join/memory/memory_inner_join.rs | 120 ++-- .../transforms/new_hash_join/mod.rs | 2 + .../transforms/new_hash_join/performance.rs | 31 + .../new_hash_join/transform_hash_join.rs | 29 +- 13 files changed, 881 insertions(+), 681 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index a24133e2c81c8..2ad39f89dfede 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -395,6 +395,8 @@ impl HashJoin { desc: Arc, ) -> Result<()> { let state = Arc::new(HashJoinMemoryState::create()); + // We must build the runtime filter before constructing the child nodes, + // as we will inject some runtime filter information into the context for the child nodes to use. let rf_desc = PlanRuntimeFilterDesc::create(&builder.ctx, self); if let Some((build_cache_index, _)) = self.build_side_cache_info { @@ -467,7 +469,7 @@ impl HashJoin { builder: &mut PipelineBuilder, desc: Arc, state: Arc, - ) -> Result> { + ) -> Result> { let hash_key_types = self .build_keys .iter() @@ -489,9 +491,6 @@ impl HashJoin { builder.func_ctx.clone(), method, desc, - self.build_projections.clone(), - self.probe_projections.clone(), - self.probe_to_build.clone(), state, )?)) } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs index 403ff553e342f..547ed16e80df7 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::arrow::and_validities; @@ -26,7 +27,7 @@ use databend_common_expression::RemoteExpr; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::executor::cast_expr_to_non_null_boolean; use parking_lot::RwLock; - +use databend_common_sql::ColumnSet; use crate::physical_plans::HashJoin; use crate::physical_plans::PhysicalRuntimeFilter; use crate::physical_plans::PhysicalRuntimeFilters; @@ -55,6 +56,10 @@ pub struct HashJoinDesc { /// Whether the Join are derived from correlated subquery. pub(crate) from_correlated_subquery: bool, pub(crate) runtime_filter: RuntimeFiltersDesc, + + pub(crate) build_projection: ColumnSet, + pub(crate) probe_projections: ColumnSet, + pub(crate) probe_to_build: Vec<(usize, (bool, bool))>, } #[derive(Debug, Clone)] @@ -122,6 +127,9 @@ impl HashJoinDesc { from_correlated_subquery: join.from_correlated_subquery, single_to_inner: join.single_to_inner.clone(), runtime_filter: (&join.runtime_filter).into(), + probe_to_build: join.probe_to_build.clone(), + build_projection: join.build_projections.clone(), + probe_projections: join.probe_projections.clone(), }) } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs index ef8d05eceb223..a7bdd05010431 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs @@ -23,28 +23,23 @@ use std::sync::Arc; use databend_common_base::base::tokio::sync::watch; use databend_common_base::base::tokio::sync::watch::Receiver; use databend_common_base::base::tokio::sync::watch::Sender; +use databend_common_base::hints::assume; use databend_common_catalog::table_context::TableContext; -use databend_common_column::bitmap::Bitmap; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::DataType; use databend_common_expression::BlockEntry; -use databend_common_expression::Column; use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; use databend_common_expression::FixedKey; -use databend_common_expression::HashMethod; use databend_common_expression::HashMethodFixedKeys; use databend_common_expression::HashMethodSerializer; use databend_common_expression::HashMethodSingleBinary; use databend_common_expression::KeyAccessor; -use databend_common_expression::KeysState; -use databend_common_expression::ProjectedBlock; use databend_common_hashtable::BinaryHashJoinHashMap; use databend_common_hashtable::HashJoinHashMap; -use databend_common_hashtable::HashJoinHashtableLike; use databend_common_hashtable::HashtableKeyable; use databend_common_hashtable::RawEntry; use databend_common_hashtable::RowPtr; @@ -64,22 +59,22 @@ use crate::sessions::QueryContext; use crate::sql::IndexType; pub struct SerializerHashJoinHashTable { - probed_rows: AtomicUsize, - matched_probe_rows: AtomicUsize, + pub(crate) probed_rows: AtomicUsize, + pub(crate) matched_probe_rows: AtomicUsize, pub(crate) hash_table: BinaryHashJoinHashMap, pub(crate) hash_method: HashMethodSerializer, } pub struct SingleBinaryHashJoinHashTable { - probed_rows: AtomicUsize, - matched_probe_rows: AtomicUsize, + pub(crate) probed_rows: AtomicUsize, + pub(crate) matched_probe_rows: AtomicUsize, pub(crate) hash_table: BinaryHashJoinHashMap, pub(crate) hash_method: HashMethodSingleBinary, } pub struct FixedKeyHashJoinHashTable { - probed_rows: AtomicUsize, - matched_probe_rows: AtomicUsize, + pub(crate) probed_rows: AtomicUsize, + pub(crate) matched_probe_rows: AtomicUsize, pub(crate) hash_table: HashJoinHashMap, pub(crate) hash_method: HashMethodFixedKeys, } @@ -346,590 +341,4 @@ impl HashJoinState { } } -impl FixedKeyHashJoinHashTable { - pub fn new(hash_table: HashJoinHashMap, hash_method: HashMethodFixedKeys) -> Self { - FixedKeyHashJoinHashTable:: { - hash_table, - hash_method, - probed_rows: Default::default(), - matched_probe_rows: Default::default(), - } - } - - pub fn insert(&self, keys: DataBlock, chunk: usize, arena: &mut Vec) -> Result<()> { - let num_rows = keys.num_rows(); - let keys = ProjectedBlock::from(keys.columns()); - let keys_state = self.hash_method.build_keys_state(keys, num_rows)?; - let build_keys_iter = self.hash_method.build_keys_iter(&keys_state)?; - - let entry_size = std::mem::size_of::>(); - arena.reserve(num_rows * entry_size); - - let mut raw_entry_ptr = - unsafe { std::mem::transmute::<*mut u8, *mut RawEntry>(arena.as_mut_ptr()) }; - - for (row_index, key) in build_keys_iter.enumerate() { - let row_ptr = RowPtr { - chunk_index: chunk as u32, - row_index: row_index as u32, - }; - - // # Safety - // The memory address of `raw_entry_ptr` is valid. - unsafe { - *raw_entry_ptr = RawEntry { - row_ptr, - key: *key, - next: 0, - } - } - - self.hash_table.insert(*key, raw_entry_ptr); - raw_entry_ptr = unsafe { raw_entry_ptr.add(1) }; - } - - Ok(()) - } - - pub fn probe_keys( - &self, - keys: DataBlock, - valids: Option, - ) -> Result> { - let num_rows = keys.num_rows(); - let hash_method = &self.hash_method; - let mut hashes = Vec::with_capacity(num_rows); - - let keys = ProjectedBlock::from(keys.columns()); - let keys_state = hash_method.build_keys_state(keys, num_rows)?; - hash_method.build_keys_hashes(&keys_state, &mut hashes); - let keys = hash_method.build_keys_accessor(keys_state.clone())?; - - let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { - 0 => false, - probed_rows => { - let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; - matched_probe_rows / (probed_rows as f64) < 0.8 - } - }; - - self.probed_rows.fetch_add( - match &valids { - None => num_rows, - Some(valids) => valids.len() - valids.null_count(), - }, - Ordering::Relaxed, - ); - - match enable_early_filtering { - true => { - let mut selection = vec![0; num_rows]; - - match self.hash_table.early_filtering_matched_probe( - &mut hashes, - valids, - &mut selection, - ) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(FixedKeysProbeStream::create(hashes, keys)), - } - } - false => match self.hash_table.probe(&mut hashes, valids) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(FixedKeysProbeStream::create(hashes, keys)), - }, - } - } -} - -impl SerializerHashJoinHashTable { - pub fn new( - hash_table: BinaryHashJoinHashMap, - hash_method: HashMethodSerializer, - ) -> SerializerHashJoinHashTable { - SerializerHashJoinHashTable { - hash_table, - hash_method, - probed_rows: AtomicUsize::new(0), - matched_probe_rows: AtomicUsize::new(0), - } - } - - pub fn insert(&self, keys: DataBlock, chunk: usize, arena: &mut Vec) -> Result<()> { - let num_rows = keys.num_rows(); - let keys = ProjectedBlock::from(keys.columns()); - let keys_state = self.hash_method.build_keys_state(keys, num_rows)?; - let build_keys_iter = self.hash_method.build_keys_iter(&keys_state)?; - - let space_size = match &keys_state { - // safe to unwrap(): offset.len() >= 1. - KeysState::Column(Column::Bitmap(col)) => col.data().len(), - KeysState::Column(Column::Binary(col)) => col.data().len(), - KeysState::Column(Column::Variant(col)) => col.data().len(), - KeysState::Column(Column::String(col)) => col.total_bytes_len(), - _ => unreachable!(), - }; - - static ENTRY_SIZE: usize = std::mem::size_of::(); - arena.reserve(num_rows * ENTRY_SIZE + space_size); - - let (mut raw_entry_ptr, mut string_local_space_ptr) = unsafe { - ( - std::mem::transmute::<*mut u8, *mut StringRawEntry>(arena.as_mut_ptr()), - arena.as_mut_ptr().add(num_rows * ENTRY_SIZE), - ) - }; - - for (row_index, key) in build_keys_iter.enumerate() { - let row_ptr = RowPtr { - chunk_index: chunk as u32, - row_index: row_index as u32, - }; - - // # Safety - // The memory address of `raw_entry_ptr` is valid. - // string_offset + key.len() <= space_size. - unsafe { - (*raw_entry_ptr).row_ptr = row_ptr; - (*raw_entry_ptr).length = key.len() as u32; - (*raw_entry_ptr).next = 0; - (*raw_entry_ptr).key = string_local_space_ptr; - // The size of `early` is 4. - std::ptr::copy_nonoverlapping( - key.as_ptr(), - (*raw_entry_ptr).early.as_mut_ptr(), - std::cmp::min(STRING_EARLY_SIZE, key.len()), - ); - std::ptr::copy_nonoverlapping(key.as_ptr(), string_local_space_ptr, key.len()); - string_local_space_ptr = string_local_space_ptr.add(key.len()); - } - - self.hash_table.insert(key, raw_entry_ptr); - raw_entry_ptr = unsafe { raw_entry_ptr.add(1) }; - } - - Ok(()) - } - - pub fn probe_keys( - &self, - keys: DataBlock, - valids: Option, - ) -> Result> { - let num_rows = keys.num_rows(); - let hash_method = &self.hash_method; - let mut hashes = Vec::with_capacity(num_rows); - - let keys = ProjectedBlock::from(keys.columns()); - let keys_state = hash_method.build_keys_state(keys, num_rows)?; - hash_method.build_keys_hashes(&keys_state, &mut hashes); - let keys = hash_method.build_keys_accessor(keys_state.clone())?; - - let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { - 0 => false, - probed_rows => { - let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; - matched_probe_rows / (probed_rows as f64) < 0.8 - } - }; - - self.probed_rows.fetch_add( - match &valids { - None => keys.len(), - Some(valids) => valids.len() - valids.null_count(), - }, - Ordering::Relaxed, - ); - - match enable_early_filtering { - true => { - let mut selection = vec![0; keys.len()]; - - match self.hash_table.early_filtering_matched_probe( - &mut hashes, - valids, - &mut selection, - ) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), - } - } - false => match self.hash_table.probe(&mut hashes, valids) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), - }, - } - } -} - -impl SingleBinaryHashJoinHashTable { - pub fn new( - hash_table: BinaryHashJoinHashMap, - hash_method: HashMethodSingleBinary, - ) -> SingleBinaryHashJoinHashTable { - SingleBinaryHashJoinHashTable { - hash_table, - hash_method, - probed_rows: AtomicUsize::new(0), - matched_probe_rows: AtomicUsize::new(0), - } - } - - pub fn insert(&self, keys: DataBlock, chunk: usize, arena: &mut Vec) -> Result<()> { - let num_rows = keys.num_rows(); - let keys = ProjectedBlock::from(keys.columns()); - let keys_state = self.hash_method.build_keys_state(keys, num_rows)?; - let build_keys_iter = self.hash_method.build_keys_iter(&keys_state)?; - - let space_size = match &keys_state { - // safe to unwrap(): offset.len() >= 1. - KeysState::Column(Column::Bitmap(col)) => col.data().len(), - KeysState::Column(Column::Binary(col)) => col.data().len(), - KeysState::Column(Column::Variant(col)) => col.data().len(), - KeysState::Column(Column::String(col)) => col.total_bytes_len(), - _ => unreachable!(), - }; - - static ENTRY_SIZE: usize = std::mem::size_of::(); - arena.reserve(num_rows * ENTRY_SIZE + space_size); - - let (mut raw_entry_ptr, mut string_local_space_ptr) = unsafe { - ( - std::mem::transmute::<*mut u8, *mut StringRawEntry>(arena.as_mut_ptr()), - arena.as_mut_ptr().add(num_rows * ENTRY_SIZE), - ) - }; - - for (row_index, key) in build_keys_iter.enumerate() { - let row_ptr = RowPtr { - chunk_index: chunk as u32, - row_index: row_index as u32, - }; - - // # Safety - // The memory address of `raw_entry_ptr` is valid. - // string_offset + key.len() <= space_size. - unsafe { - (*raw_entry_ptr).row_ptr = row_ptr; - (*raw_entry_ptr).length = key.len() as u32; - (*raw_entry_ptr).next = 0; - (*raw_entry_ptr).key = string_local_space_ptr; - // The size of `early` is 4. - std::ptr::copy_nonoverlapping( - key.as_ptr(), - (*raw_entry_ptr).early.as_mut_ptr(), - std::cmp::min(STRING_EARLY_SIZE, key.len()), - ); - std::ptr::copy_nonoverlapping(key.as_ptr(), string_local_space_ptr, key.len()); - string_local_space_ptr = string_local_space_ptr.add(key.len()); - } - - self.hash_table.insert(key, raw_entry_ptr); - raw_entry_ptr = unsafe { raw_entry_ptr.add(1) }; - } - - Ok(()) - } - - pub fn probe_keys( - &self, - keys: DataBlock, - valids: Option, - ) -> Result> { - let num_rows = keys.num_rows(); - let hash_method = &self.hash_method; - let mut hashes = Vec::with_capacity(num_rows); - - let keys = ProjectedBlock::from(keys.columns()); - let keys_state = hash_method.build_keys_state(keys, num_rows)?; - hash_method.build_keys_hashes(&keys_state, &mut hashes); - let keys = hash_method.build_keys_accessor(keys_state.clone())?; - - let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { - 0 => false, - probed_rows => { - let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; - matched_probe_rows / (probed_rows as f64) < 0.8 - } - }; - - self.probed_rows.fetch_add( - match &valids { - None => keys.len(), - Some(valids) => valids.len() - valids.null_count(), - }, - Ordering::Relaxed, - ); - - match enable_early_filtering { - true => { - let mut selection = vec![0; keys.len()]; - - match self.hash_table.early_filtering_matched_probe( - &mut hashes, - valids, - &mut selection, - ) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), - } - } - false => match self.hash_table.probe(&mut hashes, valids) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), - }, - } - } -} - -pub struct ProbeKeysResult { - pub unmatched: Vec, - pub matched_probe: Vec, - pub matched_build: Vec, -} - -impl ProbeKeysResult { - pub fn empty() -> ProbeKeysResult { - ProbeKeysResult::new(vec![], vec![], vec![]) - } - - pub fn new( - unmatched: Vec, - matched_probe: Vec, - matched_build: Vec, - ) -> ProbeKeysResult { - assert_eq!(matched_build.len(), matched_probe.len()); - - ProbeKeysResult { - unmatched, - matched_probe, - matched_build, - } - } - - pub fn is_empty(&self) -> bool { - self.matched_build.is_empty() && self.unmatched.is_empty() - } - - pub fn is_all_unmatched(&self) -> bool { - self.matched_build.is_empty() && !self.unmatched.is_empty() - } - - pub fn all_unmatched(unmatched: Vec) -> ProbeKeysResult { - ProbeKeysResult::new(unmatched, vec![], vec![]) - } -} - -pub trait ProbeStream { - fn next(&mut self, max_rows: usize) -> Result; -} - -pub struct AllUnmatchedProbeStream { - idx: usize, - size: usize, -} - -impl AllUnmatchedProbeStream { - pub fn create(size: usize) -> Box { - Box::new(AllUnmatchedProbeStream { idx: 0, size }) - } -} - -impl ProbeStream for AllUnmatchedProbeStream { - fn next(&mut self, max_rows: usize) -> Result { - if self.idx >= self.size { - return Ok(ProbeKeysResult::empty()); - } - - let res = std::cmp::min(self.size - self.idx, max_rows); - let res = (self.idx..self.idx + res).collect::>(); - self.idx += res.len(); - Ok(ProbeKeysResult::all_unmatched(res)) - } -} - -pub struct FixedKeysProbeStream { - key_idx: usize, - pointers: Vec, - keys: Box<(dyn KeyAccessor)>, - probe_entry_ptr: u64, -} - -impl FixedKeysProbeStream { - pub fn create( - pointers: Vec, - keys: Box>, - ) -> Box { - Box::new(FixedKeysProbeStream { - keys, - pointers, - key_idx: 0, - probe_entry_ptr: 0, - }) - } -} - -impl ProbeStream for FixedKeysProbeStream { - fn next(&mut self, max_rows: usize) -> Result { - unsafe { - let mut matched_build = Vec::with_capacity(max_rows); - let mut matched_probe = Vec::with_capacity(max_rows); - let mut unmatched = Vec::with_capacity(max_rows); - - while self.key_idx < self.keys.len() { - std::hint::assert_unchecked(unmatched.len() <= unmatched.capacity()); - std::hint::assert_unchecked(matched_probe.len() == matched_build.len()); - std::hint::assert_unchecked(matched_build.len() <= matched_build.capacity()); - std::hint::assert_unchecked(matched_probe.len() <= matched_probe.capacity()); - - if matched_probe.len() == max_rows { - break; - } - - if self.probe_entry_ptr == 0 { - self.probe_entry_ptr = *self.pointers.get_unchecked(self.key_idx); - - if self.probe_entry_ptr == 0 { - unmatched.push(self.key_idx); - self.key_idx += 1; - continue; - } - } - - let key = self.keys.key_unchecked(self.key_idx); - - while self.probe_entry_ptr != 0 { - let raw_entry = &*(self.probe_entry_ptr as *mut RawEntry); - - if key == &raw_entry.key { - let row_ptr = raw_entry.row_ptr; - matched_probe.push(self.key_idx as u64); - matched_build.push(row_ptr); - - if matched_probe.len() == max_rows { - self.probe_entry_ptr = raw_entry.next; - - if self.probe_entry_ptr == 0 { - self.key_idx += 1; - } - - return Ok(ProbeKeysResult::new( - unmatched, - matched_probe, - matched_build, - )); - } - } - - self.probe_entry_ptr = raw_entry.next; - } - - self.key_idx += 1; - } - - Ok(ProbeKeysResult::new( - unmatched, - matched_probe, - matched_build, - )) - } - } -} - -struct BinaryKeyProbeStream { - key_idx: usize, - pointers: Vec, - keys: Box<(dyn KeyAccessor)>, - probe_entry_ptr: u64, -} -impl BinaryKeyProbeStream { - pub fn create( - pointers: Vec, - keys: Box>, - ) -> Box { - Box::new(BinaryKeyProbeStream { - keys, - pointers, - key_idx: 0, - probe_entry_ptr: 0, - }) - } -} - -impl ProbeStream for BinaryKeyProbeStream { - fn next(&mut self, max_rows: usize) -> Result { - unsafe { - let mut matched_build = Vec::with_capacity(max_rows); - let mut matched_probe = Vec::with_capacity(max_rows); - let mut unmatched = Vec::with_capacity(max_rows); - - while self.key_idx < self.keys.len() { - std::hint::assert_unchecked(unmatched.len() <= unmatched.capacity()); - std::hint::assert_unchecked(matched_probe.len() == matched_build.len()); - std::hint::assert_unchecked(matched_build.len() <= matched_build.capacity()); - std::hint::assert_unchecked(matched_probe.len() <= matched_probe.capacity()); - - if matched_probe.len() == max_rows { - break; - } - - if self.probe_entry_ptr == 0 { - self.probe_entry_ptr = *self.pointers.get_unchecked(self.key_idx); - - if self.probe_entry_ptr == 0 { - unmatched.push(self.key_idx); - self.key_idx += 1; - continue; - } - } - - let key = self.keys.key_unchecked(self.key_idx); - - while self.probe_entry_ptr != 0 { - let raw_entry = &*(self.probe_entry_ptr as *mut StringRawEntry); - // Compare `early` and the length of the string, the size of `early` is 4. - let min_len = std::cmp::min(STRING_EARLY_SIZE, key.len()); - - if raw_entry.length as usize == key.len() - && key[0..min_len] == raw_entry.early[0..min_len] - { - let key_ref = std::slice::from_raw_parts( - raw_entry.key as *const u8, - raw_entry.length as usize, - ); - if key == key_ref { - let row_ptr = raw_entry.row_ptr; - matched_probe.push(self.key_idx as u64); - matched_build.push(row_ptr); - - if matched_probe.len() == max_rows { - self.probe_entry_ptr = raw_entry.next; - - if self.probe_entry_ptr == 0 { - self.key_idx += 1; - } - - return Ok(ProbeKeysResult::new( - unmatched, - matched_probe, - matched_build, - )); - } - } - } - - self.probe_entry_ptr = raw_entry.next; - } - - self.key_idx += 1; - } - - Ok(ProbeKeysResult::new( - unmatched, - matched_probe, - matched_build, - )) - } - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs new file mode 100644 index 0000000000000..44b97bf86a254 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs @@ -0,0 +1,107 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_hashtable::RowPtr; + +pub struct PerformanceStatistics { + matched_hash: usize, +} + +pub struct ProbedRows { + pub unmatched: Vec, + pub matched_probe: Vec, + pub matched_build: Vec, + + // pub performance_statistics: PerformanceStatistics, +} + +impl ProbedRows { + pub fn empty() -> ProbedRows { + ProbedRows::new(vec![], vec![], vec![]) + } + + pub fn new( + unmatched: Vec, + matched_probe: Vec, + matched_build: Vec, + ) -> ProbedRows { + assert_eq!(matched_build.len(), matched_probe.len()); + + ProbedRows { + unmatched, + matched_probe, + matched_build, + } + } + + pub fn is_empty(&self) -> bool { + self.matched_build.is_empty() && self.unmatched.is_empty() + } + + pub fn is_all_unmatched(&self) -> bool { + self.matched_build.is_empty() && !self.unmatched.is_empty() + } + + pub fn all_unmatched(unmatched: Vec) -> ProbedRows { + ProbedRows::new(unmatched, vec![], vec![]) + } +} + +pub trait ProbeStream { + fn next(&mut self, max_rows: usize) -> Result; + + fn both(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { + unimplemented!() + } + + fn matched(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + unimplemented!() + } +} + +pub struct AllUnmatchedProbeStream { + idx: usize, + size: usize, +} + +impl AllUnmatchedProbeStream { + pub fn create(size: usize) -> Box { + Box::new(AllUnmatchedProbeStream { idx: 0, size }) + } +} + +impl ProbeStream for AllUnmatchedProbeStream { + fn next(&mut self, max_rows: usize) -> Result { + if self.idx >= self.size { + return Ok(ProbedRows::empty()); + } + + let res = std::cmp::min(self.size - self.idx, max_rows); + let res = (self.idx..self.idx + res).collect::>(); + self.idx += res.len(); + Ok(ProbedRows::all_unmatched(res)) + } + + fn matched(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + if self.idx >= self.size { + return Ok(()); + } + + let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows); + res.unmatched.extend(self.idx..self.idx + unmatched_rows); + self.idx += unmatched_rows; + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs new file mode 100644 index 0000000000000..a0418ad19cba0 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -0,0 +1,204 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::atomic::Ordering; + +use databend_common_base::hints::assume; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FixedKey; +use databend_common_expression::HashMethod; +use databend_common_expression::HashMethodFixedKeys; +use databend_common_expression::KeyAccessor; +use databend_common_expression::ProjectedBlock; +use databend_common_hashtable::HashJoinHashMap; +use databend_common_hashtable::HashJoinHashtableLike; +use databend_common_hashtable::HashtableKeyable; +use databend_common_hashtable::RawEntry; +use databend_common_hashtable::RowPtr; + +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{AllUnmatchedProbeStream, ProbeStream}; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; +use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable; + +impl FixedKeyHashJoinHashTable { + pub fn new(hash_table: HashJoinHashMap, hash_method: HashMethodFixedKeys) -> Self { + FixedKeyHashJoinHashTable:: { + hash_table, + hash_method, + probed_rows: Default::default(), + matched_probe_rows: Default::default(), + } + } + + pub fn insert(&self, keys: DataBlock, chunk: usize, arena: &mut Vec) -> Result<()> { + let num_rows = keys.num_rows(); + let keys = ProjectedBlock::from(keys.columns()); + let keys_state = self.hash_method.build_keys_state(keys, num_rows)?; + let build_keys_iter = self.hash_method.build_keys_iter(&keys_state)?; + + let entry_size = std::mem::size_of::>(); + arena.reserve(num_rows * entry_size); + + let mut raw_entry_ptr = + unsafe { std::mem::transmute::<*mut u8, *mut RawEntry>(arena.as_mut_ptr()) }; + + for (row_index, key) in build_keys_iter.enumerate() { + let row_ptr = RowPtr { + chunk_index: chunk as u32, + row_index: row_index as u32, + }; + + // # Safety + // The memory address of `raw_entry_ptr` is valid. + unsafe { + *raw_entry_ptr = RawEntry { + row_ptr, + key: *key, + next: 0, + } + } + + self.hash_table.insert(*key, raw_entry_ptr); + raw_entry_ptr = unsafe { raw_entry_ptr.add(1) }; + } + + Ok(()) + } + + pub fn probe(&self, probe_data: ProbeData) -> Result> { + let num_rows = probe_data.num_rows(); + let hash_method = &self.hash_method; + let mut hashes = Vec::with_capacity(num_rows); + + let keys = ProjectedBlock::from(probe_data.columns()); + let keys_state = hash_method.build_keys_state(keys, num_rows)?; + hash_method.build_keys_hashes(&keys_state, &mut hashes); + let keys = hash_method.build_keys_accessor(keys_state.clone())?; + + let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { + 0 => false, + probed_rows => { + let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; + matched_probe_rows / (probed_rows as f64) < 0.8 + } + }; + + let probed_rows = probe_data.non_null_rows(); + self.probed_rows.fetch_add(probed_rows, Ordering::Relaxed); + + let (_, valids) = probe_data.into_raw(); + + match enable_early_filtering { + true => { + let mut selection = vec![0; num_rows]; + + match self.hash_table.early_filtering_matched_probe( + &mut hashes, + valids, + &mut selection, + ) { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => Ok(FixedKeysProbeStream::create(hashes, keys)), + } + } + false => match self.hash_table.probe(&mut hashes, valids) { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => Ok(FixedKeysProbeStream::create(hashes, keys)), + }, + } + } +} + +pub struct FixedKeysProbeStream { + key_idx: usize, + pointers: Vec, + keys: Box<(dyn KeyAccessor)>, + probe_entry_ptr: u64, +} + +impl FixedKeysProbeStream { + pub fn create( + pointers: Vec, + keys: Box>, + ) -> Box { + Box::new(FixedKeysProbeStream { + keys, + pointers, + key_idx: 0, + probe_entry_ptr: 0, + }) + } +} + +impl ProbeStream for FixedKeysProbeStream { + fn next(&mut self, max_rows: usize) -> Result { + unsafe { + let mut matched_build = Vec::with_capacity(max_rows); + let mut matched_probe = Vec::with_capacity(max_rows); + let mut unmatched = Vec::with_capacity(max_rows); + + while self.key_idx < self.keys.len() { + assume(unmatched.len() <= unmatched.capacity()); + assume(matched_probe.len() == matched_build.len()); + assume(matched_build.len() <= matched_build.capacity()); + assume(matched_probe.len() <= matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); + + if matched_probe.len() == max_rows { + break; + } + + if self.probe_entry_ptr == 0 { + self.probe_entry_ptr = self.pointers[self.key_idx]; + + if self.probe_entry_ptr == 0 { + unmatched.push(self.key_idx); + self.key_idx += 1; + continue; + } + } + + let key = self.keys.key_unchecked(self.key_idx); + + while self.probe_entry_ptr != 0 { + let raw_entry = &*(self.probe_entry_ptr as *mut RawEntry); + + if key == &raw_entry.key { + let row_ptr = raw_entry.row_ptr; + matched_probe.push(self.key_idx as u64); + matched_build.push(row_ptr); + + if matched_probe.len() == max_rows { + self.probe_entry_ptr = raw_entry.next; + + if self.probe_entry_ptr == 0 { + self.key_idx += 1; + } + + return Ok(ProbedRows::new(unmatched, matched_probe, matched_build)); + } + } + + self.probe_entry_ptr = raw_entry.next; + } + + self.key_idx += 1; + } + + Ok(ProbedRows::new(unmatched, matched_probe, matched_build)) + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs new file mode 100644 index 0000000000000..7d1adaee30c22 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs @@ -0,0 +1,53 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod fixed_keys; + +mod serialize_keys; +mod single_binary_key; +pub mod basic; + +use databend_common_column::bitmap::Bitmap; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; + +pub struct ProbeData { + keys: DataBlock, + valids: Option, +} + +impl ProbeData { + pub fn new(keys: DataBlock, valids: Option) -> Self { + Self { keys, valids } + } + + pub fn num_rows(&self) -> usize { + self.keys.num_rows() + } + + pub fn columns(&self) -> &[BlockEntry] { + self.keys.columns() + } + + pub fn non_null_rows(&self) -> usize { + match &self.valids { + None => self.keys.num_rows(), + Some(valids) => valids.len() - valids.null_count(), + } + } + + pub fn into_raw(self) -> (DataBlock, Option) { + (self.keys, self.valids) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs new file mode 100644 index 0000000000000..a99fc53c279a6 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -0,0 +1,241 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; + +use databend_common_exception::Result; +use databend_common_expression::Column; +use databend_common_expression::DataBlock; +use databend_common_expression::HashMethod; +use databend_common_expression::HashMethodSerializer; +use databend_common_expression::KeyAccessor; +use databend_common_expression::KeysState; +use databend_common_expression::ProjectedBlock; +use databend_common_hashtable::BinaryHashJoinHashMap; +use databend_common_hashtable::HashJoinHashtableLike; +use databend_common_hashtable::RowPtr; +use databend_common_hashtable::StringRawEntry; +use databend_common_hashtable::STRING_EARLY_SIZE; + +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{AllUnmatchedProbeStream, ProbeStream}; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; +use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; + +impl SerializerHashJoinHashTable { + pub fn new( + hash_table: BinaryHashJoinHashMap, + hash_method: HashMethodSerializer, + ) -> SerializerHashJoinHashTable { + SerializerHashJoinHashTable { + hash_table, + hash_method, + probed_rows: AtomicUsize::new(0), + matched_probe_rows: AtomicUsize::new(0), + } + } + + pub fn insert(&self, keys: DataBlock, chunk: usize, arena: &mut Vec) -> Result<()> { + let num_rows = keys.num_rows(); + let keys = ProjectedBlock::from(keys.columns()); + let keys_state = self.hash_method.build_keys_state(keys, num_rows)?; + let build_keys_iter = self.hash_method.build_keys_iter(&keys_state)?; + + let space_size = match &keys_state { + // safe to unwrap(): offset.len() >= 1. + KeysState::Column(Column::Bitmap(col)) => col.data().len(), + KeysState::Column(Column::Binary(col)) => col.data().len(), + KeysState::Column(Column::Variant(col)) => col.data().len(), + KeysState::Column(Column::String(col)) => col.total_bytes_len(), + _ => unreachable!(), + }; + + static ENTRY_SIZE: usize = std::mem::size_of::(); + arena.reserve(num_rows * ENTRY_SIZE + space_size); + + let (mut raw_entry_ptr, mut string_local_space_ptr) = unsafe { + ( + std::mem::transmute::<*mut u8, *mut StringRawEntry>(arena.as_mut_ptr()), + arena.as_mut_ptr().add(num_rows * ENTRY_SIZE), + ) + }; + + for (row_index, key) in build_keys_iter.enumerate() { + let row_ptr = RowPtr { + chunk_index: chunk as u32, + row_index: row_index as u32, + }; + + // # Safety + // The memory address of `raw_entry_ptr` is valid. + // string_offset + key.len() <= space_size. + unsafe { + (*raw_entry_ptr).row_ptr = row_ptr; + (*raw_entry_ptr).length = key.len() as u32; + (*raw_entry_ptr).next = 0; + (*raw_entry_ptr).key = string_local_space_ptr; + // The size of `early` is 4. + std::ptr::copy_nonoverlapping( + key.as_ptr(), + (*raw_entry_ptr).early.as_mut_ptr(), + std::cmp::min(STRING_EARLY_SIZE, key.len()), + ); + std::ptr::copy_nonoverlapping(key.as_ptr(), string_local_space_ptr, key.len()); + string_local_space_ptr = string_local_space_ptr.add(key.len()); + } + + self.hash_table.insert(key, raw_entry_ptr); + raw_entry_ptr = unsafe { raw_entry_ptr.add(1) }; + } + + Ok(()) + } + + pub fn probe(&self, probe_data: ProbeData) -> Result> { + let num_rows = probe_data.num_rows(); + let hash_method = &self.hash_method; + let mut hashes = Vec::with_capacity(num_rows); + + let keys = ProjectedBlock::from(probe_data.columns()); + let keys_state = hash_method.build_keys_state(keys, num_rows)?; + hash_method.build_keys_hashes(&keys_state, &mut hashes); + let keys = hash_method.build_keys_accessor(keys_state.clone())?; + + let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { + 0 => false, + probed_rows => { + let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; + matched_probe_rows / (probed_rows as f64) < 0.8 + } + }; + + let probed_rows = probe_data.non_null_rows(); + self.probed_rows.fetch_add(probed_rows, Ordering::Relaxed); + + let (_, valids) = probe_data.into_raw(); + match enable_early_filtering { + true => { + let mut selection = vec![0; num_rows]; + + match self.hash_table.early_filtering_matched_probe( + &mut hashes, + valids, + &mut selection, + ) { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), + } + } + false => match self.hash_table.probe(&mut hashes, valids) { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), + }, + } + } +} + +pub struct BinaryKeyProbeStream { + key_idx: usize, + pointers: Vec, + keys: Box<(dyn KeyAccessor)>, + probe_entry_ptr: u64, +} + +impl BinaryKeyProbeStream { + pub fn create( + pointers: Vec, + keys: Box>, + ) -> Box { + Box::new(BinaryKeyProbeStream { + keys, + pointers, + key_idx: 0, + probe_entry_ptr: 0, + }) + } +} + +impl ProbeStream for BinaryKeyProbeStream { + fn next(&mut self, max_rows: usize) -> Result { + unsafe { + let mut matched_build = Vec::with_capacity(max_rows); + let mut matched_probe = Vec::with_capacity(max_rows); + let mut unmatched = Vec::with_capacity(max_rows); + + while self.key_idx < self.keys.len() { + std::hint::assert_unchecked(unmatched.len() <= unmatched.capacity()); + std::hint::assert_unchecked(matched_probe.len() == matched_build.len()); + std::hint::assert_unchecked(matched_build.len() <= matched_build.capacity()); + std::hint::assert_unchecked(matched_probe.len() <= matched_probe.capacity()); + + if matched_probe.len() == max_rows { + break; + } + + if self.probe_entry_ptr == 0 { + self.probe_entry_ptr = *self.pointers.get_unchecked(self.key_idx); + + if self.probe_entry_ptr == 0 { + unmatched.push(self.key_idx); + self.key_idx += 1; + continue; + } + } + + let key = self.keys.key_unchecked(self.key_idx); + + while self.probe_entry_ptr != 0 { + let raw_entry = &*(self.probe_entry_ptr as *mut StringRawEntry); + // Compare `early` and the length of the string, the size of `early` is 4. + let min_len = std::cmp::min(STRING_EARLY_SIZE, key.len()); + + if raw_entry.length as usize == key.len() + && key[0..min_len] == raw_entry.early[0..min_len] + { + let key_ref = std::slice::from_raw_parts( + raw_entry.key as *const u8, + raw_entry.length as usize, + ); + if key == key_ref { + let row_ptr = raw_entry.row_ptr; + matched_probe.push(self.key_idx as u64); + matched_build.push(row_ptr); + + if matched_probe.len() == max_rows { + self.probe_entry_ptr = raw_entry.next; + + if self.probe_entry_ptr == 0 { + self.key_idx += 1; + } + + return Ok(ProbedRows::new( + unmatched, + matched_probe, + matched_build, + )); + } + } + } + + self.probe_entry_ptr = raw_entry.next; + } + + self.key_idx += 1; + } + + Ok(ProbedRows::new(unmatched, matched_probe, matched_build)) + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs new file mode 100644 index 0000000000000..6c5a6e5bb279f --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs @@ -0,0 +1,147 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; + +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::Column; +use databend_common_expression::DataBlock; +use databend_common_expression::HashMethod; +use databend_common_expression::HashMethodSingleBinary; +use databend_common_expression::KeysState; +use databend_common_expression::ProjectedBlock; +use databend_common_hashtable::BinaryHashJoinHashMap; +use databend_common_hashtable::HashJoinHashtableLike; +use databend_common_hashtable::RowPtr; +use databend_common_hashtable::StringRawEntry; +use databend_common_hashtable::STRING_EARLY_SIZE; + +use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::BinaryKeyProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{AllUnmatchedProbeStream, ProbeStream}; +use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; + +impl SingleBinaryHashJoinHashTable { + pub fn new( + hash_table: BinaryHashJoinHashMap, + hash_method: HashMethodSingleBinary, + ) -> SingleBinaryHashJoinHashTable { + SingleBinaryHashJoinHashTable { + hash_table, + hash_method, + probed_rows: AtomicUsize::new(0), + matched_probe_rows: AtomicUsize::new(0), + } + } + + pub fn insert(&self, keys: DataBlock, chunk: usize, arena: &mut Vec) -> Result<()> { + let num_rows = keys.num_rows(); + let keys = ProjectedBlock::from(keys.columns()); + let keys_state = self.hash_method.build_keys_state(keys, num_rows)?; + let build_keys_iter = self.hash_method.build_keys_iter(&keys_state)?; + + let space_size = match &keys_state { + // safe to unwrap(): offset.len() >= 1. + KeysState::Column(Column::Bitmap(col)) => col.data().len(), + KeysState::Column(Column::Binary(col)) => col.data().len(), + KeysState::Column(Column::Variant(col)) => col.data().len(), + KeysState::Column(Column::String(col)) => col.total_bytes_len(), + _ => unreachable!(), + }; + + static ENTRY_SIZE: usize = std::mem::size_of::(); + arena.reserve(num_rows * ENTRY_SIZE + space_size); + + let (mut raw_entry_ptr, mut string_local_space_ptr) = unsafe { + ( + std::mem::transmute::<*mut u8, *mut StringRawEntry>(arena.as_mut_ptr()), + arena.as_mut_ptr().add(num_rows * ENTRY_SIZE), + ) + }; + + for (row_index, key) in build_keys_iter.enumerate() { + let row_ptr = RowPtr { + chunk_index: chunk as u32, + row_index: row_index as u32, + }; + + // # Safety + // The memory address of `raw_entry_ptr` is valid. + // string_offset + key.len() <= space_size. + unsafe { + (*raw_entry_ptr).row_ptr = row_ptr; + (*raw_entry_ptr).length = key.len() as u32; + (*raw_entry_ptr).next = 0; + (*raw_entry_ptr).key = string_local_space_ptr; + // The size of `early` is 4. + std::ptr::copy_nonoverlapping( + key.as_ptr(), + (*raw_entry_ptr).early.as_mut_ptr(), + std::cmp::min(STRING_EARLY_SIZE, key.len()), + ); + std::ptr::copy_nonoverlapping(key.as_ptr(), string_local_space_ptr, key.len()); + string_local_space_ptr = string_local_space_ptr.add(key.len()); + } + + self.hash_table.insert(key, raw_entry_ptr); + raw_entry_ptr = unsafe { raw_entry_ptr.add(1) }; + } + + Ok(()) + } + + pub fn probe(&self, probe_data: ProbeData) -> Result> { + let num_rows = probe_data.num_rows(); + let hash_method = &self.hash_method; + let mut hashes = Vec::with_capacity(num_rows); + + let keys = ProjectedBlock::from(probe_data.columns()); + let keys_state = hash_method.build_keys_state(keys, num_rows)?; + hash_method.build_keys_hashes(&keys_state, &mut hashes); + let keys = hash_method.build_keys_accessor(keys_state.clone())?; + + let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { + 0 => false, + probed_rows => { + let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; + matched_probe_rows / (probed_rows as f64) < 0.8 + } + }; + + let probed_rows = probe_data.non_null_rows(); + self.probed_rows.fetch_add(probed_rows, Ordering::Relaxed); + + let (_, valids) = probe_data.into_raw(); + match enable_early_filtering { + true => { + let mut selection = vec![0; num_rows]; + + match self.hash_table.early_filtering_matched_probe( + &mut hashes, + valids, + &mut selection, + ) { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), + } + } + false => match self.hash_table.probe(&mut hashes, valids) { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), + }, + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs index f837a00ca624d..56fad77007275 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs @@ -18,7 +18,7 @@ use databend_common_expression::DataBlock; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -pub trait JoinStream: Send + Sync + 'static { +pub trait JoinStream: Send + Sync { fn next(&mut self) -> Result>; } @@ -31,9 +31,9 @@ pub trait Join: Send + Sync + 'static { Ok(JoinRuntimeFilterPacket::default()) } - fn probe_block(&mut self, data: DataBlock) -> Result>; + fn probe_block(&mut self, data: DataBlock) -> Result>; - fn final_probe(&mut self) -> Result>; + fn final_probe(&mut self) -> Result>; } pub struct EmptyJoinStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs index 7aa5041e16223..2bc72496a8a18 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs @@ -28,6 +28,7 @@ use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::DataBlock; use databend_common_expression::Evaluator; +use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::HashMethodSerializer; @@ -35,20 +36,21 @@ use databend_common_expression::HashMethodSingleBinary; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_hashtable::BinaryHashJoinHashMap; use databend_common_hashtable::HashJoinHashMap; -use databend_common_sql::ColumnSet; use ethnum::U256; use crate::pipelines::processors::transforms::new_hash_join::common::SquashBlocks; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::memory::memory_state::HashJoinMemoryState; +use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable; use crate::pipelines::processors::transforms::HashJoinHashTable; -use crate::pipelines::processors::transforms::ProbeStream; use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{ProbeStream, ProbedRows}; use crate::sessions::QueryContext; pub struct MemoryInnerJoin { @@ -56,11 +58,10 @@ pub struct MemoryInnerJoin { squash_block: SquashBlocks, method: HashMethodKind, - state: Arc, - build_projection: ColumnSet, - probe_projections: ColumnSet, - probe_to_build: Arc>, function_ctx: FunctionContext, + state: Arc, + + performance_context: PerformanceContext, } impl MemoryInnerJoin { @@ -69,9 +70,6 @@ impl MemoryInnerJoin { function_ctx: FunctionContext, method: HashMethodKind, desc: Arc, - build_projection: ColumnSet, - probe_projections: ColumnSet, - probe_to_build: Vec<(usize, (bool, bool))>, state: Arc, ) -> Result { let settings = ctx.get_settings(); @@ -83,15 +81,13 @@ impl MemoryInnerJoin { state, method, function_ctx, - build_projection, - probe_projections, - probe_to_build: Arc::new(probe_to_build), squash_block: SquashBlocks::new(block_size, block_bytes), + performance_context: PerformanceContext::new(), }) } fn init_columns_vec(&mut self) { - if self.build_projection.is_empty() || !self.state.columns.is_empty() { + if self.desc.build_projection.is_empty() || !self.state.columns.is_empty() { return; } @@ -103,13 +99,13 @@ impl MemoryInnerJoin { } if let Some(block) = self.state.chunks.first() { - for offset in 0..self.build_projection.len() { + for offset in 0..self.desc.build_projection.len() { let column_type = self.state.column_types.as_mut(); column_type.push(block.get_by_offset(offset).data_type()); } } - for offset in 0..self.build_projection.len() { + for offset in 0..self.desc.build_projection.len() { let full_columns = self .state .chunks @@ -254,7 +250,7 @@ impl Join for MemoryInnerJoin { let keys_entries = self.desc.build_key(&chunk_block, &self.function_ctx)?; let mut keys_block = DataBlock::new(keys_entries, chunk_block.num_rows()); - chunk_block = chunk_block.project(&self.build_projection); + chunk_block = chunk_block.project(&self.desc.build_projection); if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { keys_block = keys_block.filter_with_bitmap(&bitmap)?; @@ -282,7 +278,7 @@ impl Join for MemoryInnerJoin { })) } - fn probe_block(&mut self, data: DataBlock) -> Result> { + fn probe_block(&mut self, data: DataBlock) -> Result> { if data.is_empty() { return Ok(Box::new(EmptyJoinStream)); } @@ -297,32 +293,38 @@ impl Join for MemoryInnerJoin { }; self.desc.remove_keys_nullable(&mut keys); - let probe_block = data.project(&self.probe_projections); - - let joined_stream = with_join_hash_method!(|T| match self.state.hash_table.deref() { - HashJoinHashTable::T(table) => { - let probe_keys_stream = table.probe_keys(keys, valids)?; - - Ok(MemoryInnerJoinStream::create( - probe_block, - self.state.clone(), - probe_keys_stream, - self.probe_to_build.clone(), - )) - } - HashJoinHashTable::Null => Err(ErrorCode::AbortedQuery( - "Aborted query, because the hash table is uninitialized.", - )), - })?; + let probe_block = data.project(&self.desc.probe_projections); + + let joined_stream: Box = + with_join_hash_method!(|T| match self.state.hash_table.deref() { + HashJoinHashTable::T(table) => { + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = table.probe(probe_data)?; + + Ok(MemoryInnerJoinStream::create( + probe_block, + self.state.clone(), + probe_keys_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + )) + } + HashJoinHashTable::Null => Err(ErrorCode::AbortedQuery( + "Aborted query, because the hash table is uninitialized.", + )), + })?; - match self.desc.other_predicate.as_ref() { + // if let Some(filter_executor) = &mut self.performance_context.filter_executor { + match &mut self.performance_context.filter_executor { None => Ok(joined_stream), - Some(_) => Ok(FilterJoinStream::create( + Some(filter_executor) => Ok(FilterJoinStream::create( self.desc.clone(), self.function_ctx.clone(), joined_stream, + filter_executor, )), } + // } } fn final_probe(&mut self) -> Result> { @@ -330,33 +332,36 @@ impl Join for MemoryInnerJoin { } } -struct MemoryInnerJoinStream { +struct MemoryInnerJoinStream<'a> { + desc: Arc, probe_data_block: DataBlock, join_state: Arc, - probe_keys_stream: Box, - probe_to_build: Arc>, + probe_keys_stream: Box, + test: &'a mut ProbedRows, } -unsafe impl Send for MemoryInnerJoinStream {} -unsafe impl Sync for MemoryInnerJoinStream {} +unsafe impl<'a> Send for MemoryInnerJoinStream<'a> {} +unsafe impl<'a> Sync for MemoryInnerJoinStream<'a> {} -impl MemoryInnerJoinStream { +impl<'a> MemoryInnerJoinStream<'a> { pub fn create( block: DataBlock, state: Arc, - probe_keys_stream: Box, - probe_to_build: Arc>, - ) -> Box { + probe_keys_stream: Box, + desc: Arc, + test: &'a mut ProbedRows, + ) -> Box { Box::new(MemoryInnerJoinStream { probe_data_block: block, join_state: state, probe_keys_stream, - probe_to_build, + desc, + test, }) } } -impl JoinStream for MemoryInnerJoinStream { +impl<'a> JoinStream for MemoryInnerJoinStream<'a> { fn next(&mut self) -> Result> { loop { let probe_result = self.probe_keys_stream.next(65535)?; @@ -400,8 +405,10 @@ impl JoinStream for MemoryInnerJoinStream { (None, None) => DataBlock::new(vec![], probe_result.matched_build.len()), }; - if !self.probe_to_build.is_empty() { - for (index, (is_probe_nullable, is_build_nullable)) in self.probe_to_build.iter() { + if !self.desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in + self.desc.probe_to_build.iter() + { let entry = match (is_probe_nullable, is_build_nullable) { (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), (true, false) => { @@ -430,27 +437,30 @@ impl JoinStream for MemoryInnerJoinStream { } } -pub struct FilterJoinStream { +pub struct FilterJoinStream<'a> { desc: Arc, - inner: Box, function_ctx: FunctionContext, + inner: Box, + filter_executor: &'a mut FilterExecutor, } -impl FilterJoinStream { +impl<'a> FilterJoinStream<'a> { pub fn create( desc: Arc, function_ctx: FunctionContext, - inner: Box, - ) -> Box { + inner: Box, + filter_executor: &'a mut FilterExecutor, + ) -> Box { Box::new(FilterJoinStream { desc, inner, function_ctx, + filter_executor, }) } } -impl JoinStream for FilterJoinStream { +impl<'a> JoinStream for FilterJoinStream<'a> { fn next(&mut self) -> Result> { loop { let Some(data_block) = self.inner.next()? else { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index ef1856d9604fd..1edcb9ea410c5 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -17,6 +17,8 @@ mod join; mod memory; mod runtime_filter; mod transform_hash_join; +mod hashtable; +mod performance; pub use join::Join; pub use memory::HashJoinMemoryState; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs new file mode 100644 index 0000000000000..77ee1db2cf9e3 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs @@ -0,0 +1,31 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_expression::FilterExecutor; + +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; + +pub struct PerformanceContext { + pub probe_result: ProbedRows, + pub filter_executor: Option, +} + +impl PerformanceContext { + pub fn new() -> Self { + PerformanceContext { + probe_result: ProbedRows::new(vec![], vec![], vec![]), + filter_executor: None, + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs index b905d83fc4d82..714b980fb82c6 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs @@ -29,12 +29,12 @@ use tokio::sync::Barrier; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::PlanRuntimeFilterDesc; pub struct TransformHashJoin { build_port: Arc, probe_port: Arc, - joined_port: Arc, stage: Stage, @@ -42,7 +42,6 @@ pub struct TransformHashJoin { joined_data: Option, stage_sync_barrier: Arc, projection: ColumnSet, - initialize: bool, rf_desc: Arc, } @@ -61,15 +60,14 @@ impl TransformHashJoin { probe_port, joined_port, join, - joined_data: None, - stage_sync_barrier, + rf_desc, projection, - initialize: false, + stage_sync_barrier, + joined_data: None, stage: Stage::Build(BuildState { finished: false, build_data: None, }), - rf_desc, })) } } @@ -112,21 +110,11 @@ impl Processor for TransformHashJoin { } match &mut self.stage { - Stage::Build(state) => match state.event(&self.build_port)? { - Event::NeedData if !self.initialize => { - self.initialize = true; - // self.probe_port.set_need_data(); - Ok(Event::NeedData) - } - other => Ok(other), - }, + Stage::Build(state) => state.event(&self.build_port), Stage::BuildFinal(state) => state.event(), Stage::Probe(state) => state.event(&self.probe_port), Stage::ProbeFinal(state) => state.event(&self.joined_port), - Stage::Finished => { - self.joined_port.finish(); - Ok(Event::Finished) - } + Stage::Finished => Ok(Event::Finished), } } @@ -155,7 +143,7 @@ impl Processor for TransformHashJoin { Stage::Probe(state) => { if let Some(probe_data) = state.input_data.take() { let stream = self.join.probe_block(probe_data)?; - state.stream = Some(stream); + state.stream = Some(unsafe { std::mem::transmute(stream) }); } if let Some(mut stream) = state.stream.take() { @@ -170,7 +158,8 @@ impl Processor for TransformHashJoin { Stage::ProbeFinal(state) => { if !state.initialized { state.initialized = true; - state.stream = Some(self.join.final_probe()?); + let final_stream = self.join.final_probe()?; + state.stream = Some(unsafe { std::mem::transmute(final_stream) }); } if let Some(mut stream) = state.stream.take() { From 48875d44931a59c33d078de657538a5422192fb6 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Oct 2025 15:01:20 +0800 Subject: [PATCH 02/24] refactor(query): refactor code struct --- .../hashtable/src/hashjoin_hashtable.rs | 100 +++--- .../src/hashjoin_string_hashtable.rs | 68 ++--- src/common/hashtable/src/traits.rs | 6 +- .../processors/transforms/hash_join/desc.rs | 4 +- .../hash_join/hash_join_probe_state.rs | 3 + .../transforms/hash_join/hash_join_state.rs | 13 - .../new_hash_join/hashtable/basic.rs | 38 +-- .../new_hash_join/hashtable/fixed_keys.rs | 278 ++++++++++++----- .../transforms/new_hash_join/hashtable/mod.rs | 47 ++- .../new_hash_join/hashtable/serialize_keys.rs | 287 +++++++++++++----- .../hashtable/single_binary_key.rs | 107 +++++-- .../new_hash_join/memory/memory_inner_join.rs | 70 ++--- .../transforms/new_hash_join/mod.rs | 4 +- .../transforms/new_hash_join/performance.rs | 33 +- .../new_hash_join/transform_hash_join.rs | 4 +- 15 files changed, 685 insertions(+), 377 deletions(-) diff --git a/src/common/hashtable/src/hashjoin_hashtable.rs b/src/common/hashtable/src/hashjoin_hashtable.rs index 0f0d8ff4e112b..a26d840c4f7c0 100644 --- a/src/common/hashtable/src/hashjoin_hashtable.rs +++ b/src/common/hashtable/src/hashjoin_hashtable.rs @@ -17,6 +17,7 @@ use std::marker::PhantomData; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; +use databend_common_base::hints::assume; use databend_common_base::mem_allocator::DefaultAllocator; use databend_common_column::bitmap::Bitmap; @@ -215,25 +216,19 @@ where &self, hashes: &mut [u64], bitmap: Option, - matched_selection: &mut [u32], - unmatched_selection: &mut [u32], + matched_selection: &mut Vec, + unmatched_selection: &mut Vec, ) -> (usize, usize) { let mut valids = None; if let Some(bitmap) = bitmap { if bitmap.null_count() == bitmap.len() { - unmatched_selection - .iter_mut() - .enumerate() - .for_each(|(idx, val)| { - *val = idx as u32; - }); + unmatched_selection.extend(0..bitmap.null_count() as u32); return (0, hashes.len()); } else if bitmap.null_count() > 0 { valids = Some(bitmap); } } - let mut matched_idx = 0; - let mut unmatched_idx = 0; + match valids { Some(valids) => { valids.iter().zip(hashes.iter_mut().enumerate()).for_each( @@ -242,22 +237,15 @@ where let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { - *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 - }; - matched_idx += 1; + assume(matched_selection.len() <= matched_selection.capacity()); + matched_selection.push(idx as u32); } else { - unsafe { - *unmatched_selection.get_unchecked_mut(unmatched_idx) = - idx as u32 - }; - unmatched_idx += 1; + assume(unmatched_selection.len() <= unmatched_selection.capacity()); + unmatched_selection.push(idx as u32); } } else { - unsafe { - *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32 - }; - unmatched_idx += 1; + assume(unmatched_selection.len() <= unmatched_selection.capacity()); + unmatched_selection.push(idx as u32); } }, ); @@ -267,18 +255,16 @@ where let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 }; - matched_idx += 1; + assume(matched_selection.len() <= matched_selection.capacity()); + matched_selection.push(idx as u32); } else { - unsafe { - *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32 - }; - unmatched_idx += 1; + assume(unmatched_selection.len() <= unmatched_selection.capacity()); + unmatched_selection.push(idx as u32); } }); } } - (matched_idx, unmatched_idx) + (matched_selection.len(), unmatched_selection.len()) } // Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes. @@ -286,53 +272,43 @@ where &self, hashes: &mut [u64], bitmap: Option, - selection: &mut [u32], + selection: &mut Vec, ) -> usize { let mut valids = None; + if let Some(bitmap) = bitmap { if bitmap.null_count() == bitmap.len() { - hashes.iter_mut().for_each(|hash| { - *hash = 0; - }); return 0; } else if bitmap.null_count() > 0 { valids = Some(bitmap); } } - let mut count = 0; - match valids { - Some(valids) => { - valids.iter().zip(hashes.iter_mut().enumerate()).for_each( - |(valid, (idx, hash))| { - if valid { - let header = self.pointers[(*hash >> self.hash_shift) as usize]; - if header != 0 && early_filtering(header, *hash) { - *hash = remove_header_tag(header); - unsafe { *selection.get_unchecked_mut(count) = idx as u32 }; - count += 1; - } else { - *hash = 0; - } - } else { - *hash = 0; - } - }, - ); - } - None => { - hashes.iter_mut().enumerate().for_each(|(idx, hash)| { + + if let Some(valids) = valids { + for (valid, (idx, hash)) in valids.iter().zip(hashes.iter_mut().enumerate()) { + if valid { let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { *selection.get_unchecked_mut(count) = idx as u32 }; - count += 1; - } else { - *hash = 0; + assume(selection.len() <= selection.capacity()); + selection.push(idx as u32); } - }); + } } + + return selection.len(); } - count + + for (idx, hash) in hashes.iter_mut().enumerate() { + let header = self.pointers[(*hash >> self.hash_shift) as usize]; + if header != 0 && early_filtering(header, *hash) { + *hash = remove_header_tag(header); + assume(selection.len() <= selection.capacity()); + selection.push(idx as u32); + } + } + + selection.len() } fn next_contains(&self, key: &Self::Key, mut ptr: u64) -> bool { diff --git a/src/common/hashtable/src/hashjoin_string_hashtable.rs b/src/common/hashtable/src/hashjoin_string_hashtable.rs index e74e88e9b9c35..c92372f25fbd7 100644 --- a/src/common/hashtable/src/hashjoin_string_hashtable.rs +++ b/src/common/hashtable/src/hashjoin_string_hashtable.rs @@ -16,6 +16,7 @@ use std::alloc::Allocator; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; +use databend_common_base::hints::assume; use databend_common_base::mem_allocator::DefaultAllocator; use databend_common_column::bitmap::Bitmap; @@ -144,25 +145,19 @@ where A: Allocator + Clone + 'static &self, hashes: &mut [u64], bitmap: Option, - matched_selection: &mut [u32], - unmatched_selection: &mut [u32], + matched_selection: &mut Vec, + unmatched_selection: &mut Vec, ) -> (usize, usize) { let mut valids = None; if let Some(bitmap) = bitmap { if bitmap.null_count() == bitmap.len() { - unmatched_selection - .iter_mut() - .enumerate() - .for_each(|(idx, val)| { - *val = idx as u32; - }); + unmatched_selection.extend(0..bitmap.null_count() as u32); return (0, hashes.len()); } else if bitmap.null_count() > 0 { valids = Some(bitmap); } } - let mut matched_idx = 0; - let mut unmatched_idx = 0; + match valids { Some(valids) => { hashes.iter_mut().enumerate().for_each(|(idx, hash)| { @@ -170,21 +165,15 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { - *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 - }; - matched_idx += 1; + assume(matched_selection.len() <= matched_selection.capacity()); + matched_selection.push(idx as u32); } else { - unsafe { - *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32 - }; - unmatched_idx += 1; + assume(unmatched_selection.len() <= unmatched_selection.capacity()); + unmatched_selection.push(idx as u32); } } else { - unsafe { - *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32 - }; - unmatched_idx += 1; + assume(unmatched_selection.len() <= unmatched_selection.capacity()); + unmatched_selection.push(idx as u32); } }); } @@ -193,18 +182,16 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { *matched_selection.get_unchecked_mut(matched_idx) = idx as u32 }; - matched_idx += 1; + assume(matched_selection.len() <= matched_selection.capacity()); + matched_selection.push(idx as u32); } else { - unsafe { - *unmatched_selection.get_unchecked_mut(unmatched_idx) = idx as u32 - }; - unmatched_idx += 1; + assume(unmatched_selection.len() <= unmatched_selection.capacity()); + unmatched_selection.push(idx as u32); } }); } } - (matched_idx, unmatched_idx) + (matched_selection.len(), unmatched_selection.len()) } // Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes. @@ -212,20 +199,17 @@ where A: Allocator + Clone + 'static &self, hashes: &mut [u64], bitmap: Option, - selection: &mut [u32], + selection: &mut Vec, ) -> usize { let mut valids = None; if let Some(bitmap) = bitmap { if bitmap.null_count() == bitmap.len() { - hashes.iter_mut().for_each(|hash| { - *hash = 0; - }); return 0; } else if bitmap.null_count() > 0 { valids = Some(bitmap); } } - let mut count = 0; + match valids { Some(valids) => { hashes.iter_mut().enumerate().for_each(|(idx, hash)| { @@ -233,13 +217,9 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { *selection.get_unchecked_mut(count) = idx as u32 }; - count += 1; - } else { - *hash = 0; + assume(selection.len() <= selection.capacity()); + selection.push(idx as u32); } - } else { - *hash = 0; } }); } @@ -248,15 +228,13 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - unsafe { *selection.get_unchecked_mut(count) = idx as u32 }; - count += 1; - } else { - *hash = 0; + assume(selection.len() <= selection.capacity()); + selection.push(idx as u32); } }); } } - count + selection.len() } fn next_contains(&self, key: &Self::Key, mut ptr: u64) -> bool { diff --git a/src/common/hashtable/src/traits.rs b/src/common/hashtable/src/traits.rs index 298713f8c3993..54cdbc9a47463 100644 --- a/src/common/hashtable/src/traits.rs +++ b/src/common/hashtable/src/traits.rs @@ -533,8 +533,8 @@ pub trait HashJoinHashtableLike { &self, hashes: &mut [u64], valids: Option, - matched_selection: &mut [u32], - unmatched_selection: &mut [u32], + matched_selection: &mut Vec, + unmatched_selection: &mut Vec, ) -> (usize, usize); // Perform early filtering probe and store matched indexes in `selection`, return the number of matched indexes. @@ -542,7 +542,7 @@ pub trait HashJoinHashtableLike { &self, hashes: &mut [u64], valids: Option, - selection: &mut [u32], + selection: &mut Vec, ) -> usize; // we use `next_contains` to see whether we can find a matched row in the link. diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs index 547ed16e80df7..c443b35806e18 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::arrow::and_validities; @@ -26,8 +25,9 @@ use databend_common_expression::FunctionContext; use databend_common_expression::RemoteExpr; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::executor::cast_expr_to_non_null_boolean; -use parking_lot::RwLock; use databend_common_sql::ColumnSet; +use parking_lot::RwLock; + use crate::physical_plans::HashJoin; use crate::physical_plans::PhysicalRuntimeFilter; use crate::physical_plans::PhysicalRuntimeFilters; diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs index eb036932aa6f8..ff1e3ab3c5eef 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_probe_state.rs @@ -335,6 +335,7 @@ impl HashJoinProbeState { probe_state.with_conjunction, ) { if prefer_early_filtering { + probe_state.selection.clear(); // Early filtering, use selection to get better performance. table.hash_table.early_filtering_matched_probe( &mut probe_state.hashes, @@ -350,6 +351,8 @@ impl HashJoinProbeState { // Early filtering, use matched selection and unmatched selection to get better performance. let unmatched_selection = probe_state.probe_unmatched_indexes.as_mut().unwrap(); + probe_state.selection.clear(); + unmatched_selection.clear(); let (matched_count, unmatched_count) = table.hash_table.early_filtering_probe( &mut probe_state.hashes, diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs index a7bdd05010431..c493eedc77a16 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state.rs @@ -23,7 +23,6 @@ use std::sync::Arc; use databend_common_base::base::tokio::sync::watch; use databend_common_base::base::tokio::sync::watch::Receiver; use databend_common_base::base::tokio::sync::watch::Sender; -use databend_common_base::hints::assume; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -37,14 +36,10 @@ use databend_common_expression::FixedKey; use databend_common_expression::HashMethodFixedKeys; use databend_common_expression::HashMethodSerializer; use databend_common_expression::HashMethodSingleBinary; -use databend_common_expression::KeyAccessor; use databend_common_hashtable::BinaryHashJoinHashMap; use databend_common_hashtable::HashJoinHashMap; use databend_common_hashtable::HashtableKeyable; -use databend_common_hashtable::RawEntry; use databend_common_hashtable::RowPtr; -use databend_common_hashtable::StringRawEntry; -use databend_common_hashtable::STRING_EARLY_SIZE; use databend_common_sql::plans::JoinType; use databend_common_sql::ColumnSet; use ethnum::U256; @@ -59,22 +54,16 @@ use crate::sessions::QueryContext; use crate::sql::IndexType; pub struct SerializerHashJoinHashTable { - pub(crate) probed_rows: AtomicUsize, - pub(crate) matched_probe_rows: AtomicUsize, pub(crate) hash_table: BinaryHashJoinHashMap, pub(crate) hash_method: HashMethodSerializer, } pub struct SingleBinaryHashJoinHashTable { - pub(crate) probed_rows: AtomicUsize, - pub(crate) matched_probe_rows: AtomicUsize, pub(crate) hash_table: BinaryHashJoinHashMap, pub(crate) hash_method: HashMethodSingleBinary, } pub struct FixedKeyHashJoinHashTable { - pub(crate) probed_rows: AtomicUsize, - pub(crate) matched_probe_rows: AtomicUsize, pub(crate) hash_table: HashJoinHashMap, pub(crate) hash_method: HashMethodFixedKeys, } @@ -340,5 +329,3 @@ impl HashJoinState { } } } - - diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs index 44b97bf86a254..69676a93af6ab 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs @@ -15,16 +15,10 @@ use databend_common_exception::Result; use databend_common_hashtable::RowPtr; -pub struct PerformanceStatistics { - matched_hash: usize, -} - pub struct ProbedRows { pub unmatched: Vec, pub matched_probe: Vec, pub matched_build: Vec, - - // pub performance_statistics: PerformanceStatistics, } impl ProbedRows { @@ -57,17 +51,23 @@ impl ProbedRows { pub fn all_unmatched(unmatched: Vec) -> ProbedRows { ProbedRows::new(unmatched, vec![], vec![]) } + + pub fn clear(&mut self) { + self.unmatched.clear(); + self.matched_probe.clear(); + self.matched_build.clear(); + } } pub trait ProbeStream { - fn next(&mut self, max_rows: usize) -> Result; + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()>; +} - fn both(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { - unimplemented!() - } +pub struct EmptyProbeStream; - fn matched(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { - unimplemented!() +impl ProbeStream for EmptyProbeStream { + fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> { + Ok(()) } } @@ -83,24 +83,12 @@ impl AllUnmatchedProbeStream { } impl ProbeStream for AllUnmatchedProbeStream { - fn next(&mut self, max_rows: usize) -> Result { - if self.idx >= self.size { - return Ok(ProbedRows::empty()); - } - - let res = std::cmp::min(self.size - self.idx, max_rows); - let res = (self.idx..self.idx + res).collect::>(); - self.idx += res.len(); - Ok(ProbedRows::all_unmatched(res)) - } - - fn matched(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + fn advance(&mut self, _rows: &mut ProbedRows, max_rows: usize) -> Result<()> { if self.idx >= self.size { return Ok(()); } let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows); - res.unmatched.extend(self.idx..self.idx + unmatched_rows); self.idx += unmatched_rows; Ok(()) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs index a0418ad19cba0..3d756a0ba98da 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::atomic::Ordering; - use databend_common_base::hints::assume; use databend_common_exception::Result; use databend_common_expression::DataBlock; @@ -28,7 +26,9 @@ use databend_common_hashtable::HashtableKeyable; use databend_common_hashtable::RawEntry; use databend_common_hashtable::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{AllUnmatchedProbeStream, ProbeStream}; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable; @@ -38,8 +38,6 @@ impl FixedKeyHashJoinHashTable { FixedKeyHashJoinHashTable:: { hash_table, hash_method, - probed_rows: Default::default(), - matched_probe_rows: Default::default(), } } @@ -78,63 +76,112 @@ impl FixedKeyHashJoinHashTable { Ok(()) } - pub fn probe(&self, probe_data: ProbeData) -> Result> { - let num_rows = probe_data.num_rows(); + pub fn probe_matched<'a>(&'a self, data: ProbeData<'a>) -> Result> { + let num_rows = data.num_rows(); let hash_method = &self.hash_method; let mut hashes = Vec::with_capacity(num_rows); - let keys = ProjectedBlock::from(probe_data.columns()); + let keys = ProjectedBlock::from(data.columns()); let keys_state = hash_method.build_keys_state(keys, num_rows)?; hash_method.build_keys_hashes(&keys_state, &mut hashes); let keys = hash_method.build_keys_accessor(keys_state.clone())?; - let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { + let probed_rows = data.non_null_rows(); + let (_, valids, ctx) = data.into_raw(); + + let enable_early_filtering = match ctx.probed_rows { 0 => false, - probed_rows => { - let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; - matched_probe_rows / (probed_rows as f64) < 0.8 - } + probed_rows => (ctx.matched_rows as f64) / (probed_rows as f64) < 0.8, }; - let probed_rows = probe_data.non_null_rows(); - self.probed_rows.fetch_add(probed_rows, Ordering::Relaxed); + let matched_rows = match enable_early_filtering { + true => self.hash_table.early_filtering_matched_probe( + &mut hashes, + valids, + &mut ctx.selection, + ), + false => self.hash_table.probe(&mut hashes, valids), + }; - let (_, valids) = probe_data.into_raw(); + ctx.probed_rows += probed_rows; + ctx.matched_rows += matched_rows; + + match matched_rows { + 0 => Ok(Box::new(EmptyProbeStream)), + _ => match enable_early_filtering { + true => Ok(EarlyFilteringProbeStream::<_, true>::create( + hashes, + keys, + &ctx.selection, + &[], + )), + false => Ok(FixedKeyProbeStream::<_, true>::create(hashes, keys)), + }, + } + } - match enable_early_filtering { - true => { - let mut selection = vec![0; num_rows]; + pub fn probe<'a>(&'a self, data: ProbeData<'a>) -> Result> { + let num_rows = data.num_rows(); + let hash_method = &self.hash_method; + let mut hashes = Vec::with_capacity(num_rows); - match self.hash_table.early_filtering_matched_probe( + let keys = ProjectedBlock::from(data.columns()); + let keys_state = hash_method.build_keys_state(keys, num_rows)?; + hash_method.build_keys_hashes(&keys_state, &mut hashes); + let keys = hash_method.build_keys_accessor(keys_state.clone())?; + + let probed_rows = data.non_null_rows(); + let (_, valids, ctx) = data.into_raw(); + + let enable_early_filtering = match ctx.probed_rows { + 0 => false, + probed_rows => (ctx.matched_rows as f64) / (probed_rows as f64) < 0.8, + }; + + let matched_rows = match enable_early_filtering { + true => { + let (matched_rows, _) = self.hash_table.early_filtering_probe( &mut hashes, valids, - &mut selection, - ) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(FixedKeysProbeStream::create(hashes, keys)), - } + &mut ctx.selection, + &mut ctx.unmatched_selection, + ); + matched_rows } - false => match self.hash_table.probe(&mut hashes, valids) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(FixedKeysProbeStream::create(hashes, keys)), + false => self.hash_table.probe(&mut hashes, valids), + }; + + ctx.probed_rows += probed_rows; + ctx.matched_rows += matched_rows; + + match matched_rows { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => match enable_early_filtering { + true => Ok(EarlyFilteringProbeStream::<_, false>::create( + hashes, + keys, + &ctx.selection, + &ctx.unmatched_selection, + )), + false => Ok(FixedKeyProbeStream::<_, false>::create(hashes, keys)), }, } } } -pub struct FixedKeysProbeStream { +struct FixedKeyProbeStream { key_idx: usize, pointers: Vec, - keys: Box<(dyn KeyAccessor)>, probe_entry_ptr: u64, + keys: Box<(dyn KeyAccessor)>, } -impl FixedKeysProbeStream { +impl FixedKeyProbeStream { pub fn create( pointers: Vec, keys: Box>, ) -> Box { - Box::new(FixedKeysProbeStream { + Box::new(FixedKeyProbeStream:: { keys, pointers, key_idx: 0, @@ -143,62 +190,151 @@ impl FixedKeysProbeStream { } } -impl ProbeStream for FixedKeysProbeStream { - fn next(&mut self, max_rows: usize) -> Result { - unsafe { - let mut matched_build = Vec::with_capacity(max_rows); - let mut matched_probe = Vec::with_capacity(max_rows); - let mut unmatched = Vec::with_capacity(max_rows); - - while self.key_idx < self.keys.len() { - assume(unmatched.len() <= unmatched.capacity()); - assume(matched_probe.len() == matched_build.len()); - assume(matched_build.len() <= matched_build.capacity()); - assume(matched_probe.len() <= matched_probe.capacity()); - assume(self.key_idx < self.pointers.len()); - - if matched_probe.len() == max_rows { - break; - } +impl ProbeStream + for FixedKeyProbeStream +{ + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.keys.len() { + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() <= res.matched_build.capacity()); + assume(res.matched_probe.len() <= res.matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.probe_entry_ptr == 0 { + self.probe_entry_ptr = self.pointers[self.key_idx]; if self.probe_entry_ptr == 0 { - self.probe_entry_ptr = self.pointers[self.key_idx]; + if MATCHED { + res.unmatched.push(self.key_idx); + } + + self.key_idx += 1; + continue; + } + } + + let key = unsafe { self.keys.key_unchecked(self.key_idx) }; + + while self.probe_entry_ptr != 0 { + let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut RawEntry) }; + + if key == &raw_entry.key { + let row_ptr = raw_entry.row_ptr; + res.matched_probe.push(self.key_idx as u64); + res.matched_build.push(row_ptr); + + if res.matched_probe.len() == max_rows { + self.probe_entry_ptr = raw_entry.next; + + if self.probe_entry_ptr == 0 { + self.key_idx += 1; + } - if self.probe_entry_ptr == 0 { - unmatched.push(self.key_idx); - self.key_idx += 1; - continue; + return Ok(()); } } - let key = self.keys.key_unchecked(self.key_idx); + self.probe_entry_ptr = raw_entry.next; + } - while self.probe_entry_ptr != 0 { - let raw_entry = &*(self.probe_entry_ptr as *mut RawEntry); + self.key_idx += 1; + } - if key == &raw_entry.key { - let row_ptr = raw_entry.row_ptr; - matched_probe.push(self.key_idx as u64); - matched_build.push(row_ptr); + Ok(()) + } +} - if matched_probe.len() == max_rows { - self.probe_entry_ptr = raw_entry.next; +struct EarlyFilteringProbeStream<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> { + idx: usize, + pointers: Vec, + probe_entry_ptr: u64, + keys: Box<(dyn KeyAccessor)>, + selections: &'a [u32], + unmatched_selection: &'a [u32], +} - if self.probe_entry_ptr == 0 { - self.key_idx += 1; - } +impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> + EarlyFilteringProbeStream<'a, Key, MATCHED> +{ + pub fn create( + pointers: Vec, + keys: Box>, + selections: &'a [u32], + unmatched_selection: &'a [u32], + ) -> Box { + Box::new(EarlyFilteringProbeStream:: { + keys, + pointers, + selections, + unmatched_selection, + idx: 0, + probe_entry_ptr: 0, + }) + } +} - return Ok(ProbedRows::new(unmatched, matched_probe, matched_build)); +impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream + for EarlyFilteringProbeStream<'a, Key, MATCHED> +{ + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + if !MATCHED { + res.unmatched + .extend(self.unmatched_selection.iter().map(|x| *x as usize)); + } + + while self.idx < self.selections.len() { + let key_idx = self.selections[self.idx] as usize; + + assume(res.unmatched.len() <= res.unmatched.capacity()); + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() <= res.matched_build.capacity()); + assume(res.matched_probe.len() <= res.matched_probe.capacity()); + assume(key_idx < self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.probe_entry_ptr == 0 { + self.probe_entry_ptr = self.pointers[key_idx]; + + if self.probe_entry_ptr == 0 { + self.idx += 1; + continue; + } + } + + let key = unsafe { self.keys.key_unchecked(key_idx) }; + + while self.probe_entry_ptr != 0 { + let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut RawEntry) }; + + if key == &raw_entry.key { + let row_ptr = raw_entry.row_ptr; + res.matched_probe.push(key_idx as u64); + res.matched_build.push(row_ptr); + + if res.matched_probe.len() == max_rows { + self.probe_entry_ptr = raw_entry.next; + + if self.probe_entry_ptr == 0 { + self.idx += 1; } - } - self.probe_entry_ptr = raw_entry.next; + return Ok(()); + } } - self.key_idx += 1; + self.probe_entry_ptr = raw_entry.next; } - Ok(ProbedRows::new(unmatched, matched_probe, matched_build)) + self.idx += 1; } + + Ok(()) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs index 7d1adaee30c22..44b0ba9238a9e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs @@ -14,22 +14,55 @@ mod fixed_keys; +pub mod basic; mod serialize_keys; mod single_binary_key; -pub mod basic; use databend_common_column::bitmap::Bitmap; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; -pub struct ProbeData { +pub struct ProbeHashStatistics { + probed_rows: usize, + matched_rows: usize, + + selection: Vec, + unmatched_selection: Vec, +} + +impl ProbeHashStatistics { + pub fn new(max_rows: usize) -> Self { + ProbeHashStatistics { + probed_rows: 0, + matched_rows: 0, + selection: Vec::with_capacity(max_rows), + unmatched_selection: Vec::with_capacity(max_rows), + } + } + + pub fn clear(&mut self) { + self.selection.clear(); + self.unmatched_selection.clear(); + } +} + +pub struct ProbeData<'a> { keys: DataBlock, valids: Option, + probe_hash_statistics: &'a mut ProbeHashStatistics, } -impl ProbeData { - pub fn new(keys: DataBlock, valids: Option) -> Self { - Self { keys, valids } +impl<'a> ProbeData<'a> { + pub fn new( + keys: DataBlock, + valids: Option, + probe_hash_statistics: &'a mut ProbeHashStatistics, + ) -> Self { + ProbeData { + keys, + valids, + probe_hash_statistics, + } } pub fn num_rows(&self) -> usize { @@ -47,7 +80,7 @@ impl ProbeData { } } - pub fn into_raw(self) -> (DataBlock, Option) { - (self.keys, self.valids) + pub fn into_raw(self) -> (DataBlock, Option, &'a mut ProbeHashStatistics) { + (self.keys, self.valids, self.probe_hash_statistics) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index a99fc53c279a6..f8532f152a578 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; - +use databend_common_base::hints::assume; use databend_common_exception::Result; use databend_common_expression::Column; use databend_common_expression::DataBlock; @@ -29,7 +27,9 @@ use databend_common_hashtable::RowPtr; use databend_common_hashtable::StringRawEntry; use databend_common_hashtable::STRING_EARLY_SIZE; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{AllUnmatchedProbeStream, ProbeStream}; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; @@ -42,8 +42,6 @@ impl SerializerHashJoinHashTable { SerializerHashJoinHashTable { hash_table, hash_method, - probed_rows: AtomicUsize::new(0), - matched_probe_rows: AtomicUsize::new(0), } } @@ -103,62 +101,112 @@ impl SerializerHashJoinHashTable { Ok(()) } - pub fn probe(&self, probe_data: ProbeData) -> Result> { - let num_rows = probe_data.num_rows(); + pub fn probe_matched<'a>(&'a self, data: ProbeData<'a>) -> Result> { + let num_rows = data.num_rows(); let hash_method = &self.hash_method; let mut hashes = Vec::with_capacity(num_rows); - let keys = ProjectedBlock::from(probe_data.columns()); + let keys = ProjectedBlock::from(data.columns()); let keys_state = hash_method.build_keys_state(keys, num_rows)?; hash_method.build_keys_hashes(&keys_state, &mut hashes); let keys = hash_method.build_keys_accessor(keys_state.clone())?; - let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { + let probed_rows = data.non_null_rows(); + let (_, valids, ctx) = data.into_raw(); + + let enable_early_filtering = match ctx.probed_rows { 0 => false, - probed_rows => { - let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; - matched_probe_rows / (probed_rows as f64) < 0.8 - } + probed_rows => (ctx.matched_rows as f64) / (probed_rows as f64) < 0.8, }; - let probed_rows = probe_data.non_null_rows(); - self.probed_rows.fetch_add(probed_rows, Ordering::Relaxed); + let matched_rows = match enable_early_filtering { + true => self.hash_table.early_filtering_matched_probe( + &mut hashes, + valids, + &mut ctx.selection, + ), + false => self.hash_table.probe(&mut hashes, valids), + }; - let (_, valids) = probe_data.into_raw(); - match enable_early_filtering { - true => { - let mut selection = vec![0; num_rows]; + ctx.probed_rows += probed_rows; + ctx.matched_rows += matched_rows; + + match matched_rows { + 0 => Ok(Box::new(EmptyProbeStream)), + _ => match enable_early_filtering { + true => Ok(EarlyFilteringProbeStream::::create( + hashes, + keys, + &ctx.selection, + &[], + )), + false => Ok(BinaryKeyProbeStream::::create(hashes, keys)), + }, + } + } + + pub fn probe<'a>(&'a self, data: ProbeData<'a>) -> Result> { + let num_rows = data.num_rows(); + let hash_method = &self.hash_method; + let mut hashes = Vec::with_capacity(num_rows); - match self.hash_table.early_filtering_matched_probe( + let keys = ProjectedBlock::from(data.columns()); + let keys_state = hash_method.build_keys_state(keys, num_rows)?; + hash_method.build_keys_hashes(&keys_state, &mut hashes); + let keys = hash_method.build_keys_accessor(keys_state.clone())?; + + let probed_rows = data.non_null_rows(); + let (_, valids, ctx) = data.into_raw(); + + let enable_early_filtering = match ctx.probed_rows { + 0 => false, + probed_rows => (ctx.matched_rows as f64) / (probed_rows as f64) < 0.8, + }; + + let matched_rows = match enable_early_filtering { + true => { + let (matched_rows, _) = self.hash_table.early_filtering_probe( &mut hashes, valids, - &mut selection, - ) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), - } + &mut ctx.selection, + &mut ctx.unmatched_selection, + ); + matched_rows } - false => match self.hash_table.probe(&mut hashes, valids) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), + false => self.hash_table.probe(&mut hashes, valids), + }; + + ctx.probed_rows += probed_rows; + ctx.matched_rows += matched_rows; + + match matched_rows { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => match enable_early_filtering { + true => Ok(EarlyFilteringProbeStream::::create( + hashes, + keys, + &ctx.selection, + &ctx.unmatched_selection, + )), + false => Ok(BinaryKeyProbeStream::::create(hashes, keys)), }, } } } -pub struct BinaryKeyProbeStream { +pub struct BinaryKeyProbeStream { key_idx: usize, pointers: Vec, keys: Box<(dyn KeyAccessor)>, probe_entry_ptr: u64, } -impl BinaryKeyProbeStream { +impl BinaryKeyProbeStream { pub fn create( pointers: Vec, keys: Box>, ) -> Box { - Box::new(BinaryKeyProbeStream { + Box::new(BinaryKeyProbeStream:: { keys, pointers, key_idx: 0, @@ -167,40 +215,40 @@ impl BinaryKeyProbeStream { } } -impl ProbeStream for BinaryKeyProbeStream { - fn next(&mut self, max_rows: usize) -> Result { - unsafe { - let mut matched_build = Vec::with_capacity(max_rows); - let mut matched_probe = Vec::with_capacity(max_rows); - let mut unmatched = Vec::with_capacity(max_rows); - - while self.key_idx < self.keys.len() { - std::hint::assert_unchecked(unmatched.len() <= unmatched.capacity()); - std::hint::assert_unchecked(matched_probe.len() == matched_build.len()); - std::hint::assert_unchecked(matched_build.len() <= matched_build.capacity()); - std::hint::assert_unchecked(matched_probe.len() <= matched_probe.capacity()); - - if matched_probe.len() == max_rows { - break; - } +impl ProbeStream for BinaryKeyProbeStream { + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.keys.len() { + assume(res.unmatched.len() <= res.unmatched.capacity()); + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() <= res.matched_build.capacity()); + assume(res.matched_probe.len() <= res.matched_probe.capacity()); + assume(self.key_idx <= self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } - if self.probe_entry_ptr == 0 { - self.probe_entry_ptr = *self.pointers.get_unchecked(self.key_idx); + if self.probe_entry_ptr == 0 { + self.probe_entry_ptr = self.pointers[self.key_idx]; - if self.probe_entry_ptr == 0 { - unmatched.push(self.key_idx); - self.key_idx += 1; - continue; + if self.probe_entry_ptr == 0 { + if !MATCHED { + res.unmatched.push(self.key_idx); } + + self.key_idx += 1; + continue; } + } - let key = self.keys.key_unchecked(self.key_idx); + let key = unsafe { self.keys.key_unchecked(self.key_idx) }; - while self.probe_entry_ptr != 0 { - let raw_entry = &*(self.probe_entry_ptr as *mut StringRawEntry); - // Compare `early` and the length of the string, the size of `early` is 4. - let min_len = std::cmp::min(STRING_EARLY_SIZE, key.len()); + while self.probe_entry_ptr != 0 { + let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut StringRawEntry) }; + // Compare `early` and the length of the string, the size of `early` is 4. + let min_len = std::cmp::min(STRING_EARLY_SIZE, key.len()); + unsafe { if raw_entry.length as usize == key.len() && key[0..min_len] == raw_entry.early[0..min_len] { @@ -210,32 +258,131 @@ impl ProbeStream for BinaryKeyProbeStream { ); if key == key_ref { let row_ptr = raw_entry.row_ptr; - matched_probe.push(self.key_idx as u64); - matched_build.push(row_ptr); + res.matched_probe.push(self.key_idx as u64); + res.matched_build.push(row_ptr); - if matched_probe.len() == max_rows { + if res.matched_probe.len() == max_rows { self.probe_entry_ptr = raw_entry.next; if self.probe_entry_ptr == 0 { self.key_idx += 1; } - return Ok(ProbedRows::new( - unmatched, - matched_probe, - matched_build, - )); + return Ok(()); } } } + } + + self.probe_entry_ptr = raw_entry.next; + } + + self.key_idx += 1; + } + + Ok(()) + } +} - self.probe_entry_ptr = raw_entry.next; +pub struct EarlyFilteringProbeStream<'a, const MATCHED: bool> { + idx: usize, + pointers: Vec, + keys: Box<(dyn KeyAccessor)>, + probe_entry_ptr: u64, + selections: &'a [u32], + unmatched_selection: &'a [u32], +} + +impl<'a, const MATCHED: bool> EarlyFilteringProbeStream<'a, MATCHED> { + pub fn create( + pointers: Vec, + keys: Box>, + selections: &'a [u32], + unmatched_selection: &'a [u32], + ) -> Box { + Box::new(EarlyFilteringProbeStream::<'a, MATCHED> { + keys, + pointers, + selections, + unmatched_selection, + idx: 0, + probe_entry_ptr: 0, + }) + } +} + +impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATCHED> { + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + if !MATCHED { + res.unmatched + .extend(self.unmatched_selection.iter().map(|x| *x as usize)); + } + + while self.idx < self.selections.len() { + let key_idx = self.selections[self.idx] as usize; + + assume(res.unmatched.len() <= res.unmatched.capacity()); + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() <= res.matched_build.capacity()); + assume(res.matched_probe.len() <= res.matched_probe.capacity()); + assume(key_idx <= self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.probe_entry_ptr == 0 { + self.probe_entry_ptr = self.pointers[key_idx]; + + if self.probe_entry_ptr == 0 { + if !MATCHED { + res.unmatched.push(key_idx); + } + + self.idx += 1; + continue; + } + } + + let key = unsafe { self.keys.key_unchecked(key_idx) }; + + while self.probe_entry_ptr != 0 { + let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut StringRawEntry) }; + // Compare `early` and the length of the string, the size of `early` is 4. + let min_len = std::cmp::min(STRING_EARLY_SIZE, key.len()); + + unsafe { + if raw_entry.length as usize == key.len() + && key[0..min_len] == raw_entry.early[0..min_len] + { + let key_ref = std::slice::from_raw_parts( + raw_entry.key as *const u8, + raw_entry.length as usize, + ); + if key == key_ref { + let row_ptr = raw_entry.row_ptr; + res.matched_probe.push(key_idx as u64); + res.matched_build.push(row_ptr); + + if res.matched_probe.len() == max_rows { + self.probe_entry_ptr = raw_entry.next; + + if self.probe_entry_ptr == 0 { + self.idx += 1; + } + + return Ok(()); + } + } + } } - self.key_idx += 1; + self.probe_entry_ptr = raw_entry.next; } - Ok(ProbedRows::new(unmatched, matched_probe, matched_build)) + self.idx += 1; } + + Ok(()) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs index 6c5a6e5bb279f..38a0ce38b9ff7 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs @@ -12,10 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; - -use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::Column; use databend_common_expression::DataBlock; @@ -29,9 +25,12 @@ use databend_common_hashtable::RowPtr; use databend_common_hashtable::StringRawEntry; use databend_common_hashtable::STRING_EARLY_SIZE; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::BinaryKeyProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::EarlyFilteringProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{AllUnmatchedProbeStream, ProbeStream}; use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; impl SingleBinaryHashJoinHashTable { @@ -42,8 +41,6 @@ impl SingleBinaryHashJoinHashTable { SingleBinaryHashJoinHashTable { hash_table, hash_method, - probed_rows: AtomicUsize::new(0), - matched_probe_rows: AtomicUsize::new(0), } } @@ -103,44 +100,94 @@ impl SingleBinaryHashJoinHashTable { Ok(()) } - pub fn probe(&self, probe_data: ProbeData) -> Result> { - let num_rows = probe_data.num_rows(); + pub fn probe_matched<'a>(&'a self, data: ProbeData<'a>) -> Result> { + let num_rows = data.num_rows(); let hash_method = &self.hash_method; let mut hashes = Vec::with_capacity(num_rows); - let keys = ProjectedBlock::from(probe_data.columns()); + let keys = ProjectedBlock::from(data.columns()); let keys_state = hash_method.build_keys_state(keys, num_rows)?; hash_method.build_keys_hashes(&keys_state, &mut hashes); let keys = hash_method.build_keys_accessor(keys_state.clone())?; - let enable_early_filtering = match self.probed_rows.load(Ordering::Relaxed) { + let probed_rows = data.non_null_rows(); + let (_, valids, ctx) = data.into_raw(); + + let enable_early_filtering = match ctx.probed_rows { 0 => false, - probed_rows => { - let matched_probe_rows = self.matched_probe_rows.load(Ordering::Relaxed) as f64; - matched_probe_rows / (probed_rows as f64) < 0.8 - } + probed_rows => (ctx.matched_rows as f64) / (probed_rows as f64) < 0.8, }; - let probed_rows = probe_data.non_null_rows(); - self.probed_rows.fetch_add(probed_rows, Ordering::Relaxed); + let matched_rows = match enable_early_filtering { + true => self.hash_table.early_filtering_matched_probe( + &mut hashes, + valids, + &mut ctx.selection, + ), + false => self.hash_table.probe(&mut hashes, valids), + }; - let (_, valids) = probe_data.into_raw(); - match enable_early_filtering { - true => { - let mut selection = vec![0; num_rows]; + ctx.probed_rows += probed_rows; + ctx.matched_rows += matched_rows; + + match matched_rows { + 0 => Ok(Box::new(EmptyProbeStream)), + _ => match enable_early_filtering { + true => Ok(EarlyFilteringProbeStream::::create( + hashes, + keys, + &ctx.selection, + &[], + )), + false => Ok(BinaryKeyProbeStream::::create(hashes, keys)), + }, + } + } + + pub fn probe<'a>(&'a self, data: ProbeData<'a>) -> Result> { + let num_rows = data.num_rows(); + let hash_method = &self.hash_method; + let mut hashes = Vec::with_capacity(num_rows); - match self.hash_table.early_filtering_matched_probe( + let keys = ProjectedBlock::from(data.columns()); + let keys_state = hash_method.build_keys_state(keys, num_rows)?; + hash_method.build_keys_hashes(&keys_state, &mut hashes); + let keys = hash_method.build_keys_accessor(keys_state.clone())?; + + let probed_rows = data.non_null_rows(); + let (_, valids, ctx) = data.into_raw(); + + let enable_early_filtering = match ctx.probed_rows { + 0 => false, + probed_rows => (ctx.matched_rows as f64) / (probed_rows as f64) < 0.8, + }; + + let matched_rows = match enable_early_filtering { + true => { + let (matched_rows, _) = self.hash_table.early_filtering_probe( &mut hashes, valids, - &mut selection, - ) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), - } + &mut ctx.selection, + &mut ctx.unmatched_selection, + ); + matched_rows } - false => match self.hash_table.probe(&mut hashes, valids) { - 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), - _ => Ok(BinaryKeyProbeStream::create(hashes, keys)), + false => self.hash_table.probe(&mut hashes, valids), + }; + + ctx.probed_rows += probed_rows; + ctx.matched_rows += matched_rows; + + match matched_rows { + 0 => Ok(AllUnmatchedProbeStream::create(hashes.len())), + _ => match enable_early_filtering { + true => Ok(EarlyFilteringProbeStream::::create( + hashes, + keys, + &ctx.selection, + &ctx.unmatched_selection, + )), + false => Ok(BinaryKeyProbeStream::::create(hashes, keys)), }, } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs index 2bc72496a8a18..4a10e90b6b2d2 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs @@ -21,24 +21,23 @@ use databend_common_catalog::table_context::TableContext; use databend_common_column::bitmap::Bitmap; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::BooleanType; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::DataBlock; -use databend_common_expression::Evaluator; use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::HashMethodSerializer; use databend_common_expression::HashMethodSingleBinary; -use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_hashtable::BinaryHashJoinHashMap; use databend_common_hashtable::HashJoinHashMap; use ethnum::U256; use crate::pipelines::processors::transforms::new_hash_join::common::SquashBlocks; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; @@ -50,7 +49,6 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::{ProbeStream, ProbedRows}; use crate::sessions::QueryContext; pub struct MemoryInnerJoin { @@ -76,13 +74,14 @@ impl MemoryInnerJoin { let block_size = settings.get_max_block_size()? as usize; let block_bytes = settings.get_max_block_size()? as usize; + let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); Ok(MemoryInnerJoin { desc, state, method, function_ctx, squash_block: SquashBlocks::new(block_size, block_bytes), - performance_context: PerformanceContext::new(), + performance_context: context, }) } @@ -298,8 +297,11 @@ impl Join for MemoryInnerJoin { let joined_stream: Box = with_join_hash_method!(|T| match self.state.hash_table.deref() { HashJoinHashTable::T(table) => { - let probe_data = ProbeData::new(keys, valids); - let probe_keys_stream = table.probe(probe_data)?; + let probe_hash_statistics = &mut self.performance_context.probe_hash_statistics; + probe_hash_statistics.clear(); + + let probe_data = ProbeData::new(keys, valids, probe_hash_statistics); + let probe_keys_stream = table.probe_matched(probe_data)?; Ok(MemoryInnerJoinStream::create( probe_block, @@ -314,17 +316,10 @@ impl Join for MemoryInnerJoin { )), })?; - // if let Some(filter_executor) = &mut self.performance_context.filter_executor { match &mut self.performance_context.filter_executor { None => Ok(joined_stream), - Some(filter_executor) => Ok(FilterJoinStream::create( - self.desc.clone(), - self.function_ctx.clone(), - joined_stream, - filter_executor, - )), + Some(filter_executor) => Ok(FilterJoinStream::create(joined_stream, filter_executor)), } - // } } fn final_probe(&mut self) -> Result> { @@ -337,7 +332,7 @@ struct MemoryInnerJoinStream<'a> { probe_data_block: DataBlock, join_state: Arc, probe_keys_stream: Box, - test: &'a mut ProbedRows, + probed_rows: &'a mut ProbedRows, } unsafe impl<'a> Send for MemoryInnerJoinStream<'a> {} @@ -345,18 +340,18 @@ unsafe impl<'a> Sync for MemoryInnerJoinStream<'a> {} impl<'a> MemoryInnerJoinStream<'a> { pub fn create( - block: DataBlock, - state: Arc, + probe_data_block: DataBlock, + join_state: Arc, probe_keys_stream: Box, desc: Arc, - test: &'a mut ProbedRows, + probed_rows: &'a mut ProbedRows, ) -> Box { Box::new(MemoryInnerJoinStream { - probe_data_block: block, - join_state: state, - probe_keys_stream, desc, - test, + join_state, + probed_rows, + probe_data_block, + probe_keys_stream, }) } } @@ -364,13 +359,15 @@ impl<'a> MemoryInnerJoinStream<'a> { impl<'a> JoinStream for MemoryInnerJoinStream<'a> { fn next(&mut self) -> Result> { loop { - let probe_result = self.probe_keys_stream.next(65535)?; + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; - if probe_result.is_empty() { + if self.probed_rows.is_empty() { return Ok(None); } - if probe_result.is_all_unmatched() { + if self.probed_rows.is_all_unmatched() { continue; } @@ -378,14 +375,14 @@ impl<'a> JoinStream for MemoryInnerJoinStream<'a> { 0 => None, _ => Some(DataBlock::take( &self.probe_data_block, - &probe_result.matched_probe, + &self.probed_rows.matched_probe, )?), }; let build_block = match self.join_state.columns.is_empty() { true => None, false => { - let row_ptrs = probe_result.matched_build.as_slice(); + let row_ptrs = self.probed_rows.matched_build.as_slice(); Some(DataBlock::take_column_vec( self.join_state.columns.as_slice(), self.join_state.column_types.as_slice(), @@ -402,7 +399,7 @@ impl<'a> JoinStream for MemoryInnerJoinStream<'a> { } (Some(probe_block), None) => probe_block, (None, Some(build_block)) => build_block, - (None, None) => DataBlock::new(vec![], probe_result.matched_build.len()), + (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), }; if !self.desc.probe_to_build.is_empty() { @@ -438,23 +435,17 @@ impl<'a> JoinStream for MemoryInnerJoinStream<'a> { } pub struct FilterJoinStream<'a> { - desc: Arc, - function_ctx: FunctionContext, inner: Box, filter_executor: &'a mut FilterExecutor, } impl<'a> FilterJoinStream<'a> { pub fn create( - desc: Arc, - function_ctx: FunctionContext, inner: Box, filter_executor: &'a mut FilterExecutor, ) -> Box { Box::new(FilterJoinStream { - desc, inner, - function_ctx, filter_executor, }) } @@ -471,14 +462,7 @@ impl<'a> JoinStream for FilterJoinStream<'a> { continue; } - let filter = self.desc.other_predicate.as_ref().unwrap(); - let evaluator = Evaluator::new(&data_block, &self.function_ctx, &BUILTIN_FUNCTIONS); - let filter = evaluator - .run(filter)? - .try_downcast::() - .unwrap(); - - let data_block = data_block.filter_boolean_value(&filter)?; + let data_block = self.filter_executor.filter(data_block)?; if data_block.is_empty() { continue; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index 1edcb9ea410c5..04f8329eb5c12 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -13,12 +13,12 @@ // limitations under the License. mod common; +mod hashtable; mod join; mod memory; +mod performance; mod runtime_filter; mod transform_hash_join; -mod hashtable; -mod performance; pub use join::Join; pub use memory::HashJoinMemoryState; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs index 77ee1db2cf9e3..4a75913e3d1de 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs @@ -12,20 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_functions::BUILTIN_FUNCTIONS; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeHashStatistics; +use crate::pipelines::processors::HashJoinDesc; pub struct PerformanceContext { pub probe_result: ProbedRows, pub filter_executor: Option, + pub probe_hash_statistics: ProbeHashStatistics, } impl PerformanceContext { - pub fn new() -> Self { + pub fn create( + max_block_size: usize, + desc: Arc, + function_context: FunctionContext, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_context, + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PerformanceContext { - probe_result: ProbedRows::new(vec![], vec![], vec![]), - filter_executor: None, + filter_executor, + probe_result: ProbedRows::new( + Vec::with_capacity(max_block_size), + Vec::with_capacity(max_block_size), + Vec::with_capacity(max_block_size), + ), + probe_hash_statistics: ProbeHashStatistics::new(max_block_size), } } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs index 714b980fb82c6..601df7e8262b9 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs @@ -29,7 +29,6 @@ use tokio::sync::Barrier; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::PlanRuntimeFilterDesc; pub struct TransformHashJoin { @@ -118,6 +117,7 @@ impl Processor for TransformHashJoin { } } + #[allow(clippy::missing_transmute_annotations)] fn process(&mut self) -> Result<()> { match &mut self.stage { Stage::Finished => Ok(()), @@ -143,6 +143,7 @@ impl Processor for TransformHashJoin { Stage::Probe(state) => { if let Some(probe_data) = state.input_data.take() { let stream = self.join.probe_block(probe_data)?; + // This is safe because both join and stream are properties of the struct. state.stream = Some(unsafe { std::mem::transmute(stream) }); } @@ -159,6 +160,7 @@ impl Processor for TransformHashJoin { if !state.initialized { state.initialized = true; let final_stream = self.join.final_probe()?; + // This is safe because both join and stream are properties of the struct. state.stream = Some(unsafe { std::mem::transmute(final_stream) }); } From 11a2f726d7048ef4f71c19edabe24e011402da0e Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 11:04:30 +0800 Subject: [PATCH 03/24] refactor(query): refactor left outer join to new join --- .../src/physical_plans/physical_hash_join.rs | 40 +- .../service/src/pipelines/pipeline_builder.rs | 4 +- .../new_hash_join/hashtable/basic.rs | 6 +- .../new_hash_join/hashtable/fixed_keys.rs | 6 +- .../new_hash_join/hashtable/serialize_keys.rs | 6 +- .../transforms/new_hash_join/join.rs | 4 +- .../memory/{memory_inner_join.rs => basic.rs} | 371 ++++-------------- .../{memory_state.rs => basic_state.rs} | 6 +- .../new_hash_join/memory/inner_join.rs | 287 ++++++++++++++ .../transforms/new_hash_join/memory/mod.rs | 10 +- .../new_hash_join/memory/outer_left_join.rs | 299 ++++++++++++++ .../transforms/new_hash_join/mod.rs | 6 +- .../transforms/transform_cache_scan.rs | 6 +- 13 files changed, 720 insertions(+), 331 deletions(-) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{memory_inner_join.rs => basic.rs} (54%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{memory_state.rs => basic_state.rs} (95%) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 2ad39f89dfede..6fafedcebec87 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -55,9 +55,10 @@ use crate::physical_plans::physical_plan::PhysicalPlan; use crate::physical_plans::physical_plan::PhysicalPlanMeta; use crate::physical_plans::Exchange; use crate::physical_plans::PhysicalPlanBuilder; -use crate::pipelines::processors::transforms::HashJoinMemoryState; +use crate::pipelines::processors::transforms::memory::outer_left_join::OuterLeftHashJoin; +use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinProbeState; -use crate::pipelines::processors::transforms::MemoryInnerJoin; +use crate::pipelines::processors::transforms::InnerHashJoin; use crate::pipelines::processors::transforms::PlanRuntimeFilterDesc; use crate::pipelines::processors::transforms::TransformHashJoin; use crate::pipelines::processors::transforms::TransformHashJoinBuild; @@ -261,7 +262,7 @@ impl IPhysicalPlan for HashJoin { let (enable_optimization, _) = builder.merge_into_get_optimization_flag(self); if desc.single_to_inner.is_none() - && self.join_type == JoinType::Inner + && (self.join_type == JoinType::Inner || self.join_type == JoinType::Left) && experimental_new_join && !enable_optimization { @@ -394,7 +395,7 @@ impl HashJoin { builder: &mut PipelineBuilder, desc: Arc, ) -> Result<()> { - let state = Arc::new(HashJoinMemoryState::create()); + let state = Arc::new(BasicHashJoinState::create()); // We must build the runtime filter before constructing the child nodes, // as we will inject some runtime filter information into the context for the child nodes to use. let rf_desc = PlanRuntimeFilterDesc::create(&builder.ctx, self); @@ -440,7 +441,7 @@ impl HashJoin { build_input.clone(), probe_input.clone(), joined_output.clone(), - self.create_join(builder, desc.clone(), state.clone())?, + self.create_join(&self.join_type, builder, desc.clone(), state.clone())?, stage_sync_barrier.clone(), self.projections.clone(), rf_desc.clone(), @@ -466,10 +467,11 @@ impl HashJoin { fn create_join( &self, + join_type: &JoinType, builder: &mut PipelineBuilder, desc: Arc, - state: Arc, - ) -> Result> { + state: Arc, + ) -> Result> { let hash_key_types = self .build_keys .iter() @@ -486,13 +488,23 @@ impl HashJoin { let method = DataBlock::choose_hash_method_with_types(&hash_key_types)?; - Ok(Box::new(MemoryInnerJoin::create( - &builder.ctx, - builder.func_ctx.clone(), - method, - desc, - state, - )?)) + Ok(match join_type { + JoinType::Inner => Box::new(InnerHashJoin::create( + &builder.ctx, + builder.func_ctx.clone(), + method, + desc, + state, + )?), + JoinType::Left => Box::new(OuterLeftHashJoin::create( + &builder.ctx, + builder.func_ctx.clone(), + method, + desc, + state, + )?), + _ => unreachable!(), + }) } } diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs index 97a6f302ea1fe..c4e9c61af6739 100644 --- a/src/query/service/src/pipelines/pipeline_builder.rs +++ b/src/query/service/src/pipelines/pipeline_builder.rs @@ -27,7 +27,7 @@ use databend_common_settings::Settings; use super::PipelineBuilderData; use crate::interpreters::CreateTableInterpreter; use crate::physical_plans::PhysicalPlan; -use crate::pipelines::processors::transforms::HashJoinMemoryState; +use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::HashJoinBuildState; use crate::pipelines::processors::HashJoinState; use crate::pipelines::PipelineBuildResult; @@ -38,7 +38,7 @@ use crate::sessions::QueryContext; #[derive(Clone)] pub enum HashJoinStateRef { OldHashJoinState(Arc), - NewHashJoinState(Arc), + NewHashJoinState(Arc), } pub struct PipelineBuilder { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs index 69676a93af6ab..95befffdd3bba 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs @@ -16,7 +16,7 @@ use databend_common_exception::Result; use databend_common_hashtable::RowPtr; pub struct ProbedRows { - pub unmatched: Vec, + pub unmatched: Vec, pub matched_probe: Vec, pub matched_build: Vec, } @@ -27,7 +27,7 @@ impl ProbedRows { } pub fn new( - unmatched: Vec, + unmatched: Vec, matched_probe: Vec, matched_build: Vec, ) -> ProbedRows { @@ -48,7 +48,7 @@ impl ProbedRows { self.matched_build.is_empty() && !self.unmatched.is_empty() } - pub fn all_unmatched(unmatched: Vec) -> ProbedRows { + pub fn all_unmatched(unmatched: Vec) -> ProbedRows { ProbedRows::new(unmatched, vec![], vec![]) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs index 3d756a0ba98da..56593e993a03f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -208,8 +208,8 @@ impl ProbeStream self.probe_entry_ptr = self.pointers[self.key_idx]; if self.probe_entry_ptr == 0 { - if MATCHED { - res.unmatched.push(self.key_idx); + if !MATCHED { + res.unmatched.push(self.key_idx as u64); } self.key_idx += 1; @@ -283,7 +283,7 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { if !MATCHED { res.unmatched - .extend(self.unmatched_selection.iter().map(|x| *x as usize)); + .extend(self.unmatched_selection.iter().map(|x| *x as u64)); } while self.idx < self.selections.len() { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index f8532f152a578..bed09ff5c90c9 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -233,7 +233,7 @@ impl ProbeStream for BinaryKeyProbeStream { if self.probe_entry_ptr == 0 { if !MATCHED { - res.unmatched.push(self.key_idx); + res.unmatched.push(self.key_idx as u64); } self.key_idx += 1; @@ -315,7 +315,7 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { if !MATCHED { res.unmatched - .extend(self.unmatched_selection.iter().map(|x| *x as usize)); + .extend(self.unmatched_selection.iter().map(|x| *x as u64)); } while self.idx < self.selections.len() { @@ -336,7 +336,7 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC if self.probe_entry_ptr == 0 { if !MATCHED { - res.unmatched.push(key_idx); + res.unmatched.push(key_idx as u64); } self.idx += 1; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs index 56fad77007275..c5fece947030b 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs @@ -33,7 +33,9 @@ pub trait Join: Send + Sync + 'static { fn probe_block(&mut self, data: DataBlock) -> Result>; - fn final_probe(&mut self) -> Result>; + fn final_probe(&mut self) -> Result> { + Ok(Box::new(EmptyJoinStream)) + } } pub struct EmptyJoinStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs similarity index 54% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs index 4a10e90b6b2d2..941b4327d43a7 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs @@ -18,15 +18,9 @@ use std::sync::PoisonError; use databend_common_base::base::ProgressValues; use databend_common_catalog::table_context::TableContext; -use databend_common_column::bitmap::Bitmap; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::NullableColumn; -use databend_common_expression::with_join_hash_method; -use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::DataBlock; -use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::HashMethodSerializer; @@ -36,14 +30,7 @@ use databend_common_hashtable::HashJoinHashMap; use ethnum::U256; use crate::pipelines::processors::transforms::new_hash_join::common::SquashBlocks; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::memory::memory_state::HashJoinMemoryState; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; @@ -51,41 +38,109 @@ use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; use crate::pipelines::processors::HashJoinDesc; use crate::sessions::QueryContext; -pub struct MemoryInnerJoin { +pub struct BasicHashJoin { desc: Arc, squash_block: SquashBlocks, method: HashMethodKind, function_ctx: FunctionContext, - state: Arc, - - performance_context: PerformanceContext, + state: Arc, } -impl MemoryInnerJoin { +impl BasicHashJoin { pub fn create( ctx: &QueryContext, function_ctx: FunctionContext, method: HashMethodKind, desc: Arc, - state: Arc, + state: Arc, ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; let block_bytes = settings.get_max_block_size()? as usize; - let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); - Ok(MemoryInnerJoin { + Ok(BasicHashJoin { desc, state, method, function_ctx, squash_block: SquashBlocks::new(block_size, block_bytes), - performance_context: context, }) } + pub(crate) fn add_block(&mut self, mut data: Option) -> Result<()> { + let mut squashed_block = match data.take() { + None => self.squash_block.finalize()?, + Some(data_block) => self.squash_block.add_block(data_block)?, + }; + + if let Some(squashed_block) = squashed_block.take() { + let locked = self.state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + *self.state.build_rows.as_mut() += squashed_block.num_rows(); + let chunk_index = self.state.chunks.len(); + self.state.chunks.as_mut().push(squashed_block); + self.state.build_queue.as_mut().push_back(chunk_index); + } + + Ok(()) + } + + pub(crate) fn final_build(&mut self) -> Result> { + self.init_memory_hash_table(); + + let Some(chunk_index) = self.steal_chunk_index() else { + return Ok(None); + }; + + let mut chunk_block = DataBlock::empty(); + + // take storage block + { + let chunks = self.state.chunks.as_mut(); + std::mem::swap(&mut chunks[chunk_index], &mut chunk_block); + } + + let keys_entries = self.desc.build_key(&chunk_block, &self.function_ctx)?; + let mut keys_block = DataBlock::new(keys_entries, chunk_block.num_rows()); + + chunk_block = chunk_block.project(&self.desc.build_projection); + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + + if bitmap.null_count() != bitmap.len() { + chunk_block = chunk_block.filter_with_bitmap(&bitmap)?; + } + } + + self.desc.remove_keys_nullable(&mut keys_block); + + let num_rows = chunk_block.num_rows(); + let num_bytes = chunk_block.memory_size(); + + // restore storage block + { + let chunks = self.state.chunks.as_mut(); + std::mem::swap(&mut chunks[chunk_index], &mut chunk_block); + } + + self.build_hash_table(keys_block, chunk_index)?; + + Ok(Some(ProgressValues { + rows: num_rows, + bytes: num_bytes, + })) + } +} + +impl BasicHashJoin { + fn steal_chunk_index(&self) -> Option { + let locked = self.state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + self.state.build_queue.as_mut().pop_front() + } - fn init_columns_vec(&mut self) { + pub(crate) fn finalize_chunks(&mut self) { if self.desc.build_projection.is_empty() || !self.state.columns.is_empty() { return; } @@ -203,272 +258,4 @@ impl MemoryInnerJoin { Ok(()) } - - fn steal_chunk_index(&self) -> Option { - let locked = self.state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.state.build_queue.as_mut().pop_front() - } -} - -impl Join for MemoryInnerJoin { - fn add_block(&mut self, mut data: Option) -> Result<()> { - let mut squashed_block = match data.take() { - None => self.squash_block.finalize()?, - Some(data_block) => self.squash_block.add_block(data_block)?, - }; - - if let Some(squashed_block) = squashed_block.take() { - let locked = self.state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - - *self.state.build_rows.as_mut() += squashed_block.num_rows(); - let chunk_index = self.state.chunks.len(); - self.state.chunks.as_mut().push(squashed_block); - self.state.build_queue.as_mut().push_back(chunk_index); - } - - Ok(()) - } - - fn final_build(&mut self) -> Result> { - self.init_memory_hash_table(); - - let Some(chunk_index) = self.steal_chunk_index() else { - return Ok(None); - }; - - let mut chunk_block = DataBlock::empty(); - - // take storage block - { - let chunks = self.state.chunks.as_mut(); - std::mem::swap(&mut chunks[chunk_index], &mut chunk_block); - } - - let keys_entries = self.desc.build_key(&chunk_block, &self.function_ctx)?; - let mut keys_block = DataBlock::new(keys_entries, chunk_block.num_rows()); - - chunk_block = chunk_block.project(&self.desc.build_projection); - if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { - keys_block = keys_block.filter_with_bitmap(&bitmap)?; - - if bitmap.null_count() != bitmap.len() { - chunk_block = chunk_block.filter_with_bitmap(&bitmap)?; - } - } - - self.desc.remove_keys_nullable(&mut keys_block); - - let num_rows = chunk_block.num_rows(); - let num_bytes = chunk_block.memory_size(); - - // restore storage block - { - let chunks = self.state.chunks.as_mut(); - std::mem::swap(&mut chunks[chunk_index], &mut chunk_block); - } - - self.build_hash_table(keys_block, chunk_index)?; - - Ok(Some(ProgressValues { - rows: num_rows, - bytes: num_bytes, - })) - } - - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - - self.init_columns_vec(); - let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; - - let mut keys = DataBlock::new(probe_keys, data.num_rows()); - let valids = match self.desc.from_correlated_subquery { - true => None, - false => self.desc.build_valids_by_keys(&keys)?, - }; - - self.desc.remove_keys_nullable(&mut keys); - let probe_block = data.project(&self.desc.probe_projections); - - let joined_stream: Box = - with_join_hash_method!(|T| match self.state.hash_table.deref() { - HashJoinHashTable::T(table) => { - let probe_hash_statistics = &mut self.performance_context.probe_hash_statistics; - probe_hash_statistics.clear(); - - let probe_data = ProbeData::new(keys, valids, probe_hash_statistics); - let probe_keys_stream = table.probe_matched(probe_data)?; - - Ok(MemoryInnerJoinStream::create( - probe_block, - self.state.clone(), - probe_keys_stream, - self.desc.clone(), - &mut self.performance_context.probe_result, - )) - } - HashJoinHashTable::Null => Err(ErrorCode::AbortedQuery( - "Aborted query, because the hash table is uninitialized.", - )), - })?; - - match &mut self.performance_context.filter_executor { - None => Ok(joined_stream), - Some(filter_executor) => Ok(FilterJoinStream::create(joined_stream, filter_executor)), - } - } - - fn final_probe(&mut self) -> Result> { - Ok(Box::new(EmptyJoinStream)) - } -} - -struct MemoryInnerJoinStream<'a> { - desc: Arc, - probe_data_block: DataBlock, - join_state: Arc, - probe_keys_stream: Box, - probed_rows: &'a mut ProbedRows, -} - -unsafe impl<'a> Send for MemoryInnerJoinStream<'a> {} -unsafe impl<'a> Sync for MemoryInnerJoinStream<'a> {} - -impl<'a> MemoryInnerJoinStream<'a> { - pub fn create( - probe_data_block: DataBlock, - join_state: Arc, - probe_keys_stream: Box, - desc: Arc, - probed_rows: &'a mut ProbedRows, - ) -> Box { - Box::new(MemoryInnerJoinStream { - desc, - join_state, - probed_rows, - probe_data_block, - probe_keys_stream, - }) - } -} - -impl<'a> JoinStream for MemoryInnerJoinStream<'a> { - fn next(&mut self) -> Result> { - loop { - self.probed_rows.clear(); - let max_rows = self.probed_rows.matched_probe.capacity(); - self.probe_keys_stream.advance(self.probed_rows, max_rows)?; - - if self.probed_rows.is_empty() { - return Ok(None); - } - - if self.probed_rows.is_all_unmatched() { - continue; - } - - let probe_block = match self.probe_data_block.num_columns() { - 0 => None, - _ => Some(DataBlock::take( - &self.probe_data_block, - &self.probed_rows.matched_probe, - )?), - }; - - let build_block = match self.join_state.columns.is_empty() { - true => None, - false => { - let row_ptrs = self.probed_rows.matched_build.as_slice(); - Some(DataBlock::take_column_vec( - self.join_state.columns.as_slice(), - self.join_state.column_types.as_slice(), - row_ptrs, - row_ptrs.len(), - )) - } - }; - - let mut result_block = match (probe_block, build_block) { - (Some(mut probe_block), Some(build_block)) => { - probe_block.merge_block(build_block); - probe_block - } - (Some(probe_block), None) => probe_block, - (None, Some(build_block)) => build_block, - (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), - }; - - if !self.desc.probe_to_build.is_empty() { - for (index, (is_probe_nullable, is_build_nullable)) in - self.desc.probe_to_build.iter() - { - let entry = match (is_probe_nullable, is_build_nullable) { - (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), - (true, false) => { - result_block.get_by_offset(*index).clone().remove_nullable() - } - (false, true) => { - let entry = result_block.get_by_offset(*index); - let col = entry.to_column(); - - match col.is_null() || col.is_nullable() { - true => entry.clone(), - false => BlockEntry::from(NullableColumn::new_column( - col, - Bitmap::new_constant(true, result_block.num_rows()), - )), - } - } - }; - - result_block.add_entry(entry); - } - } - - return Ok(Some(result_block)); - } - } -} - -pub struct FilterJoinStream<'a> { - inner: Box, - filter_executor: &'a mut FilterExecutor, -} - -impl<'a> FilterJoinStream<'a> { - pub fn create( - inner: Box, - filter_executor: &'a mut FilterExecutor, - ) -> Box { - Box::new(FilterJoinStream { - inner, - filter_executor, - }) - } -} - -impl<'a> JoinStream for FilterJoinStream<'a> { - fn next(&mut self) -> Result> { - loop { - let Some(data_block) = self.inner.next()? else { - return Ok(None); - }; - - if data_block.is_empty() { - continue; - } - - let data_block = self.filter_executor.filter(data_block)?; - - if data_block.is_empty() { - continue; - } - - return Ok(Some(data_block)); - } - } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs similarity index 95% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs index b741119941d10..e575e6f508dd1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/memory_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs @@ -22,7 +22,7 @@ use databend_common_expression::DataBlock; use crate::pipelines::processors::transforms::new_hash_join::common::CStyleCell; use crate::pipelines::processors::transforms::HashJoinHashTable; -pub struct HashJoinMemoryState { +pub struct BasicHashJoinState { pub mutex: Mutex<()>, pub build_rows: CStyleCell, pub chunks: CStyleCell>, @@ -34,9 +34,9 @@ pub struct HashJoinMemoryState { pub hash_table: CStyleCell, } -impl HashJoinMemoryState { +impl BasicHashJoinState { pub fn create() -> Self { - HashJoinMemoryState { + BasicHashJoinState { mutex: Mutex::new(()), build_rows: CStyleCell::new(0), chunks: CStyleCell::new(Vec::new()), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs new file mode 100644 index 0000000000000..b876c103a459e --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs @@ -0,0 +1,287 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ops::Deref; +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_catalog::table_context::TableContext; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::types::NullableColumn; +use databend_common_expression::with_join_hash_method; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; + +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::memory::basic::BasicHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::memory::basic_state::BasicHashJoinState; +use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::HashJoinHashTable; +use crate::pipelines::processors::HashJoinDesc; +use crate::sessions::QueryContext; + +pub struct InnerHashJoin { + basic_hash_join: BasicHashJoin, + + desc: Arc, + function_ctx: FunctionContext, + basic_state: Arc, + performance_context: PerformanceContext, +} + +impl InnerHashJoin { + pub fn create( + ctx: &QueryContext, + function_ctx: FunctionContext, + method: HashMethodKind, + desc: Arc, + state: Arc, + ) -> Result { + let settings = ctx.get_settings(); + let block_size = settings.get_max_block_size()? as usize; + + let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); + + let basic_hash_join = BasicHashJoin::create( + ctx, + function_ctx.clone(), + method, + desc.clone(), + state.clone(), + )?; + + Ok(InnerHashJoin { + desc, + basic_hash_join, + function_ctx, + basic_state: state, + performance_context: context, + }) + } +} + +impl Join for InnerHashJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.basic_hash_join.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.basic_hash_join.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || *self.basic_state.build_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + self.basic_hash_join.finalize_chunks(); + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projections); + + let joined_stream = + with_join_hash_method!(|T| match self.basic_state.hash_table.deref() { + HashJoinHashTable::T(table) => { + let probe_hash_statistics = &mut self.performance_context.probe_hash_statistics; + probe_hash_statistics.clear(); + + let probe_data = ProbeData::new(keys, valids, probe_hash_statistics); + let probe_keys_stream = table.probe_matched(probe_data)?; + + Ok(InnerHashJoinStream::create( + probe_block, + self.basic_state.clone(), + probe_keys_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + )) + } + HashJoinHashTable::Null => Err(ErrorCode::AbortedQuery( + "Aborted query, because the hash table is uninitialized.", + )), + })?; + + match &mut self.performance_context.filter_executor { + None => Ok(joined_stream), + Some(filter_executor) => Ok(InnerHashJoinFilterStream::create( + joined_stream, + filter_executor, + )), + } + } +} + +struct InnerHashJoinStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + join_state: Arc, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +unsafe impl<'a> Send for InnerHashJoinStream<'a> {} +unsafe impl<'a> Sync for InnerHashJoinStream<'a> {} + +impl<'a> InnerHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + join_state: Arc, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(InnerHashJoinStream { + desc, + join_state, + probed_rows, + probe_data_block, + probe_keys_stream, + }) + } +} + +impl<'a> JoinStream for InnerHashJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + &self.probed_rows.matched_probe, + )?), + }; + + let build_block = match self.join_state.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.join_state.columns.as_slice(), + self.join_state.column_types.as_slice(), + row_ptrs, + row_ptrs.len(), + )) + } + }; + + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), + }; + + if !self.desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in + self.desc.probe_to_build.iter() + { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), + (true, false) => { + result_block.get_by_offset(*index).clone().remove_nullable() + } + (false, true) => { + let entry = result_block.get_by_offset(*index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + + result_block.add_entry(entry); + } + } + + return Ok(Some(result_block)); + } + } +} + +struct InnerHashJoinFilterStream<'a> { + inner: Box, + filter_executor: &'a mut FilterExecutor, +} + +impl<'a> InnerHashJoinFilterStream<'a> { + pub fn create( + inner: Box, + filter_executor: &'a mut FilterExecutor, + ) -> Box { + Box::new(InnerHashJoinFilterStream { + inner, + filter_executor, + }) + } +} + +impl<'a> JoinStream for InnerHashJoinFilterStream<'a> { + fn next(&mut self) -> Result> { + loop { + let Some(data_block) = self.inner.next()? else { + return Ok(None); + }; + + if data_block.is_empty() { + continue; + } + + let data_block = self.filter_executor.filter(data_block)?; + + if data_block.is_empty() { + continue; + } + + return Ok(Some(data_block)); + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs index f17afe74b1a9c..4979c37245fca 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod memory_inner_join; -mod memory_state; +mod basic; +mod basic_state; +mod inner_join; +pub mod outer_left_join; -pub use memory_inner_join::MemoryInnerJoin; -pub use memory_state::HashJoinMemoryState; +pub use basic_state::BasicHashJoinState; +pub use inner_join::InnerHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs new file mode 100644 index 0000000000000..8439941ec1605 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -0,0 +1,299 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ops::Deref; +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_catalog::table_context::TableContext; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::types::NullableColumn; +use databend_common_expression::with_join_hash_method; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::Scalar; + +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::memory::basic::BasicHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::wrap_true_validity; +use crate::pipelines::processors::transforms::BasicHashJoinState; +use crate::pipelines::processors::transforms::HashJoinHashTable; +use crate::pipelines::processors::transforms::Join; +use crate::pipelines::processors::HashJoinDesc; +use crate::sessions::QueryContext; + +pub struct OuterLeftHashJoin { + basic_hash_join: BasicHashJoin, + + desc: Arc, + function_ctx: FunctionContext, + basic_state: Arc, + performance_context: PerformanceContext, +} + +impl OuterLeftHashJoin { + pub fn create( + ctx: &QueryContext, + function_ctx: FunctionContext, + method: HashMethodKind, + desc: Arc, + state: Arc, + ) -> Result { + let settings = ctx.get_settings(); + let block_size = settings.get_max_block_size()? as usize; + + let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); + + let basic_hash_join = BasicHashJoin::create( + ctx, + function_ctx.clone(), + method, + desc.clone(), + state.clone(), + )?; + + Ok(OuterLeftHashJoin { + desc, + basic_hash_join, + function_ctx, + basic_state: state, + performance_context: context, + }) + } +} + +impl Join for OuterLeftHashJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.basic_hash_join.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.basic_hash_join.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + self.basic_hash_join.finalize_chunks(); + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projections); + + let probe_stream = with_join_hash_method!(|T| match self.basic_state.hash_table.deref() { + HashJoinHashTable::T(table) => { + let probe_hash_statistics = &mut self.performance_context.probe_hash_statistics; + probe_hash_statistics.clear(); + + let probe_data = ProbeData::new(keys, valids, probe_hash_statistics); + table.probe(probe_data) + } + HashJoinHashTable::Null => Err(ErrorCode::AbortedQuery( + "Aborted query, because the hash table is uninitialized.", + )), + })?; + + match self.performance_context.filter_executor.as_ref() { + None => {} + Some(_) => {} + }; + + Ok(OuterLeftHashJoinStream::create( + probe_block, + self.basic_state.clone(), + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + )) + } +} + +struct OuterLeftHashJoinStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + join_state: Arc, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + unmatched_rows: Vec, +} + +unsafe impl<'a> Send for OuterLeftHashJoinStream<'a> {} +unsafe impl<'a> Sync for OuterLeftHashJoinStream<'a> {} + +impl<'a> OuterLeftHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + join_state: Arc, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(OuterLeftHashJoinStream { + desc, + join_state, + probed_rows, + probe_data_block, + probe_keys_stream, + unmatched_rows: vec![], + }) + } +} + +impl<'a> JoinStream for OuterLeftHashJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if !self.probed_rows.unmatched.is_empty() { + eprintln!("unmatched rows: {:?}", self.probed_rows.unmatched); + self.unmatched_rows + .extend_from_slice(&self.probed_rows.unmatched); + } + + if self.probed_rows.is_empty() { + if self.unmatched_rows.is_empty() { + return Ok(None); + } + + let unmatched = std::mem::take(&mut self.unmatched_rows); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&self.probe_data_block, &unmatched)?), + }; + + let build_block = match self.join_state.columns.is_empty() { + true => None, + false => { + let columns = self + .join_state + .column_types + .iter() + .map(|column_type| { + BlockEntry::new_const_column( + column_type.wrap_nullable(), + Scalar::Null, + unmatched.len(), + ) + }) + .collect::>(); + Some(DataBlock::new(columns, unmatched.len())) + } + }; + + return Ok(Some(self.final_result_block(probe_block, build_block))); + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + &self.probed_rows.matched_probe, + )?), + }; + + let build_block = match self.join_state.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + let build_block1 = DataBlock::take_column_vec( + self.join_state.columns.as_slice(), + self.join_state.column_types.as_slice(), + row_ptrs, + row_ptrs.len(), + ); + + let true_validity = Bitmap::new_constant(true, row_ptrs.len()); + let entries = build_block1 + .columns() + .iter() + .map(|c| wrap_true_validity(c, row_ptrs.len(), &true_validity)); + Some(DataBlock::from_iter(entries, row_ptrs.len())) + } + }; + + return Ok(Some(self.final_result_block(probe_block, build_block))); + } + } +} + +impl<'a> OuterLeftHashJoinStream<'a> { + fn final_result_block( + &mut self, + probe_block: Option, + build_block: Option, + ) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), + }; + + if !self.desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in self.desc.probe_to_build.iter() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), + (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(*index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + + result_block.add_entry(entry); + } + } + result_block + } +} + +impl<'a> OuterLeftHashJoinStream<'a> {} + +impl<'a> OuterLeftHashJoinStream<'a> {} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index 04f8329eb5c12..bab153f286d80 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -15,13 +15,13 @@ mod common; mod hashtable; mod join; -mod memory; +pub mod memory; mod performance; mod runtime_filter; mod transform_hash_join; pub use join::Join; -pub use memory::HashJoinMemoryState; -pub use memory::MemoryInnerJoin; +pub use memory::BasicHashJoinState; +pub use memory::InnerHashJoin; pub use runtime_filter::PlanRuntimeFilterDesc; pub use transform_hash_join::TransformHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/transform_cache_scan.rs b/src/query/service/src/pipelines/processors/transforms/transform_cache_scan.rs index 03a6cc782bcd4..f1cd0333ccf9f 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_cache_scan.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_cache_scan.rs @@ -23,7 +23,7 @@ use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_sources::AsyncSource; use databend_common_pipeline_sources::AsyncSourcer; -use crate::pipelines::processors::transforms::HashJoinMemoryState; +use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::HashJoinState; use crate::sessions::QueryContext; @@ -110,14 +110,14 @@ impl HashJoinCacheState { #[derive(Clone)] pub struct NewHashJoinCacheState { idx: usize, - memory_state: Arc, + memory_state: Arc, column_indexes: Vec, } impl NewHashJoinCacheState { pub fn new( column_indexes: Vec, - memory_state: Arc, + memory_state: Arc, ) -> NewHashJoinCacheState { NewHashJoinCacheState { idx: 0, From 788f8b9d28a01fc3e0e39a96d6effe32c0b01d19 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 11:55:11 +0800 Subject: [PATCH 04/24] refactor(query): refactor left outer join to new join --- .../processors/transforms/hash_join/desc.rs | 3 + .../transforms/new_hash_join/join.rs | 8 + .../new_hash_join/memory/outer_left_join.rs | 153 ++++++++++-------- 3 files changed, 98 insertions(+), 66 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs index c443b35806e18..a9f0226a07c7e 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs @@ -19,6 +19,7 @@ use databend_common_expression::type_check::check_function; use databend_common_expression::BlockEntry; use databend_common_expression::Constant; use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; use databend_common_expression::Evaluator; use databend_common_expression::Expr; use databend_common_expression::FunctionContext; @@ -60,6 +61,7 @@ pub struct HashJoinDesc { pub(crate) build_projection: ColumnSet, pub(crate) probe_projections: ColumnSet, pub(crate) probe_to_build: Vec<(usize, (bool, bool))>, + pub(crate) build_schema: DataSchemaRef, } #[derive(Debug, Clone)] @@ -130,6 +132,7 @@ impl HashJoinDesc { probe_to_build: join.probe_to_build.clone(), build_projection: join.build_projections.clone(), probe_projections: join.probe_projections.clone(), + build_schema: join.build.output_schema()?, }) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs index c5fece947030b..24d53b0b25c58 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs @@ -45,3 +45,11 @@ impl JoinStream for EmptyJoinStream { Ok(None) } } + +pub struct OneBlockJoinStream(pub Option); + +impl JoinStream for OneBlockJoinStream { + fn next(&mut self) -> Result> { + Ok(self.0.take()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index 8439941ec1605..e4dadedd0ac12 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -20,11 +20,11 @@ use databend_common_catalog::table_context::TableContext; use databend_common_column::bitmap::Bitmap; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::types::DataType; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; -use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::Scalar; @@ -34,6 +34,7 @@ use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::P use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; use crate::pipelines::processors::transforms::new_hash_join::memory::basic::BasicHashJoin; use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::wrap_true_validity; @@ -97,6 +98,23 @@ impl Join for OuterLeftHashJoin { return Ok(Box::new(EmptyJoinStream)); } + if *self.basic_state.build_rows == 0 { + let num_rows = data.num_rows(); + + let types = self + .desc + .build_schema + .fields + .iter() + .map(|x| x.data_type().clone()) + .collect::>(); + + let build_block = null_build_block(&types, data.num_rows()); + let probe_block = Some(data.project(&self.desc.probe_projections)); + let result_block = final_result_block(&self.desc, probe_block, build_block, num_rows); + return Ok(Box::new(OneBlockJoinStream(Some(result_block)))); + } + self.basic_hash_join.finalize_chunks(); let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; @@ -123,11 +141,6 @@ impl Join for OuterLeftHashJoin { )), })?; - match self.performance_context.filter_executor.as_ref() { - None => {} - Some(_) => {} - }; - Ok(OuterLeftHashJoinStream::create( probe_block, self.basic_state.clone(), @@ -177,7 +190,6 @@ impl<'a> JoinStream for OuterLeftHashJoinStream<'a> { self.probe_keys_stream.advance(self.probed_rows, max_rows)?; if !self.probed_rows.unmatched.is_empty() { - eprintln!("unmatched rows: {:?}", self.probed_rows.unmatched); self.unmatched_rows .extend_from_slice(&self.probed_rows.unmatched); } @@ -193,26 +205,15 @@ impl<'a> JoinStream for OuterLeftHashJoinStream<'a> { _ => Some(DataBlock::take(&self.probe_data_block, &unmatched)?), }; - let build_block = match self.join_state.columns.is_empty() { - true => None, - false => { - let columns = self - .join_state - .column_types - .iter() - .map(|column_type| { - BlockEntry::new_const_column( - column_type.wrap_nullable(), - Scalar::Null, - unmatched.len(), - ) - }) - .collect::>(); - Some(DataBlock::new(columns, unmatched.len())) - } - }; + let types = &self.join_state.column_types; + let build_block = null_build_block(types, unmatched.len()); - return Ok(Some(self.final_result_block(probe_block, build_block))); + return Ok(Some(final_result_block( + &self.desc, + probe_block, + build_block, + unmatched.len(), + ))); } if self.probed_rows.matched_probe.is_empty() { @@ -247,53 +248,73 @@ impl<'a> JoinStream for OuterLeftHashJoinStream<'a> { } }; - return Ok(Some(self.final_result_block(probe_block, build_block))); + return Ok(Some(final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ))); } } } -impl<'a> OuterLeftHashJoinStream<'a> { - fn final_result_block( - &mut self, - probe_block: Option, - build_block: Option, - ) -> DataBlock { - let mut result_block = match (probe_block, build_block) { - (Some(mut probe_block), Some(build_block)) => { - probe_block.merge_block(build_block); - probe_block - } - (Some(probe_block), None) => probe_block, - (None, Some(build_block)) => build_block, - (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), - }; - - if !self.desc.probe_to_build.is_empty() { - for (index, (is_probe_nullable, is_build_nullable)) in self.desc.probe_to_build.iter() { - let entry = match (is_probe_nullable, is_build_nullable) { - (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), - (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), - (false, true) => { - let entry = result_block.get_by_offset(*index); - let col = entry.to_column(); - - match col.is_null() || col.is_nullable() { - true => entry.clone(), - false => BlockEntry::from(NullableColumn::new_column( - col, - Bitmap::new_constant(true, result_block.num_rows()), - )), - } +fn final_result_block( + desc: &HashJoinDesc, + probe_block: Option, + build_block: Option, + num_rows: usize, +) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], num_rows), + }; + + if !desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), + (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(*index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), } - }; + } + }; - result_block.add_entry(entry); - } + result_block.add_entry(entry); } - result_block } + result_block } -impl<'a> OuterLeftHashJoinStream<'a> {} - -impl<'a> OuterLeftHashJoinStream<'a> {} +fn null_build_block(types: &[DataType], num_rows: usize) -> Option { + match types.is_empty() { + true => None, + false => { + let columns = types + .iter() + .map(|column_type| { + BlockEntry::new_const_column( + column_type.wrap_nullable(), + Scalar::Null, + num_rows, + ) + }) + .collect::>(); + + Some(DataBlock::new(columns, num_rows)) + } + } +} From 3743194fa1463029d3bafdbba9bad4c5a8446da5 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 11:56:45 +0800 Subject: [PATCH 05/24] refactor(query): enable experimental new hash join setting --- src/query/settings/src/settings_default.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 6e321fa62aee8..d07eed2d0db19 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1481,7 +1481,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(0..=1)), }), ("enable_experimental_new_join", DefaultSettingValue { - value: UserSettingValue::UInt64(0), + value: UserSettingValue::UInt64(1), desc: "Enables the experimental new join implement", mode: SettingMode::Both, scope: SettingScope::Both, From ecea08f2bc09fb7b7206b5b5d1c0c0747cd94843 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 18:52:16 +0800 Subject: [PATCH 06/24] refactor(query): refactor left outer join to new join --- src/query/service/src/physical_plans/physical_hash_join.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 6fafedcebec87..54b3e6cb17ea8 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -262,7 +262,10 @@ impl IPhysicalPlan for HashJoin { let (enable_optimization, _) = builder.merge_into_get_optimization_flag(self); if desc.single_to_inner.is_none() - && (self.join_type == JoinType::Inner || self.join_type == JoinType::Left) + && ( + self.join_type == JoinType::Inner + // || self.join_type == JoinType::Left + ) && experimental_new_join && !enable_optimization { From 7f53d75b69152be56618043a7533b7be4cf147aa Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 20:25:25 +0800 Subject: [PATCH 07/24] refactor(query): refactor left outer join to new join --- src/common/hashtable/src/hashjoin_hashtable.rs | 14 +++++++------- .../hashtable/src/hashjoin_string_hashtable.rs | 14 +++++++------- .../hash_join/probe_join/left_anti_join.rs | 9 +++++---- .../hash_join/probe_join/left_join.rs | 18 ++++++++---------- .../new_hash_join/hashtable/fixed_keys.rs | 10 +++++----- .../new_hash_join/hashtable/serialize_keys.rs | 16 ++++++++-------- 6 files changed, 40 insertions(+), 41 deletions(-) diff --git a/src/common/hashtable/src/hashjoin_hashtable.rs b/src/common/hashtable/src/hashjoin_hashtable.rs index a26d840c4f7c0..d18f75c85f8d5 100644 --- a/src/common/hashtable/src/hashjoin_hashtable.rs +++ b/src/common/hashtable/src/hashjoin_hashtable.rs @@ -237,14 +237,14 @@ where let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(matched_selection.len() <= matched_selection.capacity()); + assume(matched_selection.len() < matched_selection.capacity()); matched_selection.push(idx as u32); } else { - assume(unmatched_selection.len() <= unmatched_selection.capacity()); + assume(unmatched_selection.len() < unmatched_selection.capacity()); unmatched_selection.push(idx as u32); } } else { - assume(unmatched_selection.len() <= unmatched_selection.capacity()); + assume(unmatched_selection.len() < unmatched_selection.capacity()); unmatched_selection.push(idx as u32); } }, @@ -255,10 +255,10 @@ where let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(matched_selection.len() <= matched_selection.capacity()); + assume(matched_selection.len() < matched_selection.capacity()); matched_selection.push(idx as u32); } else { - assume(unmatched_selection.len() <= unmatched_selection.capacity()); + assume(unmatched_selection.len() < unmatched_selection.capacity()); unmatched_selection.push(idx as u32); } }); @@ -290,7 +290,7 @@ where let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(selection.len() <= selection.capacity()); + assume(selection.len() < selection.capacity()); selection.push(idx as u32); } } @@ -303,7 +303,7 @@ where let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(selection.len() <= selection.capacity()); + assume(selection.len() < selection.capacity()); selection.push(idx as u32); } } diff --git a/src/common/hashtable/src/hashjoin_string_hashtable.rs b/src/common/hashtable/src/hashjoin_string_hashtable.rs index c92372f25fbd7..52a74d173dc67 100644 --- a/src/common/hashtable/src/hashjoin_string_hashtable.rs +++ b/src/common/hashtable/src/hashjoin_string_hashtable.rs @@ -165,14 +165,14 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(matched_selection.len() <= matched_selection.capacity()); + assume(matched_selection.len() < matched_selection.capacity()); matched_selection.push(idx as u32); } else { - assume(unmatched_selection.len() <= unmatched_selection.capacity()); + assume(unmatched_selection.len() < unmatched_selection.capacity()); unmatched_selection.push(idx as u32); } } else { - assume(unmatched_selection.len() <= unmatched_selection.capacity()); + assume(unmatched_selection.len() < unmatched_selection.capacity()); unmatched_selection.push(idx as u32); } }); @@ -182,10 +182,10 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(matched_selection.len() <= matched_selection.capacity()); + assume(matched_selection.len() < matched_selection.capacity()); matched_selection.push(idx as u32); } else { - assume(unmatched_selection.len() <= unmatched_selection.capacity()); + assume(unmatched_selection.len() < unmatched_selection.capacity()); unmatched_selection.push(idx as u32); } }); @@ -217,7 +217,7 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(selection.len() <= selection.capacity()); + assume(selection.len() < selection.capacity()); selection.push(idx as u32); } } @@ -228,7 +228,7 @@ where A: Allocator + Clone + 'static let header = self.pointers[(*hash >> self.hash_shift) as usize]; if header != 0 && early_filtering(header, *hash) { *hash = remove_header_tag(header); - assume(selection.len() <= selection.capacity()); + assume(selection.len() < selection.capacity()); selection.push(idx as u32); } }); diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs index 9492fb70cdd42..bbef9410b5bae 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs @@ -14,6 +14,7 @@ use std::sync::atomic::Ordering; +use databend_common_base::hints::assume; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataBlock; @@ -47,17 +48,17 @@ impl HashJoinProbeState { let (probe_indexes, count) = if probe_state.probe_with_selection { // Safe to unwrap. let probe_unmatched_indexes = probe_state.probe_unmatched_indexes.as_mut().unwrap(); - let mut unmatched_idx = probe_state.probe_unmatched_indexes_count; let selection = &probe_state.selection.as_slice()[0..probe_state.selection_count]; for idx in selection.iter() { let key = unsafe { keys.key_unchecked(*idx as usize) }; let ptr = unsafe { *pointers.get_unchecked(*idx as usize) }; if !hash_table.next_contains(key, ptr) { - unsafe { *probe_unmatched_indexes.get_unchecked_mut(unmatched_idx) = *idx }; - unmatched_idx += 1; + assume(probe_unmatched_indexes.len() < probe_unmatched_indexes.capacity()); + probe_unmatched_indexes.push(*idx); } } - (probe_unmatched_indexes, unmatched_idx) + let unmatched_count = probe_unmatched_indexes.len(); + (probe_unmatched_indexes, unmatched_count) } else { let mutable_indexes = &mut probe_state.mutable_indexes; let probe_indexes = &mut mutable_indexes.probe_indexes; diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs index b545728f60c6e..a587fc24e7835 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs @@ -14,6 +14,7 @@ use std::sync::atomic::Ordering; +use databend_common_base::hints::assume; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockEntry; @@ -61,11 +62,9 @@ impl HashJoinProbeState { // Results. let mut matched_idx = 0; - let mut unmatched_idx = 0; let mut result_blocks = vec![]; if probe_state.probe_with_selection { - unmatched_idx = probe_state.probe_unmatched_indexes_count; let selection = probe_state.selection.as_slice(); for selection_idx in process_state.next_idx..probe_state.selection_count { let key_idx = unsafe { *selection.get_unchecked(selection_idx) }; @@ -88,8 +87,8 @@ impl HashJoinProbeState { matched_idx += 1; } } else { - unsafe { *probe_unmatched_indexes.get_unchecked_mut(unmatched_idx) = key_idx }; - unmatched_idx += 1; + assume(probe_unmatched_indexes.len() < probe_unmatched_indexes.capacity()); + probe_unmatched_indexes.push(key_idx); } if matched_idx == max_block_size { @@ -107,6 +106,7 @@ impl HashJoinProbeState { } } } else { + probe_unmatched_indexes.clear(); // Probe hash table and generate data blocks. for key_idx in process_state.next_idx..process_state.input.num_rows() { let key = unsafe { keys.key_unchecked(key_idx) }; @@ -128,10 +128,8 @@ impl HashJoinProbeState { matched_idx += 1; } } else { - unsafe { - *probe_unmatched_indexes.get_unchecked_mut(unmatched_idx) = key_idx as u32 - }; - unmatched_idx += 1; + assume(probe_unmatched_indexes.len() < probe_unmatched_indexes.capacity()); + probe_unmatched_indexes.push(key_idx as u32); } if matched_idx == max_block_size { @@ -166,9 +164,9 @@ impl HashJoinProbeState { )?; } - if unmatched_idx > 0 { + if !probe_unmatched_indexes.is_empty() { result_blocks.push(self.process_left_or_full_join_null_block( - unmatched_idx, + probe_unmatched_indexes.len(), &process_state.input, probe_unmatched_indexes, &mut probe_state.generation_state, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs index 56593e993a03f..0cce33014a333 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -196,8 +196,8 @@ impl ProbeStream fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { while self.key_idx < self.keys.len() { assume(res.matched_probe.len() == res.matched_build.len()); - assume(res.matched_build.len() <= res.matched_build.capacity()); - assume(res.matched_probe.len() <= res.matched_probe.capacity()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); assume(self.key_idx < self.pointers.len()); if res.matched_probe.len() == max_rows { @@ -289,10 +289,10 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream while self.idx < self.selections.len() { let key_idx = self.selections[self.idx] as usize; - assume(res.unmatched.len() <= res.unmatched.capacity()); + assume(res.unmatched.len() < res.unmatched.capacity()); assume(res.matched_probe.len() == res.matched_build.len()); - assume(res.matched_build.len() <= res.matched_build.capacity()); - assume(res.matched_probe.len() <= res.matched_probe.capacity()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); assume(key_idx < self.pointers.len()); if res.matched_probe.len() == max_rows { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index bed09ff5c90c9..883daa09449cd 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -218,11 +218,11 @@ impl BinaryKeyProbeStream { impl ProbeStream for BinaryKeyProbeStream { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { while self.key_idx < self.keys.len() { - assume(res.unmatched.len() <= res.unmatched.capacity()); + assume(res.unmatched.len() < res.unmatched.capacity()); assume(res.matched_probe.len() == res.matched_build.len()); - assume(res.matched_build.len() <= res.matched_build.capacity()); - assume(res.matched_probe.len() <= res.matched_probe.capacity()); - assume(self.key_idx <= self.pointers.len()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); if res.matched_probe.len() == max_rows { break; @@ -321,11 +321,11 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC while self.idx < self.selections.len() { let key_idx = self.selections[self.idx] as usize; - assume(res.unmatched.len() <= res.unmatched.capacity()); + assume(res.unmatched.len() < res.unmatched.capacity()); assume(res.matched_probe.len() == res.matched_build.len()); - assume(res.matched_build.len() <= res.matched_build.capacity()); - assume(res.matched_probe.len() <= res.matched_probe.capacity()); - assume(key_idx <= self.pointers.len()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); + assume(key_idx < self.pointers.len()); if res.matched_probe.len() == max_rows { break; From 8c7c396a928fbce76db0074ef739a28de6a0d8a0 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 21:14:54 +0800 Subject: [PATCH 08/24] refactor(query): refactor left outer join to new join --- .../transforms/hash_join/probe_join/left_anti_join.rs | 4 ++++ .../processors/transforms/hash_join/probe_join/left_join.rs | 2 ++ 2 files changed, 6 insertions(+) diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs index bbef9410b5bae..dc93f4709506f 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_anti_join.rs @@ -80,6 +80,10 @@ impl HashJoinProbeState { let result_block = DataBlock::take(&process_state.input, &probe_indexes[0..count])?; + if probe_state.probe_with_selection { + probe_indexes.clear(); + } + probe_state.process_state = None; if result_block.is_empty() { diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs index a587fc24e7835..d8478cb0fcb19 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs @@ -172,6 +172,8 @@ impl HashJoinProbeState { &mut probe_state.generation_state, &build_state.generation_state, )?); + + probe_unmatched_indexes.clear(); } if !next_process_state { From fa5c7b1e28cefb165a2252d00fca75c3654c378a Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Oct 2025 22:15:42 +0800 Subject: [PATCH 09/24] refactor(query): refactor left outer join to new join --- .../processors/transforms/new_hash_join/hashtable/mod.rs | 4 +++- .../processors/transforms/new_hash_join/memory/inner_join.rs | 2 +- .../transforms/new_hash_join/memory/outer_left_join.rs | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs index 44b0ba9238a9e..5cbfac637c898 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs @@ -40,9 +40,11 @@ impl ProbeHashStatistics { } } - pub fn clear(&mut self) { + pub fn clear(&mut self, max_rows: usize) { self.selection.clear(); self.unmatched_selection.clear(); + self.selection.reserve(max_rows); + self.unmatched_selection.reserve(max_rows); } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs index b876c103a459e..dc75ec5c937c3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs @@ -112,7 +112,7 @@ impl Join for InnerHashJoin { with_join_hash_method!(|T| match self.basic_state.hash_table.deref() { HashJoinHashTable::T(table) => { let probe_hash_statistics = &mut self.performance_context.probe_hash_statistics; - probe_hash_statistics.clear(); + probe_hash_statistics.clear(probe_block.num_rows()); let probe_data = ProbeData::new(keys, valids, probe_hash_statistics); let probe_keys_stream = table.probe_matched(probe_data)?; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index e4dadedd0ac12..b11ef63f99ec4 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -131,7 +131,7 @@ impl Join for OuterLeftHashJoin { let probe_stream = with_join_hash_method!(|T| match self.basic_state.hash_table.deref() { HashJoinHashTable::T(table) => { let probe_hash_statistics = &mut self.performance_context.probe_hash_statistics; - probe_hash_statistics.clear(); + probe_hash_statistics.clear(probe_block.num_rows()); let probe_data = ProbeData::new(keys, valids, probe_hash_statistics); table.probe(probe_data) From 0ebfe9ef62931555ef42155c4bc6d6f8c5d37c9b Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 3 Oct 2025 16:13:19 +0800 Subject: [PATCH 10/24] refactor(query): refactor left outer join to new join --- .../src/physical_plans/physical_hash_join.rs | 4 ++-- .../hash_join/runtime_filter/interface.rs | 2 +- .../hash_join/runtime_filter/mod.rs | 1 + .../transforms/new_hash_join/join.rs | 3 ++- .../transforms/new_hash_join/memory/basic.rs | 4 +++- .../new_hash_join/memory/inner_join.rs | 15 ++++++++++++ .../transforms/new_hash_join/mod.rs | 2 +- .../new_hash_join/runtime_filter.rs | 24 ++++++++++++++----- .../new_hash_join/transform_hash_join.rs | 14 +++++------ 9 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 54b3e6cb17ea8..170e8b23a9537 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -59,7 +59,7 @@ use crate::pipelines::processors::transforms::memory::outer_left_join::OuterLeft use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinProbeState; use crate::pipelines::processors::transforms::InnerHashJoin; -use crate::pipelines::processors::transforms::PlanRuntimeFilterDesc; +use crate::pipelines::processors::transforms::RuntimeFiltersDesc; use crate::pipelines::processors::transforms::TransformHashJoin; use crate::pipelines::processors::transforms::TransformHashJoinBuild; use crate::pipelines::processors::transforms::TransformHashJoinProbe; @@ -401,7 +401,7 @@ impl HashJoin { let state = Arc::new(BasicHashJoinState::create()); // We must build the runtime filter before constructing the child nodes, // as we will inject some runtime filter information into the context for the child nodes to use. - let rf_desc = PlanRuntimeFilterDesc::create(&builder.ctx, self); + let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?; if let Some((build_cache_index, _)) = self.build_side_cache_info { builder.hash_join_states.insert( diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/interface.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/interface.rs index 64e1e871ff4a4..413032de20f2d 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/interface.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/interface.rs @@ -18,9 +18,9 @@ use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_storages_fuse::TableContext; -use super::builder::build_runtime_filter_packet; use super::convert::build_runtime_filter_infos; use super::global::get_global_runtime_filter_packet; +use crate::pipelines::processors::transforms::build_runtime_filter_packet; use crate::pipelines::processors::HashJoinBuildState; pub async fn build_and_push_down_runtime_filter( diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs index 416587dbf3efc..40a89a4ba9268 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs @@ -19,6 +19,7 @@ mod interface; mod merge; mod packet; +pub use builder::build_runtime_filter_packet; pub use convert::build_runtime_filter_infos; pub use global::get_global_runtime_filter_packet; pub use interface::build_and_push_down_runtime_filter; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs index 24d53b0b25c58..f76eb09878bc7 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs @@ -17,6 +17,7 @@ use databend_common_exception::Result; use databend_common_expression::DataBlock; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; +use crate::pipelines::processors::transforms::RuntimeFiltersDesc; pub trait JoinStream: Send + Sync { fn next(&mut self) -> Result>; @@ -27,7 +28,7 @@ pub trait Join: Send + Sync + 'static { fn final_build(&mut self) -> Result>; - fn build_runtime_filter(&self) -> Result { + fn build_runtime_filter(&self, _: &RuntimeFiltersDesc) -> Result { Ok(JoinRuntimeFilterPacket::default()) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs index 941b4327d43a7..27b8590812ed8 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs @@ -159,6 +159,7 @@ impl BasicHashJoin { } } + let mut columns = Vec::with_capacity(self.desc.build_projection.len()); for offset in 0..self.desc.build_projection.len() { let full_columns = self .state @@ -167,9 +168,10 @@ impl BasicHashJoin { .map(|block| block.get_by_offset(offset).to_column()) .collect::>(); - let columns = self.state.columns.as_mut(); columns.push(Column::take_downcast_column_vec(&full_columns)); } + + std::mem::swap(&mut columns, self.state.columns.as_mut()); } fn init_memory_hash_table(&mut self) { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs index dc75ec5c937c3..0cb4f1a3a4e4a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs @@ -28,6 +28,7 @@ use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; +use crate::pipelines::processors::transforms::build_runtime_filter_packet; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; @@ -38,6 +39,8 @@ use crate::pipelines::processors::transforms::new_hash_join::memory::basic::Basi use crate::pipelines::processors::transforms::new_hash_join::memory::basic_state::BasicHashJoinState; use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::HashJoinHashTable; +use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; +use crate::pipelines::processors::transforms::RuntimeFiltersDesc; use crate::pipelines::processors::HashJoinDesc; use crate::sessions::QueryContext; @@ -90,6 +93,18 @@ impl Join for InnerHashJoin { self.basic_hash_join.final_build() } + fn build_runtime_filter(&self, desc: &RuntimeFiltersDesc) -> Result { + build_runtime_filter_packet( + self.basic_state.chunks.deref(), + *self.basic_state.build_rows, + &desc.filters_desc, + &self.function_ctx, + desc.inlist_threshold, + desc.bloom_threshold, + desc.min_max_threshold, + ) + } + fn probe_block(&mut self, data: DataBlock) -> Result> { if data.is_empty() || *self.basic_state.build_rows == 0 { return Ok(Box::new(EmptyJoinStream)); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index bab153f286d80..b949c36f331a4 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -23,5 +23,5 @@ mod transform_hash_join; pub use join::Join; pub use memory::BasicHashJoinState; pub use memory::InnerHashJoin; -pub use runtime_filter::PlanRuntimeFilterDesc; +pub use runtime_filter::RuntimeFiltersDesc; pub use transform_hash_join::TransformHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs index c77ddf28520f9..17e0a64af4311 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs @@ -26,16 +26,25 @@ use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::RuntimeFilterDesc; use crate::sessions::QueryContext; -pub struct PlanRuntimeFilterDesc { +pub struct RuntimeFiltersDesc { ctx: Arc, + pub bloom_threshold: usize, + pub inlist_threshold: usize, + pub min_max_threshold: usize, + broadcast_id: Option, - filters_desc: Vec, + pub filters_desc: Vec, runtime_filters_ready: Vec>, } -impl PlanRuntimeFilterDesc { - pub fn create(ctx: &Arc, join: &HashJoin) -> Arc { +impl RuntimeFiltersDesc { + pub fn create(ctx: &Arc, join: &HashJoin) -> Result> { + let settings = ctx.get_settings(); + let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; + let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; + let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; + let mut filters_desc = Vec::with_capacity(join.runtime_filter.filters.len()); let mut runtime_filters_ready = Vec::with_capacity(join.runtime_filter.filters.len()); @@ -51,12 +60,15 @@ impl PlanRuntimeFilterDesc { filters_desc.push(filter_desc); } - Arc::new(PlanRuntimeFilterDesc { + Ok(Arc::new(RuntimeFiltersDesc { filters_desc, + bloom_threshold, + inlist_threshold, + min_max_threshold, runtime_filters_ready, ctx: ctx.clone(), broadcast_id: join.broadcast_id, - }) + })) } pub async fn globalization(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs index 601df7e8262b9..e2dce50cb30ca 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs @@ -29,7 +29,7 @@ use tokio::sync::Barrier; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::PlanRuntimeFilterDesc; +use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::RuntimeFiltersDesc; pub struct TransformHashJoin { build_port: Arc, @@ -41,7 +41,7 @@ pub struct TransformHashJoin { joined_data: Option, stage_sync_barrier: Arc, projection: ColumnSet, - rf_desc: Arc, + rf_desc: Arc, } impl TransformHashJoin { @@ -52,7 +52,7 @@ impl TransformHashJoin { join: Box, stage_sync_barrier: Arc, projection: ColumnSet, - rf_desc: Arc, + rf_desc: Arc, ) -> ProcessorPtr { ProcessorPtr::create(Box::new(TransformHashJoin { build_port, @@ -180,18 +180,18 @@ impl Processor for TransformHashJoin { let wait_res = self.stage_sync_barrier.wait().await; self.stage = match self.stage { - Stage::Build(_) => Stage::BuildFinal(BuildFinalState::new()), - Stage::BuildFinal(_) => { + Stage::Build(_) => { if wait_res.is_leader() { - let packet = self.join.build_runtime_filter()?; + let packet = self.join.build_runtime_filter(&self.rf_desc)?; self.rf_desc.globalization(packet).await?; } let _wait_res = self.stage_sync_barrier.wait().await; - Stage::Probe(ProbeState::new()) + Stage::BuildFinal(BuildFinalState::new()) } + Stage::BuildFinal(_) => Stage::Probe(ProbeState::new()), Stage::Probe(_) => Stage::ProbeFinal(ProbeFinalState::new()), Stage::ProbeFinal(_) => Stage::Finished, Stage::Finished => Stage::Finished, From 8363548d1e8ef42599b2721db7917adf27b5b1aa Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 5 Oct 2025 14:52:44 +0800 Subject: [PATCH 11/24] refactor(query): refactor left outer join to new join --- .../src/physical_plans/physical_hash_join.rs | 5 +- .../new_hash_join/memory/outer_left_join.rs | 135 ++++++++++++------ 2 files changed, 93 insertions(+), 47 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 170e8b23a9537..0dca58be67945 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -262,10 +262,7 @@ impl IPhysicalPlan for HashJoin { let (enable_optimization, _) = builder.merge_into_get_optimization_flag(self); if desc.single_to_inner.is_none() - && ( - self.join_type == JoinType::Inner - // || self.join_type == JoinType::Left - ) + && (self.join_type == JoinType::Inner || self.join_type == JoinType::Left) && experimental_new_join && !enable_optimization { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index b11ef63f99ec4..b75b6d82f0940 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -25,6 +25,7 @@ use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::Scalar; @@ -141,78 +142,75 @@ impl Join for OuterLeftHashJoin { )), })?; - Ok(OuterLeftHashJoinStream::create( - probe_block, - self.basic_state.clone(), - probe_stream, - self.desc.clone(), - &mut self.performance_context.probe_result, - )) + match self.performance_context.filter_executor.as_mut() { + None => Ok(OuterLeftHashJoinStream::::create( + probe_block, + self.basic_state.clone(), + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + None, + )), + Some(filter_executor) => Ok(OuterLeftHashJoinStream::::create( + probe_block, + self.basic_state.clone(), + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + Some(filter_executor), + )), + } } } -struct OuterLeftHashJoinStream<'a> { +struct OuterLeftHashJoinStream<'a, const CONJUNCT: bool> { desc: Arc, probe_data_block: DataBlock, join_state: Arc, probe_keys_stream: Box, probed_rows: &'a mut ProbedRows, - unmatched_rows: Vec, + pending_unmatched: Vec, + pending_unmatched_num_rows: usize, + filter_executor: Option<&'a mut FilterExecutor>, } -unsafe impl<'a> Send for OuterLeftHashJoinStream<'a> {} -unsafe impl<'a> Sync for OuterLeftHashJoinStream<'a> {} - -impl<'a> OuterLeftHashJoinStream<'a> { - pub fn create( - probe_data_block: DataBlock, - join_state: Arc, - probe_keys_stream: Box, - desc: Arc, - probed_rows: &'a mut ProbedRows, - ) -> Box { - Box::new(OuterLeftHashJoinStream { - desc, - join_state, - probed_rows, - probe_data_block, - probe_keys_stream, - unmatched_rows: vec![], - }) - } -} +unsafe impl<'a, const CONJUNCT: bool> Send for OuterLeftHashJoinStream<'a, CONJUNCT> {} +unsafe impl<'a, const CONJUNCT: bool> Sync for OuterLeftHashJoinStream<'a, CONJUNCT> {} -impl<'a> JoinStream for OuterLeftHashJoinStream<'a> { +impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUNCT> { fn next(&mut self) -> Result> { loop { self.probed_rows.clear(); let max_rows = self.probed_rows.matched_probe.capacity(); self.probe_keys_stream.advance(self.probed_rows, max_rows)?; - if !self.probed_rows.unmatched.is_empty() { - self.unmatched_rows - .extend_from_slice(&self.probed_rows.unmatched); - } - if self.probed_rows.is_empty() { - if self.unmatched_rows.is_empty() { + if self.pending_unmatched.is_empty() { return Ok(None); } - let unmatched = std::mem::take(&mut self.unmatched_rows); + let unmatched = std::mem::take(&mut self.pending_unmatched); + + let unmatched_row_id = unmatched + .into_iter() + .enumerate() + .filter(|(_, matched)| *matched == 0) + .map(|(row_id, _)| row_id as u64) + .collect::>(); + let probe_block = match self.probe_data_block.num_columns() { 0 => None, - _ => Some(DataBlock::take(&self.probe_data_block, &unmatched)?), + _ => Some(DataBlock::take(&self.probe_data_block, &unmatched_row_id)?), }; let types = &self.join_state.column_types; - let build_block = null_build_block(types, unmatched.len()); + let build_block = null_build_block(types, unmatched_row_id.len()); return Ok(Some(final_result_block( &self.desc, probe_block, build_block, - unmatched.len(), + unmatched_row_id.len(), ))); } @@ -248,16 +246,67 @@ impl<'a> JoinStream for OuterLeftHashJoinStream<'a> { } }; - return Ok(Some(final_result_block( + let mut result_block = final_result_block( &self.desc, probe_block, build_block, self.probed_rows.matched_build.len(), - ))); + ); + + if CONJUNCT && let Some(filter_executor) = self.filter_executor.as_mut() { + let result_count = filter_executor.select(&result_block)?; + let origin_rows = result_block.num_rows(); + + if result_count == origin_rows { + return Ok(Some(result_block)); + } + + let true_sel = filter_executor.true_selection(); + + for idx in 0..result_count { + let idx = true_sel[idx] as usize; + let row_id = self.probed_rows.matched_probe[idx] as usize; + self.pending_unmatched[row_id] = 1; + self.pending_unmatched_num_rows -= 1; + } + + let origin_rows = result_block.num_rows(); + result_block = filter_executor.take(result_block, origin_rows, result_count)?; + } + + return Ok(Some(result_block)); } } } +impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + join_state: Arc, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + let num_rows = probe_data_block.num_rows(); + let pending_unmatched = match CONJUNCT { + true => vec![0; num_rows], + false => Vec::new(), + }; + + Box::new(OuterLeftHashJoinStream::<'a, CONJUNCT> { + desc, + join_state, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + pending_unmatched, + pending_unmatched_num_rows: num_rows, + }) + } +} + fn final_result_block( desc: &HashJoinDesc, probe_block: Option, From 3e896d750ba3687d8f1d1f530b6676b52c3c89bd Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 5 Oct 2025 14:54:44 +0800 Subject: [PATCH 12/24] refactor(query): refactor left outer join to new join --- .../transforms/new_hash_join/memory/outer_left_join.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index b75b6d82f0940..8a931d319f114 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -263,9 +263,8 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN let true_sel = filter_executor.true_selection(); - for idx in 0..result_count { - let idx = true_sel[idx] as usize; - let row_id = self.probed_rows.matched_probe[idx] as usize; + for idx in true_sel.iter().take(result_count) { + let row_id = self.probed_rows.matched_probe[*idx as usize] as usize; self.pending_unmatched[row_id] = 1; self.pending_unmatched_num_rows -= 1; } From 9fcadf44e8908c40b5fb631ff4ce496d238e9f0c Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 5 Oct 2025 15:56:23 +0800 Subject: [PATCH 13/24] refactor(query): refactor left outer join to new join --- .../new_hash_join/memory/outer_left_join.rs | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index 8a931d319f114..a816b3d9995be 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -169,8 +169,9 @@ struct OuterLeftHashJoinStream<'a, const CONJUNCT: bool> { join_state: Arc, probe_keys_stream: Box, probed_rows: &'a mut ProbedRows, - pending_unmatched: Vec, - pending_unmatched_num_rows: usize, + conjunct_unmatched: Vec, + conjunct_unmatched_num_rows: usize, + unmatched_rows: Vec, filter_executor: Option<&'a mut FilterExecutor>, } @@ -184,19 +185,25 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN let max_rows = self.probed_rows.matched_probe.capacity(); self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + if !CONJUNCT && !self.probed_rows.unmatched.is_empty() { + self.unmatched_rows + .extend_from_slice(&self.probed_rows.unmatched); + } + if self.probed_rows.is_empty() { - if self.pending_unmatched.is_empty() { + if self.conjunct_unmatched.is_empty() && self.unmatched_rows.is_empty() { return Ok(None); } - let unmatched = std::mem::take(&mut self.pending_unmatched); - - let unmatched_row_id = unmatched - .into_iter() - .enumerate() - .filter(|(_, matched)| *matched == 0) - .map(|(row_id, _)| row_id as u64) - .collect::>(); + let unmatched_row_id = match CONJUNCT { + true => std::mem::take(&mut self.conjunct_unmatched) + .into_iter() + .enumerate() + .filter(|(_, matched)| *matched == 0) + .map(|(row_id, _)| row_id as u64) + .collect::>(), + false => std::mem::take(&mut self.unmatched_rows), + }; let probe_block = match self.probe_data_block.num_columns() { 0 => None, @@ -265,8 +272,8 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN for idx in true_sel.iter().take(result_count) { let row_id = self.probed_rows.matched_probe[*idx as usize] as usize; - self.pending_unmatched[row_id] = 1; - self.pending_unmatched_num_rows -= 1; + self.conjunct_unmatched[row_id] = 1; + self.conjunct_unmatched_num_rows -= 1; } let origin_rows = result_block.num_rows(); @@ -293,6 +300,11 @@ impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { false => Vec::new(), }; + let unmatched_rows = match CONJUNCT { + true => Vec::new(), + false => Vec::with_capacity(num_rows), + }; + Box::new(OuterLeftHashJoinStream::<'a, CONJUNCT> { desc, join_state, @@ -300,8 +312,9 @@ impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { probe_data_block, probe_keys_stream, filter_executor, - pending_unmatched, - pending_unmatched_num_rows: num_rows, + conjunct_unmatched: pending_unmatched, + conjunct_unmatched_num_rows: num_rows, + unmatched_rows: unmatched_rows, }) } } From ad6a21c07b6041fbdb461482beec0e06621c917f Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 5 Oct 2025 16:35:58 +0800 Subject: [PATCH 14/24] refactor(query): refactor left outer join to new join --- .../transforms/new_hash_join/memory/outer_left_join.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index a816b3d9995be..94fef8c3ecda3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -312,9 +312,9 @@ impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { probe_data_block, probe_keys_stream, filter_executor, + unmatched_rows, conjunct_unmatched: pending_unmatched, conjunct_unmatched_num_rows: num_rows, - unmatched_rows: unmatched_rows, }) } } From 56a40c2d7d90fd462db7dece958c10f82bd21256 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 5 Oct 2025 17:19:39 +0800 Subject: [PATCH 15/24] refactor(query): refactor left outer join to new join --- .../transforms/new_hash_join/hashtable/basic.rs | 14 +++++++++----- .../new_hash_join/memory/outer_left_join.rs | 12 ++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs index 95befffdd3bba..87c04267935ce 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs @@ -72,23 +72,27 @@ impl ProbeStream for EmptyProbeStream { } pub struct AllUnmatchedProbeStream { - idx: usize, - size: usize, + idx: u64, + size: u64, } impl AllUnmatchedProbeStream { pub fn create(size: usize) -> Box { - Box::new(AllUnmatchedProbeStream { idx: 0, size }) + Box::new(AllUnmatchedProbeStream { + idx: 0, + size: size as u64, + }) } } impl ProbeStream for AllUnmatchedProbeStream { - fn advance(&mut self, _rows: &mut ProbedRows, max_rows: usize) -> Result<()> { + fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { if self.idx >= self.size { return Ok(()); } - let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows); + let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64); + rows.unmatched.extend(self.idx..self.idx + unmatched_rows); self.idx += unmatched_rows; Ok(()) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index 94fef8c3ecda3..ad966525ce5a9 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -170,7 +170,6 @@ struct OuterLeftHashJoinStream<'a, const CONJUNCT: bool> { probe_keys_stream: Box, probed_rows: &'a mut ProbedRows, conjunct_unmatched: Vec, - conjunct_unmatched_num_rows: usize, unmatched_rows: Vec, filter_executor: Option<&'a mut FilterExecutor>, } @@ -237,7 +236,7 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN true => None, false => { let row_ptrs = self.probed_rows.matched_build.as_slice(); - let build_block1 = DataBlock::take_column_vec( + let build_block = DataBlock::take_column_vec( self.join_state.columns.as_slice(), self.join_state.column_types.as_slice(), row_ptrs, @@ -245,7 +244,7 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN ); let true_validity = Bitmap::new_constant(true, row_ptrs.len()); - let entries = build_block1 + let entries = build_block .columns() .iter() .map(|c| wrap_true_validity(c, row_ptrs.len(), &true_validity)); @@ -262,10 +261,9 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN if CONJUNCT && let Some(filter_executor) = self.filter_executor.as_mut() { let result_count = filter_executor.select(&result_block)?; - let origin_rows = result_block.num_rows(); - if result_count == origin_rows { - return Ok(Some(result_block)); + if result_count == 0 { + continue; } let true_sel = filter_executor.true_selection(); @@ -273,7 +271,6 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUN for idx in true_sel.iter().take(result_count) { let row_id = self.probed_rows.matched_probe[*idx as usize] as usize; self.conjunct_unmatched[row_id] = 1; - self.conjunct_unmatched_num_rows -= 1; } let origin_rows = result_block.num_rows(); @@ -314,7 +311,6 @@ impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { filter_executor, unmatched_rows, conjunct_unmatched: pending_unmatched, - conjunct_unmatched_num_rows: num_rows, }) } } From 21a6e1a8522ff8514b8cfc7575a9955dcc3a2f7c Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 5 Oct 2025 18:53:15 +0800 Subject: [PATCH 16/24] refactor(query): refactor left outer join to new join --- .../transforms/new_hash_join/hashtable/fixed_keys.rs | 10 ++++++++++ .../new_hash_join/hashtable/serialize_keys.rs | 12 ++++++++++++ .../new_hash_join/memory/outer_left_join.rs | 1 + 3 files changed, 23 insertions(+) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs index 0cce33014a333..86946c7d992b3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -219,6 +219,7 @@ impl ProbeStream let key = unsafe { self.keys.key_unchecked(self.key_idx) }; + let origin_len = res.matched_probe.len(); while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut RawEntry) }; @@ -241,6 +242,10 @@ impl ProbeStream self.probe_entry_ptr = raw_entry.next; } + if origin_len == res.matched_probe.len() { + res.unmatched.push(self.key_idx as u64); + } + self.key_idx += 1; } @@ -310,6 +315,7 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream let key = unsafe { self.keys.key_unchecked(key_idx) }; + let origin_len = res.matched_probe.len(); while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut RawEntry) }; @@ -332,6 +338,10 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream self.probe_entry_ptr = raw_entry.next; } + if origin_len == res.matched_probe.len() { + res.unmatched.push(key_idx as u64); + } + self.idx += 1; } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index 883daa09449cd..a01b782f6b1b5 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -243,6 +243,8 @@ impl ProbeStream for BinaryKeyProbeStream { let key = unsafe { self.keys.key_unchecked(self.key_idx) }; + let origin = res.matched_probe.len(); + while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut StringRawEntry) }; // Compare `early` and the length of the string, the size of `early` is 4. @@ -277,6 +279,10 @@ impl ProbeStream for BinaryKeyProbeStream { self.probe_entry_ptr = raw_entry.next; } + if origin == res.matched_probe.len() { + res.unmatched.push(self.key_idx as u64); + } + self.key_idx += 1; } @@ -346,6 +352,8 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC let key = unsafe { self.keys.key_unchecked(key_idx) }; + let origin = res.matched_probe.len(); + while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut StringRawEntry) }; // Compare `early` and the length of the string, the size of `early` is 4. @@ -380,6 +388,10 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC self.probe_entry_ptr = raw_entry.next; } + if origin == res.matched_probe.len() { + res.unmatched.push(key_idx as u64); + } + self.idx += 1; } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs index ad966525ce5a9..d4a59d2e7fd0f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/outer_left_join.rs @@ -302,6 +302,7 @@ impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { false => Vec::with_capacity(num_rows), }; + probed_rows.unmatched.reserve(num_rows); Box::new(OuterLeftHashJoinStream::<'a, CONJUNCT> { desc, join_state, From 197f5a7a28f520f0d35582815652e58bfc49cc96 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 07:53:12 +0800 Subject: [PATCH 17/24] refactor(query): refactor left outer join to new join --- .../new_hash_join/hashtable/fixed_keys.rs | 18 ++++++++++++++---- .../new_hash_join/hashtable/serialize_keys.rs | 18 ++++++++++++------ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs index 86946c7d992b3..9f1349251dcef 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -174,6 +174,7 @@ struct FixedKeyProbeStream, probe_entry_ptr: u64, keys: Box<(dyn KeyAccessor)>, + matched_num_rows: usize, } impl FixedKeyProbeStream { @@ -186,6 +187,7 @@ impl FixedKeyProbeStream< pointers, key_idx: 0, probe_entry_ptr: 0, + matched_num_rows: 0, }) } } @@ -213,13 +215,13 @@ impl ProbeStream } self.key_idx += 1; + self.matched_num_rows = 0; continue; } } let key = unsafe { self.keys.key_unchecked(self.key_idx) }; - let origin_len = res.matched_probe.len(); while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut RawEntry) }; @@ -227,12 +229,14 @@ impl ProbeStream let row_ptr = raw_entry.row_ptr; res.matched_probe.push(self.key_idx as u64); res.matched_build.push(row_ptr); + self.matched_num_rows += 1; if res.matched_probe.len() == max_rows { self.probe_entry_ptr = raw_entry.next; if self.probe_entry_ptr == 0 { self.key_idx += 1; + self.matched_num_rows = 0; } return Ok(()); @@ -242,11 +246,12 @@ impl ProbeStream self.probe_entry_ptr = raw_entry.next; } - if origin_len == res.matched_probe.len() { + if !MATCHED && self.matched_num_rows == 0 { res.unmatched.push(self.key_idx as u64); } self.key_idx += 1; + self.matched_num_rows = 0; } Ok(()) @@ -260,6 +265,7 @@ struct EarlyFilteringProbeStream<'a, Key: FixedKey + HashtableKeyable, const MAT keys: Box<(dyn KeyAccessor)>, selections: &'a [u32], unmatched_selection: &'a [u32], + matched_num_rows: usize, } impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> @@ -278,6 +284,7 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> unmatched_selection, idx: 0, probe_entry_ptr: 0, + matched_num_rows: 0, }) } } @@ -309,13 +316,13 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream if self.probe_entry_ptr == 0 { self.idx += 1; + self.matched_num_rows = 0; continue; } } let key = unsafe { self.keys.key_unchecked(key_idx) }; - let origin_len = res.matched_probe.len(); while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut RawEntry) }; @@ -323,12 +330,14 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream let row_ptr = raw_entry.row_ptr; res.matched_probe.push(key_idx as u64); res.matched_build.push(row_ptr); + self.matched_num_rows += 1; if res.matched_probe.len() == max_rows { self.probe_entry_ptr = raw_entry.next; if self.probe_entry_ptr == 0 { self.idx += 1; + self.matched_num_rows = 0; } return Ok(()); @@ -338,11 +347,12 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream self.probe_entry_ptr = raw_entry.next; } - if origin_len == res.matched_probe.len() { + if !MATCHED && self.matched_num_rows == 0 { res.unmatched.push(key_idx as u64); } self.idx += 1; + self.matched_num_rows = 0; } Ok(()) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index a01b782f6b1b5..defad6b2a2151 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -199,6 +199,7 @@ pub struct BinaryKeyProbeStream { pointers: Vec, keys: Box<(dyn KeyAccessor)>, probe_entry_ptr: u64, + matched_num_rows: usize, } impl BinaryKeyProbeStream { @@ -211,6 +212,7 @@ impl BinaryKeyProbeStream { pointers, key_idx: 0, probe_entry_ptr: 0, + matched_num_rows: 0, }) } } @@ -237,14 +239,13 @@ impl ProbeStream for BinaryKeyProbeStream { } self.key_idx += 1; + self.matched_num_rows = 0; continue; } } let key = unsafe { self.keys.key_unchecked(self.key_idx) }; - let origin = res.matched_probe.len(); - while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut StringRawEntry) }; // Compare `early` and the length of the string, the size of `early` is 4. @@ -268,6 +269,7 @@ impl ProbeStream for BinaryKeyProbeStream { if self.probe_entry_ptr == 0 { self.key_idx += 1; + self.matched_num_rows = 0; } return Ok(()); @@ -279,11 +281,12 @@ impl ProbeStream for BinaryKeyProbeStream { self.probe_entry_ptr = raw_entry.next; } - if origin == res.matched_probe.len() { + if !MATCHED && self.matched_num_rows == 0 { res.unmatched.push(self.key_idx as u64); } self.key_idx += 1; + self.matched_num_rows = 0; } Ok(()) @@ -297,6 +300,7 @@ pub struct EarlyFilteringProbeStream<'a, const MATCHED: bool> { probe_entry_ptr: u64, selections: &'a [u32], unmatched_selection: &'a [u32], + matched_num_rows: usize, } impl<'a, const MATCHED: bool> EarlyFilteringProbeStream<'a, MATCHED> { @@ -313,6 +317,7 @@ impl<'a, const MATCHED: bool> EarlyFilteringProbeStream<'a, MATCHED> { unmatched_selection, idx: 0, probe_entry_ptr: 0, + matched_num_rows: 0, }) } } @@ -346,14 +351,13 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC } self.idx += 1; + self.matched_num_rows = 0; continue; } } let key = unsafe { self.keys.key_unchecked(key_idx) }; - let origin = res.matched_probe.len(); - while self.probe_entry_ptr != 0 { let raw_entry = unsafe { &*(self.probe_entry_ptr as *mut StringRawEntry) }; // Compare `early` and the length of the string, the size of `early` is 4. @@ -377,6 +381,7 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC if self.probe_entry_ptr == 0 { self.idx += 1; + self.matched_num_rows = 0; } return Ok(()); @@ -388,11 +393,12 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC self.probe_entry_ptr = raw_entry.next; } - if origin == res.matched_probe.len() { + if !MATCHED && self.matched_num_rows == 0 { res.unmatched.push(key_idx as u64); } self.idx += 1; + self.matched_num_rows = 0; } Ok(()) From 221c2dd7f54c8abc86678872369f8b6f761605f4 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 12:38:41 +0800 Subject: [PATCH 18/24] refactor(query): refactor left outer join to new join --- .../transforms/new_hash_join/hashtable/serialize_keys.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index defad6b2a2151..6d12973b1eeee 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -263,6 +263,7 @@ impl ProbeStream for BinaryKeyProbeStream { let row_ptr = raw_entry.row_ptr; res.matched_probe.push(self.key_idx as u64); res.matched_build.push(row_ptr); + self.matched_num_rows += 1; if res.matched_probe.len() == max_rows { self.probe_entry_ptr = raw_entry.next; @@ -375,6 +376,7 @@ impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATC let row_ptr = raw_entry.row_ptr; res.matched_probe.push(key_idx as u64); res.matched_build.push(row_ptr); + self.matched_num_rows += 1; if res.matched_probe.len() == max_rows { self.probe_entry_ptr = raw_entry.next; From 4813367fa6d20d9846ab794a858b3e261478d3eb Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 17:19:51 +0800 Subject: [PATCH 19/24] refactor(query): refactor left outer join to new join --- .../processors/transforms/hash_join/desc.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs index a9f0226a07c7e..ee191e992b35e 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs @@ -32,6 +32,7 @@ use parking_lot::RwLock; use crate::physical_plans::HashJoin; use crate::physical_plans::PhysicalRuntimeFilter; use crate::physical_plans::PhysicalRuntimeFilters; +use crate::pipelines::processors::transforms::wrap_true_validity; use crate::sql::plans::JoinType; pub const MARKER_KIND_TRUE: u8 = 0; @@ -174,7 +175,20 @@ impl HashJoinDesc { pub fn build_key(&self, block: &DataBlock, ctx: &FunctionContext) -> Result> { let build_keys = &self.build_keys; - let evaluator = Evaluator::new(block, ctx, &BUILTIN_FUNCTIONS); + let mut _nullable_chunk = None; + let evaluator = match self.join_type { + JoinType::Left => { + let validity = Bitmap::new_constant(true, block.num_rows()); + let nullable_columns = block + .columns() + .iter() + .map(|c| wrap_true_validity(c, block.num_rows(), &validity)) + .collect::>(); + _nullable_chunk = Some(DataBlock::new(nullable_columns, block.num_rows())); + Evaluator::new(_nullable_chunk.as_ref().unwrap(), ctx, &BUILTIN_FUNCTIONS) + } + _ => Evaluator::new(block, ctx, &BUILTIN_FUNCTIONS), + }; build_keys .iter() .map(|expr| { From e2a98cdd607207e99e1299741a162b1a863b7531 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 17:23:06 +0800 Subject: [PATCH 20/24] refactor(query): refactor left outer join to new join --- src/common/base/src/hints/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/common/base/src/hints/mod.rs b/src/common/base/src/hints/mod.rs index ff81722eba941..8e23909645f63 100644 --- a/src/common/base/src/hints/mod.rs +++ b/src/common/base/src/hints/mod.rs @@ -14,6 +14,13 @@ #[inline] pub fn assume(condition: bool) { + #[cfg(debug_assertions)] + { + if !condition { + panic!("assume condition must be true"); + } + } + if !condition { unsafe { std::hint::unreachable_unchecked() } } From 32dd07aeda7bc9b9187e9ad21e21252882bac924 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 18:04:19 +0800 Subject: [PATCH 21/24] refactor(query): refactor left outer join to new join --- src/common/base/src/hints/mod.rs | 7 +++++-- tests/sqllogictests/suites/query/join/left_outer.test | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/common/base/src/hints/mod.rs b/src/common/base/src/hints/mod.rs index 8e23909645f63..1999b0ae6e763 100644 --- a/src/common/base/src/hints/mod.rs +++ b/src/common/base/src/hints/mod.rs @@ -21,7 +21,10 @@ pub fn assume(condition: bool) { } } - if !condition { - unsafe { std::hint::unreachable_unchecked() } + #[cfg(not(debug_assertions))] + { + if !condition { + unsafe { std::hint::unreachable_unchecked() } + } } } diff --git a/tests/sqllogictests/suites/query/join/left_outer.test b/tests/sqllogictests/suites/query/join/left_outer.test index 9bc1c406a224e..3d95af9dead81 100644 --- a/tests/sqllogictests/suites/query/join/left_outer.test +++ b/tests/sqllogictests/suites/query/join/left_outer.test @@ -290,10 +290,10 @@ SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a; ---- 2 3 2 10 2 3 2 8 -1 2 NULL NULL 2 3 2 6 2 3 2 4 2 3 2 2 +1 2 NULL NULL 3 4 NULL NULL 4 5 NULL NULL 5 6 NULL NULL From 9f8e9be2f054f68959b0d0812331930304a9a8b8 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 19:49:19 +0800 Subject: [PATCH 22/24] refactor(query): refactor left outer join to new join --- .../transforms/new_hash_join/hashtable/fixed_keys.rs | 5 ++++- .../transforms/new_hash_join/hashtable/serialize_keys.rs | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs index 9f1349251dcef..e65644bb85e1a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs @@ -266,6 +266,7 @@ struct EarlyFilteringProbeStream<'a, Key: FixedKey + HashtableKeyable, const MAT selections: &'a [u32], unmatched_selection: &'a [u32], matched_num_rows: usize, + returned_unmatched: bool, } impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> @@ -285,6 +286,7 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> idx: 0, probe_entry_ptr: 0, matched_num_rows: 0, + returned_unmatched: false, }) } } @@ -293,7 +295,8 @@ impl<'a, Key: FixedKey + HashtableKeyable, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, Key, MATCHED> { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { - if !MATCHED { + if !MATCHED && !self.returned_unmatched { + self.returned_unmatched = true; res.unmatched .extend(self.unmatched_selection.iter().map(|x| *x as u64)); } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs index 6d12973b1eeee..5744c8cf8fa1e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs @@ -302,6 +302,7 @@ pub struct EarlyFilteringProbeStream<'a, const MATCHED: bool> { selections: &'a [u32], unmatched_selection: &'a [u32], matched_num_rows: usize, + returned_unmatched: bool, } impl<'a, const MATCHED: bool> EarlyFilteringProbeStream<'a, MATCHED> { @@ -319,13 +320,15 @@ impl<'a, const MATCHED: bool> EarlyFilteringProbeStream<'a, MATCHED> { idx: 0, probe_entry_ptr: 0, matched_num_rows: 0, + returned_unmatched: false, }) } } impl<'a, const MATCHED: bool> ProbeStream for EarlyFilteringProbeStream<'a, MATCHED> { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { - if !MATCHED { + if !MATCHED && !self.returned_unmatched { + self.returned_unmatched = true; res.unmatched .extend(self.unmatched_selection.iter().map(|x| *x as u64)); } From 4c6fa0ce01d91d21d37d2d0f4ca8341e7ddcbe2b Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 20:56:39 +0800 Subject: [PATCH 23/24] refactor(query): refactor left outer join to new join --- src/query/settings/src/settings_default.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index d07eed2d0db19..6e321fa62aee8 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1481,7 +1481,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(0..=1)), }), ("enable_experimental_new_join", DefaultSettingValue { - value: UserSettingValue::UInt64(1), + value: UserSettingValue::UInt64(0), desc: "Enables the experimental new join implement", mode: SettingMode::Both, scope: SettingScope::Both, From 554a792c96790d5151827bc96e762988eb1be08c Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Oct 2025 23:28:54 +0800 Subject: [PATCH 24/24] refactor(query): refactor left outer join to new join --- .../sqllogictests/suites/query/join/left_outer.test | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/sqllogictests/suites/query/join/left_outer.test b/tests/sqllogictests/suites/query/join/left_outer.test index 3d95af9dead81..bbbfd8a53a12d 100644 --- a/tests/sqllogictests/suites/query/join/left_outer.test +++ b/tests/sqllogictests/suites/query/join/left_outer.test @@ -286,14 +286,14 @@ statement ok set max_block_size = 2; query I -SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a; +SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a ORDER BY t1.a, t2.b; ---- -2 3 2 10 -2 3 2 8 -2 3 2 6 -2 3 2 4 -2 3 2 2 1 2 NULL NULL +2 3 2 2 +2 3 2 4 +2 3 2 6 +2 3 2 8 +2 3 2 10 3 4 NULL NULL 4 5 NULL NULL 5 6 NULL NULL