Skip to content

Commit

Permalink
PERF-modin-project#6876: Skip the masking stage on 'iloc' where benef…
Browse files Browse the repository at this point in the history
…icial

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Jan 24, 2024
1 parent 43134ef commit 8061516
Showing 1 changed file with 44 additions and 9 deletions.
53 changes: 44 additions & 9 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,12 +1161,22 @@ def _take_2d_positional(
extra_log="Mask takes only list-like numeric indexers, "
+ f"received: {type(indexer)}",
)
if isinstance(indexer, list):
indexer = np.array(indexer, dtype=np.int64)
indexers.append(indexer)
row_positions, col_positions = indexers

if col_positions is None and row_positions is None:
return self.copy()

# quite fast check that allows skip sorting
must_sort_row_pos = row_positions is not None and not np.all(
row_positions[1:] >= row_positions[:-1]
)
must_sort_col_pos = col_positions is not None and not np.all(
col_positions[1:] >= col_positions[:-1]
)

if col_positions is None and row_positions is not None:
# Check if the optimization that first takes part of the data using the mask
# operation so that later less data is concatenated into a whole column is useful.
Expand All @@ -1175,18 +1185,40 @@ def _take_2d_positional(
all_rows = None
if self.has_materialized_index:
all_rows = len(self.index)
elif self._row_lengths_cache:
elif self._row_lengths_cache or must_sort_row_pos:
all_rows = sum(self._row_lengths_cache)
if all_rows:
if len(row_positions) > 0.9 * all_rows:
return self._reorder_labels(
row_positions=row_positions, col_positions=col_positions
)

# 'base_num_cols' specifies the number of columns that the dataframe should have
# in order to jump to 'reordered_labels' in case of len(row_positions) / len(self) >= base_ratio;
# these variables may be a subject to change in order to tune performance more accurately
base_num_cols = 10
base_ratio = 0.2
# Example:
# len(self.columns): 10 == base_num_cols -> min ratio to jump to reorder_labels: 0.2 == base_ratio
# len(self.columns): 15 -> min ratio to jump to reorder_labels: 0.3
# len(self.columns): 20 -> min ratio to jump to reorder_labels: 0.4
# ...
# len(self.columns): 49 -> min ratio to jump to reorder_labels: 0.98
# len(self.columns): 50 -> min ratio to jump to reorder_labels: 1.0
# len(self.columns): 55 -> min ratio to jump to reorder_labels: 1.0
# ...
if (all_rows and len(row_positions) > 0.9 * all_rows) or (
must_sort_row_pos
and len(row_positions) * base_num_cols
>= min(
all_rows * len(self.columns) * base_ratio,
len(row_positions) * base_num_cols,
)
):
return self._reorder_labels(
row_positions=row_positions, col_positions=col_positions
)
sorted_row_positions = sorted_col_positions = None

if row_positions is not None:
sorted_row_positions = self._get_sorted_positions(row_positions)
if must_sort_row_pos:
sorted_row_positions = self._get_sorted_positions(row_positions)
else:
sorted_row_positions = row_positions
# Get dict of row_parts as {row_index: row_internal_indices}
row_partitions_dict = self._get_dict_of_block_index(
0, sorted_row_positions, are_indices_sorted=True
Expand All @@ -1201,7 +1233,10 @@ def _take_2d_positional(
new_index = self.copy_index_cache(copy_lengths=True)

if col_positions is not None:
sorted_col_positions = self._get_sorted_positions(col_positions)
if must_sort_col_pos:
sorted_col_positions = self._get_sorted_positions(col_positions)
else:
sorted_col_positions = col_positions
# Get dict of col_parts as {col_index: col_internal_indices}
col_partitions_dict = self._get_dict_of_block_index(
1, sorted_col_positions, are_indices_sorted=True
Expand Down

0 comments on commit 8061516

Please sign in to comment.