Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions bigframes/core/compile/sqlglot/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,17 @@ def compile_sql_select(node: sql_nodes.SqlSelectNode, child: ir.SQLGlotIR):
for ordering in node.sorting
)

projected_cols: tuple[tuple[str, sge.Expression], ...] = tuple(
(
cdef.id.sql,
expression_compiler.expression_compiler.compile_expression(cdef.expression),
projected_cols: tuple[tuple[str, sge.Expression], ...] = tuple()
if not node.is_star_selection:
projected_cols = tuple(
(
cdef.id.sql,
expression_compiler.expression_compiler.compile_expression(
cdef.expression
),
)
for cdef in node.selections
)
for cdef in node.selections
)

sge_predicates = tuple(
expression_compiler.expression_compiler.compile_expression(expression)
Expand Down
25 changes: 14 additions & 11 deletions bigframes/core/compile/sqlglot/sqlglot_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def from_table(
if sql_predicate:
select_expr = sge.Select().select(sge.Star()).from_(table_expr)
select_expr = select_expr.where(
sg.parse_one(sql_predicate, dialect="bigquery"), append=False
sg.parse_one(sql_predicate, dialect=cls.dialect), append=False
)
return cls(expr=select_expr, uid_gen=uid_gen)

Expand All @@ -172,16 +172,19 @@ def select(
if len(sorting) > 0:
new_expr = new_expr.order_by(*sorting)

to_select = [
sge.Alias(
this=expr,
alias=sge.to_identifier(id, quoted=self.quoted),
)
if expr.alias_or_name != id
else expr
for id, expr in selections
]
new_expr = new_expr.select(*to_select, append=False)
if len(selections) > 0:
to_select = [
sge.Alias(
this=expr,
alias=sge.to_identifier(id, quoted=self.quoted),
)
if expr.alias_or_name != id
else expr
for id, expr in selections
]
new_expr = new_expr.select(*to_select, append=False)
else:
new_expr = new_expr.select(sge.Star(), append=False)

if len(predicates) > 0:
condition = _and(predicates)
Expand Down
9 changes: 4 additions & 5 deletions bigframes/core/rewrite/select_pullup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,12 @@ def pull_up_source_ids(node: nodes.ReadTableNode) -> nodes.BigFrameNode:
if all(id.sql == source_id for id, source_id in node.scan_list.items):
return node
else:
source_ids = sorted(
set(scan_item.source_id for scan_item in node.scan_list.items)
)
new_scan_list = nodes.ScanList.from_items(
[
nodes.ScanItem(identifiers.ColumnId(source_id), source_id)
for source_id in source_ids
nodes.ScanItem(
identifiers.ColumnId(scan_item.source_id), scan_item.source_id
)
for scan_item in node.scan_list.items
Comment on lines +59 to +62
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what we really want is to order by the underlying physical schema?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When generating SQL, we should respect the original order of the selected columns and avoid reordering them to match the physical schema. While reordering can reveal more SELECT * optimizations when the query involves intermediate CTEs or subqueries, the current logic sorts the scan_list by the algebraic ordering of column names. This approach hides potential SELECT * optimizations, particularly in use cases like bpd.read_table("table_name")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think this needs to be by physical schema id ideally, but I think it mostly doesn't matter when combined with other rewriters.

]
)
new_source = dataclasses.replace(node, scan_list=new_scan_list)
Expand Down
4 changes: 4 additions & 0 deletions bigframes/core/sql_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ def consumed_ids(self):
def _node_expressions(self):
raise NotImplementedError()

@property
def is_star_selection(self) -> bool:
return tuple(self.ids) == tuple(self.child.ids)

@functools.cache
def get_id_mapping(self) -> dict[identifiers.ColumnId, ex.Expression]:
return {cdef.id: cdef.expression for cdef in self.selections}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
WITH `bfcte_0` AS (
SELECT
`float64_col`,
`int64_col`
`int64_col`,
`float64_col`
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
), `bfcte_1` AS (
SELECT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
WITH `bfcte_0` AS (
SELECT
`float64_col`,
`int64_col`
`int64_col`,
`float64_col`
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
), `bfcte_1` AS (
SELECT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,6 @@
WITH `bfcte_0` AS (
SELECT
`bool_col`,
`bytes_col`,
`date_col`,
`datetime_col`,
`duration_col`,
`float64_col`,
`geography_col`,
`int64_col`,
`int64_too`,
`numeric_col`,
`rowindex`,
`rowindex_2`,
`string_col`,
`time_col`,
`timestamp_col`
*
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
), `bfcte_1` AS (
SELECT
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
WITH `bfcte_0` AS (
SELECT
`bool_col`,
`duration_col`,
`int64_col`,
`duration_col`,
`int64_col` AS `bfcol_6`,
`bool_col` AS `bfcol_7`,
`duration_col` AS `bfcol_8`
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
WITH `bfcte_0` AS (
SELECT
`bool_col`,
`duration_col`,
`int64_col`,
`duration_col`,
`int64_col` AS `bfcol_6`,
`bool_col` AS `bfcol_7`,
`duration_col` AS `bfcol_8`
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
WITH `bfcte_0` AS (
SELECT
`int_list_col`,
`rowindex`,
`int_list_col`,
`string_list_col`
FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
), `bfcte_1` AS (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
WITH `bfcte_0` AS (
SELECT
`int_list_col`,
`rowindex`
`rowindex`,
`int_list_col`
FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
), `bfcte_1` AS (
SELECT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,5 @@ WITH `bfcte_0` AS (
`rowindex` > 0 AND `string_col` IN ('Hello, World!')
)
SELECT
`rowindex`,
`int64_col`,
`string_col`
*
FROM `bfcte_0`
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
SELECT
`rowindex`,
`json_col`
*
FROM `bigframes-dev`.`sqlglot_test`.`json_types`
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
SELECT
`bool_col`,
`bytes_col`,
`date_col`,
`datetime_col`,
`geography_col`,
`int64_col`,
`int64_too`,
`numeric_col`,
`float64_col`,
`rowindex`,
`rowindex_2`,
`string_col`,
`time_col`,
`timestamp_col`,
`duration_col`
*
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` FOR SYSTEM_TIME AS OF '2025-11-09T03:04:05.678901+00:00'
Loading