enso-org · mergify · Jan 11, 2023 · Dec 30, 2022 · Jan 2, 2023 · Jan 2, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -270,6 +270,7 @@
 - [Overhauled the JSON support (now based of JavaScript), `Data.fetch` and other
   minor tweaks][3987]
 - [Enable Date, Time and DateTime to be read and written to Excel.][3997]
+- [Implemented `Table.distinct` for Database backends.][4027]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -423,6 +424,7 @@
 [3987]: https://github.com/enso-org/enso/pull/3987
 [3997]: https://github.com/enso-org/enso/pull/3997
 [4013]: https://github.com/enso-org/enso/pull/4013
+[4027]: https://github.com/enso-org/enso/pull/4027
 
 #### Enso Compiler
 

@@ -126,7 +126,7 @@ type Connection
             Database_Table_Module.make_table self name columns ctx
         SQL_Query.Table_Name name ->
             ctx = Context.for_table name (if alias == "" then name else alias)
-            columns = self.jdbc_connection.fetch_columns (self.dialect.generate_sql (Query.Select_All ctx))
+            columns = self.jdbc_connection.fetch_columns (self.dialect.generate_sql (Query.Select Nothing ctx))
             Database_Table_Module.make_table self name columns ctx
 
     ## Execute the query and load the results into memory as a Table.

@@ -2,6 +2,7 @@ from Standard.Base import all
 import Standard.Base.Error.Unimplemented.Unimplemented
 
 from Standard.Table import Aggregate_Column, Join_Kind
+import Standard.Table.Internal.Problem_Builder.Problem_Builder
 
 import project.Connection.Connection.Connection
 import project.Data.SQL_Statement.SQL_Statement
@@ -56,6 +57,12 @@ type Dialect
     prepare_join self =
         Unimplemented.throw "This is an interface only."
 
+    ## PRIVATE
+       Prepares a distinct operation.
+    prepare_distinct : Table -> Vector -> Boolean -> Problem_Builder -> Table
+    prepare_distinct self =
+        Unimplemented.throw "This is an interface only."
+
 ## PRIVATE
 
    The dialect of SQLite databases.

@@ -23,7 +23,7 @@ import Standard.Table.Internal.Aggregate_Column_Helper
 from Standard.Table.Data.Column import get_item_string
 from Standard.Table.Data.Table import print_table
 from Standard.Table.Internal.Filter_Condition_Helpers import make_filter_column
-from Standard.Table.Errors import Column_Count_Mismatch, No_Index_Set_Error, No_Such_Column
+from Standard.Table.Errors import Column_Count_Mismatch, No_Index_Set_Error, No_Such_Column, No_Input_Columns_Selected, No_Output_Columns
 
 import project.Data.Column.Column
 import project.Data.SQL_Statement.SQL_Statement
@@ -624,7 +624,9 @@ type Table
        input table.
 
        When multiple rows have the same values within the specified columns, the
-       first row of each such set is returned.
+       first row of each such set is returned if possible, but in database
+       backends any row from each set may be returned (for example if the row
+       ordering is unspecified).
 
        For the in-memory table, the unique rows will be in the order they
        occurred in the input (this is not guaranteed for database operations).
@@ -649,8 +651,19 @@ type Table
            `Floating_Point_Grouping` warning.
     distinct : Vector Text | Column_Selector -> Case_Sensitivity -> Problem_Behavior -> Table
     distinct self (columns = Column_Selector.By_Name (self.columns.map .name)) case_sensitivity=Case_Sensitivity.Sensitive on_problems=Report_Warning =
-        _ = [columns, case_sensitivity, on_problems]
-        Error.throw (Unsupported_Database_Operation.Error "`Table.distinct` is not yet implemented for the database backend.")
+        problem_builder = Problem_Builder.new
+        warning_mapper error = case error of
+            No_Output_Columns -> Maybe.Some No_Input_Columns_Selected
+            _ -> Nothing
+        key_columns = Warning.map_warnings_and_errors warning_mapper <|
+            self.columns_helper.select_columns selector=columns reorder=True on_problems=on_problems
+        text_case_insensitive = case case_sensitivity of
+            Case_Sensitivity.Sensitive -> False
+            Case_Sensitivity.Insensitive locale ->
+                Helpers.assume_default_locale locale <|
+                    True
+        new_table = self.connection.dialect.prepare_distinct self key_columns text_case_insensitive problem_builder
+        problem_builder.attach_problems_before on_problems new_table
 
     ## Joins two tables according to the specified join conditions.
 
@@ -800,7 +813,7 @@ type Table
             new_columns = partitioned.first
             problems = partitioned.second
             on_problems.attach_problems_before problems <|
-                self.updated_context_and_columns new_ctx new_columns
+                self.updated_context_and_columns new_ctx new_columns subquery=True
 
     ## Returns a new table with a chosen subset of columns left unchanged and
        the other columns pivoted to rows with a single name field and a single
@@ -908,7 +921,7 @@ type Table
            computing too much we do not pass all the columns but only the first
            one.
         setup = self.context.as_subquery self.name [[self.internal_columns.first]]
-        new_ctx = Context.for_subquery setup.first
+        new_ctx = Context.for_subquery setup.subquery
         query = Query.Select [[column_name, expr]] new_ctx
         sql = self.connection.dialect.generate_sql query
         table = self.connection.read_statement sql
@@ -957,8 +970,8 @@ type Table
                Naively wrapping each column in a `COUNT(...)` will not
                always work as aggregates cannot be nested.
             setup = self.context.as_subquery self.name [self.internal_columns]
-            new_ctx = Context.for_subquery setup.first
-            new_columns = setup.second.first.map column->
+            new_ctx = Context.for_subquery setup.subquery
+            new_columns = setup.new_columns.first.map column->
                 [column.name, SQL_Expression.Operation "COUNT" [column.expression]]
             query = Query.Select new_columns new_ctx
             self.connection.dialect.generate_sql query
@@ -1007,8 +1020,24 @@ type Table
        Arguments:
        - ctx: The new context for this table.
        - internal_columns: The new columns to include in the table.
+       - subquery: A boolean indicating whether the operation should be wrapped
+         in a subquery. This is a simple workaround for operations which may be
+         affected by further operations if not wrapped. For example, a group-by
+         may need to be wrapped in this way if a filter is to be performed on it
+         later on. Ideally, this should be done only on demand, if the
+         subsequent operation needs it and operations like join should try to
+         avoid nesting subqueries without necessity. However, for now, for
+         simplicity, we are always wrapping brittle operations. This may be
+         revised in the future, to generate better and more concise SQL code.
     updated_context_and_columns : Context -> Vector Internal_Column -> Table
-    updated_context_and_columns self ctx internal_columns = Table.Value self.name self.connection internal_columns ctx
+    updated_context_and_columns self ctx internal_columns subquery=False = case subquery of
+        True ->
+            setup = ctx.as_subquery self.name [internal_columns]
+            new_ctx = Context.for_subquery setup.subquery
+            new_columns = setup.new_columns.first
+            Table.Value self.name self.connection new_columns new_ctx
+        False ->
+            Table.Value self.name self.connection internal_columns ctx
 
     ## PRIVATE
 

@@ -178,7 +178,7 @@ base_dialect =
     functions = [["COALESCE", make_function "COALESCE"], ["ROW_MIN", make_function "MIN"], ["ROW_MAX", make_function "MAX"]]
     agg = [fun "MAX", fun "MIN", fun "AVG", fun "SUM"]
     counts = [fun "COUNT", ["COUNT_ROWS", make_constant "COUNT(*)"]]
-    text = [is_empty, bin "LIKE", simple_equals_ignore_case]
+    text = [is_empty, bin "LIKE", simple_equals_ignore_case, fold_case]
     nulls = [["IS_NULL", make_right_unary_op "IS NULL"], ["FILL_NULL", make_function "COALESCE"]]
     contains = [["IS_IN", make_is_in], ["IS_IN_COLUMN", make_is_in_column]]
     base_map = Map.from_vector (arith + logic + compare + functions + agg + counts + text + nulls + contains)
@@ -293,6 +293,11 @@ generate_from_part dialect from_spec = case from_spec of
         sub = generate_query dialect (Query.Select columns context)
         sub.paren ++ alias dialect as_name
 
+
+## PRIVATE
+fold_case = lift_unary_op "FOLD_CASE" arg->
+    code "LOWER(UPPER(" ++ arg ++ "))"
+
 ## PRIVATE
 simple_equals_ignore_case = Base_Generator.lift_binary_op "equals_ignore_case" a-> b->
     code "LOWER(UPPER(" ++ a ++ ")) = LOWER(UPPER(" ++ b ++ "))"
@@ -377,10 +382,16 @@ generate_query : Internal_Dialect -> Query -> Builder
 generate_query dialect query = case query of
     Query.Select columns ctx ->
         gen_column pair = (generate_expression dialect pair.second) ++ alias dialect pair.first
-        cols = SQL.join ", " (columns.map gen_column)
-        code "SELECT " ++ cols ++ generate_select_context dialect ctx
-    Query.Select_All ctx ->
-        code "SELECT * " ++ generate_select_context dialect ctx
+        cols = case columns of
+            Nothing -> code "*"
+            _ -> SQL.join ", " (columns.map gen_column)
+        prefix = case ctx.distinct_on of
+            Nothing -> code ""
+            expressions : Vector ->
+                # TODO I just realised this does not make sense in other backends than Postgres, so we should probably fail in such cases; probably rewrite into a generic modifier? or a transform?
+                generated = SQL.join ", " (expressions.map (generate_expression dialect))
+                code "DISTINCT ON (" ++ generated ++ ") "
+        code "SELECT " ++ prefix ++ cols ++ generate_select_context dialect ctx
     Query.Insert table_name pairs ->
         generate_insert_query dialect table_name pairs
     _ -> Error.throw <| Unsupported_Database_Operation.Error "Unsupported query type."

@@ -0,0 +1,17 @@
+from Standard.Base import all
+
+from Standard.Table.Errors import Floating_Point_Grouping
+
+import project.Internal.IR.SQL_Expression.SQL_Expression
+
+## PRIVATE
+make_distinct_expression text_case_insensitive problem_builder key_column =
+    if key_column.sql_type.is_definitely_double then
+        problem_builder.report_other_warning (Floating_Point_Grouping.Error key_column.name)
+
+    expr = key_column.expression
+
+    needs_case_fold = text_case_insensitive && key_column.sql_type.is_definitely_text
+    case needs_case_fold of
+        True -> SQL_Expression.Operation "FOLD_CASE" [expr]
+        False -> expr
@@ -105,16 +105,14 @@ prepare_subqueries left right needs_left_indicator needs_right_indicator =
     # TODO [RW] Not all of these included columns are actually usable from the external context, so
     # in the future we may consider pruning some of them as additional optimization and simplification of the query
     # (the only columns that are needed are ones that the later performed join resolution needs).
-    left_config = left.context.as_subquery left_alias [left.internal_columns, left_indicators]
-    right_config = right.context.as_subquery right_alias [right.internal_columns, right_indicators]
-
-    left_subquery = left_config.first
-    new_left_columns = left_config.second.at 0
-    new_left_indicators = left_config.second.at 1
-    right_subquery = right_config.first
-    new_right_columns = right_config.second.at 0
-    new_right_indicators = right_config.second.at 1
-
-    left_setup = Join_Subquery_Setup.Value left_subquery new_left_columns left.internal_columns (new_left_indicators.get 0)
-    right_setup = Join_Subquery_Setup.Value right_subquery new_right_columns right.internal_columns (new_right_indicators.get 0)
+    left_sub = left.context.as_subquery left_alias [left.internal_columns, left_indicators]
+    right_sub = right.context.as_subquery right_alias [right.internal_columns, right_indicators]
+
+    new_left_columns = left_sub.new_columns.first
+    new_left_indicators = left_sub.new_columns.second
+    new_right_columns = right_sub.new_columns.first
+    new_right_indicators = right_sub.new_columns.second
+
+    left_setup = Join_Subquery_Setup.Value left_sub.subquery new_left_columns left.internal_columns (new_left_indicators.get 0)
+    right_setup = Join_Subquery_Setup.Value right_sub.subquery new_right_columns right.internal_columns (new_right_indicators.get 0)
     Pair.new left_setup right_setup
@@ -19,7 +19,7 @@ type Context
        - alias: An alias name to use for table within the query.
     for_table : Text -> Text -> Context
     for_table table_name alias=table_name =
-        Context.Value (From_Spec.Table table_name alias) [] [] [] Nothing
+        Context.Value (From_Spec.Table table_name alias) [] [] [] Nothing Nothing
 
     ## PRIVATE
 
@@ -30,7 +30,7 @@ type Context
        - alias: An alias name to use for table within the query.
     for_query : Text -> Text -> Context
     for_query raw_sql alias =
-        Context.Value (From_Spec.Query raw_sql alias) [] [] [] Nothing
+        Context.Value (From_Spec.Query raw_sql alias) [] [] [] Nothing Nothing
 
     ## PRIVATE
 
@@ -40,7 +40,7 @@ type Context
        - subquery: The subquery to lift into a context.
     for_subquery : From_Spec -> Context
     for_subquery subquery =
-        Context.Value subquery [] [] [] Nothing
+        Context.Value subquery [] [] [] Nothing Nothing
 
     ## PRIVATE
 
@@ -63,7 +63,7 @@ type Context
          grouped-by columns or aggregate expressions.
        - limit: an optional maximum number of elements that the equery should
          return.
-    Value (from_spec : From_Spec) (where_filters : Vector SQL_Expression) (orders : Vector Order_Descriptor) (groups : Vector SQL_Expression) (limit : Nothing | Integer)
+    Value (from_spec : From_Spec) (where_filters : Vector SQL_Expression) (orders : Vector Order_Descriptor) (groups : Vector SQL_Expression) (limit : Nothing | Integer) (distinct_on : Nothing | Vector SQL_Expression)
 
     ## PRIVATE
 
@@ -73,7 +73,7 @@ type Context
        - new_filters: The new filters to set in the query.
     set_where_filters : Vector SQL_Expression -> Context
     set_where_filters self new_filters =
-        Context.Value self.from_spec new_filters self.orders self.groups self.limit
+        Context.Value self.from_spec new_filters self.orders self.groups self.limit self.distinct_on
 
     ## PRIVATE
 
@@ -83,7 +83,7 @@ type Context
        - new_orders: The new ordering clauses to set in the query.
     set_orders : Vector Order_Descriptor -> Context
     set_orders self new_orders =
-        Context.Value self.from_spec self.where_filters new_orders self.groups self.limit
+        Context.Value self.from_spec self.where_filters new_orders self.groups self.limit self.distinct_on
 
     ## PRIVATE
 
@@ -100,7 +100,7 @@ type Context
        - new_orders: The new ordering clauses to add to the query.
     add_orders : Vector Order_Descriptor -> Context
     add_orders self new_orders =
-        Context.Value self.from_spec self.where_filters new_orders+self.orders self.groups self.limit
+        Context.Value self.from_spec self.where_filters new_orders+self.orders self.groups self.limit self.distinct_on
 
     ## PRIVATE
 
@@ -110,7 +110,7 @@ type Context
        - new_groups: The new grouping clauses to set in the query.
     set_groups : Vector SQL_Expression -> Context
     set_groups self new_groups =
-        Context.Value self.from_spec self.where_filters self.orders new_groups self.limit
+        Context.Value self.from_spec self.where_filters self.orders new_groups self.limit self.distinct_on
 
     ## PRIVATE
 
@@ -120,7 +120,14 @@ type Context
        - new_limit: The new limit clauses to set in the query.
     set_limit : (Nothing | Integer) -> Context
     set_limit self new_limit =
-       Context.Value self.from_spec self.where_filters self.orders self.groups new_limit
+       Context.Value self.from_spec self.where_filters self.orders self.groups new_limit self.distinct_on
+
+    ## PRIVATE
+
+         Returns a copy of the context with changed `distinct_on` expressions.
+    set_distinct_on : (Nothing | Vector SQL_Expression) -> Context
+    set_distinct_on self new_distinct_on =
+       Context.Value self.from_spec self.where_filters self.orders self.groups self.limit new_distinct_on
 
     ## PRIVATE
 
@@ -136,8 +143,7 @@ type Context
        to one from the original list but it is valid in the new context.
 
        This is useful as a preprocessing step between combining queries, for example in a join.
-    # as_subquery : Text -> Vector (Vector Internal_Column) -> [From_Spec.Sub_Query, Vector (Vector Internal_Column)]
-    as_subquery : Text -> Vector Any -> Vector
+    as_subquery : Text -> Vector (Vector Internal_Column) -> Subquery_Setup
     as_subquery self alias column_lists =
         rewrite_internal_column : Internal_Column -> Internal_Column
         rewrite_internal_column column =
@@ -150,4 +156,7 @@ type Context
             columns.map column-> [column.name, column.expression]
         new_from = From_Spec.Sub_Query encapsulated_columns self alias
 
-        [new_from, new_columns]
+        Subquery_Setup.Value new_from new_columns
+
+type Subquery_Setup
+    Value (subquery : From_Spec) (new_columns : Vector (Vector Internal_Column))
@@ -15,17 +15,10 @@ type Query
        Arguments:
        - expressions: List of pairs specifying the columns to materialize; each
          is a pair whose first element is the name of the materialized column
-         and the second element is the expression to compute.
+         and the second element is the expression to compute. If `Nothing` is
+         provided, all available columns will be selected.
        - context: The query context, see `Context` for more detail.
-    Select (expressions : Vector (Pair Text SQL_Expression)) (context : Context)
-
-    ## PRIVATE
-
-       A Select SQL query that gets all columns in a table.
-
-       Arguments:
-       - context: The query context, see `Context` for more detail.
-    Select_All context
+    Select (expressions : Nothing | Vector (Pair Text SQL_Expression)) (context : Context)
 
     ## PRIVATE