ClickHouse · vdimir · Apr 5, 2023 · Feb 14, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileApprox.md b/docs/en/sql-reference/aggregate-functions/reference/quantileApprox.md
@@ -0,0 +1,76 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/quantileApprox
+sidebar_position: 204
+---
+
+# quantileApprox
+
+Computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [Greenwald-Khanna](http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf) algorithm. The Greenwald-Khanna algorithm is an algorithm used to compute quantiles on a stream of data in a highly efficient manner. It was introduced by Michael Greenwald and Sanjeev Khanna in 2001. It is widely used in databases and big data systems where computing accurate quantiles on a large stream of data in real-time is necessary. The algorithm is highly efficient, taking only O(log n) space and O(log log n) time per item (where n is the size of the input). It is also highly accurate, providing an approximate quantile value with high probability. 
+
+`quantileApprox` is different from other quantile functions in ClickHouse, because it enables user to control the accuracy of the approximate quantile result.
+
+**Syntax**
+
+``` sql
+quantileApprox(accuracy, level)(expr)
+```
+
+Alias: `medianApprox`.
+
+**Arguments**
+
+-   `accuracy` — Accuracy of quantile. Constant positive integer. Larger accuracy value means less error. For example, if the accuracy argument is set to 100, the computed quantile will have an error no greater than 1% with high probability. There is a trade-off between the accuracy of the computed quantiles and the computational complexity of the algorithm. A larger accuracy requires more memory and computational resources to compute the quantile accurately, while a smaller accuracy argument allows for a faster and more memory-efficient computation but with a slightly lower accuracy.
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+
+
+**Returned value**
+
+-   Quantile of the specified level and accuracy.
+
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+``` sql
+SELECT quantileApprox(1, 0.25)(number + 1)
+FROM numbers(1000)
+
+┌─quantileApprox(1, 0.25)(plus(number, 1))─┐
+│                                        1 │
+└──────────────────────────────────────────┘
+
+SELECT quantileApprox(10, 0.25)(number + 1)
+FROM numbers(1000)
+
+┌─quantileApprox(10, 0.25)(plus(number, 1))─┐
+│                                       156 │
+└───────────────────────────────────────────┘
+
+SELECT quantileApprox(100, 0.25)(number + 1)
+FROM numbers(1000)
+
+┌─quantileApprox(100, 0.25)(plus(number, 1))─┐
+│                                        251 │
+└────────────────────────────────────────────┘
+
+SELECT quantileApprox(1000, 0.25)(number + 1)
+FROM numbers(1000)
+
+┌─quantileApprox(1000, 0.25)(plus(number, 1))─┐
+│                                         249 │
+└─────────────────────────────────────────────┘
+```
+
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
@@ -114,3 +114,59 @@ Result:
 │ [249.75,499.5,749.25,899.1,949.05,989.01,998.001]                   │
 └─────────────────────────────────────────────────────────────────────┘
 ```
+
+## quantilesApprox
+
+`quantilesApprox` works similarly with `quantileApprox` but allows us to calculate quantities at different levels simultaneously and returns an array.
+
+**Syntax**
+
+``` sql
+quantilesApprox(accuracy, level1, level2, ...)(expr)
+```
+
+**Returned value**
+
+-   [Array](../../../sql-reference/data-types/array.md) of quantiles of the specified levels.
+
+Type of array values:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Query:
+
+
+``` sql
+SELECT quantilesApprox(1, 0.25, 0.5, 0.75)(number + 1)
+FROM numbers(1000)
+
+┌─quantilesApprox(1, 0.25, 0.5, 0.75)(plus(number, 1))─┐
+│ [1,1,1]                                              │
+└──────────────────────────────────────────────────────┘
+
+SELECT quantilesApprox(10, 0.25, 0.5, 0.75)(number + 1)
+FROM numbers(1000)
+
+┌─quantilesApprox(10, 0.25, 0.5, 0.75)(plus(number, 1))─┐
+│ [156,413,659]                                         │
+└───────────────────────────────────────────────────────┘
+
+
+SELECT quantilesApprox(100, 0.25, 0.5, 0.75)(number + 1)
+FROM numbers(1000)
+
+┌─quantilesApprox(100, 0.25, 0.5, 0.75)(plus(number, 1))─┐
+│ [251,498,741]                                          │
+└────────────────────────────────────────────────────────┘
+
+SELECT quantilesApprox(1000, 0.25, 0.5, 0.75)(number + 1)
+FROM numbers(1000)
+
+┌─quantilesApprox(1000, 0.25, 0.5, 0.75)(plus(number, 1))─┐
+│ [249,499,749]                                           │
+└─────────────────────────────────────────────────────────┘
+```
diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.h b/src/AggregateFunctions/AggregateFunctionQuantile.h
@@ -26,9 +26,11 @@ namespace ErrorCodes
 {
     extern const int ILLEGAL_TYPE_OF_ARGUMENT;
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
 }
 
 template <typename> class QuantileTiming;
+template <typename> class QuantileApprox;
 
 
 /** Generic aggregate function for calculation of quantiles.
@@ -60,29 +62,65 @@ class AggregateFunctionQuantile final
     using ColVecType = ColumnVectorOrDecimal<Value>;
 
     static constexpr bool returns_float = !(std::is_same_v<FloatReturnType, void>);
+    static constexpr bool is_quantile_approx = std::is_same_v<Data, QuantileApprox<Value>>;
     static_assert(!is_decimal<Value> || !returns_float);
 
     QuantileLevels<Float64> levels;
 
     /// Used when there are single level to get.
     Float64 level = 0.5;
 
+    /// Used for the approximate version of the algorithm (Greenwald-Khanna)
+    ssize_t accuracy = 10000;
+
     DataTypePtr & argument_type;
 
 public:
     AggregateFunctionQuantile(const DataTypes & argument_types_, const Array & params)
         : IAggregateFunctionDataHelper<Data, AggregateFunctionQuantile<Value, Data, Name, has_second_arg, FloatReturnType, returns_many>>(
             argument_types_, params, createResultType(argument_types_))
-        , levels(params, returns_many)
+        , levels(is_quantile_approx && !params.empty() ? Array(params.begin() + 1, params.end()) : params, returns_many)
         , level(levels.levels[0])
         , argument_type(this->argument_types[0])
     {
         if (!returns_many && levels.size() > 1)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require one parameter or less", getName());
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires one level parameter or less", getName());
+
+        if constexpr (is_quantile_approx)
+        {
+            if (params.empty())
+                throw Exception(
+                    ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one param", getName());
+
+            const auto & accuracy_field = params[0];
+            if (!isInt64OrUInt64FieldType(accuracy_field.getType()))
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} requires accuracy parameter with integer type", getName());
+
+            if (accuracy_field.getType() == Field::Types::Int64)
+                accuracy = accuracy_field.get<Int64>();
+            else
+                accuracy = accuracy_field.get<UInt64>();
+
+            if (accuracy <= 0)
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS,
+                    "Aggregate function {} requires accuracy parameter with positive value but is {}",
+                    getName(),
+                    accuracy);
+        }
     }
 
     String getName() const override { return Name::name; }
 
+    void create(AggregateDataPtr __restrict place) const override /// NOLINT
+    {
+        if constexpr (is_quantile_approx)
+            new (place) Data(accuracy);
+        else
+            new (place) Data;
+    }
+
     static DataTypePtr createResultType(const DataTypes & argument_types_)
     {
         DataTypePtr res;
@@ -250,4 +288,7 @@ struct NameQuantilesBFloat16 { static constexpr auto name = "quantilesBFloat16";
 struct NameQuantileBFloat16Weighted { static constexpr auto name = "quantileBFloat16Weighted"; };
 struct NameQuantilesBFloat16Weighted { static constexpr auto name = "quantilesBFloat16Weighted"; };
 
+struct NameQuantileApprox { static constexpr auto name = "quantileApprox"; };
+struct NameQuantilesApprox { static constexpr auto name = "quantilesApprox"; };
+
 }
diff --git a/src/AggregateFunctions/AggregateFunctionQuantileApprox.cpp b/src/AggregateFunctions/AggregateFunctionQuantileApprox.cpp
@@ -0,0 +1,71 @@
+#include <AggregateFunctions/AggregateFunctionQuantile.h>
+#include <AggregateFunctions/QuantileApprox.h>
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/Helpers.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <Core/Field.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+template <typename Value, bool _> using FuncQuantileApprox = AggregateFunctionQuantile<Value, QuantileApprox<Value>, NameQuantileApprox, false, void, false>;
+template <typename Value, bool _> using FuncQuantilesApprox = AggregateFunctionQuantile<Value, QuantileApprox<Value>, NameQuantilesApprox, false, void, true>;
+
+template <template <typename, bool> class Function>
+AggregateFunctionPtr createAggregateFunctionQuantile(
+    const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+{
+    /// Second argument type check doesn't depend on the type of the first one.
+    Function<void, true>::assertSecondArg(argument_types);
+
+    const DataTypePtr & argument_type = argument_types[0];
+    WhichDataType which(argument_type);
+
+#define DISPATCH(TYPE) \
+    if (which.idx == TypeIndex::TYPE) \
+        return std::make_shared<Function<TYPE, true>>(argument_types, params);
+    FOR_BASIC_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+
+    if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
+    if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
+
+    if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
+    if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);
+
+    if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
+    if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
+    if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
+    if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
+
+    throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}",
+                    argument_type->getName(), name);
+}
+
+}
+
+void registerAggregateFunctionsQuantileApprox(AggregateFunctionFactory & factory)
+{
+    /// For aggregate functions returning array we cannot return NULL on empty set.
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
+
+    factory.registerFunction(NameQuantileApprox::name, createAggregateFunctionQuantile<FuncQuantileApprox>);
+    factory.registerFunction(NameQuantilesApprox::name, {createAggregateFunctionQuantile<FuncQuantilesApprox>, properties});
+
+    /// 'median' is an alias for 'quantile'
+    factory.registerAlias("medianApprox", NameQuantileApprox::name);
+}
+
+}