Skip to content

Commit

Permalink
added support for the stddev, variance and geometric_mean aggregation…
Browse files Browse the repository at this point in the history
… functions
  • Loading branch information
msbt committed Jan 28, 2015
1 parent d76b41d commit 11e393d
Show file tree
Hide file tree
Showing 28 changed files with 1,245 additions and 71 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ Changes for Crate
Unreleased
==========

- Added support for the ``std_dev``, ``variance`` and ``geometric_mean``
aggregation functions

- Added support for the CURRENT_TIMESTAMP expression

- Throw an error while using unknown column references in ``SELECT``
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/BooleanType.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import java.util.Locale;
import java.util.Map;

public class BooleanType extends DataType<Boolean> implements DataTypeFactory, Streamer<Boolean>, FixedWithType {
public class BooleanType extends DataType<Boolean> implements DataTypeFactory, Streamer<Boolean>, FixedWidthType {

public static final int ID = 3;
public static final BooleanType INSTANCE = new BooleanType();
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/ByteType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import java.io.IOException;

public class ByteType extends DataType<Byte> implements DataTypeFactory, Streamer<Byte>, FixedWithType {
public class ByteType extends DataType<Byte> implements DataTypeFactory, Streamer<Byte>, FixedWidthType {

public final static ByteType INSTANCE = new ByteType();
public final static int ID = 2;
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/DoubleType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import java.io.IOException;

public class DoubleType extends DataType<Double> implements FixedWithType, Streamer<Double>, DataTypeFactory {
public class DoubleType extends DataType<Double> implements FixedWidthType, Streamer<Double>, DataTypeFactory {

public static final DoubleType INSTANCE = new DoubleType();
public static final int ID = 6;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
/**
* A type that has a fixed size for every value
*/
public interface FixedWithType {
public interface FixedWidthType {

/**
* The fixed amount of memory a value object instance of type t requires.
* (t is the type described by our DataType interface or something that implements FixedWithType)
* (t is the type described by our DataType interface or something that implements FixedWidthType)
*
*
* Implementations here may not be 100% accurate because sizes may vary between JVM implementations
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/FloatType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import java.io.IOException;

public class FloatType extends DataType<Float> implements Streamer<Float>, DataTypeFactory, FixedWithType {
public class FloatType extends DataType<Float> implements Streamer<Float>, DataTypeFactory, FixedWidthType {

public static final FloatType INSTANCE = new FloatType();
public static final int ID = 7;
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/GeoPointType.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import java.util.Arrays;
import java.util.List;

public class GeoPointType extends DataType<Double[]> implements Streamer<Double[]>, DataTypeFactory, FixedWithType {
public class GeoPointType extends DataType<Double[]> implements Streamer<Double[]>, DataTypeFactory, FixedWidthType {

public static final int ID = 13;
public static final GeoPointType INSTANCE = new GeoPointType();
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/IntegerType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import java.io.IOException;

public class IntegerType extends DataType<Integer> implements Streamer<Integer>, DataTypeFactory, FixedWithType {
public class IntegerType extends DataType<Integer> implements Streamer<Integer>, DataTypeFactory, FixedWidthType {

public static final IntegerType INSTANCE = new IntegerType();
public static final int ID = 9;
Expand Down
3 changes: 2 additions & 1 deletion core/src/main/java/io/crate/types/LongType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@

import java.io.IOException;

public class LongType extends DataType<Long> implements FixedWithType, Streamer<Long>, DataTypeFactory {
public class
LongType extends DataType<Long> implements FixedWidthType, Streamer<Long>, DataTypeFactory {

public static final LongType INSTANCE = new LongType();
public static final int ID = 10;
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/io/crate/types/ShortType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import java.io.IOException;

public class ShortType extends DataType<Short> implements DataTypeFactory, Streamer<Short>, FixedWithType {
public class ShortType extends DataType<Short> implements DataTypeFactory, Streamer<Short>, FixedWidthType {

public static final ShortType INSTANCE = new ShortType();
public static final int ID = 8;
Expand Down
98 changes: 95 additions & 3 deletions docs/sql/aggregation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,10 @@ as a double value. Its single argument is the column name of a numeric column or
cr> select sum(name), kind from locations group by kind order by sum(name) desc;
SQLActionException[unknown function: sum(string)]

avg
===
avg / mean
==========

The ``avg`` aggregation function returns the arithmetic mean, the *average*,
The ``avg`` or ``mean`` aggregation function returns the arithmetic mean, the *average*,
of all values in a column that are not ``NULL`` as a double value. It accepts all numeric columns
and timestamp columns as single argument. Using ``avg`` on other column types is not allowed.

Expand All @@ -242,6 +242,94 @@ Example::
SELECT 3 rows in set (... sec)


geometric_mean
==============

The ``geometric_mean`` aggregation function computes the geometric mean,
a mean for positive numbers. For details see: `Geometric Mean`_.

``geometric mean`` is defined on all numeric types and on timestamp. It always
returns double values. If a value is negative, all values were null or we got no
value at all ``NULL`` is returned. If any of the aggregated values is ``0`` the result will be ``0.0``
as well.

.. note::

Due to java double precision arithmetic it is possible that any two executions
of the aggregation function on the same data produce slightly differing results.

Example::

cr> select geometric_mean(position), kind from locations
... group by kind order by kind;
+--------------------------+-------------+
| geometric_mean(position) | kind |
+--------------------------+-------------+
| 2.6321480259 | Galaxy |
| 2.6051710847 | Planet |
| 2.2133638394 | Star System |
+--------------------------+-------------+
SELECT 3 rows in set (... sec)


variance
========

The ``variance`` aggregation function computes the `Variance`_ of the set of non-null
values in a column. It is a measure about how far a set of numbers is spread.
A variance of ``0.0`` indicates that all values are the same.

``variance`` is defined on all numeric types and on timestamp. It returns a
double value. If all values were null or we got no value at all ``NULL`` is
returned.

Example::

cr> select variance(position), kind from locations
... group by kind order by kind desc;
+--------------------+-------------+
| variance(position) | kind |
+--------------------+-------------+
| 1.25 | Star System |
| 2.0 | Planet |
| 3.6875 | Galaxy |
+--------------------+-------------+
SELECT 3 rows in set (... sec)

.. note::

Due to java double precision arithmetic it is possible that any two executions
of the aggregation function on the same data produce slightly differing results.

stddev
======

The ``stddev`` aggregation function computes the `Standard Deviation`_ of the set
of non-null values in a column. It is a measure of the variation of data values.
A low standard deviation indicates that the values tend to be near the mean.

``stddev`` is defined on all numeric types and on timestamp. It always
returns double values. If all values were null or we got no
value at all ``NULL`` is returned.

Example::

cr> select stddev(position), kind from locations
... group by kind order by kind;
+------------------+-------------+
| stddev(position) | kind |
+------------------+-------------+
| 1.92028643697 | Galaxy |
| 1.41421356237 | Planet |
| 1.11803398875 | Star System |
+------------------+-------------+
SELECT 3 rows in set (... sec)

.. note::

Due to java double precision arithmetic it is possible that any two executions
of the aggregation function on the same data produce slightly differing results.

arbitrary
=========

Expand Down Expand Up @@ -284,3 +372,7 @@ user. This works as rows with same ``user_id`` have the same
strings. The advantage is that the ``arbitrary`` function does very little
to no computation as for example ``max`` aggregation function would
do.

.. _Geometric Mean: https://en.wikipedia.org/wiki/Mean#Geometric_mean_.28GM.29
.. _Variance: https://en.wikipedia.org/wiki/Variance
.. _Standard Deviation: https://en.wikipedia.org/wiki/Standard_deviation
2 changes: 0 additions & 2 deletions docs/sql/system.txt
Original file line number Diff line number Diff line change
Expand Up @@ -653,8 +653,6 @@ Note, that querying a time setting will always return a ``string`` value::
+---------------------------------------------------...-------------------+
SELECT 1 row in set (... sec)

::

The default configuration in ``crate.yml`` looks like:

.. code-block:: yaml
Expand Down
2 changes: 1 addition & 1 deletion sql/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies {
compile project(':blob')
compile project(':sql-parser')
compile 'com.amazonaws:aws-java-sdk:1.8.7'

compile 'org.apache.commons:commons-math3:3.4.1'
testCompile project(':testing')
testCompile 'org.skyscreamer:jsonassert:1.2.0'
testCompile 'com.h2database:h2:1.3.173'
Expand Down
4 changes: 2 additions & 2 deletions sql/src/main/java/io/crate/breaker/SizeEstimatorFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ public static <T> SizeEstimator<T> create(DataType type) {
case IpType.ID:
return (SizeEstimator<T>)new BytesRefSizeEstimator();
default:
if (type instanceof FixedWithType) {
return (SizeEstimator<T>) new ConstSizeEstimator(((FixedWithType) type).fixedSize());
if (type instanceof FixedWidthType) {
return (SizeEstimator<T>) new ConstSizeEstimator(((FixedWidthType) type).fixedSize());
}
throw new UnsupportedOperationException(String.format("Cannot get SizeEstimator for type %s", type));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,9 @@ protected void configure() {
SumAggregation.register(this);
CountAggregation.register(this);
CollectSetAggregation.register(this);

VarianceAggregation.register(this);
GeometricMeanAggregation.register(this);
StandardDeviationAggregation.register(this);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,34 +31,68 @@
import io.crate.types.DataType;
import io.crate.types.DataTypeFactory;
import io.crate.types.DataTypes;
import io.crate.types.FixedWithType;
import io.crate.types.FixedWidthType;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;

import java.io.IOException;

public class AverageAggregation extends AggregationFunction<AverageAggregation.AverageState, Double> {

public static final String NAME = "avg";
public static final String[] NAMES = new String[] {"avg", "mean"};
public static final String NAME = NAMES[0];
private final FunctionInfo info;

/**
* register as "avg" and "mean"
*/
public static void register(AggregationImplModule mod) {
for (DataType t :DataTypes.NUMERIC_PRIMITIVE_TYPES) {
for (String name :NAMES) {
for (DataType<?> t : DataTypes.NUMERIC_PRIMITIVE_TYPES) {
mod.register(new AverageAggregation(new FunctionInfo(
new FunctionIdent(name, ImmutableList.<DataType>of(t)), DataTypes.DOUBLE,
FunctionInfo.Type.AGGREGATE)));
}
mod.register(new AverageAggregation(new FunctionInfo(
new FunctionIdent(NAME, ImmutableList.of(t)), DataTypes.DOUBLE,
FunctionInfo.Type.AGGREGATE)));
new FunctionIdent(name, ImmutableList.<DataType>of(DataTypes.TIMESTAMP)), DataTypes.DOUBLE,
FunctionInfo.Type.AGGREGATE)));
}
mod.register(new AverageAggregation(new FunctionInfo(
new FunctionIdent(NAME, ImmutableList.<DataType>of(DataTypes.TIMESTAMP)), DataTypes.DOUBLE,
FunctionInfo.Type.AGGREGATE)));
}

AverageAggregation(FunctionInfo info) {
this.info = info;
public static class AverageState implements Comparable<AverageState> {

private double sum = 0;
private long count = 0;

public Double value() {
if (count > 0) {
return sum / count;
} else {
return null;
}
}

@Override
public int compareTo(AverageState o) {
if (o == null) {
return 1;
} else {
int compare = Double.compare(sum, o.sum);
if (compare == 0) {
return Long.compare(count, o.count);
}
return compare;
}
}

@Override
public String toString() {
return "sum: " + sum + " count: " + count;
}
}

public static class AverageStateType extends DataType<AverageState>
implements FixedWithType, Streamer<AverageState>, DataTypeFactory {
implements FixedWidthType, Streamer<AverageState>, DataTypeFactory {

public static final int ID = 1024;
private static final AverageStateType INSTANCE = new AverageStateType();
Expand Down Expand Up @@ -119,39 +153,10 @@ public int fixedSize() {
}
}

public static class AverageState implements Comparable<AverageState> {

private double sum = 0;
private long count = 0;

public Double value() {
if (count > 0) {
return sum / count;
} else {
return null;
}
}

@Override
public int compareTo(AverageState o) {
if (o == null) {
return 1;
} else {
int compare = Double.compare(sum, o.sum);
if (compare == 0) {
return Long.compare(count, o.count);
}
return compare;
}
}

@Override
public String toString() {
return "sum: " + sum + " count: " + count;
}
AverageAggregation(FunctionInfo info) {
this.info = info;
}


@Override
public AverageState iterate(RamAccountingContext ramAccountingContext, AverageState state, Input... args) {
if (state != null) {
Expand Down

0 comments on commit 11e393d

Please sign in to comment.