Initial implementation of the Aggregator (#1156)

Initial implementation of `datatable.extras.aggregate` for 1D, 2D and ND table aggregations. Includes: - 1D continuous binning; - 2D continuous binning; - 1D categorical aggregation; - 2D categorical aggregation; - 2D mixed (continuous/categorical) aggregation; - ND aggregation that also includes a projection method when ncols > max_dimensions; - implementations of the `first()` and `count()` `datatable` reducers; - other minor changes to `datatable`.
h2oai · Jul 19, 2018 · ab59524 · ab59524
1 parent 7fd6054
commit ab59524
Show file tree

Hide file tree

Showing 15 changed files with 788 additions and 8 deletions.
diff --git a/Makefile b/Makefile
@@ -420,6 +420,7 @@ fast_objects = $(addprefix $(BUILDDIR)/, \
 	expr/py_expr.o            \
 	expr/reduceop.o           \
 	expr/unaryop.o            \
+	extras/aggregator.o       \
 	groupby.o                 \
 	jay/open_jay.o            \
 	jay/save_jay.o            \
@@ -659,6 +660,9 @@ $(BUILDDIR)/expr/py_expr.h: c/expr/py_expr.h $(BUILDDIR)/column.h $(BUILDDIR)/gr
 	@echo • Refreshing c/expr/py_expr.h
 	@cp c/expr/py_expr.h $@
 
+$(BUILDDIR)/extras/aggregator.h: c/extras/aggregator.h $(BUILDDIR)/datatable.h $(BUILDDIR)/py_datatable.h $(BUILDDIR)/rowindex.h $(BUILDDIR)/types.h
+	@echo • Refreshing c/extras/aggregator.h
+	@cp c/extras/aggregator.h $@
 
 $(BUILDDIR)/python/float.h: c/python/float.h $(BUILDDIR)/utils/pyobj.h
 	@echo • Refreshing c/python/float.h
@@ -830,7 +834,7 @@ $(BUILDDIR)/datatable_rbind.o : c/datatable_rbind.cc $(BUILDDIR)/column.h $(BUIL
 	@echo • Compiling $<
 	@$(CC) -c $< $(CCFLAGS) -o $@
 
-$(BUILDDIR)/datatablemodule.o : c/datatablemodule.cc $(BUILDDIR)/capi.h $(BUILDDIR)/csv/py_csv.h $(BUILDDIR)/csv/writer.h $(BUILDDIR)/expr/py_expr.h $(BUILDDIR)/options.h $(BUILDDIR)/py_column.h $(BUILDDIR)/py_columnset.h $(BUILDDIR)/py_datatable.h $(BUILDDIR)/py_datawindow.h $(BUILDDIR)/py_encodings.h $(BUILDDIR)/py_groupby.h $(BUILDDIR)/py_rowindex.h $(BUILDDIR)/py_types.h $(BUILDDIR)/py_utils.h $(BUILDDIR)/utils/assert.h
+$(BUILDDIR)/datatablemodule.o : c/datatablemodule.cc $(BUILDDIR)/capi.h $(BUILDDIR)/csv/py_csv.h $(BUILDDIR)/csv/writer.h $(BUILDDIR)/expr/py_expr.h $(BUILDDIR)/extras/aggregator.h $(BUILDDIR)/options.h $(BUILDDIR)/py_column.h $(BUILDDIR)/py_columnset.h $(BUILDDIR)/py_datatable.h $(BUILDDIR)/py_datawindow.h $(BUILDDIR)/py_encodings.h $(BUILDDIR)/py_groupby.h $(BUILDDIR)/py_rowindex.h $(BUILDDIR)/py_types.h $(BUILDDIR)/py_utils.h $(BUILDDIR)/utils/assert.h
 	@echo • Compiling $<
 	@$(CC) -c $< $(CCFLAGS) -o $@
 
@@ -853,6 +857,10 @@ $(BUILDDIR)/expr/reduceop.o : c/expr/reduceop.cc $(BUILDDIR)/expr/py_expr.h $(BU
 $(BUILDDIR)/expr/unaryop.o : c/expr/unaryop.cc $(BUILDDIR)/expr/py_expr.h $(BUILDDIR)/types.h
 	@echo • Compiling $<
 	@$(CC) -c $< $(CCFLAGS) -o $@
+
+$(BUILDDIR)/extras/aggregator.o : c/extras/aggregator.cc $(BUILDDIR)/extras/aggregator.h $(BUILDDIR)/py_utils.h $(BUILDDIR)/rowindex.h $(BUILDDIR)/types.h $(BUILDDIR)/utils/omp.h $(BUILDDIR)/utils/pyobj.h
+	@echo • Compiling $<
+	@$(CC) -c $< $(CCFLAGS) -o $@
 
 $(BUILDDIR)/groupby.o : c/groupby.cc $(BUILDDIR)/utils/exceptions.h $(BUILDDIR)/groupby.h
 	@echo • Compiling $<

diff --git a/c/datatablemodule.cc b/c/datatablemodule.cc
@@ -22,6 +22,7 @@
 #include "py_types.h"
 #include "py_utils.h"
 #include "utils/assert.h"
+#include "extras/aggregator.h"
 
 extern void init_jay();
 
@@ -191,6 +192,7 @@ static PyMethodDef DatatableModuleMethods[] = {
     METHODv(expr_unaryop),
     METHOD0(is_debug_mode),
     METHOD0(has_omp_support),
+    METHODv(aggregate),
 
     {nullptr, nullptr, 0, nullptr}  /* Sentinel */
 };

diff --git a/c/expr/reduceop.cc b/c/expr/reduceop.cc
@@ -21,6 +21,7 @@ enum OpCode {
   Stdev = 4,
   First = 5,
   Sum   = 6,
+  Count = 7,
 };
 
 template<typename T>
@@ -77,6 +78,29 @@ static void sum_skipna(const int32_t* groups, int32_t grp, void** params) {
 
 
 
+//------------------------------------------------------------------------------
+// Count calculation
+//------------------------------------------------------------------------------
+
+template<typename IT, typename OT>
+static void count_skipna(const int32_t* groups, int32_t grp, void** params) {
+  Column* col0 = static_cast<Column*>(params[0]);
+  Column* col1 = static_cast<Column*>(params[1]);
+  const IT* inputs = static_cast<const IT*>(col0->data());
+  OT* outputs = static_cast<OT*>(col1->data_w());
+  OT count = 0;
+  int32_t row0 = groups[grp];
+  int32_t row1 = groups[grp + 1];
+  col0->rowindex().strided_loop(row0, row1, 1,
+    [&](int64_t i) {
+      IT x = inputs[i];
+      count += !ISNA<IT>(x);
+    });
+  outputs[grp] = count;
+}
+
+
+
 //------------------------------------------------------------------------------
 // Mean calculation
 //------------------------------------------------------------------------------
@@ -200,6 +224,7 @@ static gmapperfn resolve1(int opcode) {
     case OpCode::Max:   return max_skipna<T1>;
     case OpCode::Stdev: return stdev_skipna<T1, T2>;
     case OpCode::Sum:   return sum_skipna<T1, T2>;
+    case OpCode::Count: return count_skipna<T1, T2>;
     default:            return nullptr;
   }
 }
@@ -218,6 +243,22 @@ static gmapperfn resolve0(int opcode, SType stype) {
       default:             return nullptr;
     }
   }
+
+  if (opcode == OpCode::Count) {
+    switch (stype) {
+      case SType::BOOL:
+      case SType::INT8:    return count_skipna<int8_t, uint64_t>;
+      case SType::INT16:   return count_skipna<int16_t, uint64_t>;
+      case SType::INT32:   return count_skipna<int32_t, uint64_t>;
+      case SType::INT64:   return count_skipna<int64_t, uint64_t>;
+      case SType::FLOAT32: return count_skipna<float, uint64_t>;
+      case SType::FLOAT64: return count_skipna<double, uint64_t>;
+      case SType::STR32:   return count_skipna<int32_t, uint64_t>;
+      case SType::STR64:   return count_skipna<int64_t, uint64_t>;
+      default:             return nullptr;
+    }
+  }
+
   switch (stype) {
     case SType::BOOL:
     case SType::INT8:    return resolve1<int8_t, double>(opcode);
@@ -251,6 +292,11 @@ Column* reduceop(int opcode, Column* arg, const Groupby& groupby)
       res_type = SType::INT64;
     }
   }
+
+  if (opcode == OpCode::Count) {
+    res_type = SType::INT64;
+  }
+
   int32_t ngrps = static_cast<int32_t>(groupby.ngroups());
   if (ngrps == 0) ngrps = 1;