Skip to content

Commit

Permalink
Initial implementation of the Aggregator (#1156)
Browse files Browse the repository at this point in the history
Initial implementation of `datatable.extras.aggregate` for 1D, 2D and ND table aggregations.

Includes:
- 1D continuous binning;
- 2D continuous binning;
- 1D categorical aggregation;
- 2D categorical aggregation;
- 2D mixed (continuous/categorical) aggregation;
- ND aggregation that also includes a projection method when ncols > max_dimensions;
- implementations of the `first()` and `count()` `datatable` reducers;
- other minor changes to `datatable`.
  • Loading branch information
oleksiyskononenko authored and st-pasha committed Jul 19, 2018
1 parent 7fd6054 commit ab59524
Show file tree
Hide file tree
Showing 15 changed files with 788 additions and 8 deletions.
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ fast_objects = $(addprefix $(BUILDDIR)/, \
expr/py_expr.o \
expr/reduceop.o \
expr/unaryop.o \
extras/aggregator.o \
groupby.o \
jay/open_jay.o \
jay/save_jay.o \
Expand Down Expand Up @@ -659,6 +660,9 @@ $(BUILDDIR)/expr/py_expr.h: c/expr/py_expr.h $(BUILDDIR)/column.h $(BUILDDIR)/gr
@echo • Refreshing c/expr/py_expr.h
@cp c/expr/py_expr.h $@

$(BUILDDIR)/extras/aggregator.h: c/extras/aggregator.h $(BUILDDIR)/datatable.h $(BUILDDIR)/py_datatable.h $(BUILDDIR)/rowindex.h $(BUILDDIR)/types.h
@echo • Refreshing c/extras/aggregator.h
@cp c/extras/aggregator.h $@

$(BUILDDIR)/python/float.h: c/python/float.h $(BUILDDIR)/utils/pyobj.h
@echo • Refreshing c/python/float.h
Expand Down Expand Up @@ -830,7 +834,7 @@ $(BUILDDIR)/datatable_rbind.o : c/datatable_rbind.cc $(BUILDDIR)/column.h $(BUIL
@echo • Compiling $<
@$(CC) -c $< $(CCFLAGS) -o $@

$(BUILDDIR)/datatablemodule.o : c/datatablemodule.cc $(BUILDDIR)/capi.h $(BUILDDIR)/csv/py_csv.h $(BUILDDIR)/csv/writer.h $(BUILDDIR)/expr/py_expr.h $(BUILDDIR)/options.h $(BUILDDIR)/py_column.h $(BUILDDIR)/py_columnset.h $(BUILDDIR)/py_datatable.h $(BUILDDIR)/py_datawindow.h $(BUILDDIR)/py_encodings.h $(BUILDDIR)/py_groupby.h $(BUILDDIR)/py_rowindex.h $(BUILDDIR)/py_types.h $(BUILDDIR)/py_utils.h $(BUILDDIR)/utils/assert.h
$(BUILDDIR)/datatablemodule.o : c/datatablemodule.cc $(BUILDDIR)/capi.h $(BUILDDIR)/csv/py_csv.h $(BUILDDIR)/csv/writer.h $(BUILDDIR)/expr/py_expr.h $(BUILDDIR)/extras/aggregator.h $(BUILDDIR)/options.h $(BUILDDIR)/py_column.h $(BUILDDIR)/py_columnset.h $(BUILDDIR)/py_datatable.h $(BUILDDIR)/py_datawindow.h $(BUILDDIR)/py_encodings.h $(BUILDDIR)/py_groupby.h $(BUILDDIR)/py_rowindex.h $(BUILDDIR)/py_types.h $(BUILDDIR)/py_utils.h $(BUILDDIR)/utils/assert.h
@echo • Compiling $<
@$(CC) -c $< $(CCFLAGS) -o $@

Expand All @@ -853,6 +857,10 @@ $(BUILDDIR)/expr/reduceop.o : c/expr/reduceop.cc $(BUILDDIR)/expr/py_expr.h $(BU
$(BUILDDIR)/expr/unaryop.o : c/expr/unaryop.cc $(BUILDDIR)/expr/py_expr.h $(BUILDDIR)/types.h
@echo • Compiling $<
@$(CC) -c $< $(CCFLAGS) -o $@

$(BUILDDIR)/extras/aggregator.o : c/extras/aggregator.cc $(BUILDDIR)/extras/aggregator.h $(BUILDDIR)/py_utils.h $(BUILDDIR)/rowindex.h $(BUILDDIR)/types.h $(BUILDDIR)/utils/omp.h $(BUILDDIR)/utils/pyobj.h
@echo • Compiling $<
@$(CC) -c $< $(CCFLAGS) -o $@

$(BUILDDIR)/groupby.o : c/groupby.cc $(BUILDDIR)/utils/exceptions.h $(BUILDDIR)/groupby.h
@echo • Compiling $<
Expand Down
2 changes: 2 additions & 0 deletions c/datatablemodule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "py_types.h"
#include "py_utils.h"
#include "utils/assert.h"
#include "extras/aggregator.h"

extern void init_jay();

Expand Down Expand Up @@ -191,6 +192,7 @@ static PyMethodDef DatatableModuleMethods[] = {
METHODv(expr_unaryop),
METHOD0(is_debug_mode),
METHOD0(has_omp_support),
METHODv(aggregate),

{nullptr, nullptr, 0, nullptr} /* Sentinel */
};
Expand Down
46 changes: 46 additions & 0 deletions c/expr/reduceop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ enum OpCode {
Stdev = 4,
First = 5,
Sum = 6,
Count = 7,
};

template<typename T>
Expand Down Expand Up @@ -77,6 +78,29 @@ static void sum_skipna(const int32_t* groups, int32_t grp, void** params) {



//------------------------------------------------------------------------------
// Count calculation
//------------------------------------------------------------------------------

template<typename IT, typename OT>
static void count_skipna(const int32_t* groups, int32_t grp, void** params) {
Column* col0 = static_cast<Column*>(params[0]);
Column* col1 = static_cast<Column*>(params[1]);
const IT* inputs = static_cast<const IT*>(col0->data());
OT* outputs = static_cast<OT*>(col1->data_w());
OT count = 0;
int32_t row0 = groups[grp];
int32_t row1 = groups[grp + 1];
col0->rowindex().strided_loop(row0, row1, 1,
[&](int64_t i) {
IT x = inputs[i];
count += !ISNA<IT>(x);
});
outputs[grp] = count;
}



//------------------------------------------------------------------------------
// Mean calculation
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -200,6 +224,7 @@ static gmapperfn resolve1(int opcode) {
case OpCode::Max: return max_skipna<T1>;
case OpCode::Stdev: return stdev_skipna<T1, T2>;
case OpCode::Sum: return sum_skipna<T1, T2>;
case OpCode::Count: return count_skipna<T1, T2>;
default: return nullptr;
}
}
Expand All @@ -218,6 +243,22 @@ static gmapperfn resolve0(int opcode, SType stype) {
default: return nullptr;
}
}

if (opcode == OpCode::Count) {
switch (stype) {
case SType::BOOL:
case SType::INT8: return count_skipna<int8_t, uint64_t>;
case SType::INT16: return count_skipna<int16_t, uint64_t>;
case SType::INT32: return count_skipna<int32_t, uint64_t>;
case SType::INT64: return count_skipna<int64_t, uint64_t>;
case SType::FLOAT32: return count_skipna<float, uint64_t>;
case SType::FLOAT64: return count_skipna<double, uint64_t>;
case SType::STR32: return count_skipna<int32_t, uint64_t>;
case SType::STR64: return count_skipna<int64_t, uint64_t>;
default: return nullptr;
}
}

switch (stype) {
case SType::BOOL:
case SType::INT8: return resolve1<int8_t, double>(opcode);
Expand Down Expand Up @@ -251,6 +292,11 @@ Column* reduceop(int opcode, Column* arg, const Groupby& groupby)
res_type = SType::INT64;
}
}

if (opcode == OpCode::Count) {
res_type = SType::INT64;
}

int32_t ngrps = static_cast<int32_t>(groupby.ngroups());
if (ngrps == 0) ngrps = 1;

Expand Down

0 comments on commit ab59524

Please sign in to comment.