Skip to content

Commit

Permalink
GroupBy.rowNumber() #16
Browse files Browse the repository at this point in the history
* preliminary refactoring - extracting index sort algorithms in a standalone IndexSorter
  • Loading branch information
andrus committed Mar 31, 2019
1 parent d395290 commit 7c668e9
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 55 deletions.
48 changes: 21 additions & 27 deletions dflib/src/main/java/com/nhl/dflib/ColumnDataFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
import com.nhl.dflib.map.ValueMapper;
import com.nhl.dflib.row.CrossColumnRowProxy;
import com.nhl.dflib.row.RowProxy;
import com.nhl.dflib.series.ArraySeries;
import com.nhl.dflib.series.ColumnMappedSeries;
import com.nhl.dflib.series.HeadSeries;
import com.nhl.dflib.series.IndexedSeries;
import com.nhl.dflib.series.RowMappedSeries;
import com.nhl.dflib.sort.SortIndexer;
import com.nhl.dflib.sort.IndexSorter;
import com.nhl.dflib.sort.Sorters;

import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
Expand All @@ -41,6 +41,15 @@ public ColumnDataFrame(Index columnsIndex, Series[] dataColumns) {
this.dataColumns = Objects.requireNonNull(dataColumns);
}

protected static Integer[] rowNumberSequence(int h) {
Integer[] rn = new Integer[h];
for (int i = 0; i < h; i++) {
rn[i] = i;
}

return rn;
}

@Override
public int height() {
return dataColumns.length > 0 ? dataColumns[0].size() : 0;
Expand All @@ -61,6 +70,11 @@ public <T> Series<T> getColumn(String name) {
return dataColumns[columnsIndex.position(name)];
}

@Override
public DataFrame addRowNumber(String columnName) {
return addColumn(columnName, new ArraySeries<>(rowNumberSequence(height())));
}

@Override
public DataFrame head(int len) {

Expand Down Expand Up @@ -139,47 +153,27 @@ private DataFrame filterWithIndex(Series<Integer> filteredIndex) {

@Override
public <V extends Comparable<? super V>> DataFrame sort(RowToValueMapper<V> sortKeyExtractor) {
return sort(Sorters.sorter(sortKeyExtractor));
return new IndexSorter(this).sort(Sorters.sorter(sortKeyExtractor));
}

@Override
public DataFrame sort(String[] columns, boolean[] ascending) {
return sort(Sorters.sorter(columnsIndex, columns, ascending));
return new IndexSorter(this).sort(Sorters.sorter(columnsIndex, columns, ascending));
}

@Override
public DataFrame sort(int[] columns, boolean[] ascending) {
return sort(Sorters.sorter(columnsIndex, columns, ascending));
return new IndexSorter(this).sort(Sorters.sorter(columnsIndex, columns, ascending));
}

@Override
public DataFrame sort(int column, boolean ascending) {
return sort(Sorters.sorter(columnsIndex, column, ascending));
return new IndexSorter(this).sort(Sorters.sorter(columnsIndex, column, ascending));
}

@Override
public DataFrame sort(String column, boolean ascending) {
return sort(Sorters.sorter(columnsIndex, column, ascending));
}

private DataFrame sort(Comparator<RowProxy> comparator) {
Comparator<Integer> rowComparator = toIntComparator(comparator);
Series<Integer> sortedIndex = SortIndexer.sortedIndex(this, rowComparator);

int width = width();
Series<?>[] newColumnsData = new Series[width];
for (int i = 0; i < width; i++) {
newColumnsData[i] = new IndexedSeries<>(dataColumns[i], sortedIndex);
}

return new ColumnDataFrame(columnsIndex, newColumnsData);
}

private Comparator<Integer> toIntComparator(Comparator<RowProxy> rowComparator) {
int h = height();
CrossColumnRowProxy p1 = new CrossColumnRowProxy(columnsIndex, dataColumns, h);
CrossColumnRowProxy p2 = new CrossColumnRowProxy(columnsIndex, dataColumns, h);
return (i1, i2) -> rowComparator.compare(p1.rewind(i1), p2.rewind(i2));
return new IndexSorter(this).sort(Sorters.sorter(columnsIndex, column, ascending));
}

@Override
Expand Down
5 changes: 1 addition & 4 deletions dflib/src/main/java/com/nhl/dflib/DataFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,7 @@ default <V, VR> DataFrame convertColumn(String columnName, ValueMapper<V, VR> co
* @param columnName the name of the row number column
* @return a new DataFrame with an extra row number column
*/
default DataFrame addRowNumber(String columnName) {
int[] counter = new int[1];
return addColumn(columnName, r -> counter[0]++);
}
DataFrame addRowNumber(String columnName);

default <V> DataFrame addColumn(String columnName, RowToValueMapper<V> columnValueProducer) {
return addColumns(new String[]{columnName}, columnValueProducer);
Expand Down
57 changes: 57 additions & 0 deletions dflib/src/main/java/com/nhl/dflib/row/DataFrameRowProxy.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package com.nhl.dflib.row;

import com.nhl.dflib.DataFrame;
import com.nhl.dflib.Index;

public class DataFrameRowProxy implements RowProxy {

private DataFrame dataFrame;
private int rowIndex;
private int height;

public DataFrameRowProxy(DataFrame dataFrame) {
this.dataFrame = dataFrame;
this.height = height;
this.rowIndex = -1;
}

@Override
public Index getIndex() {
return dataFrame.getColumnsIndex();
}

@Override
public Object get(int columnPos) {
return dataFrame.getColumn(columnPos).get(rowIndex);
}

@Override
public Object get(String columnName) {
return dataFrame.getColumn(columnName).get(rowIndex);
}

@Override
public void copyRange(RowBuilder to, int fromOffset, int toOffset, int len) {
// row can be missing in joins...
if (rowIndex >= 0) {
int w = dataFrame.width();
for (int i = 0; i < w; i++) {
to.set(i + toOffset, dataFrame.getColumn(i).get(rowIndex));
}
}
}

public boolean hasNext() {
return rowIndex + 1 < height;
}

public DataFrameRowProxy rewind() {
this.rowIndex++;
return this;
}

public DataFrameRowProxy rewind(int index) {
this.rowIndex = index;
return this;
}
}
67 changes: 67 additions & 0 deletions dflib/src/main/java/com/nhl/dflib/sort/IndexSorter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package com.nhl.dflib.sort;

import com.nhl.dflib.ColumnDataFrame;
import com.nhl.dflib.DataFrame;
import com.nhl.dflib.Series;
import com.nhl.dflib.row.DataFrameRowProxy;
import com.nhl.dflib.row.RowProxy;
import com.nhl.dflib.series.ArraySeries;
import com.nhl.dflib.series.IndexedSeries;

import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.function.Supplier;

public class IndexSorter {

private DataFrame dataFrame;
private Supplier<Integer[]> indexBuilder;

public IndexSorter(DataFrame dataFrame) {
this.dataFrame = dataFrame;
this.indexBuilder = () -> rowNumberSequence(dataFrame.height());
}

public IndexSorter(DataFrame dataFrame, List<Integer> rangeToSort) {
this.dataFrame = dataFrame;
// copy range to avoid modification of the source list
this.indexBuilder = () -> rangeToSort.toArray(new Integer[rangeToSort.size()]);
}

protected static Integer[] rowNumberSequence(int h) {
Integer[] rn = new Integer[h];
for (int i = 0; i < h; i++) {
rn[i] = i;
}

return rn;
}

public DataFrame sort(Comparator<RowProxy> comparator) {

// make sure 'mutableIndex' is not visible outside this method as we are going to modify it,
// so obtain it via the supplier right on the spot
Integer[] mutableIndex = indexBuilder.get();

Comparator<Integer> rowComparator = rowIndexComparator(comparator);

// note - mutating passed index
Arrays.sort(mutableIndex, rowComparator);
Series<Integer> sortedIndex = new ArraySeries<>(mutableIndex);

int width = dataFrame.width();
Series<?>[] newColumnsData = new Series[width];
for (int i = 0; i < width; i++) {
newColumnsData[i] = new IndexedSeries<>(dataFrame.getColumn(i), sortedIndex);
}

return new ColumnDataFrame(dataFrame.getColumnsIndex(), newColumnsData);
}

private Comparator<Integer> rowIndexComparator(Comparator<RowProxy> rowComparator) {
DataFrameRowProxy p1 = new DataFrameRowProxy(dataFrame);
DataFrameRowProxy p2 = new DataFrameRowProxy(dataFrame);
return (i1, i2) -> rowComparator.compare(p1.rewind(i1), p2.rewind(i2));
}
}
24 changes: 0 additions & 24 deletions dflib/src/main/java/com/nhl/dflib/sort/SortIndexer.java

This file was deleted.

0 comments on commit 7c668e9

Please sign in to comment.