Skip to content

Commit

Permalink
The MapRealMatrix instances don't initialize their own storage engine…
Browse files Browse the repository at this point in the history
…s any more. Instead they use a single engine stored in MatrixDataframe.
  • Loading branch information
datumbox committed Dec 25, 2016
1 parent 40fe13b commit e31b846
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -40,6 +40,7 @@ Version 0.8.0-SNAPSHOT - Build 20161224
- Change the Validation mechanism to support Splitters. The AbstractValidator is now a single Validator class.
- K-fold cross validation is now performed by combining the KFoldSplitter and the Validator classes.
- Added support of a ShuffleSplitter.
- The MapRealMatrix instances don't initialize their own storage engines any more. Instead they use a single engine stored in MatrixDataframe.

Version 0.7.1-SNAPSHOT - Build 20161217
---------------------------------------
Expand Down
10 changes: 7 additions & 3 deletions TODO.txt
@@ -1,11 +1,15 @@
CODE IMPROVEMENTS
=================

- Consider dropping all the common.dataobjects and use their internalData directly instead.
- Support of better Transformers: Add the notion of Standardizers such as ZeroOne and Zscore, decouple OneHotEncoding from Standardization.
- New preprocessing package:
- Remove the old datatransformation package but maintain the AbstractTransformer.
- Create the following numerical Scalers: StandardScaler, MinMaxScaler, MaxAbsScaler, Binarization and PercentileScaler. All classes should indicate if Y is scaled except of the binarizer.
- Create the following categorical methods: OneHotEncoder (no reference levels) and CornerConstraintsEncoder (or set-to-zero that use reference levels).
- Creata a DataTransformer class that receives the TP for the numerical and categorical transformers and applies them in one step.
- Improve the API of Feature Selection and how we handle different data types.
- Write generic optimizers instead of having optimization methods in the algorithms. Add the optimizers and regularization packages under mathematics.
- Consider dropping all the common.dataobjects and use their internalData directly instead.
- Refactor the statistics package and replace all the static methods with proper inheritance.
- Write generic optimizers instead of having optimization methods in the algorithms. Add the optimizers and regularization packages under mathematics.

- Consider moving storages in a separate module that is inherited by common.
- Consider moving all tests in a separate module.
Expand Down
Expand Up @@ -35,6 +35,11 @@
*/
public class MapRealMatrix extends AbstractRealMatrix implements SparseRealMatrix {

/**
* The id of this Matrix.
*/
private final int id;

/**
* The number of rows of the matrix.
*/
Expand All @@ -50,11 +55,6 @@ public class MapRealMatrix extends AbstractRealMatrix implements SparseRealMatri
*/
private final Map<Long, Double> entries;

/**
* The storage storage engine.
*/
private final StorageEngine storageEngine;

/**
* Protected constructor with the provided the dimension arguments.
*
Expand All @@ -68,21 +68,24 @@ protected MapRealMatrix(int rowDimension, int columnDimension) throws NotStrictl
this.rowDimension = rowDimension;
this.columnDimension = columnDimension;

String storageName = "mrm" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong();
storageEngine = MatrixDataframe.configuration.getStorageConfiguration().createStorageEngine(storageName);
entries = storageEngine.getBigMap("tmp_entries", Long.class, Double.class, MapType.HASHMAP, StorageHint.IN_DISK, false, true);
if(MatrixDataframe.storageEngine == null) {
throw new NullPointerException("The MatrixDataframe storage engine is not initialized.");
}

id = MatrixDataframe.storageId.getAndIncrement();
entries = MatrixDataframe.storageEngine.getBigMap("tmp_mrmentries"+id, Long.class, Double.class, MapType.HASHMAP, StorageHint.IN_DISK, false, true);
}

/**
* When we perform matrix operations, we often lose the reference to the original matrix and we are unable to
* close its storage. Even though the JVM will close the storage before shutdown, by adding a close method in the finalize
* we ensure that if the object is gc, we will close the storage engine sooner.
* clear its storage. Even though the JVM will close the storage before shutdown, by adding a close method in the finalize
* we ensure that if the object is gc, we will clear the unnecessary entries of the storage engine sooner.
* @throws java.lang.Throwable
*/
@Override
protected void finalize() throws Throwable {
try {
storageEngine.close();
MatrixDataframe.storageEngine.dropBigMap("tmp_mrmentries"+id, entries);
}
finally {
super.finalize();
Expand Down
Expand Up @@ -15,12 +15,14 @@
*/
package com.datumbox.framework.common.dataobjects;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.storageengines.interfaces.StorageEngine;
import com.datumbox.framework.common.utilities.RandomGenerator;
import org.apache.commons.math3.linear.OpenMapRealVector;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.RealVector;

import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

/**
* The MatrixDataframe class is responsible for converting a Dataframe object to a
Expand All @@ -32,12 +34,17 @@ matrices and this class provides the tools to achieve the necessary conversions.
public class MatrixDataframe {

/**
* A reference to the most recently used Configuration object. It is necessary to define it static because
* some methods of the RealMatrix require generating new object without passing the configuration file.
* To have access on the configuration and build the data map, we require setting this static field with the latest Configuration
* object. It is package protected inorder to be accessible from the MapRealMatrix class.
* We create a single storage engine for all MatrixDataframe and MapRealMatrix objects. It is necessary to define it static
* and package protected to make it accessible to other classes such as the MapRealMatrix. This is because
* some methods of the RealMatrix require generating new object without passing the configuration file. The engine
* is created only once to avoid hurting performance. Thus the storageEngine is initialized once in a thread-safe manner.
*/
static Configuration configuration;
static StorageEngine storageEngine;

/**
* Keeps record how many usages were made on the storageEngine to avoid conflicting names.
*/
static final AtomicInteger storageId = new AtomicInteger();

private final RealMatrix X;
private final RealVector Y;
Expand Down Expand Up @@ -75,6 +82,23 @@ private MatrixDataframe(RealMatrix X, RealVector Y) {
this.Y = Y;
this.X = X;
}

/**
* Initializes the static storage engine if it's not already set.
*
* @param dataset
*/
private static void setStorageEngine(Dataframe dataset) {
//create a single storage engine for all the MapRealMatrixes
if (storageEngine == null) {
synchronized(MatrixDataframe.class) {
if (storageEngine == null) {
String storageName = "mdf" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong();
storageEngine = dataset.configuration.getStorageConfiguration().createStorageEngine(storageName);
}
}
}
}

/**
* Method used to generate a training Dataframe to a MatrixDataframe and extracts its contents
Expand All @@ -92,6 +116,8 @@ public static MatrixDataframe newInstance(Dataframe dataset, boolean addConstant
if(!featureIdsReference.isEmpty()) {
throw new IllegalArgumentException("The featureIdsReference map should be empty.");
}

setStorageEngine(dataset);


int n = dataset.size();
Expand All @@ -101,7 +127,6 @@ public static MatrixDataframe newInstance(Dataframe dataset, boolean addConstant
++d;
}

configuration = dataset.configuration;
MatrixDataframe m = new MatrixDataframe(new MapRealMatrix(n, d), new MapRealVector(n));

if(dataset.isEmpty()) {
Expand Down Expand Up @@ -167,11 +192,12 @@ public static MatrixDataframe parseDataset(Dataframe newData, Map<Integer, Integ
if(featureIdsReference.isEmpty()) {
throw new IllegalArgumentException("The featureIdsReference map should not be empty.");
}

setStorageEngine(newData);

int n = newData.size();
int d = featureIdsReference.size();

configuration = newData.configuration;
MatrixDataframe m = new MatrixDataframe(new MapRealMatrix(n, d), new MapRealVector(n));

if(newData.isEmpty()) {
Expand Down Expand Up @@ -229,7 +255,7 @@ public static RealVector parseRecord(Record r, Map<Object, Integer> featureIdsRe
int d = featureIdsReference.size();

//create an Map-backed vector only if we have available info about configuration.
RealVector v = (configuration != null)?new MapRealVector(d):new OpenMapRealVector(d);
RealVector v = (storageEngine != null)?new MapRealVector(d):new OpenMapRealVector(d);

boolean addConstantColumn = featureIdsReference.containsKey(Dataframe.COLUMN_NAME_CONSTANT);

Expand Down

0 comments on commit e31b846

Please sign in to comment.