The MapRealMatrix instances don't initialize their own storage engine…

…s any more. Instead they use a single engine stored in MatrixDataframe.
datumbox · Dec 25, 2016 · e31b846 · e31b846
1 parent 40fe13b
commit e31b846
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,6 +40,7 @@ Version 0.8.0-SNAPSHOT - Build 20161224
 - Change the Validation mechanism to support Splitters. The AbstractValidator is now a single Validator class. 
 - K-fold cross validation is now performed by combining the KFoldSplitter and the Validator classes. 
 - Added support of a ShuffleSplitter.
+- The MapRealMatrix instances don't initialize their own storage engines any more. Instead they use a single engine stored in MatrixDataframe.
 
 Version 0.7.1-SNAPSHOT - Build 20161217
 ---------------------------------------

diff --git a/TODO.txt b/TODO.txt
@@ -1,11 +1,15 @@
 CODE IMPROVEMENTS
 =================
 
-- Consider dropping all the common.dataobjects and use their internalData directly instead.
-- Support of better Transformers: Add the notion of Standardizers such as ZeroOne and Zscore, decouple OneHotEncoding from Standardization.
+- New preprocessing package:
+    - Remove the old datatransformation package but maintain the AbstractTransformer.
+    - Create the following numerical Scalers: StandardScaler, MinMaxScaler, MaxAbsScaler, Binarization and PercentileScaler. All classes should indicate if Y is scaled except of the binarizer.
+    - Create the following categorical methods: OneHotEncoder (no reference levels) and CornerConstraintsEncoder (or set-to-zero that use reference levels).
+    - Creata a DataTransformer class that receives the TP for the numerical and categorical transformers and applies them in one step.
 - Improve the API of Feature Selection and how we handle different data types.
-- Write generic optimizers instead of having optimization methods in the algorithms. Add the optimizers and regularization packages under mathematics.
+- Consider dropping all the common.dataobjects and use their internalData directly instead.
 - Refactor the statistics package and replace all the static methods with proper inheritance.
+- Write generic optimizers instead of having optimization methods in the algorithms. Add the optimizers and regularization packages under mathematics.
 
 - Consider moving storages in a separate module that is inherited by common.
 - Consider moving all tests in a separate module.

diff --git a/...amework-common/src/main/java/com/datumbox/framework/common/dataobjects/MapRealMatrix.java b/...amework-common/src/main/java/com/datumbox/framework/common/dataobjects/MapRealMatrix.java
@@ -35,6 +35,11 @@
  */
 public class MapRealMatrix extends AbstractRealMatrix implements SparseRealMatrix {
 
+    /**
+     * The id of this Matrix.
+     */
+    private final int id;
+
     /**
      * The number of rows of the matrix.
      */
@@ -50,11 +55,6 @@ public class MapRealMatrix extends AbstractRealMatrix implements SparseRealMatri
      */
     private final Map<Long, Double> entries;
 
-    /**
-     * The storage storage engine.
-     */
-    private final StorageEngine storageEngine;
-
     /**
      * Protected constructor with the provided the dimension arguments.
      *
@@ -68,21 +68,24 @@ protected MapRealMatrix(int rowDimension, int columnDimension) throws NotStrictl
         this.rowDimension = rowDimension;
         this.columnDimension = columnDimension;
 
-        String storageName = "mrm" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong();
-        storageEngine = MatrixDataframe.configuration.getStorageConfiguration().createStorageEngine(storageName);
-        entries = storageEngine.getBigMap("tmp_entries", Long.class, Double.class, MapType.HASHMAP, StorageHint.IN_DISK, false, true);
+        if(MatrixDataframe.storageEngine == null) {
+            throw new NullPointerException("The MatrixDataframe storage engine is not initialized.");
+        }
+
+        id = MatrixDataframe.storageId.getAndIncrement();
+        entries = MatrixDataframe.storageEngine.getBigMap("tmp_mrmentries"+id, Long.class, Double.class, MapType.HASHMAP, StorageHint.IN_DISK, false, true);
     }
 
     /**
      * When we perform matrix operations, we often lose the reference to the original matrix and we are unable to
-     * close its storage. Even though the JVM will close the storage before shutdown, by adding a close method in the finalize
-     * we ensure that if the object is gc, we will close the storage engine sooner.
+     * clear its storage. Even though the JVM will close the storage before shutdown, by adding a close method in the finalize
+     * we ensure that if the object is gc, we will clear the unnecessary entries of the storage engine sooner.
      * @throws java.lang.Throwable
      */
     @Override
     protected void finalize() throws Throwable {
         try {
-            storageEngine.close();
+            MatrixDataframe.storageEngine.dropBigMap("tmp_mrmentries"+id, entries);
         }
         finally {
             super.finalize();

diff --git a/...ework-common/src/main/java/com/datumbox/framework/common/dataobjects/MatrixDataframe.java b/...ework-common/src/main/java/com/datumbox/framework/common/dataobjects/MatrixDataframe.java
@@ -15,12 +15,14 @@
  */
 package com.datumbox.framework.common.dataobjects;
 
-import com.datumbox.framework.common.Configuration;
+import com.datumbox.framework.common.storageengines.interfaces.StorageEngine;
+import com.datumbox.framework.common.utilities.RandomGenerator;
 import org.apache.commons.math3.linear.OpenMapRealVector;
 import org.apache.commons.math3.linear.RealMatrix;
 import org.apache.commons.math3.linear.RealVector;
 
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
 
 /**
  * The MatrixDataframe class is responsible for converting a Dataframe object to a
@@ -32,12 +34,17 @@ matrices and this class provides the tools to achieve the necessary conversions.
 public class MatrixDataframe {
 
     /**
-     * A reference to the most recently used Configuration object. It is necessary to define it static because
-     * some methods of the RealMatrix require generating new object without passing the configuration file.
-     * To have access on the configuration and build the data map, we require setting this static field with the latest Configuration
-     * object. It is package protected inorder to be accessible from the MapRealMatrix class.
+     * We create a single storage engine for all MatrixDataframe and MapRealMatrix objects. It is necessary to define it static
+     * and package protected to make it accessible to other classes such as the MapRealMatrix. This is because
+     * some methods of the RealMatrix require generating new object without passing the configuration file. The engine
+     * is created only once to avoid hurting performance. Thus the storageEngine is initialized once in a thread-safe manner.
      */
-    static Configuration configuration;
+    static StorageEngine storageEngine;
+
+    /**
+     * Keeps record how many usages were made on the storageEngine to avoid conflicting names.
+     */
+    static final AtomicInteger storageId = new AtomicInteger();
 
     private final RealMatrix X;
     private final RealVector Y;
@@ -75,6 +82,23 @@ private MatrixDataframe(RealMatrix X, RealVector Y) {
         this.Y = Y;
         this.X = X;
     }
+
+    /**
+     * Initializes the static storage engine if it's not already set.
+     *
+     * @param dataset
+     */
+    private static void setStorageEngine(Dataframe dataset) {
+        //create a single storage engine for all the MapRealMatrixes
+        if (storageEngine == null) {
+            synchronized(MatrixDataframe.class) {
+                if (storageEngine == null) {
+                    String storageName = "mdf" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong();
+                    storageEngine = dataset.configuration.getStorageConfiguration().createStorageEngine(storageName);
+                }
+            }
+        }
+    }
 
     /**
      * Method used to generate a training Dataframe to a MatrixDataframe and extracts its contents
@@ -92,6 +116,8 @@ public static MatrixDataframe newInstance(Dataframe dataset, boolean addConstant
         if(!featureIdsReference.isEmpty()) {
             throw new IllegalArgumentException("The featureIdsReference map should be empty.");
         }
+
+        setStorageEngine(dataset);
 
 
         int n = dataset.size();
@@ -101,7 +127,6 @@ public static MatrixDataframe newInstance(Dataframe dataset, boolean addConstant
             ++d;
         }
 
-        configuration = dataset.configuration;
         MatrixDataframe m = new MatrixDataframe(new MapRealMatrix(n, d), new MapRealVector(n));
 
         if(dataset.isEmpty()) {
@@ -167,11 +192,12 @@ public static MatrixDataframe parseDataset(Dataframe newData, Map<Integer, Integ
         if(featureIdsReference.isEmpty()) {
             throw new IllegalArgumentException("The featureIdsReference map should not be empty.");
         }
+
+        setStorageEngine(newData);
 
         int n = newData.size();
         int d = featureIdsReference.size();
 
-        configuration = newData.configuration;
         MatrixDataframe m = new MatrixDataframe(new MapRealMatrix(n, d), new MapRealVector(n));
 
         if(newData.isEmpty()) {
@@ -229,7 +255,7 @@ public static RealVector parseRecord(Record r, Map<Object, Integer> featureIdsRe
         int d = featureIdsReference.size();
 
         //create an Map-backed vector only if we have available info about configuration.
-        RealVector v = (configuration != null)?new MapRealVector(d):new OpenMapRealVector(d);
+        RealVector v = (storageEngine != null)?new MapRealVector(d):new OpenMapRealVector(d);
 
         boolean addConstantColumn = featureIdsReference.containsKey(Dataframe.COLUMN_NAME_CONSTANT);