From 6ed0170148a9b6ad223c3e72a3d3cd847fd0768c Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 22 Dec 2016 21:39:09 +0000 Subject: [PATCH] - Restored the DatabaseConfiguration.getDBnameSeparator() method. - Changed the RandomGenerator.getThreadLocalRandomUnseeded() to ensure we get different random numbers across threads. - CERT no longer received any dbName parameter as we don't store anything on disk. - Removed underscores from all temporary names in the framework. - When we close() a Trainer that has not be loaded or saved the knowledgeBase will be deleted to remove any temporary files. - The models of a specific dbName are added in a directory structure. - Created an other level of abstraction for File-based Database Connectors and Configurations. - Rename Folder to Directory on comments, methods, vars and config files. - Empty parent directories of the algorithm output are automatically cleaned up. --- CHANGELOG.md | 16 +- LICENSE | 0 TODO.txt | 3 +- .../applications/datamodeling/Modeler.java | 15 +- .../framework/applications/nlp/CETR.java | 9 +- .../framework/applications/nlp/CETRTest.java | 4 +- .../common/dataobjects/Dataframe.java | 374 +++++++++--------- .../common/dataobjects/MapRealMatrix.java | 2 +- .../common/interfaces/Trainable.java | 3 - .../abstracts/AbstractDatabaseConnector.java | 58 ++- .../inmemory/InMemoryConfiguration.java | 41 +- .../inmemory/InMemoryConnector.java | 44 +-- .../interfaces/DatabaseConfiguration.java | 13 +- .../mapdb/MapDBConfiguration.java | 42 +- .../mapdb/MapDBConnector.java | 72 ++-- .../common/utilities/RandomGenerator.java | 11 +- .../datumbox.config.default.properties | 8 +- .../common/dataobjects/DataframeTest.java | 3 +- .../test/resources/datumbox.config.properties | 8 +- .../clustering/HierarchicalAgglomerative.java | 2 +- .../machinelearning/clustering/Kmeans.java | 2 +- .../common/abstracts/AbstractTrainer.java | 33 +- .../algorithms/AbstractBoostingBagging.java | 13 +- .../AbstractCategoricalFeatureSelector.java | 8 +- .../common/dataobjects/TrainableBundle.java | 15 +- .../metrics/ClassificationMetrics.java | 8 +- .../CollaborativeFiltering.java | 2 +- .../regression/StepwiseRegression.java | 11 +- .../mathematics/discrete/Combinatorics.java | 1 - .../relatedsamples/Friedman.java | 1 - .../independentsamples/Logrank.java | 1 - .../classification/OrdinalRegressionTest.java | 3 +- .../classification/SoftMaxRegressionTest.java | 3 +- .../HierarchicalAgglomerativeTest.java | 3 +- .../clustering/KmeansTest.java | 3 +- .../MatrixLinearRegressionTest.java | 3 +- .../machinelearning/regression/NLMSTest.java | 3 +- 37 files changed, 384 insertions(+), 457 deletions(-) mode change 100755 => 100644 LICENSE diff --git a/CHANGELOG.md b/CHANGELOG.md index a690ce5a..1206dd8c 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ CHANGELOG ========= -Version 0.8.0-SNAPSHOT - Build 20161221 +Version 0.8.0-SNAPSHOT - Build 20161222 --------------------------------------- - Improved Validation: @@ -9,9 +9,9 @@ Version 0.8.0-SNAPSHOT - Build 20161221 - Removed the kFold validation from Algorithms. Now we offer a new validator mechanism. - A single KnowledgeBase implementation is now used. - Removed the unnecessary n & d model parameters from all models. -- Random unseeded filenames are now produced using RandomGenerator.getThreadLocalRandomUnseeded(). +- Random unseeded filenames are now produced using RandomGenerator.getRandomUnseeded(). - Removing the need to call KnowledgeBase.init() in any predict/transform method. -- Improved DatabaseConnector: existsObject method, InMemory now stores objects independently, MapDB stores all files in folder. +- Improved DatabaseConnector: existsObject method, InMemory now stores objects independently, MapDB stores all files in directory. - The training parameters are now provided on the constructor of the algorithms not with a setter. - TextClassifier inherits from Modeler. - Removed all unnecessary passing of class objects from Stepwise Regression, Wrappers and Ensumble learning classes. @@ -21,8 +21,16 @@ Version 0.8.0-SNAPSHOT - Build 20161221 - Created a TrainableBundle to keep track of the Trainables of Modeler, AbstractBoostingBagging and StepwiseRegression. - Removed automatic save after fit, now save() must be called. - AbstractTrainer no longer stores a local copy of dbName. The save method accepts a dbName. -- The DatabaseConfiguration.getDBnameSeparator() method was removed. - The rename() is created in DatabaseConnectors and it's used by KnowledgeBase to saveAs the models. +- Restored the DatabaseConfiguration.getDBnameSeparator() method. +- Changed the RandomGenerator.getThreadLocalRandomUnseeded() to ensure we get different random numbers across threads. +- CERT no longer received any dbName parameter as we don't store anything on disk. +- Removed underscores from all temporary names in the framework. +- When we close() a Trainer that has not be loaded or saved the knowledgeBase will be deleted to remove any temporary files. +- The models of a specific dbName are added in a directory structure. +- Created an other level of abstraction for File-based Database Connectors and Configurations. +- Rename Folder to Directory on comments, methods, vars and config files. +- Empty parent directories of the algorithm output are automatically cleaned up. Version 0.7.1-SNAPSHOT - Build 20161217 --------------------------------------- diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/TODO.txt b/TODO.txt index 7b9d2870..3d674e53 100755 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,7 @@ CODE IMPROVEMENTS ================= -- Can we add all the files of a model in a single folder. -- Can we make the two constructors of the Trainers to call a common constructor to eliminate duplicate code? +- Save and load method for Dataset. - Support of better Transformers (Zscore, decouple boolean transforming from numeric etc). - Write a ShuffleSplitValidator class similar to KFold. Perhaps we need a single Validator class and separate Splitters. diff --git a/datumbox-framework-applications/src/main/java/com/datumbox/framework/applications/datamodeling/Modeler.java b/datumbox-framework-applications/src/main/java/com/datumbox/framework/applications/datamodeling/Modeler.java index 2599cf59..524716fa 100755 --- a/datumbox-framework-applications/src/main/java/com/datumbox/framework/applications/datamodeling/Modeler.java +++ b/datumbox-framework-applications/src/main/java/com/datumbox/framework/applications/datamodeling/Modeler.java @@ -39,7 +39,7 @@ public class Modeler extends AbstractTrainer lines = Files.readAllLines(Paths.get(this.getClass().getClassLoader().getResource("datasets/example.com.html").toURI()), StandardCharsets.UTF_8); @@ -64,7 +62,7 @@ public void testExtract() { parameters.setNumberOfClusters(2); parameters.setAlphaWindowSizeFor2DModel(3); parameters.setSmoothingAverageRadius(2); - CETR instance = new CETR(dbName, conf); + CETR instance = new CETR(conf); String expResult = "This domain is established to be used for illustrative examples in documents. You may use this domain in examples without prior coordination or asking for permission."; String result = instance.extract(text, parameters); assertEquals(expResult, result); diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/Dataframe.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/Dataframe.java index 2d5c92fc..1954b424 100644 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/Dataframe.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/Dataframe.java @@ -43,60 +43,60 @@ * Machine Learning algorithms get as argument Dataframe objects. The class has an * internal static Builder class which can be used to generate Dataframe objects * from Text or CSV files. - * + * * @author Vasilis Vryniotis */ public class Dataframe implements Collection, Copyable { - + /** * Internal name of the response variable. */ public static final String COLUMN_NAME_Y = "~Y"; - + /** * Internal name of the constant. */ public static final String COLUMN_NAME_CONSTANT = "~CONSTANT"; - + /** * The Builder is a utility class which can help you build Dataframe from - Text files and CSV files. + Text files and CSV files. */ public static class Builder { - + /** * It builds a Dataframe object from a provided list of text files. The data - map should have as index the names of each class and as values the URIs - of the training files. The files should contain one training example - per row. If we want to parse a Text File of unknown category then - pass a single URI with null as key. - - The method requires as arguments a file with the category names and locations - of the training files, an instance of a TextExtractor which is used - to extract the keywords from the documents and the Database Configuration - Object. - * + map should have as index the names of each class and as values the URIs + of the training files. The files should contain one training example + per row. If we want to parse a Text File of unknown category then + pass a single URI with null as key. + + The method requires as arguments a file with the category names and locations + of the training files, an instance of a TextExtractor which is used + to extract the keywords from the documents and the Database Configuration + Object. + * * @param textFilesMap * @param textExtractor * @param conf - * @return + * @return */ public static Dataframe parseTextFiles(Map textFilesMap, Extractable textExtractor, Configuration conf) { Dataframe dataset = new Dataframe(conf); Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class); - + for (Map.Entry entry : textFilesMap.entrySet()) { Object theClass = entry.getKey(); URI datasetURI = entry.getValue(); - + logger.info("Dataset Parsing {} class", theClass); - + try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(datasetURI)), "UTF8"))) { final int baseCounter = dataset.size(); //because we read multiple files we need to keep track of all records added earlier - ThreadMethods.throttledExecution(StreamMethods.enumerate(br.lines()), e -> { + ThreadMethods.throttledExecution(StreamMethods.enumerate(br.lines()), e -> { Integer rId = baseCounter + e.getKey(); String line = e.getValue(); - + AssociativeArray xData = new AssociativeArray( textExtractor.extract(StringCleaner.clear(line)) ); @@ -105,26 +105,26 @@ public static Dataframe parseTextFiles(Map textFilesMap, Extractabl //we call below the recalculateMeta() dataset.set(rId, r); }, conf.getConcurrencyConfig()); - } + } catch (IOException ex) { throw new RuntimeException(ex); } } - + return dataset; } - + /** * It builds a Dataframe object from a CSV file; the first line of the provided * CSV file must have a header with the column names. - * + * * The method accepts the following arguments: A Reader object from where * we will read the contents of the csv file. The name column of the * response variable y. A map with the column names and their respective * DataTypes. The char delimiter for the columns, the char for quotes and * the string of the record/row separator. The Database Configuration * object. - * + * * @param reader * @param yVariable * @param headerDataTypes @@ -134,44 +134,44 @@ public static Dataframe parseTextFiles(Map textFilesMap, Extractabl * @param skip * @param limit * @param conf - * @return + * @return */ - public static Dataframe parseCSVFile(Reader reader, String yVariable, LinkedHashMap headerDataTypes, - char delimiter, char quote, String recordSeparator, Long skip, Long limit, Configuration conf) { + public static Dataframe parseCSVFile(Reader reader, String yVariable, LinkedHashMap headerDataTypes, + char delimiter, char quote, String recordSeparator, Long skip, Long limit, Configuration conf) { Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class); - + if(skip == null) { skip = 0L; } - + if(limit == null) { limit = Long.MAX_VALUE; } - + logger.info("Parsing CSV file"); - + if (!headerDataTypes.containsKey(yVariable)) { logger.warn("WARNING: The file is missing the response variable column {}.", yVariable); } - + TypeInference.DataType yDataType = headerDataTypes.get(yVariable); Map xDataTypes = new HashMap<>(headerDataTypes); //copy header types xDataTypes.remove(yVariable); //remove the response variable from xDataTypes Dataframe dataset = new Dataframe(conf, yDataType, xDataTypes); //use the private constructor to pass DataTypes directly and avoid updating them on the fly - - + + CSVFormat format = CSVFormat - .RFC4180 - .withHeader() - .withDelimiter(delimiter) - .withQuote(quote) - .withRecordSeparator(recordSeparator); - - try (final CSVParser parser = new CSVParser(reader, format)) { - ThreadMethods.throttledExecution(StreamMethods.enumerate(StreamMethods.stream(parser.spliterator(), false)).skip(skip).limit(limit), e -> { + .RFC4180 + .withHeader() + .withDelimiter(delimiter) + .withQuote(quote) + .withRecordSeparator(recordSeparator); + + try (final CSVParser parser = new CSVParser(reader, format)) { + ThreadMethods.throttledExecution(StreamMethods.enumerate(StreamMethods.stream(parser.spliterator(), false)).skip(skip).limit(limit), e -> { Integer rId = e.getKey(); CSVRecord row = e.getValue(); - + if (!row.isConsistent()) { logger.warn("WARNING: Skipping row {} because its size does not match the header size.", row.getRecordNumber()); } @@ -185,33 +185,33 @@ public static Dataframe parseCSVFile(Reader reader, String yVariable, LinkedHash Object value = TypeInference.DataType.parse(row.get(column), dataType); //parse the string value according to the DataType if (yVariable != null && yVariable.equals(column)) { y = value; - } + } else { xData.put(column, value); } } - + Record r = new Record(xData, y); - + //use the internal unsafe methods to avoid the update of the Metas. //The Metas are already set in the construction of the Dataframe. dataset._unsafe_set(rId, r); } }, conf.getConcurrencyConfig()); - } + } catch (IOException ex) { throw new RuntimeException(ex); } return dataset; } - } - - private TypeInference.DataType yDataType; + } + + private TypeInference.DataType yDataType; private Map xDataTypes; private Map records; private final AtomicInteger atomicNextAvailableRecordId = new AtomicInteger(); - + private final DatabaseConnector dbc; /** @@ -219,68 +219,68 @@ public static Dataframe parseCSVFile(Reader reader, String yVariable, LinkedHash * that extend the Dataframe or the MatrixDataframe class which is on the same package. */ protected final Configuration conf; - + /** * This executor is used for the parallel processing of streams with custom * Thread pool. */ protected final ForkJoinStream streamExecutor; - + /** * Public constructor of Dataframe. - * - * @param conf + * + * @param conf */ public Dataframe(Configuration conf) { this.conf = conf; - String dbName = "dts_" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong(); + String dbName = "dts" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong(); dbc = this.conf.getDbConfig().getConnector(dbName); - + records = dbc.getBigMap("tmp_records", Integer.class, Record.class, MapType.TREEMAP, StorageHint.IN_DISK, true, true); - + yDataType = null; xDataTypes = dbc.getBigMap("tmp_xDataTypes", Object.class, TypeInference.DataType.class, MapType.HASHMAP, StorageHint.IN_MEMORY, true, true); - + streamExecutor = new ForkJoinStream(this.conf.getConcurrencyConfig()); } - + /** * Private constructor used by the Builder inner static class. - * + * * @param conf * @param yDataType - * @param xDataTypes + * @param xDataTypes */ private Dataframe(Configuration conf, TypeInference.DataType yDataType, Map xDataTypes) { this(conf); this.yDataType = yDataType; this.xDataTypes.putAll(xDataTypes); } - - + + //Mandatory Collection Methods - + /** * Returns the total number of Records of the Dataframe. - * - * @return + * + * @return */ @Override public int size() { return records.size(); } - + /** * Checks if the Dataframe is empty. - * - * @return + * + * @return */ @Override public boolean isEmpty() { return records.isEmpty(); } - + /** * Clears all the internal Records of the Dataframe. The Dataframe can be used * after you clear it. @@ -288,35 +288,35 @@ public boolean isEmpty() { @Override public void clear() { yDataType = null; - + xDataTypes.clear(); records.clear(); } /** * Adds a record in the Dataframe and updates the Meta data. - * + * * @param r - * @return + * @return */ @Override public boolean add(Record r) { addRecord(r); return true; } - + /** * Checks if the Record exists in the Dataframe. Note that the Record is checked only * for its x and y components. - * + * * @param o - * @return + * @return */ @Override public boolean contains(Object o) { return records.containsValue((Record)o); } - + /** {@inheritDoc} */ @Override public boolean addAll(Collection c) { @@ -325,13 +325,13 @@ public boolean addAll(Collection c) { }); return true; } - + /** {@inheritDoc} */ @Override public boolean containsAll(Collection c) { return records.values().containsAll(c); } - + /** {@inheritDoc} */ @Override public Object[] toArray() { @@ -342,8 +342,8 @@ public Object[] toArray() { } return array; } - - /** {@inheritDoc} */ + + /** {@inheritDoc} */ @Override @SuppressWarnings("unchecked") public T[] toArray(T[] a) { @@ -357,31 +357,31 @@ public T[] toArray(T[] a) { } return a; } - + /** * Returns a read-only iterator on the values of the Dataframe. - * - * @return + * + * @return */ @Override public Iterator iterator() { return values().iterator(); } - + /** {@inheritDoc} */ @Override public Stream stream() { return StreamMethods.stream(values(), false); } - + //Optional Collection Methods - + /** * Removes the first occurrence of the specified element from this Dataframe, * if it is present and it does not update the metadata. - * + * * @param o - * @return + * @return */ @Override public boolean remove(Object o) { @@ -392,13 +392,13 @@ public boolean remove(Object o) { remove(id); return true; } - + /** * Removes all of this collection's elements that are also contained in the * specified collection and updates the metadata. - * + * * @param c - * @return + * @return */ @Override public boolean removeAll(Collection c) { @@ -415,9 +415,9 @@ public boolean removeAll(Collection c) { /** * Retains only the elements in this collection that are contained in the * specified collection and updates the meta data. - * + * * @param c - * @return + * @return */ @Override public boolean retainAll(Collection c) { @@ -435,29 +435,29 @@ public boolean retainAll(Collection c) { } return modified; } - - + + //Other methods /** * Removes a record with a particular id from the Dataframe but does not update * the metadata. - * + * * @param id - * @return + * @return */ public Record remove(Integer id) { return records.remove(id); } - + /** * Returns the index of the first occurrence of the specified element in this * Dataframe, or null if this Dataframe does not contain the element. * WARNING: The Records are checked only for their X and Y values, not for * the yPredicted and yPredictedProbabilities values. - * + * * @param o - * @return + * @return */ public Integer indexOf(Record o) { if(o!=null) { @@ -471,161 +471,161 @@ public Integer indexOf(Record o) { } return null; } - + /** * Returns a particular Record using its id. - * + * * @param id - * @return + * @return */ public Record get(Integer id) { return records.get(id); } - + /** * Adds a Record in the Dataframe and returns its id. - * + * * @param r - * @return + * @return */ public Integer addRecord(Record r) { Integer rId = _unsafe_add(r); updateMeta(r); return rId; } - + /** * Sets the record of a particular id in the dataset. If the record does not * exist it will be added with the specific id and the next added record will * have as id the next integer. - * + * * Note that the meta-data are partially updated. This means that if the replaced * Record contained a column which is now no longer available in the dataset, * then the meta-data will not refect this update (the column will continue to exist * in the meta data). If this is a problem, you should call the recalculateMeta() * method to force them being recalculated. - * + * * @param rId * @param r - * @return + * @return */ public Integer set(Integer rId, Record r) { _unsafe_set(rId, r); updateMeta(r); return rId; } - + /** * Returns the total number of X columns in the Dataframe. - * - * @return + * + * @return */ public int xColumnSize() { return xDataTypes.size(); } - + /** * Returns the type of the response variable y. - * - * @return + * + * @return */ public TypeInference.DataType getYDataType() { return yDataType; } - + /** * Returns an Map with column names as index and DataTypes as values. - * - * @return + * + * @return */ public Map getXDataTypes() { return Collections.unmodifiableMap(xDataTypes); } - + /** * It extracts the values of a particular column from all records and * stores them into an FlatDataList. - * + * * @param column - * @return + * @return */ public FlatDataList getXColumn(Object column) { FlatDataList flatDataList = new FlatDataList(); - + for(Record r : values()) { flatDataList.add(r.getX().get(column)); } - + return flatDataList; } - + /** * It extracts the values of the response variables from all observations and * stores them into an FlatDataList. - * - * @return + * + * @return */ public FlatDataList getYColumn() { FlatDataList flatDataList = new FlatDataList(); - + for(Record r : values()) { flatDataList.add(r.getY()); } - + return flatDataList; } - + /** * Removes completely a list of columns from the dataset. The meta-data of * the Dataframe are updated. The method internally uses threads. - * + * * @param columnSet */ - public void dropXColumns(Set columnSet) { + public void dropXColumns(Set columnSet) { columnSet.retainAll(xDataTypes.keySet()); //keep only those columns that are already known to the Meta data of the Dataframe - + if(columnSet.isEmpty()) { return; } - + //remove all the columns from the Meta data xDataTypes.keySet().removeAll(columnSet); - - streamExecutor.forEach(StreamMethods.stream(entries(), true), e -> { + + streamExecutor.forEach(StreamMethods.stream(entries(), true), e -> { Integer rId = e.getKey(); Record r = e.getValue(); - + AssociativeArray xData = r.getX().copy(); boolean modified = xData.keySet().removeAll(columnSet); - + if(modified) { Record newR = new Record(xData, r.getY(), r.getYPredicted(), r.getYPredictedProbabilities()); - + //safe to call in this context. we already updated the meta when we modified the xDataTypes _unsafe_set(rId, newR); } }); - + } - + /** * It generates and returns a new Dataframe which contains a subset of this Dataframe. * All the Records of the returned Dataframe are copies of the original Records. * The method is used for k-fold cross validation and sampling. Note that the * Records in the new Dataframe have DIFFERENT ids from the original ones. - * + * * @param idsCollection - * @return + * @return */ public Dataframe getSubset(FlatDataList idsCollection) { Dataframe d = new Dataframe(conf); - + for(Object id : idsCollection) { - d.add(get((Integer)id)); - } + d.add(get((Integer)id)); + } return d; } - + /** * It forces the recalculation of Meta data using the Records of the dataset. */ @@ -636,20 +636,20 @@ public void recalculateMeta() { updateMeta(r); } } - + /** {@inheritDoc} */ @Override public Dataframe copy() { Dataframe d = new Dataframe(conf); - + for(Map.Entry e : entries()) { Integer rId = e.getKey(); Record r = e.getValue(); - d.set(rId, r); - } + d.set(rId, r); + } return d; } - + /** * Deletes the Dataframe and removes all internal variables. Once you delete a * dataset, the instance can no longer be used. @@ -660,21 +660,21 @@ public void delete() { dbc.clear(); try { dbc.close(); - } + } catch (Exception ex) { throw new RuntimeException(ex); } - + //Ensures that the Dataframe can't be used after delete() is called. yDataType = null; xDataTypes = null; records = null; } - + /** * Returns a read-only Iterable on the keys and Records of the Dataframe. - * - * @return + * + * @return */ public Iterable> entries() { return () -> new Iterator>() { @@ -699,28 +699,28 @@ public void remove() { } }; } - + /** * Returns a read-only Iterable on the keys of the Dataframe. - * - * @return + * + * @return */ public Iterable index() { return () -> new Iterator() { private final Iterator it = records.keySet().iterator(); - + /** {@inheritDoc} */ @Override public boolean hasNext() { return it.hasNext(); } - + /** {@inheritDoc} */ @Override public Integer next() { return it.next(); } - + /** {@inheritDoc} */ @Override public void remove() { @@ -728,11 +728,11 @@ public void remove() { } }; } - + /** * Returns a read-only Iterable on the values of the Dataframe. - * - * @return + * + * @return */ public Iterable values() { return () -> new Iterator(){ @@ -757,7 +757,7 @@ public void remove() { } }; } - + /** * Sets the record in a particular position in the dataset, WITHOUT updating * the internal meta-info and returns the previous value (null if not existed). @@ -766,10 +766,10 @@ public void remove() { * unless you explicitly call the recalculateMeta() method, the meta data * will be corrupted. If you do use this method, MAKE sure you perform the * recalculation after you are done with the updates. - * + * * @param rId - * @param r - * @return + * @param r + * @return */ public Record _unsafe_set(Integer rId, Record r) { //move ahead the next id @@ -777,13 +777,13 @@ public Record _unsafe_set(Integer rId, Record r) { return records.put(rId, r); } - + /** * Adds the record in the dataset without updating the Meta. The add method * returns the id of the new record. - * + * * @param r - * @return + * @return */ private Integer _unsafe_add(Record r) { Integer newId = atomicNextAvailableRecordId.getAndIncrement(); @@ -791,33 +791,33 @@ private Integer _unsafe_add(Record r) { return newId; } - + /** * Protected getter for the DatabaseConnector of the Dataframe. It is used * by the DataframeMatrix. - * - * @return + * + * @return */ public DatabaseConnector getDbc() { return dbc; } - + /** * Updates the meta data of the Dataframe using the provided Record. * The Meta-data include the supported columns and their DataTypes. - * - * @param r + * + * @param r */ private void updateMeta(Record r) { for(Map.Entry entry : r.getX().entrySet()) { Object column = entry.getKey(); Object value = entry.getValue(); - + if(value!=null) { xDataTypes.putIfAbsent(column, TypeInference.getDataType(value)); } } - + if(yDataType == null) { Object value = r.getY(); if(value!=null) { @@ -825,5 +825,5 @@ private void updateMeta(Record r) { } } } - + } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/MapRealMatrix.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/MapRealMatrix.java index 7f7d035d..8556c443 100644 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/MapRealMatrix.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/dataobjects/MapRealMatrix.java @@ -68,7 +68,7 @@ protected MapRealMatrix(int rowDimension, int columnDimension) throws NotStrictl this.rowDimension = rowDimension; this.columnDimension = columnDimension; - String dbName = "mrm_" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong(); + String dbName = "mrm" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong(); dbc = MatrixDataframe.conf.getDbConfig().getConnector(dbName); entries = dbc.getBigMap("tmp_entries", Long.class, Double.class, MapType.HASHMAP, StorageHint.IN_DISK, false, true); } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/interfaces/Trainable.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/interfaces/Trainable.java index 39379739..c9d19b3e 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/interfaces/Trainable.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/interfaces/Trainable.java @@ -15,11 +15,8 @@ */ package com.datumbox.framework.common.interfaces; -import com.datumbox.framework.common.Configuration; import com.datumbox.framework.common.dataobjects.Dataframe; -import java.lang.reflect.InvocationTargetException; - /** * This interface is used to mark classes that can be trained. This interface * used for classes that perform training/analysis and learn parameters. diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/abstracts/AbstractDatabaseConnector.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/abstracts/AbstractDatabaseConnector.java index 1f913d55..7aa6c8ce 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/abstracts/AbstractDatabaseConnector.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/abstracts/AbstractDatabaseConnector.java @@ -16,18 +16,15 @@ package com.datumbox.framework.common.persistentstorage.abstracts; import com.datumbox.framework.common.persistentstorage.interfaces.BigMap; +import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConfiguration; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector; import com.datumbox.framework.common.utilities.ReflectionMethods; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.io.Serializable; -import java.io.UncheckedIOException; import java.lang.reflect.Field; import java.lang.reflect.Method; -import java.nio.file.*; -import java.nio.file.attribute.BasicFileAttributes; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; @@ -42,7 +39,11 @@ * * @author Vasilis Vryniotis */ -public abstract class AbstractDatabaseConnector implements DatabaseConnector { +public abstract class AbstractDatabaseConnector implements DatabaseConnector { + + protected String dbName; + protected final DC dbConf; + /** * Logger for all Connectors. */ @@ -53,9 +54,15 @@ public abstract class AbstractDatabaseConnector implements DatabaseConnector { private Thread hook; /** - * Protected Constructor which is responsible for adding the Shutdown hook. + * Protected Constructor which is responsible for adding the Shutdown hook and storing the database name and configuration. + * + * @param dbName + * @param dbConf */ - protected AbstractDatabaseConnector() { + protected AbstractDatabaseConnector(String dbName, DC dbConf) { + this.dbName = dbName; + this.dbConf = dbConf; + hook = new Thread(() -> { AbstractDatabaseConnector.this.hook = null; if(AbstractDatabaseConnector.this.isClosed()) { @@ -64,6 +71,14 @@ protected AbstractDatabaseConnector() { AbstractDatabaseConnector.this.close(); }); Runtime.getRuntime().addShutdownHook(hook); + + logger.trace("Opened db {}", dbName); + } + + /** {@inheritDoc} */ + @Override + public String getDatabaseName() { + return dbName; } /** {@inheritDoc} */ @@ -187,33 +202,4 @@ protected void postDeserializer(T serializableObject) { } - /** - * Deletes the file or folder recursively if it exists. - * - * @param path - * @return - * @throws IOException - */ - protected boolean deleteIfExistsRecursively(Path path) throws IOException { - try { - return Files.deleteIfExists(path); - } - catch (DirectoryNotEmptyException ex) { - //do recursive delete - Files.walkFileTree(path, new SimpleFileVisitor() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - Files.delete(file); - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { - Files.delete(dir); - return FileVisitResult.CONTINUE; - } - }); - return true; - } - } } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConfiguration.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConfiguration.java index 588c217c..d986cd9c 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConfiguration.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConfiguration.java @@ -15,7 +15,7 @@ */ package com.datumbox.framework.common.persistentstorage.inmemory; -import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConfiguration; +import com.datumbox.framework.common.persistentstorage.abstracts.AbstractFileDBConfiguration; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector; import java.util.Properties; @@ -27,45 +27,18 @@ * * @author Vasilis Vryniotis */ -public class InMemoryConfiguration implements DatabaseConfiguration { +public class InMemoryConfiguration extends AbstractFileDBConfiguration { - //DB specific properties - private String outputFolder = null; - - /** - * Default Constructor. - */ - public InMemoryConfiguration() { - - } - /** {@inheritDoc} */ @Override - public DatabaseConnector getConnector(String database) { - return new InMemoryConnector(database, this); + public DatabaseConnector getConnector(String dbName) { + return new InMemoryConnector(dbName, this); } - + /** {@inheritDoc} */ @Override public void load(Properties properties) { - outputFolder = properties.getProperty("dbConfig.InMemoryConfiguration.outputFolder"); - } - - /** - * Getter for the output folder where the InMemory data files are stored. - * - * @return - */ - public String getOutputFolder() { - return outputFolder; - } - - /** - * Setter for the output folder where the InMemory data files are stored. - * - * @param outputFolder - */ - public void setOutputFolder(String outputFolder) { - this.outputFolder = outputFolder; + outputDirectory = properties.getProperty("dbConfig.InMemoryConfiguration.outputDirectory"); } + } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConnector.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConnector.java index 3b47747a..aa991a82 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConnector.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/inmemory/InMemoryConnector.java @@ -16,6 +16,8 @@ package com.datumbox.framework.common.persistentstorage.inmemory; import com.datumbox.framework.common.persistentstorage.abstracts.AbstractDatabaseConnector; +import com.datumbox.framework.common.persistentstorage.abstracts.AbstractFileDBConnector; +import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConfiguration; import com.datumbox.framework.common.utilities.DeepCopy; import java.io.File; @@ -40,21 +42,15 @@ * * @author Vasilis Vryniotis */ -public class InMemoryConnector extends AbstractDatabaseConnector { - - private String dbName; - private final InMemoryConfiguration dbConf; +public class InMemoryConnector extends AbstractFileDBConnector { /** * @param dbName * @param dbConf - * @see AbstractDatabaseConnector#AbstractDatabaseConnector() + * @see AbstractDatabaseConnector#AbstractDatabaseConnector(String, DatabaseConfiguration) */ protected InMemoryConnector(String dbName, InMemoryConfiguration dbConf) { - super(); - this.dbName = dbName; - this.dbConf = dbConf; - logger.trace("Opened db {}", dbName); + super(dbName, dbConf); } /** {@inheritDoc} */ @@ -66,13 +62,7 @@ public boolean rename(String newDBName) { } try { - Path targetPath = getRootPath(newDBName); - deleteIfExistsRecursively(targetPath); - - Path srcPath = getRootPath(dbName); - if(Files.exists(srcPath)) { - Files.move(srcPath, targetPath); - } + moveDirectory(getRootPath(dbName), getRootPath(newDBName)); } catch (IOException ex) { throw new UncheckedIOException(ex); @@ -96,9 +86,7 @@ public void saveObject(String name, T serializableObjec assertConnectionOpen(); try { Path rootPath = getRootPath(dbName); - if(!Files.exists(rootPath)) { - Files.createDirectory(rootPath); - } + createDirectoryIfNotExists(rootPath); Path objectPath = new File(rootPath.toFile(), name).toPath(); Files.write(objectPath, DeepCopy.serialize(serializableObject)); @@ -143,7 +131,7 @@ public void close() { public void clear() { assertConnectionOpen(); try { - deleteIfExistsRecursively(getRootPath(dbName)); + deleteDirectory(getRootPath(dbName), true); } catch (IOException ex) { throw new UncheckedIOException(ex); @@ -171,22 +159,6 @@ else if(MapType.TREEMAP.equals(type)) { public void dropBigMap(String name, T map) { assertConnectionOpen(); map.clear(); - } - - /** {@inheritDoc} */ - @Override - public String getDatabaseName() { - return dbName; } - - private Path getRootPath(String dbName) { - //get the default filepath of the permanet db file - String outputFolder = dbConf.getOutputFolder(); - - if(outputFolder == null || outputFolder.isEmpty()) { - outputFolder = System.getProperty("java.io.tmpdir"); //write them to the tmp directory - } - return Paths.get(outputFolder + File.separator + dbName); - } } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/interfaces/DatabaseConfiguration.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/interfaces/DatabaseConfiguration.java index 30d40044..c4c8b8a1 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/interfaces/DatabaseConfiguration.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/interfaces/DatabaseConfiguration.java @@ -25,12 +25,21 @@ */ public interface DatabaseConfiguration extends Configurable { + /** + * Returns the separator that is used in the DB names. Usually the database + * names used by the algorithms are concatenations of various words separated + * by this character. + * + * @return + */ + public String getDBnameSeparator(); + /** * Initializes and returns a connection to the Database. * - * @param database + * @param dbName * @return */ - public DatabaseConnector getConnector(String database); + public DatabaseConnector getConnector(String dbName); } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConfiguration.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConfiguration.java index f18c79c0..10caefb7 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConfiguration.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConfiguration.java @@ -15,7 +15,7 @@ */ package com.datumbox.framework.common.persistentstorage.mapdb; -import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConfiguration; +import com.datumbox.framework.common.persistentstorage.abstracts.AbstractFileDBConfiguration; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector; import java.util.Properties; @@ -28,56 +28,28 @@ * * @author Vasilis Vryniotis */ -public class MapDBConfiguration implements DatabaseConfiguration { +public class MapDBConfiguration extends AbstractFileDBConfiguration { - //DB specific properties - private String outputFolder = null; - private int cacheSize = 10000; private boolean compressed = true; private boolean hybridized = true; - - /** - * Default Constructor. - */ - public MapDBConfiguration() { - - } - + /** {@inheritDoc} */ @Override - public DatabaseConnector getConnector(String database) { - return new MapDBConnector(database, this); + public DatabaseConnector getConnector(String dbName) { + return new MapDBConnector(dbName, this); } - + /** {@inheritDoc} */ @Override public void load(Properties properties) { - outputFolder = properties.getProperty("dbConfig.MapDBConfiguration.outputFolder"); + outputDirectory = properties.getProperty("dbConfig.MapDBConfiguration.outputDirectory"); cacheSize = Integer.parseInt(properties.getProperty("dbConfig.MapDBConfiguration.cacheSize")); compressed = "true".equalsIgnoreCase(properties.getProperty("dbConfig.MapDBConfiguration.compressed")); hybridized = "true".equalsIgnoreCase(properties.getProperty("dbConfig.MapDBConfiguration.hybridized")); } - - /** - * Getter for the output folder where the MapDB data files are stored. - * - * @return - */ - public String getOutputFolder() { - return outputFolder; - } - - /** - * Setter for the output folder where the MapDB data files are stored. - * - * @param outputFolder - */ - public void setOutputFolder(String outputFolder) { - this.outputFolder = outputFolder; - } /** * Getter for the size of items stored in the LRU cache by MapDB. diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConnector.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConnector.java index 5b1cee4e..53dac32d 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConnector.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/persistentstorage/mapdb/MapDBConnector.java @@ -16,6 +16,8 @@ package com.datumbox.framework.common.persistentstorage.mapdb; import com.datumbox.framework.common.persistentstorage.abstracts.AbstractDatabaseConnector; +import com.datumbox.framework.common.persistentstorage.abstracts.AbstractFileDBConnector; +import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConfiguration; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector; import org.mapdb.*; @@ -29,6 +31,7 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.TimeUnit; /** @@ -39,10 +42,7 @@ * * @author Vasilis Vryniotis */ -public class MapDBConnector extends AbstractDatabaseConnector { - - private String dbName; - private final MapDBConfiguration dbConf; +public class MapDBConnector extends AbstractFileDBConnector { /** * Enum class which stores the Database Type used for every collection. @@ -79,13 +79,10 @@ private enum DBType { /** * @param dbName * @param dbConf - * @see AbstractDatabaseConnector#AbstractDatabaseConnector() + * @see AbstractDatabaseConnector#AbstractDatabaseConnector(String, DatabaseConfiguration) */ protected MapDBConnector(String dbName, MapDBConfiguration dbConf) { - super(); - this.dbName = dbName; - this.dbConf = dbConf; - logger.trace("Opened db {}", dbName); + super(dbName, dbConf); } /** {@inheritDoc} */ @@ -98,17 +95,28 @@ public boolean rename(String newDBName) { DB db = openDB(DBType.PRIMARY_DB); db.commit(); - db.close(); - try { - Path targetPath = getRootPath(newDBName); - deleteIfExistsRecursively(targetPath); + //find the underlying engine + Engine e = db.getEngine(); + while(EngineWrapper.class.isAssignableFrom(e.getClass())) { + e = ((EngineWrapper)e).getWrappedEngine(); + } - Path srcPath = getRootPath(dbName); - if(Files.exists(srcPath)) { - Files.move(srcPath, targetPath); + //close and wait until the close on the underlying engine is also finished + db.close(); + while(!e.isClosed()) { + logger.trace("Waiting for the engine to close"); + try { + TimeUnit.MILLISECONDS.sleep(100); + } + catch (InterruptedException ex) { + throw new RuntimeException(ex); } } + + try { + moveDirectory(getRootPath(dbName), getRootPath(newDBName)); + } catch (IOException ex) { throw new UncheckedIOException(ex); } @@ -181,7 +189,7 @@ public void clear() { closeDBRegistry(); try { - deleteIfExistsRecursively(getRootPath(dbName)); + deleteDirectory(getRootPath(dbName), true); } catch (IOException ex) { throw new UncheckedIOException(ex); @@ -282,13 +290,7 @@ public void dropBigMap(String name, T map) { map.clear(); } } - - /** {@inheritDoc} */ - @Override - public String getDatabaseName() { - return dbName; - } - + //private methods of connector class /** @@ -351,13 +353,11 @@ private DB openDB(DBType dbType) { if(dbType == DBType.PRIMARY_DB) { //main storage Path rootPath = getRootPath(dbName); - if(!Files.exists(rootPath)) { - try { - Files.createDirectory(rootPath); - } - catch (IOException ex) { - throw new UncheckedIOException(ex); - } + try { + createDirectoryIfNotExists(rootPath); + } + catch (IOException ex) { + throw new UncheckedIOException(ex); } m = DBMaker.newFileDB(new File(rootPath.toFile(), DBType.PRIMARY_DB.toString()) ); @@ -427,15 +427,5 @@ private void closeDBRegistry() { } dbRegistry.clear(); } - - private Path getRootPath(String dbName) { - //get the default filepath of the permanet db file - String outputFolder = dbConf.getOutputFolder(); - if(outputFolder == null || outputFolder.isEmpty()) { - outputFolder = System.getProperty("java.io.tmpdir"); //write them to the tmp directory - } - - return Paths.get(outputFolder + File.separator + dbName); - } } diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/RandomGenerator.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/RandomGenerator.java index 1f7513a7..7b160f57 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/RandomGenerator.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/RandomGenerator.java @@ -16,7 +16,6 @@ package com.datumbox.framework.common.utilities; import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; /** * The RandomGenerator generates Random objects that can be read and modified @@ -31,7 +30,13 @@ public class RandomGenerator { private static ThreadLocal threadLocalRandom; - private static Random threadLocalRandomUnseeded = ThreadLocalRandom.current(); + private static final ThreadLocal threadLocalRandomUnseeded = new ThreadLocal() { + /** {@inheritDoc} */ + @Override + protected Random initialValue() { + return new Random(); + } + }; /** * Getter for the global seed. The global seed affects the initial seeding of @@ -92,6 +97,6 @@ protected Random initialValue() { * @return */ public static Random getThreadLocalRandomUnseeded() { - return threadLocalRandomUnseeded; + return threadLocalRandomUnseeded.get(); } } diff --git a/datumbox-framework-common/src/main/resources/datumbox.config.default.properties b/datumbox-framework-common/src/main/resources/datumbox.config.default.properties index 9b71f51b..2c5d63b2 100755 --- a/datumbox-framework-common/src/main/resources/datumbox.config.default.properties +++ b/datumbox-framework-common/src/main/resources/datumbox.config.default.properties @@ -25,15 +25,15 @@ dbConfig.className=com.datumbox.framework.common.persistentstorage.inmemory.InMe # InMemoryConfiguration # --------------------- -# The relative or absolute path for the output folder where the models are stored (if not specified the temporary folder is used): -dbConfig.InMemoryConfiguration.outputFolder= +# The relative or absolute path for the output directory where the models are stored (if not specified the temporary directory is used): +dbConfig.InMemoryConfiguration.outputDirectory= # MapDBConfiguration # ------------------ -# The relative or absolute path for the output folder where the models are stored (if not specified the temporary folder is used): -dbConfig.MapDBConfiguration.outputFolder= +# The relative or absolute path for the output directory where the models are stored (if not specified the temporary directory is used): +dbConfig.MapDBConfiguration.outputDirectory= # The number of records kept in each LRU cache. Setting it to 0 will disable caching (not recommended): dbConfig.MapDBConfiguration.cacheSize=10000 diff --git a/datumbox-framework-common/src/test/java/com/datumbox/framework/common/dataobjects/DataframeTest.java b/datumbox-framework-common/src/test/java/com/datumbox/framework/common/dataobjects/DataframeTest.java index 2ad0d369..9d6127ed 100755 --- a/datumbox-framework-common/src/test/java/com/datumbox/framework/common/dataobjects/DataframeTest.java +++ b/datumbox-framework-common/src/test/java/com/datumbox/framework/common/dataobjects/DataframeTest.java @@ -217,8 +217,7 @@ public void testExtractColumnValuesByY() { logger.info("extractColumnValuesByY"); Configuration conf = Configuration.getConfiguration(); - - Object column = "height"; + Dataframe dataset = new Dataframe(conf); AssociativeArray xData1 = new AssociativeArray(); diff --git a/datumbox-framework-common/src/test/resources/datumbox.config.properties b/datumbox-framework-common/src/test/resources/datumbox.config.properties index 9b71f51b..2c5d63b2 100755 --- a/datumbox-framework-common/src/test/resources/datumbox.config.properties +++ b/datumbox-framework-common/src/test/resources/datumbox.config.properties @@ -25,15 +25,15 @@ dbConfig.className=com.datumbox.framework.common.persistentstorage.inmemory.InMe # InMemoryConfiguration # --------------------- -# The relative or absolute path for the output folder where the models are stored (if not specified the temporary folder is used): -dbConfig.InMemoryConfiguration.outputFolder= +# The relative or absolute path for the output directory where the models are stored (if not specified the temporary directory is used): +dbConfig.InMemoryConfiguration.outputDirectory= # MapDBConfiguration # ------------------ -# The relative or absolute path for the output folder where the models are stored (if not specified the temporary folder is used): -dbConfig.MapDBConfiguration.outputFolder= +# The relative or absolute path for the output directory where the models are stored (if not specified the temporary directory is used): +dbConfig.MapDBConfiguration.outputDirectory= # The number of records kept in each LRU cache. Setting it to 0 will disable caching (not recommended): dbConfig.MapDBConfiguration.cacheSize=10000 diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerative.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerative.java index 215aca09..748b1358 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerative.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerative.java @@ -378,7 +378,7 @@ protected void _fit(Dataframe trainingData) { private double calculateDistance(Record r1, Record r2) { TrainingParameters trainingParameters = knowledgeBase.getTrainingParameters(); - double distance = 0.0; + double distance; TrainingParameters.Distance distanceMethod = trainingParameters.getDistanceMethod(); if(distanceMethod==TrainingParameters.Distance.EUCLIDIAN) { distance = Distance.euclidean(r1.getX(), r2.getX()); diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/Kmeans.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/Kmeans.java index 7c019e03..41ef7e5b 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/Kmeans.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/clustering/Kmeans.java @@ -588,7 +588,7 @@ private double calculateDistance(Record r1, Record r2) { TrainingParameters trainingParameters = knowledgeBase.getTrainingParameters(); Map featureWeights = modelParameters.getFeatureWeights(); - double distance = 0.0; + double distance; TrainingParameters.Distance distanceMethod = trainingParameters.getDistanceMethod(); if(distanceMethod==TrainingParameters.Distance.EUCLIDIAN) { distance = Distance.euclideanWeighted(r1.getX(), r2.getX(), featureWeights); diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/AbstractTrainer.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/AbstractTrainer.java index eb350e06..c07f68d7 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/AbstractTrainer.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/AbstractTrainer.java @@ -113,6 +113,11 @@ public static abstract class AbstractTrainingParameters implements TrainingParam */ protected final KnowledgeBase knowledgeBase; + /** + * Flag that indicates whether the trainer has been saved or loaded from disk. + */ + private boolean persisted = false; + /** * Constructor which is called on model initialization before training. * @@ -120,7 +125,7 @@ public static abstract class AbstractTrainingParameters implements TrainingParam * @param conf */ protected AbstractTrainer(TP trainingParameters, Configuration conf) { - String knowledgeBaseName = createKnowledgeBaseName("kb_" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong()); + String knowledgeBaseName = createKnowledgeBaseName("kb" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong(), conf.getDbConfig().getDBnameSeparator()); knowledgeBase = new KnowledgeBase<>(knowledgeBaseName, conf, trainingParameters); } @@ -131,8 +136,9 @@ protected AbstractTrainer(TP trainingParameters, Configuration conf) { * @param conf */ protected AbstractTrainer(String dbName, Configuration conf) { - String knowledgeBaseName = createKnowledgeBaseName(dbName); + String knowledgeBaseName = createKnowledgeBaseName(dbName, conf.getDbConfig().getDBnameSeparator()); knowledgeBase = new KnowledgeBase<>(knowledgeBaseName, conf); + persisted = true; } /** {@inheritDoc} */ @@ -161,20 +167,34 @@ public void fit(Dataframe trainingData) { /** {@inheritDoc} */ @Override public void save(String dbName) { - String knowledgeBaseName = createKnowledgeBaseName(dbName); + logger.info("save()"); + + String knowledgeBaseName = createKnowledgeBaseName(dbName, knowledgeBase.getConf().getDbConfig().getDBnameSeparator()); knowledgeBase.save(knowledgeBaseName); + persisted = true; } /** {@inheritDoc} */ @Override public void delete() { + logger.info("delete()"); + knowledgeBase.delete(); } /** {@inheritDoc} */ @Override public void close() { - knowledgeBase.close(); + logger.info("close()"); + + if(persisted) { + //if the trainer is persisted in disk, just close the connection + knowledgeBase.close(); + } + else { + //if not try to delete it in case temporary files remained on disk + knowledgeBase.delete(); + } } /** @@ -188,9 +208,10 @@ public void close() { * Generates a name for the KnowledgeBase. * * @param dbName + * @param separator * @return */ - protected final String createKnowledgeBaseName(String dbName) { - return dbName + "_" + getClass().getSimpleName(); + protected final String createKnowledgeBaseName(String dbName, String separator) { + return dbName + separator + getClass().getSimpleName(); } } diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/algorithms/AbstractBoostingBagging.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/algorithms/AbstractBoostingBagging.java index 3e995588..ee62df2e 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/algorithms/AbstractBoostingBagging.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/algorithms/AbstractBoostingBagging.java @@ -17,7 +17,6 @@ import com.datumbox.framework.common.Configuration; import com.datumbox.framework.common.dataobjects.*; -import com.datumbox.framework.common.interfaces.Trainable; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector.MapType; import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector.StorageHint; @@ -44,7 +43,7 @@ */ public abstract class AbstractBoostingBagging extends AbstractClassifier { - private TrainableBundle bundle = new TrainableBundle(); + private final TrainableBundle bundle = new TrainableBundle(); private static final String DB_INDICATOR = "Cmp"; private static final int MAX_NUM_OF_RETRIES = 2; @@ -205,7 +204,6 @@ protected void _predict(Dataframe newData) { @Override protected void _fit(Dataframe trainingData) { Configuration conf = knowledgeBase.getConf(); - DatabaseConnector dbc = knowledgeBase.getDbc(); TP trainingParameters = knowledgeBase.getTrainingParameters(); MP modelParameters = knowledgeBase.getModelParameters(); @@ -318,9 +316,11 @@ protected enum Status { @Override public void save(String dbName) { initBundle(); - String knowledgeBaseName = createKnowledgeBaseName(dbName); - bundle.save(knowledgeBaseName); super.save(dbName); + + String separator = knowledgeBase.getConf().getDbConfig().getDBnameSeparator(); + String knowledgeBaseName = createKnowledgeBaseName(dbName, separator); + bundle.save(knowledgeBaseName, separator); } /** {@inheritDoc} */ @@ -348,6 +348,7 @@ private void initBundle() { DatabaseConnector dbc = knowledgeBase.getDbc(); MP modelParameters = knowledgeBase.getModelParameters(); TP trainingParameters = knowledgeBase.getTrainingParameters(); + String separator = conf.getDbConfig().getDBnameSeparator(); //the number of weak classifiers is the minimum between the classifiers that were defined in training parameters AND the number of the weak classifiers that were kept Class weakClassifierClass = trainingParameters.getWeakClassifierTrainingParameters().getTClass(); @@ -355,7 +356,7 @@ private void initBundle() { for(int i=0;i featureCount DatabaseConnector dbc = knowledgeBase.getDbc(); TP trainingParameters = knowledgeBase.getTrainingParameters(); Integer rareFeatureThreshold = trainingParameters.getRareFeatureThreshold(); - - Map columnTypes = data.getXDataTypes(); - + //find the featureCounts logger.debug("Estimating featureCounts"); @@ -242,13 +240,11 @@ private void removeRareFeatures(Dataframe data, Map featureCount private void buildFeatureStatistics(Dataframe data, Map classCounts, Map, Integer> featureClassCounts, Map featureCounts) { logger.debug("buildFeatureStatistics()"); - TP trainingParameters = knowledgeBase.getTrainingParameters(); //the method below does not only removes the rare features but also //first and formost calculates the contents of featureCounts map. removeRareFeatures(data, featureCounts); - - Map columnTypes = data.getXDataTypes(); + //now find the classCounts and the featureClassCounts logger.debug("Estimating classCounts and featureClassCounts"); for(Record r : data) { diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/dataobjects/TrainableBundle.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/dataobjects/TrainableBundle.java index 302c0d60..9939c5dc 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/dataobjects/TrainableBundle.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/dataobjects/TrainableBundle.java @@ -87,17 +87,24 @@ public void setParallelized(boolean parallelized) { } } - /** {@inheritDoc} */ - public void save(String dbName) { + /** + * Saves all the trainables in the bundle. + * + * @param dbName + * @param separator + */ + public void save(String dbName, String separator) { for(Map.Entry e : bundle.entrySet()) { Trainable t = e.getValue(); if(t != null) { - t.save(dbName + "_" + e.getKey()); + t.save(dbName + separator + e.getKey()); } } } - /** {@inheritDoc} */ + /** + * Deletes all the trainables in the bundle. + */ public void delete() { for(Trainable t : bundle.values()) { if(t != null) { diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/modelselection/metrics/ClassificationMetrics.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/modelselection/metrics/ClassificationMetrics.java index 38259566..2c298deb 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/modelselection/metrics/ClassificationMetrics.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/modelselection/metrics/ClassificationMetrics.java @@ -61,13 +61,13 @@ public enum SensitivityRates { private double macroRecall = 0.0; private double macroF1 = 0.0; - private Map microPrecision = new HashMap<>(); //this is small. Size equal to 4*class numbers + private final Map microPrecision = new HashMap<>(); //this is small. Size equal to 4*class numbers - private Map microRecall = new HashMap<>(); //this is small. Size equal to 4*class numbers + private final Map microRecall = new HashMap<>(); //this is small. Size equal to 4*class numbers - private Map microF1 = new HashMap<>(); //this is small. Size equal to 4*class numbers + private final Map microF1 = new HashMap<>(); //this is small. Size equal to 4*class numbers - private Map, Double> contingencyTable = new HashMap<>(); //this is small. Size equal to 4*class numbers + private final Map, Double> contingencyTable = new HashMap<>(); //this is small. Size equal to 4*class numbers /** * Getter for Accuracy. diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/recommendersystem/CollaborativeFiltering.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/recommendersystem/CollaborativeFiltering.java index bab78911..5f255334 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/recommendersystem/CollaborativeFiltering.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/recommendersystem/CollaborativeFiltering.java @@ -217,7 +217,7 @@ protected void _fit(Dataframe trainingData) { private double calculateSimilarity(Record r1, Record r2) { TrainingParameters trainingParameters = knowledgeBase.getTrainingParameters(); - double similarity = 0.0; + double similarity; TrainingParameters.SimilarityMeasure similarityMethod = trainingParameters.getSimilarityMethod(); if(similarityMethod==TrainingParameters.SimilarityMeasure.EUCLIDIAN) { similarity = Distance.euclidean(r1.getX(), r2.getX()); diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/regression/StepwiseRegression.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/regression/StepwiseRegression.java index d1c7818b..276c1e57 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/regression/StepwiseRegression.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/regression/StepwiseRegression.java @@ -41,7 +41,7 @@ public class StepwiseRegression extends AbstractRegressor entry1 : dataTable.entrySet()) { - Object i = entry1.getKey(); AssociativeArray row = entry1.getValue(); //find the number of tied values and convert values into ranks diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/statistics/survival/nonparametrics/independentsamples/Logrank.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/statistics/survival/nonparametrics/independentsamples/Logrank.java index f570dba0..4f7944d7 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/statistics/survival/nonparametrics/independentsamples/Logrank.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/statistics/survival/nonparametrics/independentsamples/Logrank.java @@ -116,7 +116,6 @@ else if(currentCensored entry : transposeDataCollection.entrySet()) { - Object j = entry.getKey(); FlatDataCollection flatDataCollection = entry.getValue(); for(Object value2 : flatDataCollection) { diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/OrdinalRegressionTest.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/OrdinalRegressionTest.java index ee995440..206b9b96 100755 --- a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/OrdinalRegressionTest.java +++ b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/OrdinalRegressionTest.java @@ -120,8 +120,7 @@ public void testKFoldCrossValidation() { Dataframe trainingData = data[0]; data[1].delete(); - - String dbName = this.getClass().getSimpleName(); + DummyXMinMaxNormalizer df = MLBuilder.create(new DummyXMinMaxNormalizer.TrainingParameters(), conf); df.fit_transform(trainingData); diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/SoftMaxRegressionTest.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/SoftMaxRegressionTest.java index d612d2e8..7deda563 100755 --- a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/SoftMaxRegressionTest.java +++ b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/classification/SoftMaxRegressionTest.java @@ -122,8 +122,7 @@ public void testKFoldCrossValidation() { Dataframe trainingData = data[0]; data[1].delete(); - - String dbName = this.getClass().getSimpleName(); + XMinMaxNormalizer df = MLBuilder.create(new XMinMaxNormalizer.TrainingParameters(), conf); df.fit_transform(trainingData); diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerativeTest.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerativeTest.java index 3b0ca216..476f8861 100755 --- a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerativeTest.java +++ b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/HierarchicalAgglomerativeTest.java @@ -111,8 +111,7 @@ public void testKFoldCrossValidation() { Dataframe trainingData = data[0]; data[1].delete(); - - String dbName = this.getClass().getSimpleName(); + DummyXYMinMaxNormalizer df = MLBuilder.create(new DummyXYMinMaxNormalizer.TrainingParameters(), conf); df.fit_transform(trainingData); diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/KmeansTest.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/KmeansTest.java index 69064e99..10ce6fa2 100755 --- a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/KmeansTest.java +++ b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/clustering/KmeansTest.java @@ -114,8 +114,7 @@ public void testKFoldCrossValidation() { Dataframe trainingData = data[0]; data[1].delete(); - - String dbName = this.getClass().getSimpleName(); + DummyXYMinMaxNormalizer df = MLBuilder.create(new DummyXYMinMaxNormalizer.TrainingParameters(), conf); df.fit_transform(trainingData); diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/MatrixLinearRegressionTest.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/MatrixLinearRegressionTest.java index 0fee244f..40a85518 100755 --- a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/MatrixLinearRegressionTest.java +++ b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/MatrixLinearRegressionTest.java @@ -107,8 +107,7 @@ public void testKFoldCrossValidation() { Dataframe[] data = Datasets.regressionMixed(conf); Dataframe trainingData = data[0]; data[1].delete(); - - String dbName = this.getClass().getSimpleName(); + DummyXYMinMaxNormalizer df = MLBuilder.create(new DummyXYMinMaxNormalizer.TrainingParameters(), conf); df.fit_transform(trainingData); diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/NLMSTest.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/NLMSTest.java index 64538f31..bf6faf28 100755 --- a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/NLMSTest.java +++ b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/machinelearning/regression/NLMSTest.java @@ -110,8 +110,7 @@ public void testKFoldCrossValidation() { Dataframe[] data = Datasets.housingNumerical(conf); Dataframe trainingData = data[0]; data[1].delete(); - - String dbName = this.getClass().getSimpleName(); + DummyXYMinMaxNormalizer df = MLBuilder.create(new DummyXYMinMaxNormalizer.TrainingParameters(), conf); df.fit_transform(trainingData);