diff --git a/TODO.txt b/TODO.txt index cc584110..ae373306 100755 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,6 @@ CODE IMPROVEMENTS ================= -- Create a AnovaSelect feature selector and use it on the tests. - - Consider dropping all the common.dataobjects and use their internalData directly instead. - Refactor the statistics package and replace all the static methods with proper inheritance. - Write generic optimizers instead of having optimization methods in the algorithms. Add the optimizers and regularization packages under mathematics. @@ -35,7 +33,7 @@ NEW ALGORITHMS ============== - Create a PercentileScaler numerical scaler. -- Create the following FeatureSelectors: KruskalWallisSelect, SpearmanSelect. +- Create the following FeatureSelectors: AnovaSelect, KruskalWallisSelect, SpearmanSelect. - Speed up LDA: http://www.cs.ucsb.edu/~mingjia/cs240/doc/273811.pdf - Factorization Machines: http://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf - Develop the FunkSVD and PLSI as probabilistic version of SVD. diff --git a/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/datamodeling/ModelerTest.java b/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/datamodeling/ModelerTest.java index af6ec0fd..4e9d8ef7 100755 --- a/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/datamodeling/ModelerTest.java +++ b/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/datamodeling/ModelerTest.java @@ -19,7 +19,9 @@ import com.datumbox.framework.common.dataobjects.Dataframe; import com.datumbox.framework.common.dataobjects.Record; import com.datumbox.framework.core.machinelearning.MLBuilder; -import com.datumbox.framework.core.machinelearning.classification.MultinomialNaiveBayes; +import com.datumbox.framework.core.machinelearning.classification.SoftMaxRegression; +import com.datumbox.framework.core.machinelearning.featureselection.ChisquareSelect; +import com.datumbox.framework.core.machinelearning.featureselection.PCA; import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics; import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder; import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler; @@ -53,8 +55,8 @@ public void testTrainAndValidate() { Dataframe[] data = Datasets.heartDiseaseClusters(configuration); Dataframe trainingData = data[0]; - Dataframe validationData = data[1]; - + Dataframe validationData = data[0].copy(); + Dataframe testData = data[1]; String storageName = this.getClass().getSimpleName(); @@ -70,11 +72,16 @@ public void testTrainAndValidate() { trainingParameters.setCategoricalEncoderTrainingParameters(ceParams); //feature selection configuration - trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList()); + + PCA.TrainingParameters pcaParams = new PCA.TrainingParameters(); + pcaParams.setVariancePercentageThreshold(0.99999995); + trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList(new ChisquareSelect.TrainingParameters(), pcaParams)); //model Configuration - MultinomialNaiveBayes.TrainingParameters modelTrainingParameters = new MultinomialNaiveBayes.TrainingParameters(); - modelTrainingParameters.setMultiProbabilityWeighted(true); + SoftMaxRegression.TrainingParameters modelTrainingParameters = new SoftMaxRegression.TrainingParameters(); + modelTrainingParameters.setL1(0.0001); + modelTrainingParameters.setL2(0.0001); + modelTrainingParameters.setTotalIterations(100); trainingParameters.setModelerTrainingParameters(modelTrainingParameters); Modeler instance = MLBuilder.create(trainingParameters, configuration); @@ -82,29 +89,30 @@ public void testTrainAndValidate() { instance.save(storageName); instance.close(); + trainingData.close(); instance = MLBuilder.load(Modeler.class, storageName, configuration); - instance.predict(trainingData); + instance.predict(validationData); - ClassificationMetrics vm = new ClassificationMetrics(trainingData); + ClassificationMetrics vm = new ClassificationMetrics(validationData); - double expResult2 = 0.7867564534231202; + double expResult2 = 0.8428731762065095; assertEquals(expResult2, vm.getMacroF1(), Constants.DOUBLE_ACCURACY_HIGH); - trainingData.close(); + validationData.close(); instance.close(); instance = MLBuilder.load(Modeler.class, storageName, configuration); - instance.predict(validationData); + instance.predict(testData); Map expResult = new HashMap<>(); Map result = new HashMap<>(); - for(Map.Entry e : validationData.entries()) { + for(Map.Entry e : testData.entries()) { Integer rId = e.getKey(); Record r = e.getValue(); expResult.put(rId, r.getY()); @@ -114,7 +122,7 @@ public void testTrainAndValidate() { instance.delete(); - validationData.close(); + testData.close(); } } diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/featureselectors/AbstractCountBasedFeatureSelector.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/featureselectors/AbstractCountBasedFeatureSelector.java index d92bd9dc..e920d6b4 100644 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/featureselectors/AbstractCountBasedFeatureSelector.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/featureselectors/AbstractCountBasedFeatureSelector.java @@ -134,13 +134,13 @@ protected void _fit(Dataframe trainingData) { /** {@inheritDoc} */ @Override protected Set getSupportedXDataTypes() { - return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL))); + return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL)); } /** {@inheritDoc} */ @Override protected Set getSupportedYDataTypes() { - return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL))); + return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL)); } /** diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractEncoder.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractEncoder.java index f5fc8f64..106d8818 100644 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractEncoder.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractEncoder.java @@ -20,7 +20,6 @@ import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer; import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -54,6 +53,6 @@ protected AbstractEncoder(String storageName, Configuration configuration) { /** {@inheritDoc} */ @Override protected Set getSupportedXDataTypes() { - return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL))); + return new HashSet<>(Arrays.asList(TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL)); } } diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractScaler.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractScaler.java index 18622d75..63db21e2 100644 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractScaler.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/transformers/AbstractScaler.java @@ -21,7 +21,6 @@ import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer; import java.util.Arrays; -import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -92,7 +91,7 @@ protected AbstractScaler(String storageName, Configuration configuration) { /** {@inheritDoc} */ @Override protected Set getSupportedXDataTypes() { - return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.NUMERICAL))); + return new HashSet<>(Arrays.asList(TypeInference.DataType.NUMERICAL)); } } diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/ChisquareSelect.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/ChisquareSelect.java index 08ca4a45..42fdd359 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/ChisquareSelect.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/ChisquareSelect.java @@ -31,7 +31,7 @@ /** * Implementation of the Chisquare Feature Selection algorithm which can be used - * for evaluating categorical and boolean variables. + * for evaluating boolean variables. * * References: * http://nlp.stanford.edu/IR-book/html/htmledition/feature-selectionchi2-feature-selection-1.html diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/MutualInformation.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/MutualInformation.java index 8d6deac1..ebad1f1e 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/MutualInformation.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/MutualInformation.java @@ -27,7 +27,7 @@ /** * Implementation of the Mutual Information Feature Selection algorithm which can be used - * for evaluating categorical and boolean variables. + * for evaluating boolean variables. * * References: * http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/PCA.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/PCA.java index 1234708e..d6ce47aa 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/PCA.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/PCA.java @@ -365,7 +365,7 @@ protected void _transform(Dataframe newData) { /** {@inheritDoc} */ @Override protected Set getSupportedXDataTypes() { - return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL))); + return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL)); } /** {@inheritDoc} */ diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/TFIDF.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/TFIDF.java index 4c39b48c..48848bf3 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/TFIDF.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/TFIDF.java @@ -211,7 +211,7 @@ protected void _fit(Dataframe trainingData) { /** {@inheritDoc} */ @Override protected Set getSupportedXDataTypes() { - return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL))); + return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL)); } /** {@inheritDoc} */