Skip to content

Commit

Permalink
Adding tests that show how to chain feature selectors and minor refac…
Browse files Browse the repository at this point in the history
…toring.
  • Loading branch information
datumbox committed Dec 31, 2016
1 parent e697f65 commit 6c7f412
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 26 deletions.
4 changes: 1 addition & 3 deletions TODO.txt
@@ -1,8 +1,6 @@
CODE IMPROVEMENTS
=================

- Create a AnovaSelect feature selector and use it on the tests.

- Consider dropping all the common.dataobjects and use their internalData directly instead.
- Refactor the statistics package and replace all the static methods with proper inheritance.
- Write generic optimizers instead of having optimization methods in the algorithms. Add the optimizers and regularization packages under mathematics.
Expand Down Expand Up @@ -35,7 +33,7 @@ NEW ALGORITHMS
==============

- Create a PercentileScaler numerical scaler.
- Create the following FeatureSelectors: KruskalWallisSelect, SpearmanSelect.
- Create the following FeatureSelectors: AnovaSelect, KruskalWallisSelect, SpearmanSelect.
- Speed up LDA: http://www.cs.ucsb.edu/~mingjia/cs240/doc/273811.pdf
- Factorization Machines: http://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
- Develop the FunkSVD and PLSI as probabilistic version of SVD.
Expand Down
Expand Up @@ -19,7 +19,9 @@
import com.datumbox.framework.common.dataobjects.Dataframe;
import com.datumbox.framework.common.dataobjects.Record;
import com.datumbox.framework.core.machinelearning.MLBuilder;
import com.datumbox.framework.core.machinelearning.classification.MultinomialNaiveBayes;
import com.datumbox.framework.core.machinelearning.classification.SoftMaxRegression;
import com.datumbox.framework.core.machinelearning.featureselection.ChisquareSelect;
import com.datumbox.framework.core.machinelearning.featureselection.PCA;
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
Expand Down Expand Up @@ -53,8 +55,8 @@ public void testTrainAndValidate() {
Dataframe[] data = Datasets.heartDiseaseClusters(configuration);

Dataframe trainingData = data[0];
Dataframe validationData = data[1];

Dataframe validationData = data[0].copy();
Dataframe testData = data[1];

String storageName = this.getClass().getSimpleName();

Expand All @@ -70,41 +72,47 @@ public void testTrainAndValidate() {
trainingParameters.setCategoricalEncoderTrainingParameters(ceParams);

//feature selection configuration
trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList());

PCA.TrainingParameters pcaParams = new PCA.TrainingParameters();
pcaParams.setVariancePercentageThreshold(0.99999995);
trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList(new ChisquareSelect.TrainingParameters(), pcaParams));

//model Configuration
MultinomialNaiveBayes.TrainingParameters modelTrainingParameters = new MultinomialNaiveBayes.TrainingParameters();
modelTrainingParameters.setMultiProbabilityWeighted(true);
SoftMaxRegression.TrainingParameters modelTrainingParameters = new SoftMaxRegression.TrainingParameters();
modelTrainingParameters.setL1(0.0001);
modelTrainingParameters.setL2(0.0001);
modelTrainingParameters.setTotalIterations(100);
trainingParameters.setModelerTrainingParameters(modelTrainingParameters);

Modeler instance = MLBuilder.create(trainingParameters, configuration);
instance.fit(trainingData);
instance.save(storageName);

instance.close();
trainingData.close();

instance = MLBuilder.load(Modeler.class, storageName, configuration);

instance.predict(trainingData);
instance.predict(validationData);

ClassificationMetrics vm = new ClassificationMetrics(trainingData);
ClassificationMetrics vm = new ClassificationMetrics(validationData);

double expResult2 = 0.7867564534231202;
double expResult2 = 0.8428731762065095;
assertEquals(expResult2, vm.getMacroF1(), Constants.DOUBLE_ACCURACY_HIGH);

trainingData.close();
validationData.close();
instance.close();


instance = MLBuilder.load(Modeler.class, storageName, configuration);

instance.predict(validationData);
instance.predict(testData);



Map<Integer, Object> expResult = new HashMap<>();
Map<Integer, Object> result = new HashMap<>();
for(Map.Entry<Integer, Record> e : validationData.entries()) {
for(Map.Entry<Integer, Record> e : testData.entries()) {
Integer rId = e.getKey();
Record r = e.getValue();
expResult.put(rId, r.getY());
Expand All @@ -114,7 +122,7 @@ public void testTrainAndValidate() {

instance.delete();

validationData.close();
testData.close();
}

}
Expand Up @@ -134,13 +134,13 @@ protected void _fit(Dataframe trainingData) {
/** {@inheritDoc} */
@Override
protected Set<TypeInference.DataType> getSupportedXDataTypes() {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL)));
return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL));
}

/** {@inheritDoc} */
@Override
protected Set<TypeInference.DataType> getSupportedYDataTypes() {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL)));
return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL));
}

/**
Expand Down
Expand Up @@ -20,7 +20,6 @@
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

Expand Down Expand Up @@ -54,6 +53,6 @@ protected AbstractEncoder(String storageName, Configuration configuration) {
/** {@inheritDoc} */
@Override
protected Set<TypeInference.DataType> getSupportedXDataTypes() {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL)));
return new HashSet<>(Arrays.asList(TypeInference.DataType.CATEGORICAL, TypeInference.DataType.ORDINAL));
}
}
Expand Up @@ -21,7 +21,6 @@
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

Expand Down Expand Up @@ -92,7 +91,7 @@ protected AbstractScaler(String storageName, Configuration configuration) {
/** {@inheritDoc} */
@Override
protected Set<TypeInference.DataType> getSupportedXDataTypes() {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.NUMERICAL)));
return new HashSet<>(Arrays.asList(TypeInference.DataType.NUMERICAL));
}

}
Expand Up @@ -31,7 +31,7 @@

/**
* Implementation of the Chisquare Feature Selection algorithm which can be used
* for evaluating categorical and boolean variables.
* for evaluating boolean variables.
*
* References:
* http://nlp.stanford.edu/IR-book/html/htmledition/feature-selectionchi2-feature-selection-1.html
Expand Down
Expand Up @@ -27,7 +27,7 @@

/**
* Implementation of the Mutual Information Feature Selection algorithm which can be used
* for evaluating categorical and boolean variables.
* for evaluating boolean variables.
*
* References:
* http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html
Expand Down
Expand Up @@ -365,7 +365,7 @@ protected void _transform(Dataframe newData) {
/** {@inheritDoc} */
@Override
protected Set<TypeInference.DataType> getSupportedXDataTypes() {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL)));
return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL));
}

/** {@inheritDoc} */
Expand Down
Expand Up @@ -211,7 +211,7 @@ protected void _fit(Dataframe trainingData) {
/** {@inheritDoc} */
@Override
protected Set<TypeInference.DataType> getSupportedXDataTypes() {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL)));
return new HashSet<>(Arrays.asList(TypeInference.DataType.BOOLEAN, TypeInference.DataType.NUMERICAL));
}

/** {@inheritDoc} */
Expand Down

0 comments on commit 6c7f412

Please sign in to comment.