Skip to content

Commit

Permalink
Create a new OneHotEncoder preprocessing class.
Browse files Browse the repository at this point in the history
  • Loading branch information
datumbox committed Dec 26, 2016
1 parent 9a42a0e commit 78fb253
Show file tree
Hide file tree
Showing 15 changed files with 167 additions and 46 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
@@ -1,7 +1,7 @@
CHANGELOG
=========

Version 0.8.0-SNAPSHOT - Build 20161225
Version 0.8.0-SNAPSHOT - Build 20161226
---------------------------------------

- Initial Updates:
Expand Down Expand Up @@ -60,6 +60,7 @@ Version 0.8.0-SNAPSHOT - Build 20161225
- The MapRealMatrix instances don't initialize their own storage engines any more. Instead they use a single engine stored in MatrixDataframe.
- The recommendersystem package is renamed to recommendation.
- The old datatransformation package is replaced with the preprocessing package which decouples Numerical Scaling from Categorical variable encoding.
- Create a new OneHotEncoder preprocessing class.

Version 0.7.0 - Build 20160319
------------------------------
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -16,7 +16,7 @@ The code is licensed under the [Apache License, Version 2.0](./LICENSE).
Version
-------

The latest version is 0.8.0-SNAPSHOT (Build 20161225).
The latest version is 0.8.0-SNAPSHOT (Build 20161226).

The [devel branch](https://github.com/datumbox/datumbox-framework/tree/devel) is the development branch (default github branch). The [master branch](https://github.com/datumbox/datumbox-framework/tree/master) contains the latest stable version of the framework. All the stable releases are marked with [tags](https://github.com/datumbox/datumbox-framework/releases).

Expand Down
1 change: 0 additions & 1 deletion TODO.txt
Expand Up @@ -19,7 +19,6 @@ NEW FEATURES
============

- Create the following Numerical Scalers: StandardScaler, MaxAbsScaler, PercentileScaler and Binarization.
- Create a OneHotEncoder categorical encoder.

- Create a storage engine for MapDB 3 once caching & asynchronous writing is supported. Remove the HOTFIX for MapDB bug #664.
- Create a storage engine for BerkeleyDB.
Expand Down
Expand Up @@ -21,7 +21,7 @@
import com.datumbox.framework.core.machinelearning.MLBuilder;
import com.datumbox.framework.core.machinelearning.classification.MultinomialNaiveBayes;
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.preprocessing.CornerConstraintsEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
import com.datumbox.framework.tests.Constants;
import com.datumbox.framework.tests.Datasets;
Expand Down Expand Up @@ -65,7 +65,7 @@ public void testTrainAndValidate() {
trainingParameters.setNumericalScalerTrainingParameters(nsParams);

//categorical encoding configuration
CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
trainingParameters.setCategoricalEncoderTrainingParameters(ceParams);

//feature selection configuration
Expand Down
Expand Up @@ -16,6 +16,7 @@
package com.datumbox.framework.core.machinelearning.common.abstracts.transformers;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.dataobjects.TypeInference;
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;

/**
Expand Down Expand Up @@ -45,4 +46,15 @@ protected AbstractCategoricalEncoder(String storageName, Configuration configura
super(storageName, configuration);
}

/**
* Checks whether the variable should be converted into dummy (boolean). Only
* categorical and ordinal values are converted.
*
* @param columnType
* @return
*/
protected boolean covert2dummy(TypeInference.DataType columnType) {
return columnType==TypeInference.DataType.CATEGORICAL || columnType==TypeInference.DataType.ORDINAL;
}

}
Expand Up @@ -198,6 +198,7 @@ public LinearRegressionMetrics(Dataframe predictedData) {
SST = SSR+SSE;
RSquare = SSR/SST;

//The d number is a proxy for the number of weights in the model but it could be wrong especially if categorical with unknown levels exist in the testset
int d = predictedData.xColumnSize()+1;//add one for the constant
int p = d - 1; //exclude constant

Expand Down
Expand Up @@ -163,15 +163,4 @@ protected void _transform(Dataframe newData) {
newData.recalculateMeta();
}

/**
* Checks whether the variable should be converted into dummy (boolean). Only
* categorical and ordinal values are converted.
*
* @param columnType
* @return
*/
private boolean covert2dummy(TypeInference.DataType columnType) {
return columnType==TypeInference.DataType.CATEGORICAL || columnType==TypeInference.DataType.ORDINAL;
}

}
@@ -0,0 +1,119 @@
/**
* Copyright (C) 2013-2016 Vasilis Vryniotis <bbriniotis@datumbox.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datumbox.framework.core.machinelearning.preprocessing;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.concurrency.StreamMethods;
import com.datumbox.framework.common.dataobjects.*;
import com.datumbox.framework.common.storageengines.interfaces.StorageEngine;
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;
import com.datumbox.framework.core.machinelearning.common.abstracts.transformers.AbstractCategoricalEncoder;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
* Encodes the categorical columns of the dataset into booleans using the One Hot Encoding method.
*
* @author Vasilis Vryniotis <bbriniotis@datumbox.com>
*/
public class OneHotEncoder extends AbstractCategoricalEncoder<OneHotEncoder.ModelParameters, OneHotEncoder.TrainingParameters> {

/** {@inheritDoc} */
public static class ModelParameters extends AbstractCategoricalEncoder.AbstractModelParameters {
private static final long serialVersionUID = 1L;

/**
* @param storageEngine
* @see AbstractTrainer.AbstractModelParameters#AbstractModelParameters(StorageEngine)
*/
protected ModelParameters(StorageEngine storageEngine) {
super(storageEngine);
}

}

/** {@inheritDoc} */
public static class TrainingParameters extends AbstractCategoricalEncoder.AbstractTrainingParameters {
private static final long serialVersionUID = 1L;

}

/**
* @param trainingParameters
* @param configuration
* @see AbstractTrainer#AbstractTrainer(AbstractTrainer.AbstractTrainingParameters, Configuration)
*/
protected OneHotEncoder(TrainingParameters trainingParameters, Configuration configuration) {
super(trainingParameters, configuration);
}

/**
* @param storageName
* @param configuration
* @see AbstractTrainer#AbstractTrainer(String, Configuration)
*/
protected OneHotEncoder(String storageName, Configuration configuration) {
super(storageName, configuration);
}

/** {@inheritDoc} */
@Override
protected void _fit(Dataframe trainingData) {
//does not learn anything
}

/** {@inheritDoc} */
@Override
protected void _transform(Dataframe newData) {
Map<Object, TypeInference.DataType> columnTypes = newData.getXDataTypes();

//Replace variables with dummy versions
streamExecutor.forEach(StreamMethods.stream(newData.entries(), isParallelized()), e -> {
Integer rId = e.getKey();
Record r = e.getValue();

AssociativeArray xData = r.getX().copy();

boolean modified = false;
for(Object column : r.getX().keySet()) {
if(covert2dummy(columnTypes.get(column))==false) {
continue;
}
Object value = xData.remove(column); //remove the original column
modified = true;

//create a new column
List<Object> newColumn = Arrays.asList(column,value);

//add a new dummy variable for this column-value combination
xData.put(newColumn, true);
}

if(modified) {
Record newR = new Record(xData, r.getY(), r.getYPredicted(), r.getYPredictedProbabilities());

//we call below the recalculateMeta()
newData._unsafe_set(rId, newR);
}
});

//Reset Meta info
newData.recalculateMeta();
}

}
Expand Up @@ -22,7 +22,7 @@
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.modelselection.Validator;
import com.datumbox.framework.core.machinelearning.modelselection.splitters.ShuffleSplitter;
import com.datumbox.framework.core.machinelearning.preprocessing.CornerConstraintsEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
import com.datumbox.framework.tests.Constants;
import com.datumbox.framework.tests.Datasets;
Expand Down Expand Up @@ -65,8 +65,8 @@ public void testPredict() {
numericalScaler.fit_transform(trainingData);
numericalScaler.save(storageName);

CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
CornerConstraintsEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);

categoricalEncoder.fit_transform(trainingData);
categoricalEncoder.save(storageName);
Expand All @@ -88,7 +88,7 @@ public void testPredict() {


numericalScaler = MLBuilder.load(MinMaxScaler.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(CornerConstraintsEncoder.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(OneHotEncoder.class, storageName, configuration);
instance = MLBuilder.load(MultinomialNaiveBayes.class, storageName, configuration);

numericalScaler.transform(validationData);
Expand Down
Expand Up @@ -22,7 +22,7 @@
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.modelselection.Validator;
import com.datumbox.framework.core.machinelearning.modelselection.splitters.KFoldSplitter;
import com.datumbox.framework.core.machinelearning.preprocessing.CornerConstraintsEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
import com.datumbox.framework.tests.Constants;
import com.datumbox.framework.tests.Datasets;
Expand Down Expand Up @@ -65,8 +65,8 @@ public void testPredict() {
numericalScaler.fit_transform(trainingData);
numericalScaler.save(storageName);

CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
CornerConstraintsEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);

categoricalEncoder.fit_transform(trainingData);
categoricalEncoder.save(storageName);
Expand Down Expand Up @@ -94,7 +94,7 @@ public void testPredict() {


numericalScaler = MLBuilder.load(MinMaxScaler.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(CornerConstraintsEncoder.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(OneHotEncoder.class, storageName, configuration);

instance = MLBuilder.load(OrdinalRegression.class, storageName, configuration);

Expand Down Expand Up @@ -141,8 +141,8 @@ public void testKFoldCrossValidation() {
MinMaxScaler numericalScaler = MLBuilder.create(nsParams, configuration);
numericalScaler.fit_transform(trainingData);

CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
CornerConstraintsEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
categoricalEncoder.fit_transform(trainingData);

OrdinalRegression.TrainingParameters param = new OrdinalRegression.TrainingParameters();
Expand Down
Expand Up @@ -22,7 +22,7 @@
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.modelselection.Validator;
import com.datumbox.framework.core.machinelearning.modelselection.splitters.KFoldSplitter;
import com.datumbox.framework.core.machinelearning.preprocessing.CornerConstraintsEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
import com.datumbox.framework.tests.Constants;
import com.datumbox.framework.tests.Datasets;
Expand Down Expand Up @@ -65,8 +65,8 @@ public void testPredict() {
numericalScaler.fit_transform(trainingData);
numericalScaler.save(storageName);

CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
CornerConstraintsEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);

categoricalEncoder.fit_transform(trainingData);
categoricalEncoder.save(storageName);
Expand All @@ -90,7 +90,7 @@ public void testPredict() {


numericalScaler = MLBuilder.load(MinMaxScaler.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(CornerConstraintsEncoder.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(OneHotEncoder.class, storageName, configuration);
instance = MLBuilder.load(SoftMaxRegression.class, storageName, configuration);

numericalScaler.transform(validationData);
Expand Down
Expand Up @@ -22,7 +22,7 @@
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.modelselection.Validator;
import com.datumbox.framework.core.machinelearning.modelselection.splitters.KFoldSplitter;
import com.datumbox.framework.core.machinelearning.preprocessing.CornerConstraintsEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
import com.datumbox.framework.tests.Constants;
import com.datumbox.framework.tests.Datasets;
Expand Down Expand Up @@ -67,8 +67,8 @@ public void testPredict() {
numericalScaler.fit_transform(trainingData);
numericalScaler.save(storageName);

CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
CornerConstraintsEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);

categoricalEncoder.fit_transform(trainingData);
categoricalEncoder.save(storageName);
Expand All @@ -90,7 +90,7 @@ public void testPredict() {


numericalScaler = MLBuilder.load(MinMaxScaler.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(CornerConstraintsEncoder.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(OneHotEncoder.class, storageName, configuration);
instance = MLBuilder.load(SupportVectorMachine.class, storageName, configuration);

numericalScaler.transform(validationData);
Expand Down
Expand Up @@ -23,7 +23,7 @@
import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
import com.datumbox.framework.core.machinelearning.modelselection.Validator;
import com.datumbox.framework.core.machinelearning.modelselection.splitters.KFoldSplitter;
import com.datumbox.framework.core.machinelearning.preprocessing.CornerConstraintsEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
import com.datumbox.framework.tests.Constants;
import com.datumbox.framework.tests.Datasets;
Expand Down Expand Up @@ -66,8 +66,8 @@ public void testPredict() {
numericalScaler.fit_transform(trainingData);
numericalScaler.save(storageName);

CornerConstraintsEncoder.TrainingParameters ceParams = new CornerConstraintsEncoder.TrainingParameters();
CornerConstraintsEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);

categoricalEncoder.fit_transform(trainingData);
categoricalEncoder.save(storageName);
Expand Down Expand Up @@ -95,7 +95,7 @@ public void testPredict() {


numericalScaler = MLBuilder.load(MinMaxScaler.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(CornerConstraintsEncoder.class, storageName, configuration);
categoricalEncoder = MLBuilder.load(OneHotEncoder.class, storageName, configuration);
instance = MLBuilder.load(Adaboost.class, storageName, configuration);

numericalScaler.transform(validationData);
Expand Down

0 comments on commit 78fb253

Please sign in to comment.