Skip to content

Commit

Permalink
Adding a BinaryScaler.
Browse files Browse the repository at this point in the history
  • Loading branch information
datumbox committed Dec 27, 2016
1 parent 4dc9911 commit a64e25a
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 5 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Expand Up @@ -60,7 +60,7 @@ Version 0.8.0-SNAPSHOT - Build 20161227
- The MapRealMatrix instances don't initialize their own storage engines any more. Instead they use a single engine stored in MatrixDataframe. - The MapRealMatrix instances don't initialize their own storage engines any more. Instead they use a single engine stored in MatrixDataframe.
- The recommendersystem package is renamed to recommendation. - The recommendersystem package is renamed to recommendation.
- The old datatransformation package is replaced with the preprocessing package which decouples Numerical Scaling from Categorical variable encoding. - The old datatransformation package is replaced with the preprocessing package which decouples Numerical Scaling from Categorical variable encoding.
- Added the following preprocessing algorithms: OneHotEncoder, StandardScaler, MaxAbsScaler. - Added the following preprocessing algorithms: OneHotEncoder, StandardScaler, MaxAbsScaler, BinaryScaler.
- Added minAbsolute and maxAbsolute methods in Descriptives. - Added minAbsolute and maxAbsolute methods in Descriptives.


Version 0.7.0 - Build 20160319 Version 0.7.0 - Build 20160319
Expand Down
2 changes: 1 addition & 1 deletion TODO.txt
Expand Up @@ -18,7 +18,7 @@ CODE IMPROVEMENTS
NEW FEATURES NEW FEATURES
============ ============


- Create the following Numerical Scalers: PercentileScaler and Binarization. - Create the following Numerical Scalers: PercentileScaler.


- Create a storage engine for MapDB 3 once caching & asynchronous writing is supported. Remove the HOTFIX for MapDB bug #664. - Create a storage engine for MapDB 3 once caching & asynchronous writing is supported. Remove the HOTFIX for MapDB bug #664.
- Create a storage engine for BerkeleyDB. - Create a storage engine for BerkeleyDB.
Expand Down
@@ -0,0 +1,153 @@
/**
* Copyright (C) 2013-2016 Vasilis Vryniotis <bbriniotis@datumbox.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datumbox.framework.core.machinelearning.preprocessing;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.concurrency.StreamMethods;
import com.datumbox.framework.common.dataobjects.*;
import com.datumbox.framework.common.storageengines.interfaces.StorageEngine;
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;
import com.datumbox.framework.core.machinelearning.common.abstracts.transformers.AbstractNumericalScaler;

import java.util.Map;

/**
* Rescales the numerical features of the dataset between -1 and 1.
*
* @author Vasilis Vryniotis <bbriniotis@datumbox.com>
*/
public class BinaryScaler extends AbstractNumericalScaler<BinaryScaler.ModelParameters, BinaryScaler.TrainingParameters> {

/** {@inheritDoc} */
public static class ModelParameters extends AbstractNumericalScaler.AbstractModelParameters {
private static final long serialVersionUID = 1L;

/**
* @param storageEngine
* @see AbstractTrainer.AbstractModelParameters#AbstractModelParameters(StorageEngine)
*/
protected ModelParameters(StorageEngine storageEngine) {
super(storageEngine);
}

}

/** {@inheritDoc} */
public static class TrainingParameters extends AbstractNumericalScaler.AbstractTrainingParameters {
private static final long serialVersionUID = 1L;

private double threshold = 0.0;

/**
* Getter for the threhold.
*
* @return
*/
public double getThreshold() {
return threshold;
}

/**
* Setter for the threshold. Values less or equal to the threhold are turned into false and those greater
* are turned into true.
*
* @param threshold
*/
public void setThreshold(double threshold) {
this.threshold = threshold;
}
}

/**
* @param trainingParameters
* @param configuration
* @see AbstractTrainer#AbstractTrainer(AbstractTrainer.AbstractTrainingParameters, Configuration)
*/
protected BinaryScaler(TrainingParameters trainingParameters, Configuration configuration) {
super(trainingParameters, configuration);
}

/**
* @param storageName
* @param configuration
* @see AbstractTrainer#AbstractTrainer(String, Configuration)
*/
protected BinaryScaler(String storageName, Configuration configuration) {
super(storageName, configuration);
}

/** {@inheritDoc} */
@Override
protected void _fit(Dataframe trainingData) {

}

/** {@inheritDoc} */
@Override
protected void _transform(Dataframe newData) {
TrainingParameters trainingParameters = knowledgeBase.getTrainingParameters();
boolean scaleResponse = trainingParameters.getScaleResponse() && newData.getYDataType() == TypeInference.DataType.NUMERICAL;
double threshold = trainingParameters.getThreshold();
Map<Object, TypeInference.DataType> columnTypes = newData.getXDataTypes();

streamExecutor.forEach(StreamMethods.stream(newData.entries(), isParallelized()), e -> {
Record r = e.getValue();
AssociativeArray xData = r.getX().copy();
Object yData = r.getY();

boolean modified = false;
for(Map.Entry<Object, Object> entry : xData.entrySet()) {
Object column = entry.getKey();
Double value = xData.getDouble(column);
if(value == null || columnTypes.get(column)!=TypeInference.DataType.NUMERICAL) {
continue;
}

xData.put(column, scale(value, threshold));
modified = true;
}

if(scaleResponse && yData != null) {
Double value = TypeInference.toDouble(yData);

yData = scale(value, threshold);
modified = true;
}

if(modified) {
Integer rId = e.getKey();
Record newR = new Record(xData, yData, r.getYPredicted(), r.getYPredictedProbabilities());

//we call below the recalculateMeta()
newData._unsafe_set(rId, newR);
}
});

//Reset Meta info
newData.recalculateMeta();
}

/**
* Performs the actual rescaling handling corner cases.
*
* @param value
* @param threshold
* @return
*/
private Boolean scale(Double value, double threshold) {
return value>threshold;
}
}
Expand Up @@ -167,7 +167,7 @@ protected void _transform(Dataframe newData) {
* @param maxAbsolute * @param maxAbsolute
* @return * @return
*/ */
private double scale(Double value, Double maxAbsolute) { private Double scale(Double value, Double maxAbsolute) {
if(maxAbsolute.equals(0.0)) { if(maxAbsolute.equals(0.0)) {
return Math.signum(value); return Math.signum(value);
} }
Expand Down
Expand Up @@ -198,7 +198,7 @@ protected void _transform(Dataframe newData) {
* @param max * @param max
* @return * @return
*/ */
private double scale(Double value, Double min, Double max) { private Double scale(Double value, Double min, Double max) {
if(min.equals(max)) { if(min.equals(max)) {
if(value>max) { if(value>max) {
return 1.0; return 1.0;
Expand Down
Expand Up @@ -198,7 +198,7 @@ protected void _transform(Dataframe newData) {
* @param std * @param std
* @return * @return
*/ */
private double scale(Double value, Double mean, Double std) { private Double scale(Double value, Double mean, Double std) {
if(std.equals(0.0)) { if(std.equals(0.0)) {
if(value > mean) { if(value > mean) {
return 1.0; return 1.0;
Expand Down

0 comments on commit a64e25a

Please sign in to comment.