From a64e25a883e433d20a9563a96d220234ba1f7c73 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Tue, 27 Dec 2016 01:41:00 +0000 Subject: [PATCH] Adding a BinaryScaler. --- CHANGELOG.md | 2 +- TODO.txt | 2 +- .../preprocessing/BinaryScaler.java | 153 ++++++++++++++++++ .../preprocessing/MaxAbsScaler.java | 2 +- .../preprocessing/MinMaxScaler.java | 2 +- .../preprocessing/StandardScaler.java | 2 +- 6 files changed, 158 insertions(+), 5 deletions(-) create mode 100644 datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/BinaryScaler.java diff --git a/CHANGELOG.md b/CHANGELOG.md index f7b6f549..e9cd1285 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,7 +60,7 @@ Version 0.8.0-SNAPSHOT - Build 20161227 - The MapRealMatrix instances don't initialize their own storage engines any more. Instead they use a single engine stored in MatrixDataframe. - The recommendersystem package is renamed to recommendation. - The old datatransformation package is replaced with the preprocessing package which decouples Numerical Scaling from Categorical variable encoding. - - Added the following preprocessing algorithms: OneHotEncoder, StandardScaler, MaxAbsScaler. + - Added the following preprocessing algorithms: OneHotEncoder, StandardScaler, MaxAbsScaler, BinaryScaler. - Added minAbsolute and maxAbsolute methods in Descriptives. Version 0.7.0 - Build 20160319 diff --git a/TODO.txt b/TODO.txt index 9c7cecfd..41936eff 100755 --- a/TODO.txt +++ b/TODO.txt @@ -18,7 +18,7 @@ CODE IMPROVEMENTS NEW FEATURES ============ -- Create the following Numerical Scalers: PercentileScaler and Binarization. +- Create the following Numerical Scalers: PercentileScaler. - Create a storage engine for MapDB 3 once caching & asynchronous writing is supported. Remove the HOTFIX for MapDB bug #664. - Create a storage engine for BerkeleyDB. diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/BinaryScaler.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/BinaryScaler.java new file mode 100644 index 00000000..c548d8a2 --- /dev/null +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/BinaryScaler.java @@ -0,0 +1,153 @@ +/** + * Copyright (C) 2013-2016 Vasilis Vryniotis + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.datumbox.framework.core.machinelearning.preprocessing; + +import com.datumbox.framework.common.Configuration; +import com.datumbox.framework.common.concurrency.StreamMethods; +import com.datumbox.framework.common.dataobjects.*; +import com.datumbox.framework.common.storageengines.interfaces.StorageEngine; +import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer; +import com.datumbox.framework.core.machinelearning.common.abstracts.transformers.AbstractNumericalScaler; + +import java.util.Map; + +/** + * Rescales the numerical features of the dataset between -1 and 1. + * + * @author Vasilis Vryniotis + */ +public class BinaryScaler extends AbstractNumericalScaler { + + /** {@inheritDoc} */ + public static class ModelParameters extends AbstractNumericalScaler.AbstractModelParameters { + private static final long serialVersionUID = 1L; + + /** + * @param storageEngine + * @see AbstractTrainer.AbstractModelParameters#AbstractModelParameters(StorageEngine) + */ + protected ModelParameters(StorageEngine storageEngine) { + super(storageEngine); + } + + } + + /** {@inheritDoc} */ + public static class TrainingParameters extends AbstractNumericalScaler.AbstractTrainingParameters { + private static final long serialVersionUID = 1L; + + private double threshold = 0.0; + + /** + * Getter for the threhold. + * + * @return + */ + public double getThreshold() { + return threshold; + } + + /** + * Setter for the threshold. Values less or equal to the threhold are turned into false and those greater + * are turned into true. + * + * @param threshold + */ + public void setThreshold(double threshold) { + this.threshold = threshold; + } + } + + /** + * @param trainingParameters + * @param configuration + * @see AbstractTrainer#AbstractTrainer(AbstractTrainer.AbstractTrainingParameters, Configuration) + */ + protected BinaryScaler(TrainingParameters trainingParameters, Configuration configuration) { + super(trainingParameters, configuration); + } + + /** + * @param storageName + * @param configuration + * @see AbstractTrainer#AbstractTrainer(String, Configuration) + */ + protected BinaryScaler(String storageName, Configuration configuration) { + super(storageName, configuration); + } + + /** {@inheritDoc} */ + @Override + protected void _fit(Dataframe trainingData) { + + } + + /** {@inheritDoc} */ + @Override + protected void _transform(Dataframe newData) { + TrainingParameters trainingParameters = knowledgeBase.getTrainingParameters(); + boolean scaleResponse = trainingParameters.getScaleResponse() && newData.getYDataType() == TypeInference.DataType.NUMERICAL; + double threshold = trainingParameters.getThreshold(); + Map columnTypes = newData.getXDataTypes(); + + streamExecutor.forEach(StreamMethods.stream(newData.entries(), isParallelized()), e -> { + Record r = e.getValue(); + AssociativeArray xData = r.getX().copy(); + Object yData = r.getY(); + + boolean modified = false; + for(Map.Entry entry : xData.entrySet()) { + Object column = entry.getKey(); + Double value = xData.getDouble(column); + if(value == null || columnTypes.get(column)!=TypeInference.DataType.NUMERICAL) { + continue; + } + + xData.put(column, scale(value, threshold)); + modified = true; + } + + if(scaleResponse && yData != null) { + Double value = TypeInference.toDouble(yData); + + yData = scale(value, threshold); + modified = true; + } + + if(modified) { + Integer rId = e.getKey(); + Record newR = new Record(xData, yData, r.getYPredicted(), r.getYPredictedProbabilities()); + + //we call below the recalculateMeta() + newData._unsafe_set(rId, newR); + } + }); + + //Reset Meta info + newData.recalculateMeta(); + } + + /** + * Performs the actual rescaling handling corner cases. + * + * @param value + * @param threshold + * @return + */ + private Boolean scale(Double value, double threshold) { + return value>threshold; + } +} diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MaxAbsScaler.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MaxAbsScaler.java index 676b6b20..9c189666 100644 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MaxAbsScaler.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MaxAbsScaler.java @@ -167,7 +167,7 @@ protected void _transform(Dataframe newData) { * @param maxAbsolute * @return */ - private double scale(Double value, Double maxAbsolute) { + private Double scale(Double value, Double maxAbsolute) { if(maxAbsolute.equals(0.0)) { return Math.signum(value); } diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MinMaxScaler.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MinMaxScaler.java index eaf5d25b..89014e48 100644 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MinMaxScaler.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/MinMaxScaler.java @@ -198,7 +198,7 @@ protected void _transform(Dataframe newData) { * @param max * @return */ - private double scale(Double value, Double min, Double max) { + private Double scale(Double value, Double min, Double max) { if(min.equals(max)) { if(value>max) { return 1.0; diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/StandardScaler.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/StandardScaler.java index 5cd55fa7..abb7a0e2 100644 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/StandardScaler.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/preprocessing/StandardScaler.java @@ -198,7 +198,7 @@ protected void _transform(Dataframe newData) { * @param std * @return */ - private double scale(Double value, Double mean, Double std) { + private Double scale(Double value, Double mean, Double std) { if(std.equals(0.0)) { if(value > mean) { return 1.0;