From bf0e5aa7808ab819d7122e7b0bf0e421c08a6a82 Mon Sep 17 00:00:00 2001 From: Ilya Gyrdymov Date: Sun, 21 Jun 2020 11:30:07 +0300 Subject: [PATCH 1/6] Cross validator entity: evaluate method now returns Future instead of double --- ...id_test_data_columns_number_exception.dart | 10 +++ ...d_train_data_columns_number_exception.dart | 10 +++ .../cross_validator/cross_validator.dart | 10 ++- .../cross_validator/cross_validator_impl.dart | 80 +++++++++---------- .../cross_validator_impl_test.dart | 68 ++++++++-------- 5 files changed, 101 insertions(+), 77 deletions(-) create mode 100644 lib/src/common/exception/invalid_test_data_columns_number_exception.dart create mode 100644 lib/src/common/exception/invalid_train_data_columns_number_exception.dart diff --git a/lib/src/common/exception/invalid_test_data_columns_number_exception.dart b/lib/src/common/exception/invalid_test_data_columns_number_exception.dart new file mode 100644 index 00000000..7cbc8e6b --- /dev/null +++ b/lib/src/common/exception/invalid_test_data_columns_number_exception.dart @@ -0,0 +1,10 @@ +class InvalidTestDataColumnsNumberException implements Exception { + InvalidTestDataColumnsNumberException(int expected, int received) : + message = 'Unexpected columns number in test data, ' + 'expected $expected, received ${received}'; + + final String message; + + @override + String toString() => message; +} diff --git a/lib/src/common/exception/invalid_train_data_columns_number_exception.dart b/lib/src/common/exception/invalid_train_data_columns_number_exception.dart new file mode 100644 index 00000000..764f88df --- /dev/null +++ b/lib/src/common/exception/invalid_train_data_columns_number_exception.dart @@ -0,0 +1,10 @@ +class InvalidTrainDataColumnsNumberException implements Exception { + InvalidTrainDataColumnsNumberException(int expected, int received) : + message = 'Unexpected columns number in training data, ' + 'expected $expected, received ${received}'; + + final String message; + + @override + String toString() => message; +} diff --git a/lib/src/model_selection/cross_validator/cross_validator.dart b/lib/src/model_selection/cross_validator/cross_validator.dart index 1364ea9d..4f1c48ef 100644 --- a/lib/src/model_selection/cross_validator/cross_validator.dart +++ b/lib/src/model_selection/cross_validator/cross_validator.dart @@ -134,7 +134,11 @@ abstract class CrossValidator { /// onDataSplit: onDataSplit, /// ); /// ```` - double evaluate(PredictorFactory predictorFactory, MetricType metricType, { - DataPreprocessFn onDataSplit, - }); + Future evaluate( + PredictorFactory predictorFactory, + MetricType metricType, + { + DataPreprocessFn onDataSplit, + } + ); } diff --git a/lib/src/model_selection/cross_validator/cross_validator_impl.dart b/lib/src/model_selection/cross_validator/cross_validator_impl.dart index 23c609f1..f3f85b7f 100644 --- a/lib/src/model_selection/cross_validator/cross_validator_impl.dart +++ b/lib/src/model_selection/cross_validator/cross_validator_impl.dart @@ -1,3 +1,5 @@ +import 'package:ml_algo/src/common/exception/invalid_test_data_columns_number_exception.dart'; +import 'package:ml_algo/src/common/exception/invalid_train_data_columns_number_exception.dart'; import 'package:ml_algo/src/metric/metric_type.dart'; import 'package:ml_algo/src/model_selection/cross_validator/cross_validator.dart'; import 'package:ml_algo/src/model_selection/data_splitter/data_splitter.dart'; @@ -21,54 +23,48 @@ class CrossValidatorImpl implements CrossValidator { final DataSplitter _splitter; @override - double evaluate(PredictorFactory predictorFactory, MetricType metricType, { - DataPreprocessFn onDataSplit, - }) { + Future evaluate( + PredictorFactory predictorFactory, + MetricType metricType, + { + DataPreprocessFn onDataSplit, + } + ) { final samplesAsMatrix = samples.toMatrix(dtype); final sourceColumnsNum = samplesAsMatrix.columnsNum; - final discreteColumns = enumerate(samples.series) .where((indexedSeries) => indexedSeries.value.isDiscrete) .map((indexedSeries) => indexedSeries.index); - final allIndicesGroups = _splitter.split(samplesAsMatrix.rowsNum); - var score = 0.0; - var folds = 0; - - for (final testRowsIndices in allIndicesGroups) { - final split = _makeSplit(testRowsIndices, discreteColumns); - final trainDataFrame = split[0]; - final testDataFrame = split[1]; - - final splits = onDataSplit != null - ? onDataSplit(trainDataFrame, testDataFrame) - : [trainDataFrame, testDataFrame]; - - final transformedTrainData = splits[0]; - final transformedTestData = splits[1]; - - final transformedTrainDataColumnsNum = transformedTrainData.header.length; - final transformedTestDataColumnsNum = transformedTestData.header.length; - - if (transformedTrainDataColumnsNum != sourceColumnsNum) { - throw Exception('Unexpected columns number in training data: ' - 'expected $sourceColumnsNum, received ' - '${transformedTrainDataColumnsNum}'); - } - - if (transformedTestDataColumnsNum != sourceColumnsNum) { - throw Exception('Unexpected columns number in testing data: ' - 'expected $sourceColumnsNum, received ' - '${transformedTestDataColumnsNum}'); - } - - score += predictorFactory(transformedTrainData, targetNames) - .assess(transformedTestData, targetNames, metricType); - - folds++; - } - - return score / folds; + final scores = allIndicesGroups + .map((testRowsIndices) { + final split = _makeSplit(testRowsIndices, discreteColumns); + final trainDataFrame = split[0]; + final testDataFrame = split[1]; + final splits = onDataSplit != null + ? onDataSplit(trainDataFrame, testDataFrame) + : [trainDataFrame, testDataFrame]; + final transformedTrainData = splits[0]; + final transformedTestData = splits[1]; + final transformedTrainDataColumnsNum = transformedTrainData.header.length; + final transformedTestDataColumnsNum = transformedTestData.header.length; + + if (transformedTrainDataColumnsNum != sourceColumnsNum) { + throw InvalidTrainDataColumnsNumberException(sourceColumnsNum, + transformedTrainDataColumnsNum); + } + + if (transformedTestDataColumnsNum != sourceColumnsNum) { + throw InvalidTestDataColumnsNumberException(sourceColumnsNum, + transformedTestDataColumnsNum); + } + + return predictorFactory(transformedTrainData, targetNames) + .assess(transformedTestData, targetNames, metricType); + }) + .toList(); + + return Future.value(Vector.fromList(scores, dtype: dtype)); } List _makeSplit(Iterable testRowsIndices, diff --git a/test/model_selection/cross_validator/cross_validator_impl_test.dart b/test/model_selection/cross_validator/cross_validator_impl_test.dart index c04d1321..573ce7f1 100644 --- a/test/model_selection/cross_validator/cross_validator_impl_test.dart +++ b/test/model_selection/cross_validator/cross_validator_impl_test.dart @@ -1,3 +1,5 @@ +import 'package:ml_algo/src/common/exception/invalid_test_data_columns_number_exception.dart'; +import 'package:ml_algo/src/common/exception/invalid_train_data_columns_number_exception.dart'; import 'package:ml_algo/src/metric/metric_type.dart'; import 'package:ml_algo/src/model_selection/cross_validator/cross_validator_impl.dart'; import 'package:ml_algo/src/model_selection/data_splitter/data_splitter.dart'; @@ -16,7 +18,8 @@ DataSplitter createSplitter(Iterable> indices) { void main() { group('CrossValidatorImpl', () { - test('should perform validation of a predictor on given test splits', () { + test('should evaluate performance of a predictor on given test ' + 'splits', () async { final allObservations = DataFrame(>[ [330, 930, 130, 100], [630, 830, 230, 200], @@ -28,21 +31,18 @@ void main() { [430, 230, 830, 800], [530, 130, 930, 900], ], header: ['first', 'second', 'third', 'target'], headerExists: false); - final metric = MetricType.mape; final splitter = createSplitter([[0,2,4],[6, 8]]); final predictor = AssessableMock(); final validator = CrossValidatorImpl(allObservations, ['target'], splitter, DType.float32); + final score = 20.0; + when(predictor.assess(any, any, any)).thenReturn(score); - var score = 20.0; - when(predictor.assess(any, any, any)) - .thenAnswer((Invocation inv) => score = score + 10); - - final actual = validator + final actual = await validator .evaluate((observations, outcomes) => predictor, metric); - expect(actual, 35); + expect(actual, [20, 20]); final verificationResult = verify( predictor.assess( @@ -66,8 +66,10 @@ void main() { verificationResult.called(2); }); - test('should take the first element as train samples from data ' - 'preprocessing callback response while evaluating a predictor', () { + test('should treat the first element of the returning array from data ' + 'preprocessing callback response as train samples while evaluating a ' + 'predictor', () async { + // we don't care about data here cause it will be mocked farther final allObservations = DataFrame( [[1, 1, 1, 1]], @@ -110,7 +112,7 @@ void main() { ], }; - validator.evaluate( + await validator.evaluate( (observations, outcomes) { expect( observations.toMatrix(), @@ -124,8 +126,10 @@ void main() { ); }); - test('should take the second element as test samples from data ' - 'preprocessing callback response while evaluating a predictor', () { + test('should treat the second element of the returning array from data ' + 'preprocessing callback response as test samples while evaluating a ' + 'predictor', () async { + // we don't care about data here cause it will be mocked farther final allObservations = DataFrame( [[1, 1, 1, 1]], @@ -167,7 +171,7 @@ void main() { ], }; - validator.evaluate( + await validator.evaluate( (observations, outcomes) => predictor, metric, onDataSplit: (trainData, testData) => @@ -200,7 +204,7 @@ void main() { verificationResult.called(3); }); - test('should pass splits into data preprocessing callback', () { + test('should pass splits into data preprocessing callback', () async { final header = ['first', 'second', 'third', 'target']; // we don't care about data here cause it will be mocked farther @@ -280,7 +284,7 @@ void main() { }, }; - validator.evaluate( + await validator.evaluate( (observations, outcomes) => predictor, metric, onDataSplit: (trainData, testData) { @@ -300,9 +304,9 @@ void main() { ); }); - test('should throw an exception if one tries to return a training data ' - 'from data perprocessing callback with number of columns less than ' - 'the number of columns of original data', () { + test('should throw an exception if one tries to return the train data ' + 'from the data perprocessing callback with the number of columns less ' + 'than the number of columns of the original data', () async { final header = ['first', 'second', 'third', 'target']; // we don't care about data here cause it will be mocked farther @@ -341,12 +345,12 @@ void main() { ], ); - expect(actual, throwsException); + expect(actual, throwsA(isA())); }); - test('should throw an exception if one tries to return a training data ' - 'from data perprocessing callback with number of columns greater than ' - 'the number of columns of original data', () { + test('should throw an exception if one tries to return the train data ' + 'from the data perprocessing callback with the number of columns ' + 'greater than the number of columns of the original data', () { final header = ['first', 'second', 'third', 'target']; // we don't care about data here cause it will be mocked farther @@ -384,12 +388,12 @@ void main() { ], ); - expect(actual, throwsException); + expect(actual, throwsA(isA())); }); - test('should throw an exception if one tries to return a testing data ' - 'from data perprocessing callback with number of columns less than ' - 'the number of columns of original data', () { + test('should throw an exception if one tries to return the test data ' + 'from the data perprocessing callback with the number of columns less ' + 'than the number of columns of the original data', () { final header = ['first', 'second', 'third', 'target']; // we don't care about data here cause it will be mocked farther @@ -427,12 +431,12 @@ void main() { ], ); - expect(actual, throwsException); + expect(actual, throwsA(isA())); }); - test('should throw an exception if one tries to return a testing data ' - 'from data perprocessing callback with number of columns greater than ' - 'the number of columns of original data', () { + test('should throw an exception if one tries to return the test data ' + 'from the data perprocessing callback with the number of columns ' + 'greater than the number of columns of the original data', () { final header = ['first', 'second', 'third', 'target']; // we don't care about data here cause it will be mocked farther @@ -470,7 +474,7 @@ void main() { ], ); - expect(actual, throwsException); + expect(actual, throwsA(isA())); }); }); } From 5e06b60a260b9bbae8458ef4243b0527657415a1 Mon Sep 17 00:00:00 2001 From: Ilya Gyrdymov Date: Sun, 21 Jun 2020 13:05:53 +0300 Subject: [PATCH 2/6] Cross validator entity: evaluate method now returns Future instead of double --- CHANGELOG.md | 4 +++ .../cross_validator/cross_validator.dart | 26 +++++++++---------- pubspec.yaml | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3186330a..bd9d7690 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 14.0.0 +- Breaking change: + - `CrossValidator`: `evalute` method's api changed, it returns a Future resolving with scores Vector now + ## 13.10.0 - `LinearRegressor`: - `Default constructor`: `collectLearningData` parameter added diff --git a/lib/src/model_selection/cross_validator/cross_validator.dart b/lib/src/model_selection/cross_validator/cross_validator.dart index 4f1c48ef..fe5503af 100644 --- a/lib/src/model_selection/cross_validator/cross_validator.dart +++ b/lib/src/model_selection/cross_validator/cross_validator.dart @@ -23,8 +23,8 @@ abstract class CrossValidator { /// /// Parameters: /// - /// [samples] The whole training dataset to be split into parts to iteratively - /// evaluate given predictor on the each particular part + /// [samples] A dataset to be split into parts to iteratively evaluate given + /// predictor's performance /// /// [targetColumnNames] Names of columns from [samples] that contain outcomes /// @@ -57,8 +57,8 @@ abstract class CrossValidator { /// /// Parameters: /// - /// [samples] The whole training dataset to be split into parts to iteratively - /// evaluate given model on the each particular part. + /// [samples] A dataset to be split into parts to iteratively + /// evaluate given predictor's performance /// /// [targetColumnNames] Names of columns from [samples] that contain outcomes. /// @@ -83,21 +83,21 @@ abstract class CrossValidator { ); } - /// Returns a score of quality of passed predictor depending on given - /// [metricType] + /// Returns a future resolving with a vector of scores of quality of passed + /// predictor depending on given [metricType] /// /// Parameters: /// - /// [predictorFactory] A factory function that returns a testing predictor + /// [predictorFactory] A factory function that returns an evaluating predictor /// - /// [metricType] Metric to assess a predictor, that is being created by + /// [metricType] Metric using to assess a predictor creating by /// [predictorFactory] /// /// [onDataSplit] A callback that is called when a new train-test split is /// ready to be passed into evaluating predictor. One may place some /// additional data-dependent logic here, e.g., data preprocessing. The /// callback accepts train and test data from a new split and returns - /// transformed split as list, where the first element is training data and + /// transformed split as list, where the first element is train data and /// the second one - test data, both of [DataFrame] type. This new transformed /// split will be passed into the predictor. /// @@ -115,10 +115,8 @@ abstract class CrossValidator { /// header: header, /// headerExists: false, /// ); - /// /// final predictorFactory = (trainData, _) => /// KnnRegressor(trainData, 'col_3', k: 4); - /// /// final onDataSplit = (trainData, testData) { /// final standardizer = Standardizer(trainData); /// return [ @@ -126,13 +124,15 @@ abstract class CrossValidator { /// standardizer.process(testData), /// ]; /// } - /// /// final validator = CrossValidator.kFold(data, ['col_3']); - /// final score = validator.evaluate( + /// final scores = await validator.evaluate( /// predictorFactory, /// MetricType.mape, /// onDataSplit: onDataSplit, /// ); + /// final averageScore = scores.mean(); + /// + /// print(averageScore); /// ```` Future evaluate( PredictorFactory predictorFactory, diff --git a/pubspec.yaml b/pubspec.yaml index 87a380b0..28b6eb34 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -1,6 +1,6 @@ name: ml_algo description: Machine learning algorithms written in native dart -version: 13.10.0 +version: 14.0.0 homepage: https://github.com/gyrdym/ml_algo environment: From e5b9dba491042c3aff0c129bb46f170c4e974382 Mon Sep 17 00:00:00 2001 From: Ilya Gyrdymov Date: Sun, 21 Jun 2020 14:51:26 +0300 Subject: [PATCH 3/6] Cross validator entity: evaluate method now returns Future instead of double --- README.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 34cdfe3e..27ef2bdb 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ All are set, so we can do our classification. Evaluate our model via accuracy metric: ````dart -final accuracy = validator.evaluate((samples, targetNames) => +final scores = await validator.evaluate((samples, targetNames) => LogisticRegressor( samples, targetNames[0], // remember, we provided a list of just a single name @@ -112,6 +112,13 @@ final accuracy = validator.evaluate((samples, targetNames) => ), MetricType.accuracy); ```` +Since the CrossValidator's instance returns a Vector of scores as a result of our predictor evaluation, we may choose +any way to reduce all the collected scores to a single number, for instance we may use Vector's `mean` method: + +```dart +final accuracy = scores.mean(); +``` + Let's print the score: ````dart print('accuracy on classification: ${accuracy.toStringAsFixed(2)}'); @@ -134,7 +141,7 @@ Future main() async { final samples = await fromCsv('datasets/pima_indians_diabetes_database.csv', headerExists: true); final targetColumnName = 'class variable (0 or 1)'; final validator = CrossValidator.KFold(samples, [targetColumnName], numberOfFolds: 5); - final accuracy = validator.evaluate((samples, targetNames) => + final scores = await validator.evaluate((samples, targetNames) => LogisticRegressor( samples, targetNames[0], // remember, we provide a list of just a single name @@ -146,6 +153,7 @@ Future main() async { interceptScale: .1, learningRateType: LearningRateType.constant ), MetricType.accuracy); + final accuracy = scores.mean(); print('accuracy on classification: ${accuracy.toStringFixed(2)}'); } @@ -202,14 +210,15 @@ Let the `k` parameter be equal to `4`. Assess a knn regressor with the chosen `k` value using MAPE metric ````dart -final error = validator.evaluate((samples, targetNames) => +final scores = await validator.evaluate((samples, targetNames) => KnnRegressor(samples, targetNames[0], 4), MetricType.mape); +final averageError = scores.mean(); ```` Let's print our error ````dart -print('MAPE error on k-fold validation: ${error.toStringAsFixed(2)}%'); // it yields approx. 6.18 +print('MAPE error on k-fold validation: ${averageError.toStringAsFixed(2)}%'); // it yields approx. 6.18 ```` ### Contacts From 5826a624f6c15479cfec7ca11ca39a3dbb5f0ae7 Mon Sep 17 00:00:00 2001 From: Ilya Gyrdymov Date: Sun, 21 Jun 2020 14:55:50 +0300 Subject: [PATCH 4/6] Cross validator entity: evaluate method now returns Future instead of double --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 27ef2bdb..892ad8f5 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ final targetColumnName = 'class variable (0 or 1)'; ```` Then we should create an instance of `CrossValidator` class to fit [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) -our model. We should pass training data (our `samples` variable), a list of target column names (in our case it's +of our model. We should pass training data (our `samples` variable), a list of target column names (in our case it's just a name stored in `targetColumnName` variable) and a number of folds into CrossValidator constructor. ````dart From 3c1e3cce883e9b232eb2c8a46edba12b59018d3c Mon Sep 17 00:00:00 2001 From: Ilya Gyrdymov Date: Sun, 21 Jun 2020 15:26:32 +0300 Subject: [PATCH 5/6] Cross validator entity: evaluate method now returns Future instead of double --- README.md | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 892ad8f5..3525ad3e 100644 --- a/README.md +++ b/README.md @@ -102,13 +102,10 @@ final scores = await validator.evaluate((samples, targetNames) => LogisticRegressor( samples, targetNames[0], // remember, we provided a list of just a single name - optimizerType: LinearOptimizerType.gradient, - initialLearningRate: .8, - iterationsLimit: 500, - batchSize: samples.rows.length, - fitIntercept: true, - interceptScale: .1, - learningRateType: LearningRateType.constant + optimizerType: LinearOptimizerType.gradient, + learningRateType: LearningRateType.decreasingAdaptive, + probabilityThreshold: 0.7, + randomSeed: 3, ), MetricType.accuracy); ```` @@ -127,7 +124,7 @@ print('accuracy on classification: ${accuracy.toStringAsFixed(2)}'); We will see something like this: ```` -acuracy on classification: 0.77 +acuracy on classification: 0.65 ```` All the code above all together: @@ -145,13 +142,10 @@ Future main() async { LogisticRegressor( samples, targetNames[0], // remember, we provide a list of just a single name - optimizerType: LinearOptimizerType.gradient, - initialLearningRate: .8, - iterationsLimit: 500, - batchSize: 768, - fitIntercept: true, - interceptScale: .1, - learningRateType: LearningRateType.constant + optimizerType: LinearOptimizerType.gradient, + learningRateType: LearningRateType.decreasingAdaptive, + probabilityThreshold: 0.7, + randomSeed: 3, ), MetricType.accuracy); final accuracy = scores.mean(); From 73789288e3962cf85874dc019ee6b1f383b91e2b Mon Sep 17 00:00:00 2001 From: Ilya Gyrdymov Date: Sun, 21 Jun 2020 19:15:51 +0300 Subject: [PATCH 6/6] Cross validator entity: evaluate method now returns Future instead of double --- CHANGELOG.md | 3 ++- .../exception/invalid_train_data_columns_number_exception.dart | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd9d7690..1436c70d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,8 @@ ## 14.0.0 - Breaking change: - - `CrossValidator`: `evalute` method's api changed, it returns a Future resolving with scores Vector now + - `CrossValidator`: `evalute` method's api changed, it returns a Future resolving with scores Vector now instead + of a double value ## 13.10.0 - `LinearRegressor`: diff --git a/lib/src/common/exception/invalid_train_data_columns_number_exception.dart b/lib/src/common/exception/invalid_train_data_columns_number_exception.dart index 764f88df..041c2c9b 100644 --- a/lib/src/common/exception/invalid_train_data_columns_number_exception.dart +++ b/lib/src/common/exception/invalid_train_data_columns_number_exception.dart @@ -1,6 +1,6 @@ class InvalidTrainDataColumnsNumberException implements Exception { InvalidTrainDataColumnsNumberException(int expected, int received) : - message = 'Unexpected columns number in training data, ' + message = 'Unexpected columns number in train data, ' 'expected $expected, received ${received}'; final String message;