From 8c96da7abb8e16f2f73c4ba2b83c402c3bd5449c Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 27 Dec 2018 22:31:23 -0800 Subject: [PATCH] making GetCoefficientStatistics public --- .../Standard/LinearModelParameters.cs | 9 +- .../MulticlassLogisticRegression.cs | 2 +- .../Standard/ModelStatistics.cs | 88 +++++++++---------- .../TrainerEstimators/LbfgsTests.cs | 19 +++- 4 files changed, 63 insertions(+), 55 deletions(-) diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearModelParameters.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearModelParameters.cs index 8bf4438ad6..1fc4c16eb4 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LinearModelParameters.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LinearModelParameters.cs @@ -84,10 +84,7 @@ public IEnumerator GetEnumerator() return _pred.Weight.Items(all: true).Select(iv => iv.Value).GetEnumerator(); } - IEnumerator IEnumerable.GetEnumerator() - { - return GetEnumerator(); - } + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); } /// The predictor's feature weight coefficients. @@ -505,7 +502,7 @@ private protected override void SaveSummary(TextWriter writer, RoleMappedSchema writer.WriteLine(LinearPredictorUtils.LinearModelAsText("Linear Binary Classification Predictor", null, null, in weights, Bias, schema)); - _stats?.SaveText(writer, this, schema, 20); + _stats?.SaveText(writer, this, schema.Feature.Value, 20); } /// @@ -516,7 +513,7 @@ IList> ICanGetSummaryInKeyValuePairs.GetSummaryInKe var weights = Weight; List> results = new List>(); LinearPredictorUtils.SaveLinearModelWeightsInKeyValuePairs(in weights, Bias, schema, results); - _stats?.SaveSummaryInKeyValuePairs(this, schema, int.MaxValue, results); + _stats?.SaveSummaryInKeyValuePairs(this, schema.Feature.Value, int.MaxValue, results); return results; } diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs index 5d524f5393..b7c08b2370 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs @@ -783,7 +783,7 @@ void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema) } if (_stats != null) - _stats.SaveText(writer, null, schema, 20); + _stats.SaveText(writer, null, schema.Feature.Value, 20); } /// diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs index 09b6974b01..905dfd17ba 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs @@ -27,12 +27,12 @@ namespace Microsoft.ML.Learners public readonly struct CoefficientStatistics { public readonly string Name; - public readonly Single Estimate; - public readonly Single StandardError; - public readonly Single ZScore; - public readonly Single PValue; + public readonly float Estimate; + public readonly float StandardError; + public readonly float ZScore; + public readonly float PValue; - public CoefficientStatistics(string name, Single estimate, Single stdError, Single zScore, Single pValue) + public CoefficientStatistics(string name, float estimate, float stdError, float zScore, float pValue) { Contracts.AssertNonEmpty(name); Name = name; @@ -69,10 +69,10 @@ private static VersionInfo GetVersionInfo() private readonly long _trainingExampleCount; // The deviance of this model. - private readonly Single _deviance; + private readonly float _deviance; // The deviance of the null hypothesis. - private readonly Single _nullDeviance; + private readonly float _nullDeviance; // Total count of parameters. private readonly int _paramCount; @@ -82,17 +82,17 @@ private static VersionInfo GetVersionInfo() // It could be null when there are too many non-zero weights so that // the memory is insufficient to hold the Hessian matrix necessary for the computation // of the variance-covariance matrix. - private readonly VBuffer? _coeffStdError; + private readonly VBuffer? _coeffStdError; public long TrainingExampleCount => _trainingExampleCount; - public Single Deviance => _deviance; + public float Deviance => _deviance; - public Single NullDeviance => _nullDeviance; + public float NullDeviance => _nullDeviance; public int ParametersCount => _paramCount; - internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance) + internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, float deviance, float nullDeviance) { Contracts.AssertValue(env); env.Assert(trainingExampleCount > 0); @@ -104,7 +104,7 @@ internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, _nullDeviance = nullDeviance; } - internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance, in VBuffer coeffStdError) + internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, float deviance, float nullDeviance, in VBuffer coeffStdError) : this(env, trainingExampleCount, paramCount, deviance, nullDeviance) { _env.Assert(coeffStdError.GetValues().Length == _paramCount); @@ -120,10 +120,10 @@ internal LinearModelStatistics(IHostEnvironment env, ModelLoadContext ctx) // *** Binary Format *** // int: count of parameters // long: count of training examples - // Single: deviance - // Single: null deviance + // float: deviance + // float: null deviance // bool: whether standard error is included - // (Conditional) Single[_paramCount]: values of std errors of coefficients + // (Conditional) float[_paramCount]: values of std errors of coefficients // (Conditional) int: length of std errors of coefficients // (Conditional) int[_paramCount]: indices of std errors of coefficients @@ -143,18 +143,18 @@ internal LinearModelStatistics(IHostEnvironment env, ModelLoadContext ctx) return; } - Single[] stdErrorValues = ctx.Reader.ReadFloatArray(_paramCount); + float[] stdErrorValues = ctx.Reader.ReadFloatArray(_paramCount); int length = ctx.Reader.ReadInt32(); _env.CheckDecode(length >= _paramCount); if (length == _paramCount) { - _coeffStdError = new VBuffer(length, stdErrorValues); + _coeffStdError = new VBuffer(length, stdErrorValues); return; } _env.Assert(length > _paramCount); int[] stdErrorIndices = ctx.Reader.ReadIntArray(_paramCount); - _coeffStdError = new VBuffer(length, _paramCount, stdErrorValues, stdErrorIndices); + _coeffStdError = new VBuffer(length, _paramCount, stdErrorValues, stdErrorIndices); } internal static LinearModelStatistics Create(IHostEnvironment env, ModelLoadContext ctx) @@ -178,10 +178,10 @@ private void SaveCore(ModelSaveContext ctx) // *** Binary Format *** // int: count of parameters // long: count of training examples - // Single: deviance - // Single: null deviance + // float: deviance + // float: null deviance // bool: whether standard error is included - // (Conditional) Single[_paramCount]: values of std errors of coefficients + // (Conditional) float[_paramCount]: values of std errors of coefficients // (Conditional) int: length of std errors of coefficients // (Conditional) int[_paramCount]: indices of std errors of coefficients @@ -212,7 +212,7 @@ private void SaveCore(ModelSaveContext ctx) /// /// Computes the standart deviation, Z-Score and p-Value. /// - public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias, out Single stdError, out Single zScore, out Single pValue) + public static bool TryGetBiasStatistics(LinearModelStatistics stats, float bias, out float stdError, out float zScore, out float pValue) { if (!stats._coeffStdError.HasValue) { @@ -226,12 +226,12 @@ public static bool TryGetBiasStatistics(LinearModelStatistics stats, Single bias stdError = stats._coeffStdError.Value.GetValues()[0]; Contracts.Assert(stdError == stats._coeffStdError.Value.GetItemOrDefault(0)); zScore = bias / stdError; - pValue = 1.0f - (Single)ProbabilityFunctions.Erf(Math.Abs(zScore / sqrt2)); + pValue = 1.0f - (float)ProbabilityFunctions.Erf(Math.Abs(zScore / sqrt2)); return true; } - private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stats, in VBuffer weights, in VBuffer> names, - ref VBuffer estimate, ref VBuffer stdErr, ref VBuffer zScore, ref VBuffer pValue, out ValueGetter>> getSlotNames) + private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stats, in VBuffer weights, in VBuffer> names, + ref VBuffer estimate, ref VBuffer stdErr, ref VBuffer zScore, ref VBuffer pValue, out ValueGetter>> getSlotNames) { if (!stats._coeffStdError.HasValue) { @@ -260,7 +260,7 @@ private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stat var weight = estimateEditor.Values[i - 1] = weights.GetItemOrDefault(wi); var stdError = stdErrorEditor.Values[wi] = coeffStdErrorValues[i]; zScoreEditor.Values[i - 1] = weight / stdError; - pValueEditor.Values[i - 1] = 1 - (Single)ProbabilityFunctions.Erf(Math.Abs(zScoreEditor.Values[i - 1] / sqrt2)); + pValueEditor.Values[i - 1] = 1 - (float)ProbabilityFunctions.Erf(Math.Abs(zScoreEditor.Values[i - 1] / sqrt2)); } estimate = estimateEditor.Commit(); @@ -283,7 +283,7 @@ private static void GetUnorderedCoefficientStatistics(LinearModelStatistics stat }; } - private List GetUnorderedCoefficientStatistics(LinearBinaryModelParameters parent, RoleMappedSchema schema) + private List GetUnorderedCoefficientStatistics(LinearBinaryModelParameters parent, Schema.Column featureColumn) { Contracts.AssertValue(_env); _env.CheckValue(parent, nameof(parent)); @@ -291,12 +291,14 @@ private List GetUnorderedCoefficientStatistics(LinearBina if (!_coeffStdError.HasValue) return new List(); - var weights = parent.Weights as IReadOnlyList; + var weights = parent.Weights as IReadOnlyList; _env.Assert(_paramCount == 1 || weights != null); _env.Assert(_coeffStdError.Value.Length == weights.Count + 1); var names = default(VBuffer>); - MetadataUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, weights.Count, ref names); + + featureColumn.Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref names); + _env.Assert(names.Length > 0, "FeatureColumn has no metadata."); ReadOnlySpan stdErrorValues = _coeffStdError.Value.GetValues(); const Double sqrt2 = 1.41421356237; // Math.Sqrt(2); @@ -304,7 +306,7 @@ private List GetUnorderedCoefficientStatistics(LinearBina List result = new List(_paramCount - 1); bool denseStdError = _coeffStdError.Value.IsDense; ReadOnlySpan stdErrorIndices = _coeffStdError.Value.GetIndices(); - Single[] zScores = new Single[_paramCount - 1]; + float[] zScores = new float[_paramCount - 1]; for (int i = 1; i < _paramCount; i++) { int wi = denseStdError ? i - 1 : stdErrorIndices[i] - 1; @@ -315,7 +317,7 @@ private List GetUnorderedCoefficientStatistics(LinearBina var weight = weights[wi]; var stdError = stdErrorValues[i]; var zScore = zScores[i - 1] = weight / stdError; - var pValue = 1 - (Single)ProbabilityFunctions.Erf(Math.Abs(zScore / sqrt2)); + var pValue = 1 - (float)ProbabilityFunctions.Erf(Math.Abs(zScore / sqrt2)); result.Add(new CoefficientStatistics(name, weight, stdError, zScore, pValue)); } return result; @@ -324,33 +326,31 @@ private List GetUnorderedCoefficientStatistics(LinearBina /// /// Gets the coefficient statistics as an object. /// - internal CoefficientStatistics[] GetCoefficientStatistics(LinearBinaryModelParameters parent, RoleMappedSchema schema, int paramCountCap) + public CoefficientStatistics[] GetCoefficientStatistics(LinearBinaryModelParameters parent, Schema.Column featureColumn, int paramCountCap) { Contracts.AssertValue(_env); _env.CheckValue(parent, nameof(parent)); - _env.CheckValue(schema, nameof(schema)); _env.CheckParam(paramCountCap >= 0, nameof(paramCountCap)); if (paramCountCap > _paramCount) paramCountCap = _paramCount; - Single stdError; - Single zScore; - Single pValue; + float stdError; + float zScore; + float pValue; var bias = parent.Bias; if (!TryGetBiasStatistics(parent.Statistics, bias, out stdError, out zScore, out pValue)) return null; - var order = GetUnorderedCoefficientStatistics(parent, schema).OrderByDescending(stat => stat.ZScore).Take(paramCountCap - 1); + var order = GetUnorderedCoefficientStatistics(parent, featureColumn).OrderByDescending(stat => stat.ZScore).Take(paramCountCap - 1); return order.Prepend(new[] { new CoefficientStatistics("(Bias)", bias, stdError, zScore, pValue) }).ToArray(); } - internal void SaveText(TextWriter writer, LinearBinaryModelParameters parent, RoleMappedSchema schema, int paramCountCap) + internal void SaveText(TextWriter writer, LinearBinaryModelParameters parent, Schema.Column featureColumn, int paramCountCap) { Contracts.AssertValue(_env); _env.CheckValue(writer, nameof(writer)); _env.AssertValueOrNull(parent); - _env.AssertValueOrNull(schema); writer.WriteLine(); writer.WriteLine("*** MODEL STATISTICS SUMMARY *** "); writer.WriteLine("Count of training examples:\t{0}", _trainingExampleCount); @@ -361,7 +361,7 @@ internal void SaveText(TextWriter writer, LinearBinaryModelParameters parent, Ro if (parent == null) return; - var coeffStats = GetCoefficientStatistics(parent, schema, paramCountCap); + var coeffStats = GetCoefficientStatistics(parent, featureColumn, paramCountCap); if (coeffStats == null) return; @@ -387,7 +387,7 @@ internal void SaveText(TextWriter writer, LinearBinaryModelParameters parent, Ro /// Support method for linear models and . /// internal void SaveSummaryInKeyValuePairs(LinearBinaryModelParameters parent, - RoleMappedSchema schema, int paramCountCap, List> resultCollection) + Schema.Column featureColumn, int paramCountCap, List> resultCollection) { Contracts.AssertValue(_env); _env.AssertValue(resultCollection); @@ -400,7 +400,7 @@ internal void SaveSummaryInKeyValuePairs(LinearBinaryModelParameters parent, if (parent == null) return; - var coeffStats = GetCoefficientStatistics(parent, schema, paramCountCap); + var coeffStats = GetCoefficientStatistics(parent, featureColumn, paramCountCap); if (coeffStats == null) return; @@ -408,7 +408,7 @@ internal void SaveSummaryInKeyValuePairs(LinearBinaryModelParameters parent, { resultCollection.Add(new KeyValuePair( coeffStat.Name, - new Single[] { coeffStat.Estimate, coeffStat.StandardError, coeffStat.ZScore, coeffStat.PValue })); + new float[] { coeffStat.Estimate, coeffStat.StandardError, coeffStat.ZScore, coeffStat.PValue })); } } @@ -458,7 +458,7 @@ internal Schema.Metadata MakeStatisticsMetadata(LinearBinaryModelParameters pare return builder.GetMetadata(); } - private string DecorateProbabilityString(Single probZ) + private string DecorateProbabilityString(float probZ) { Contracts.AssertValue(_env); _env.Assert(0 <= probZ && probZ <= 1); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs index 30cab6c409..b70921066d 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Linq; using Microsoft.ML.Core.Data; using Microsoft.ML.Data; using Microsoft.ML.Internal.Calibration; @@ -54,7 +55,7 @@ public void TestEstimatorPoissonRegression() } [Fact] - public void TestLogisticRegressionStats() + public void TestLogisticRegressionNoStats() { (IEstimator pipe, IDataView dataView) = GetBinaryClassificationPipeline(); @@ -70,7 +71,7 @@ public void TestLogisticRegressionStats() } [Fact] - public void TestLogisticRegressionStats_MKL() + public void TestLogisticRegressionWithStats() { (IEstimator pipe, IDataView dataView) = GetBinaryClassificationPipeline(); @@ -80,14 +81,24 @@ public void TestLogisticRegressionStats_MKL() s.StdComputer = new ComputeLRTrainingStdThroughHal(); })); - var transformerChain = pipe.Fit(dataView) as TransformerChain>; + var transformer = pipe.Fit(dataView) as TransformerChain>; - var linearModel = transformerChain.LastTransformer.Model.SubPredictor as LinearBinaryModelParameters; + var linearModel = transformer.LastTransformer.Model.SubPredictor as LinearBinaryModelParameters; var stats = linearModel.Statistics; LinearModelStatistics.TryGetBiasStatistics(stats, 2, out float stdError, out float zScore, out float pValue); CompareNumbersWithTolerance(stdError, 0.250672936); CompareNumbersWithTolerance(zScore, 7.97852373); + + var scoredData = transformer.Transform(dataView); + + var coeffcients = stats.GetCoefficientStatistics(linearModel, scoredData.Schema["Features"], 100); + + Assert.Equal(19, coeffcients.Length); + + foreach(var coefficient in coeffcients) + Assert.True(coefficient.StandardError < 1.0); + } } }