From a4554748ba74a75d604fa27db83325870814e625 Mon Sep 17 00:00:00 2001 From: artidoro Date: Thu, 3 Jan 2019 15:32:52 +0100 Subject: [PATCH 1/5] started working --- .../LegacyPredictionEngineBench.cs | 4 - ...sticDualCoordinateAscentClassifierBench.cs | 3 - test/Microsoft.ML.FSharp.Tests/SmokeTests.fs | 3 - .../Microsoft.ML.TestFramework/ModelHelper.cs | 267 ++++-------------- 4 files changed, 57 insertions(+), 220 deletions(-) diff --git a/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs index 24ee741b72..6931159efb 100644 --- a/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs @@ -3,10 +3,6 @@ // See the LICENSE file in the project root for more information. using BenchmarkDotNet.Attributes; -using Microsoft.ML.Legacy; -using Microsoft.ML.Legacy.Data; -using Microsoft.ML.Legacy.Trainers; -using Microsoft.ML.Legacy.Transforms; namespace Microsoft.ML.Benchmarks { diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index fe16701fe7..76e99f7058 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -7,9 +7,6 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Engines; using Microsoft.ML.Data; -using Microsoft.ML.Legacy.Models; -using Microsoft.ML.Legacy.Trainers; -using Microsoft.ML.Legacy.Transforms; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms.Text; diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index fbd9a468ba..6ff16fe4c7 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -56,9 +56,6 @@ namespace Microsoft.ML.FSharp.Tests #nowarn "44" open System open Microsoft.ML -open Microsoft.ML.Legacy.Data -open Microsoft.ML.Legacy.Trainers -open Microsoft.ML.Legacy.Transforms open Microsoft.ML.Data open Xunit diff --git a/test/Microsoft.ML.TestFramework/ModelHelper.cs b/test/Microsoft.ML.TestFramework/ModelHelper.cs index 3f6811185e..c0dc2ccb0b 100644 --- a/test/Microsoft.ML.TestFramework/ModelHelper.cs +++ b/test/Microsoft.ML.TestFramework/ModelHelper.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System.IO; +using Microsoft.ML.Core.Data; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; using Microsoft.ML.Legacy.Data; @@ -12,8 +13,8 @@ namespace Microsoft.ML.TestFramework #pragma warning disable 612, 618 public static class ModelHelper { - private static MLContext s_environment = new MLContext(seed: 1); - private static TransformModel s_housePriceModel; + private static MLContext mlContext = new MLContext(seed: 1); + private static ITransformer s_housePriceModel; public static void WriteKcHousePriceModel(string dataPath, string outputModelPath) { @@ -34,12 +35,12 @@ public static void WriteKcHousePriceModel(string dataPath, Stream stream) { s_housePriceModel = CreateKcHousePricePredictorModel(dataPath); } - s_housePriceModel.Save(s_environment, stream); + mlContext.Model.Save(s_housePriceModel, stream); } public static IDataView GetKcHouseDataView(string dataPath) { - return s_environment.Data.ReadFromTextFile(dataPath, + return mlContext.Data.ReadFromTextFile(dataPath, columns: new[] { new Data.TextLoader.Column("Id", Data.DataKind.TX, 0), @@ -69,213 +70,59 @@ public static IDataView GetKcHouseDataView(string dataPath) ); } - private static TransformModel CreateKcHousePricePredictorModel(string dataPath) + private static ITransformer CreateKcHousePricePredictorModel(string dataPath) { - Experiment experiment = s_environment.CreateExperiment(); - var importData = new Legacy.Data.TextLoader(dataPath) - { - Arguments = new TextLoaderArguments - { - Separator = new[] { ',' }, - HasHeader = true, - Column = new[] - { - new TextLoaderColumn() - { - Name = "Id", - Source = new [] { new TextLoaderRange(0) }, - Type = Legacy.Data.DataKind.Text - }, - - new TextLoaderColumn() - { - Name = "Date", - Source = new [] { new TextLoaderRange(1) }, - Type = Legacy.Data.DataKind.Text - }, - - new TextLoaderColumn() - { - Name = "Label", - Source = new [] { new TextLoaderRange(2) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Bedrooms", - Source = new [] { new TextLoaderRange(3) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Bathrooms", - Source = new [] { new TextLoaderRange(4) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "SqftLiving", - Source = new [] { new TextLoaderRange(5) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "SqftLot", - Source = new [] { new TextLoaderRange(6) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Floors", - Source = new [] { new TextLoaderRange(7) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Waterfront", - Source = new [] { new TextLoaderRange(8) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "View", - Source = new [] { new TextLoaderRange(9) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Condition", - Source = new [] { new TextLoaderRange(10) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Grade", - Source = new [] { new TextLoaderRange(11) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "SqftAbove", - Source = new [] { new TextLoaderRange(12) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "SqftBasement", - Source = new [] { new TextLoaderRange(13) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "YearBuilt", - Source = new [] { new TextLoaderRange(14) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "YearRenovated", - Source = new [] { new TextLoaderRange(15) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Zipcode", - Source = new [] { new TextLoaderRange(16) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Lat", - Source = new [] { new TextLoaderRange(17) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "Long", - Source = new [] { new TextLoaderRange(18) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "SqftLiving15", - Source = new [] { new TextLoaderRange(19) }, - Type = Legacy.Data.DataKind.Num - }, - - new TextLoaderColumn() - { - Name = "SqftLot15", - Source = new [] { new TextLoaderRange(20) }, - Type = Legacy.Data.DataKind.Num - }, - } - } - - //new Data.CustomTextLoader(); - // importData.CustomSchema = dataSchema; - // - }; - - Legacy.Data.TextLoader.Output imported = experiment.Add(importData); - var numericalConcatenate = new Legacy.Transforms.ColumnConcatenator(); - numericalConcatenate.Data = imported.Data; - numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); - Legacy.Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate); - - var categoryConcatenate = new Legacy.Transforms.ColumnConcatenator(); - categoryConcatenate.Data = numericalConcatenated.OutputData; - categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode"); - Legacy.Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate); - - var categorize = new Legacy.Transforms.CategoricalOneHotVectorizer(); - categorize.AddColumn("CategoryFeatures"); - categorize.Data = categoryConcatenated.OutputData; - Legacy.Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize); - - var featuresConcatenate = new Legacy.Transforms.ColumnConcatenator(); - featuresConcatenate.Data = categorized.OutputData; - featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures"); - Legacy.Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate); - - var learner = new Legacy.Trainers.StochasticDualCoordinateAscentRegressor(); - learner.TrainingData = featuresConcatenated.OutputData; - learner.NumThreads = 1; - Legacy.Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner); - - var combineModels = new Legacy.Transforms.ManyHeterogeneousModelCombiner(); - combineModels.TransformModels = new ArrayVar(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model); - combineModels.PredictorModel = learnerOutput.PredictorModel; - Legacy.Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels); - - var scorer = new Legacy.Transforms.Scorer - { - PredictorModel = combinedModels.PredictorModel - }; - - var scorerOutput = experiment.Add(scorer); - experiment.Compile(); - experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false)); - experiment.Run(); - - return experiment.GetOutput(scorerOutput.ScoringTransform); + Experiment experiment = mlContext.CreateExperiment(); + + var data = GetKcHouseDataView(dataPath); + var pipeline = mlContext.Transforms.Concatenate("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15") + .Append(mlContext.Transforms.Concatenate("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode")) + .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoryFeatures")) + .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoryFeatures")) + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(advancedSettings: s => { s.NumThreads = 1; })); + + //var numericalConcatenate = new Legacy.Transforms.ColumnConcatenator(); + //numericalConcatenate.Data = GetKcHouseDataView(dataPath); + //numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); + //Legacy.Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate); + + //var categoryConcatenate = new Legacy.Transforms.ColumnConcatenator(); + //categoryConcatenate.Data = numericalConcatenated.OutputData; + //categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode"); + //Legacy.Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate); + + //var categorize = new Legacy.Transforms.CategoricalOneHotVectorizer(); + //categorize.AddColumn("CategoryFeatures"); + //categorize.Data = categoryConcatenated.OutputData; + //Legacy.Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize); + + //var featuresConcatenate = new Legacy.Transforms.ColumnConcatenator(); + //featuresConcatenate.Data = categorized.OutputData; + //featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures"); + //Legacy.Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate); + + //var learner = new Legacy.Trainers.StochasticDualCoordinateAscentRegressor(); + //learner.TrainingData = featuresConcatenated.OutputData; + //learner.NumThreads = 1; + //Legacy.Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner); + + //var combineModels = new Legacy.Transforms.ManyHeterogeneousModelCombiner(); + //combineModels.TransformModels = new ArrayVar(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model); + //combineModels.PredictorModel = learnerOutput.PredictorModel; + //Legacy.Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels); + + //var scorer = new Legacy.Transforms.Scorer + //{ + // PredictorModel = combinedModels.PredictorModel + //}; + + //var scorerOutput = experiment.Add(scorer); + //experiment.Compile(); + //experiment.SetInput(importData.InputFile, new SimpleFileHandle(mlContext, dataPath, false, false)); + //experiment.Run(); + + //return experiment.GetOutput(scorerOutput.ScoringTransform); + return pipeline.Fit(data); } } #pragma warning restore 612, 618 From f864ec92480f79e46e10f61d28e0b8e8a6659f0f Mon Sep 17 00:00:00 2001 From: artidoro Date: Fri, 4 Jan 2019 10:47:30 +0100 Subject: [PATCH 2/5] model helper and related testsd --- .../LegacyPredictionEngineBench.cs | 4 + ...sticDualCoordinateAscentClassifierBench.cs | 3 + test/Microsoft.ML.FSharp.Tests/SmokeTests.fs | 3 + .../Microsoft.ML.TestFramework/ModelHelper.cs | 83 +--------------- .../PredictionModelTests.cs | 97 +++++++------------ .../Scenarios/HousePricePredictionTests.cs | 14 +-- 6 files changed, 58 insertions(+), 146 deletions(-) diff --git a/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs index 6931159efb..24ee741b72 100644 --- a/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/LegacyPredictionEngineBench.cs @@ -3,6 +3,10 @@ // See the LICENSE file in the project root for more information. using BenchmarkDotNet.Attributes; +using Microsoft.ML.Legacy; +using Microsoft.ML.Legacy.Data; +using Microsoft.ML.Legacy.Trainers; +using Microsoft.ML.Legacy.Transforms; namespace Microsoft.ML.Benchmarks { diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index 76e99f7058..fe16701fe7 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -7,6 +7,9 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Engines; using Microsoft.ML.Data; +using Microsoft.ML.Legacy.Models; +using Microsoft.ML.Legacy.Trainers; +using Microsoft.ML.Legacy.Transforms; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms.Text; diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index 6ff16fe4c7..fbd9a468ba 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -56,6 +56,9 @@ namespace Microsoft.ML.FSharp.Tests #nowarn "44" open System open Microsoft.ML +open Microsoft.ML.Legacy.Data +open Microsoft.ML.Legacy.Trainers +open Microsoft.ML.Legacy.Transforms open Microsoft.ML.Data open Xunit diff --git a/test/Microsoft.ML.TestFramework/ModelHelper.cs b/test/Microsoft.ML.TestFramework/ModelHelper.cs index c0dc2ccb0b..94c11c3340 100644 --- a/test/Microsoft.ML.TestFramework/ModelHelper.cs +++ b/test/Microsoft.ML.TestFramework/ModelHelper.cs @@ -2,43 +2,14 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.IO; using Microsoft.ML.Core.Data; using Microsoft.ML.Data; -using Microsoft.ML.EntryPoints; -using Microsoft.ML.Legacy.Data; namespace Microsoft.ML.TestFramework { -#pragma warning disable 612, 618 public static class ModelHelper { - private static MLContext mlContext = new MLContext(seed: 1); - private static ITransformer s_housePriceModel; - - public static void WriteKcHousePriceModel(string dataPath, string outputModelPath) - { - if (File.Exists(outputModelPath)) - { - File.Delete(outputModelPath); - } - - using (var saveStream = File.OpenWrite(outputModelPath)) - { - WriteKcHousePriceModel(dataPath, saveStream); - } - } - - public static void WriteKcHousePriceModel(string dataPath, Stream stream) - { - if (s_housePriceModel == null) - { - s_housePriceModel = CreateKcHousePricePredictorModel(dataPath); - } - mlContext.Model.Save(s_housePriceModel, stream); - } - - public static IDataView GetKcHouseDataView(string dataPath) + public static IDataView GetKcHouseDataView(MLContext mlContext, string dataPath) { return mlContext.Data.ReadFromTextFile(dataPath, columns: new[] @@ -70,60 +41,14 @@ public static IDataView GetKcHouseDataView(string dataPath) ); } - private static ITransformer CreateKcHousePricePredictorModel(string dataPath) + public static IEstimator GetKcHousePipeline(MLContext mlContext) { - Experiment experiment = mlContext.CreateExperiment(); - - var data = GetKcHouseDataView(dataPath); - var pipeline = mlContext.Transforms.Concatenate("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15") + // Define pipeline. + return mlContext.Transforms.Concatenate("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15") .Append(mlContext.Transforms.Concatenate("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode")) .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoryFeatures")) .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoryFeatures")) .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(advancedSettings: s => { s.NumThreads = 1; })); - - //var numericalConcatenate = new Legacy.Transforms.ColumnConcatenator(); - //numericalConcatenate.Data = GetKcHouseDataView(dataPath); - //numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); - //Legacy.Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate); - - //var categoryConcatenate = new Legacy.Transforms.ColumnConcatenator(); - //categoryConcatenate.Data = numericalConcatenated.OutputData; - //categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode"); - //Legacy.Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate); - - //var categorize = new Legacy.Transforms.CategoricalOneHotVectorizer(); - //categorize.AddColumn("CategoryFeatures"); - //categorize.Data = categoryConcatenated.OutputData; - //Legacy.Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize); - - //var featuresConcatenate = new Legacy.Transforms.ColumnConcatenator(); - //featuresConcatenate.Data = categorized.OutputData; - //featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures"); - //Legacy.Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate); - - //var learner = new Legacy.Trainers.StochasticDualCoordinateAscentRegressor(); - //learner.TrainingData = featuresConcatenated.OutputData; - //learner.NumThreads = 1; - //Legacy.Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner); - - //var combineModels = new Legacy.Transforms.ManyHeterogeneousModelCombiner(); - //combineModels.TransformModels = new ArrayVar(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model); - //combineModels.PredictorModel = learnerOutput.PredictorModel; - //Legacy.Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels); - - //var scorer = new Legacy.Transforms.Scorer - //{ - // PredictorModel = combinedModels.PredictorModel - //}; - - //var scorerOutput = experiment.Add(scorer); - //experiment.Compile(); - //experiment.SetInput(importData.InputFile, new SimpleFileHandle(mlContext, dataPath, false, false)); - //experiment.Run(); - - //return experiment.GetOutput(scorerOutput.ScoringTransform); - return pipeline.Fit(data); } } -#pragma warning restore 612, 618 } diff --git a/test/Microsoft.ML.Tests/PredictionModelTests.cs b/test/Microsoft.ML.Tests/PredictionModelTests.cs index 7d8d565445..4aaa194605 100644 --- a/test/Microsoft.ML.Tests/PredictionModelTests.cs +++ b/test/Microsoft.ML.Tests/PredictionModelTests.cs @@ -2,8 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.IO; -using System.Threading.Tasks; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Xunit; @@ -11,7 +9,6 @@ namespace Microsoft.ML.EntryPoints.Tests { -#pragma warning disable 612 public class PredictionModelTests : BaseTestClass { public class HousePriceData @@ -43,67 +40,48 @@ public class HousePricePrediction } [Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")] - public async Task ReadStrongTypeModelFromStream() + public void ReadStrongTypeModelFromStream() { - using (var memoryStream = new MemoryStream()) - { - ModelHelper.WriteKcHousePriceModel(GetDataPath("kc_house_data.csv"), memoryStream); - memoryStream.Position = 0; - - var model = await Legacy.PredictionModel.ReadAsync(memoryStream); - - HousePricePrediction prediction = model.Predict(new HousePriceData() - { - Bedrooms = 3, - Bathrooms = 1.75f, - SqftLiving = 2450, - SqftLot = 2691, - Floors = 2, - Waterfront = 0, - View = 0, - Condition = 3, - Grade = 8, - SqftAbove = 1750, - SqftBasement = 700, - YearBuilt = 1915, - YearRenovated = 0, - Zipcode = 98119, - Lat = 47.6386f, - Long = -122.36f, - SqftLiving15 = 1760, - SqftLot15 = 3573 - }); + var mlContext = new MLContext(seed: 1); + var data = ModelHelper.GetKcHouseDataView(mlContext, GetDataPath("kc_house_data.csv")); + var pipeline = ModelHelper.GetKcHousePipeline(mlContext); + var model = pipeline.Fit(data); - Assert.InRange(prediction.Price, 790_000, 850_000); + var engine = model.CreatePredictionEngine(mlContext); + HousePricePrediction prediction = engine.Predict(new HousePriceData() + { + Bedrooms = 3, + Bathrooms = 1.75f, + SqftLiving = 2450, + SqftLot = 2691, + Floors = 2, + Waterfront = 0, + View = 0, + Condition = 3, + Grade = 8, + SqftAbove = 1750, + SqftBasement = 700, + YearBuilt = 1915, + YearRenovated = 0, + Zipcode = 98119, + Lat = 47.6386f, + Long = -122.36f, + SqftLiving15 = 1760, + SqftLot15 = 3573 + }); - var dataView = model.Predict(ModelHelper.GetKcHouseDataView(GetDataPath("kc_house_data.csv"))); - dataView.Schema.TryGetColumnIndex("Score", out int scoreColumn); - using (var cursor = dataView.GetRowCursor((int col) => col == scoreColumn)) - { - var scoreGetter = cursor.GetGetter(scoreColumn); - float score = 0; - cursor.MoveNext(); - scoreGetter(ref score); - Assert.InRange(score, 100_000, 200_000); - } - - Legacy.PredictionModel nonGenericModel; - using (var anotherStream = new MemoryStream()) - { - await model.WriteAsync(anotherStream); - nonGenericModel = await Legacy.PredictionModel.ReadAsync(anotherStream); - } + Assert.InRange(prediction.Price, 790_000, 850_000); - dataView = nonGenericModel.Predict(ModelHelper.GetKcHouseDataView(GetDataPath("kc_house_data.csv"))); - using (var cursor = dataView.GetRowCursor((int col) => col == scoreColumn)) - { - var scoreGetter = cursor.GetGetter(scoreColumn); - float score = 0; - cursor.MoveNext(); - scoreGetter(ref score); - Assert.InRange(score, 100_000, 200_000); - } + var dataView = model.Transform(data); + dataView.Schema.TryGetColumnIndex("Score", out int scoreColumn); + using (var cursor = dataView.GetRowCursor((int col) => col == scoreColumn)) + { + var scoreGetter = cursor.GetGetter(scoreColumn); + float score = 0; + cursor.MoveNext(); + scoreGetter(ref score); + Assert.InRange(score, 100_000, 200_000); } } @@ -112,5 +90,4 @@ public PredictionModelTests(ITestOutputHelper output) { } } -#pragma warning restore 612 } diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs index ef8f704f4d..bc88045562 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs @@ -16,16 +16,17 @@ A real-estate firm Contoso wants to add a house price prediction to their ASP.NE The application will let users submit information about their house, and see a price they could expect if they put the house for sale. Because real estate transaction data is public, Contoso has historical data they intend to use to train Machine Learning prediction engine. */ -#pragma warning disable 612 [Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")] - public async void PredictHousePriceModelTest() + public void PredictHousePriceModelTest() { - string modelFilePath = GetOutputPath("PredictHousePriceModelTest.zip"); - ModelHelper.WriteKcHousePriceModel(GetDataPath("kc_house_data.csv"), modelFilePath); + var mlContext = new MLContext(seed: 1); + var data = ModelHelper.GetKcHouseDataView(mlContext, GetDataPath("kc_house_data.csv")); + var pipeline = ModelHelper.GetKcHousePipeline(mlContext); + var model = pipeline.Fit(data); - var model = await Legacy.PredictionModel.ReadAsync(modelFilePath); + var engine = model.CreatePredictionEngine(mlContext); - HousePricePrediction prediction = model.Predict(new HousePriceData() + HousePricePrediction prediction = engine.Predict(new HousePriceData() { Bedrooms = 3, Bathrooms = 2, @@ -49,7 +50,6 @@ public async void PredictHousePriceModelTest() Assert.InRange(prediction.Price, 260_000, 330_000); } -#pragma warning restore 612 public class HousePriceData { From fab927902082e070ac1b6ef00a55e9498da551d8 Mon Sep 17 00:00:00 2001 From: artidoro Date: Fri, 4 Jan 2019 13:20:41 +0100 Subject: [PATCH 3/5] fsharp tests --- .../Microsoft.ML.FSharp.Tests.fsproj | 14 +- test/Microsoft.ML.FSharp.Tests/SmokeTests.fs | 177 +++++------------- .../EnvironmentExtensions.cs | 3 - .../Microsoft.ML.TestFramework.csproj | 1 - .../Microsoft.ML.TestFramework/ModelHelper.cs | 42 ++--- 5 files changed, 67 insertions(+), 170 deletions(-) diff --git a/test/Microsoft.ML.FSharp.Tests/Microsoft.ML.FSharp.Tests.fsproj b/test/Microsoft.ML.FSharp.Tests/Microsoft.ML.FSharp.Tests.fsproj index bdf496e780..d3db320af4 100644 --- a/test/Microsoft.ML.FSharp.Tests/Microsoft.ML.FSharp.Tests.fsproj +++ b/test/Microsoft.ML.FSharp.Tests/Microsoft.ML.FSharp.Tests.fsproj @@ -22,20 +22,8 @@ - - - - - - - - - - - - @@ -43,4 +31,4 @@ - + \ No newline at end of file diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index fbd9a468ba..df0bfd2185 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -56,19 +56,16 @@ namespace Microsoft.ML.FSharp.Tests #nowarn "44" open System open Microsoft.ML -open Microsoft.ML.Legacy.Data -open Microsoft.ML.Legacy.Trainers -open Microsoft.ML.Legacy.Transforms open Microsoft.ML.Data open Xunit module SmokeTest1 = type SentimentData() = - [] + [] + val mutable Sentiment : bool + [] val mutable SentimentText : string - [] - val mutable Sentiment : float32 type SentimentPrediction() = [] @@ -77,51 +74,23 @@ module SmokeTest1 = [] let ``FSharp-Sentiment-Smoke-Test`` () = - // See https://github.com/dotnet/machinelearning/issues/401: forces the loading of ML.NET component assemblies - let _load = - [ typeof; - typeof; - typeof] // ML.EntryPoints - let testDataPath = __SOURCE_DIRECTORY__ + @"/../data/wikipedia-detox-250-line-data.tsv" - let pipeline = Legacy.LearningPipeline() - - pipeline.Add( - Microsoft.ML.Legacy.Data.TextLoader(testDataPath).CreateFrom( - Arguments = - TextLoaderArguments( - HasHeader = true, - Column = [| TextLoaderColumn(Name = "Label", - Source = [| TextLoaderRange(0) |], - Type = Nullable (Legacy.Data.DataKind.Num)) - TextLoaderColumn(Name = "SentimentText", - Source = [| TextLoaderRange(1) |], - Type = Nullable (Legacy.Data.DataKind.Text)) |] - ))) - - pipeline.Add( - TextFeaturizer( - "Features", [| "SentimentText" |], - KeepPunctuations = false, - OutputTokens = true, - VectorNormalizer = TextFeaturizingEstimatorTextNormKind.L2 - )) - - pipeline.Add( - FastTreeBinaryClassifier( - NumLeaves = 5, - NumTrees = 5, - MinDocumentsInLeafs = 2 - )) - - let model = pipeline.Train() + let ml = MLContext(seed = new System.Nullable(1), conc = 1) + let data = ml.Data.ReadFromTextFile(testDataPath, hasHeader = true) + + let pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") + .Append(ml.BinaryClassification.Trainers.FastTree(numLeaves = 5, numTrees = 5)) + + let model = pipeline.Fit(data) + let engine = model.CreatePredictionEngine(ml) + let predictions = [ SentimentData(SentimentText = "This is a gross exaggeration. Nobody is setting a kangaroo court. There was a simple addition.") SentimentData(SentimentText = "Sort of ok") SentimentData(SentimentText = "Joe versus the Volcano Coffee Company is a great film.") ] - |> model.Predict + |> List.map engine.Predict let predictionResults = [ for p in predictions -> p.Sentiment ] Assert.Equal(predictionResults, [ false; true; true ]) @@ -131,11 +100,11 @@ module SmokeTest2 = [] type SentimentData = - { [] - SentimentText : string - - [] - Sentiment : float32 } + { [] + Sentiment : bool + + [] + SentimentText : string } [] type SentimentPrediction = @@ -145,51 +114,23 @@ module SmokeTest2 = [] let ``FSharp-Sentiment-Smoke-Test`` () = - // See https://github.com/dotnet/machinelearning/issues/401: forces the loading of ML.NET component assemblies - let _load = - [ typeof; - typeof; - typeof] // ML.EntryPoints - let testDataPath = __SOURCE_DIRECTORY__ + @"/../data/wikipedia-detox-250-line-data.tsv" + + let ml = MLContext(seed = new System.Nullable(1), conc = 1) + let data = ml.Data.ReadFromTextFile(testDataPath, hasHeader = true) + + let pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") + .Append(ml.BinaryClassification.Trainers.FastTree(numLeaves = 5, numTrees = 5)) + + let model = pipeline.Fit(data) - let pipeline = Legacy.LearningPipeline() - - pipeline.Add( - Microsoft.ML.Legacy.Data.TextLoader(testDataPath).CreateFrom( - Arguments = - TextLoaderArguments( - HasHeader = true, - Column = [| TextLoaderColumn(Name = "Label", - Source = [| TextLoaderRange(0) |], - Type = Nullable (Legacy.Data.DataKind.Num)) - TextLoaderColumn(Name = "SentimentText", - Source = [| TextLoaderRange(1) |], - Type = Nullable (Legacy.Data.DataKind.Text)) |] - ))) - - pipeline.Add( - TextFeaturizer( - "Features", [| "SentimentText" |], - KeepPunctuations = false, - OutputTokens = true, - VectorNormalizer = TextFeaturizingEstimatorTextNormKind.L2 - )) - - pipeline.Add( - FastTreeBinaryClassifier( - NumLeaves = 5, - NumTrees = 5, - MinDocumentsInLeafs = 2 - )) - - let model = pipeline.Train() + let engine = model.CreatePredictionEngine(ml) let predictions = - [ { SentimentText = "This is a gross exaggeration. Nobody is setting a kangaroo court. There was a simple addition."; Sentiment = 0.0f } - { SentimentText = "Sort of ok"; Sentiment = 0.0f } - { SentimentText = "Joe versus the Volcano Coffee Company is a great film."; Sentiment = 0.0f } ] - |> model.Predict + [ { SentimentText = "This is a gross exaggeration. Nobody is setting a kangaroo court. There was a simple addition."; Sentiment = false } + { SentimentText = "Sort of ok"; Sentiment = false } + { SentimentText = "Joe versus the Volcano Coffee Company is a great film."; Sentiment = false } ] + |> List.map engine.Predict let predictionResults = [ for p in predictions -> p.Sentiment ] Assert.Equal(predictionResults, [ false; true; true ]) @@ -197,11 +138,11 @@ module SmokeTest2 = module SmokeTest3 = type SentimentData() = - [] - member val SentimentText = "".AsMemory() with get, set + [] + member val Sentiment = false with get, set - [] - member val Sentiment = 0.0 with get, set + [] + member val SentimentText = "".AsMemory() with get, set type SentimentPrediction() = [] @@ -210,51 +151,23 @@ module SmokeTest3 = [] let ``FSharp-Sentiment-Smoke-Test`` () = - // See https://github.com/dotnet/machinelearning/issues/401: forces the loading of ML.NET component assemblies - let _load = - [ typeof; - typeof; - typeof] // ML.EntryPoints - let testDataPath = __SOURCE_DIRECTORY__ + @"/../data/wikipedia-detox-250-line-data.tsv" - let pipeline = Legacy.LearningPipeline() - - pipeline.Add( - Microsoft.ML.Legacy.Data.TextLoader(testDataPath).CreateFrom( - Arguments = - TextLoaderArguments( - HasHeader = true, - Column = [| TextLoaderColumn(Name = "Label", - Source = [| TextLoaderRange(0) |], - Type = Nullable (Legacy.Data.DataKind.Num)) - TextLoaderColumn(Name = "SentimentText", - Source = [| TextLoaderRange(1) |], - Type = Nullable (Legacy.Data.DataKind.Text)) |] - ))) - - pipeline.Add( - TextFeaturizer( - "Features", [| "SentimentText" |], - KeepPunctuations = false, - OutputTokens = true, - VectorNormalizer = TextFeaturizingEstimatorTextNormKind.L2 - )) - - pipeline.Add( - FastTreeBinaryClassifier( - NumLeaves = 5, - NumTrees = 5, - MinDocumentsInLeafs = 2 - )) - - let model = pipeline.Train() + let ml = MLContext(seed = new System.Nullable(1), conc = 1) + let data = ml.Data.ReadFromTextFile(testDataPath, hasHeader = true) + + let pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") + .Append(ml.BinaryClassification.Trainers.FastTree(numLeaves = 5, numTrees = 5)) + + let model = pipeline.Fit(data) + + let engine = model.CreatePredictionEngine(ml) let predictions = [ SentimentData(SentimentText = "This is a gross exaggeration. Nobody is setting a kangaroo court. There was a simple addition.".AsMemory()) SentimentData(SentimentText = "Sort of ok".AsMemory()) SentimentData(SentimentText = "Joe versus the Volcano Coffee Company is a great film.".AsMemory()) ] - |> model.Predict + |> List.map engine.Predict let predictionResults = [ for p in predictions -> p.Sentiment ] Assert.Equal(predictionResults, [ false; true; true ]) diff --git a/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs b/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs index 0302b6c2a4..1b8cbdcb60 100644 --- a/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs +++ b/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs @@ -25,9 +25,6 @@ public static TEnvironment AddStandardComponents(this TEnvironment env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble env.ComponentCatalog.RegisterAssembly(typeof(KMeansModelParameters).Assembly); // ML.KMeansClustering env.ComponentCatalog.RegisterAssembly(typeof(PcaModelParameters).Assembly); // ML.PCA -#pragma warning disable 612 - env.ComponentCatalog.RegisterAssembly(typeof(Experiment).Assembly); // ML.Legacy -#pragma warning restore 612 env.ComponentCatalog.RegisterAssembly(typeof(CVSplit).Assembly); // ML.EntryPoints return env; } diff --git a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj index 62ae1cea6a..3854fc5f67 100644 --- a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj +++ b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj @@ -15,7 +15,6 @@ - diff --git a/test/Microsoft.ML.TestFramework/ModelHelper.cs b/test/Microsoft.ML.TestFramework/ModelHelper.cs index 94c11c3340..953b5cab70 100644 --- a/test/Microsoft.ML.TestFramework/ModelHelper.cs +++ b/test/Microsoft.ML.TestFramework/ModelHelper.cs @@ -14,27 +14,27 @@ public static IDataView GetKcHouseDataView(MLContext mlContext, string dataPath) return mlContext.Data.ReadFromTextFile(dataPath, columns: new[] { - new Data.TextLoader.Column("Id", Data.DataKind.TX, 0), - new Data.TextLoader.Column("Date", Data.DataKind.TX, 1), - new Data.TextLoader.Column("Label", Data.DataKind.R4, 2), - new Data.TextLoader.Column("BedRooms", Data.DataKind.R4, 3), - new Data.TextLoader.Column("BathRooms", Data.DataKind.R4, 4), - new Data.TextLoader.Column("SqftLiving", Data.DataKind.R4, 5), - new Data.TextLoader.Column("SqftLot", Data.DataKind.R4, 6), - new Data.TextLoader.Column("Floors", Data.DataKind.R4, 7), - new Data.TextLoader.Column("WaterFront", Data.DataKind.R4, 8), - new Data.TextLoader.Column("View", Data.DataKind.R4, 9), - new Data.TextLoader.Column("Condition", Data.DataKind.R4, 10), - new Data.TextLoader.Column("Grade", Data.DataKind.R4, 11), - new Data.TextLoader.Column("SqftAbove", Data.DataKind.R4, 12), - new Data.TextLoader.Column("SqftBasement", Data.DataKind.R4, 13), - new Data.TextLoader.Column("YearBuilt", Data.DataKind.R4, 14), - new Data.TextLoader.Column("YearRenovated", Data.DataKind.R4, 15), - new Data.TextLoader.Column("Zipcode", Data.DataKind.R4, 16), - new Data.TextLoader.Column("Lat", Data.DataKind.R4, 17), - new Data.TextLoader.Column("Long", Data.DataKind.R4, 18), - new Data.TextLoader.Column("SqftLiving15", Data.DataKind.R4, 19), - new Data.TextLoader.Column("SqftLot15", Data.DataKind.R4, 20) + new TextLoader.Column("Id", DataKind.TX, 0), + new TextLoader.Column("Date", DataKind.TX, 1), + new TextLoader.Column("Label", DataKind.R4, 2), + new TextLoader.Column("BedRooms", DataKind.R4, 3), + new TextLoader.Column("BathRooms", DataKind.R4, 4), + new TextLoader.Column("SqftLiving", DataKind.R4, 5), + new TextLoader.Column("SqftLot", DataKind.R4, 6), + new TextLoader.Column("Floors", DataKind.R4, 7), + new TextLoader.Column("WaterFront", DataKind.R4, 8), + new TextLoader.Column("View", DataKind.R4, 9), + new TextLoader.Column("Condition", DataKind.R4, 10), + new TextLoader.Column("Grade", DataKind.R4, 11), + new TextLoader.Column("SqftAbove", DataKind.R4, 12), + new TextLoader.Column("SqftBasement", DataKind.R4, 13), + new TextLoader.Column("YearBuilt", DataKind.R4, 14), + new TextLoader.Column("YearRenovated", DataKind.R4, 15), + new TextLoader.Column("Zipcode", DataKind.R4, 16), + new TextLoader.Column("Lat", DataKind.R4, 17), + new TextLoader.Column("Long", DataKind.R4, 18), + new TextLoader.Column("SqftLiving15", DataKind.R4, 19), + new TextLoader.Column("SqftLot15", DataKind.R4, 20) }, hasHeader: true, separatorChar: ',' From fd8905b2b3b23d5718eb7d2fa91738ee30484c00 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 4 Jan 2019 16:27:39 +0100 Subject: [PATCH 4/5] entrypoint catalog --- .../Common/EntryPoints/core_ep-list.tsv | 1 - .../Common/EntryPoints/core_manifest.json | 354 ------------------ 2 files changed, 355 deletions(-) diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 75f83c5030..68efd39400 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -2,7 +2,6 @@ Data.CustomTextLoader Import a dataset from a text file Microsoft.ML.EntryPoints Data.DataViewReference Pass dataview from memory to experiment Microsoft.ML.EntryPoints.DataViewReference ImportData Microsoft.ML.EntryPoints.DataViewReference+Input Microsoft.ML.EntryPoints.DataViewReference+Output Data.IDataViewArrayConverter Create an array variable of IDataView Microsoft.ML.EntryPoints.MacroUtils MakeArray Microsoft.ML.EntryPoints.MacroUtils+ArrayIDataViewInput Microsoft.ML.EntryPoints.MacroUtils+ArrayIDataViewOutput Data.PredictorModelArrayConverter Create an array variable of PredictorModel Microsoft.ML.EntryPoints.MacroUtils MakeArray Microsoft.ML.EntryPoints.MacroUtils+ArrayIPredictorModelInput Microsoft.ML.EntryPoints.MacroUtils+ArrayIPredictorModelOutput -Data.TextLoader Import a dataset from a text file Microsoft.ML.Legacy.EntryPoints.ImportTextData TextLoader Microsoft.ML.Legacy.EntryPoints.ImportTextData+LoaderInput Microsoft.ML.EntryPoints.ImportTextData+Output Models.AnomalyDetectionEvaluator Evaluates an anomaly detection scored dataset. Microsoft.ML.Data.Evaluate AnomalyDetection Microsoft.ML.Data.AnomalyDetectionMamlEvaluator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+CommonEvaluateOutput Models.AnomalyPipelineEnsemble Combine anomaly detection models into an ensemble Microsoft.ML.EntryPoints.EnsembleCreator CreateAnomalyPipelineEnsemble Microsoft.ML.EntryPoints.EnsembleCreator+PipelineAnomalyInput Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput Models.BinaryClassificationEvaluator Evaluates a binary classification scored dataset. Microsoft.ML.Data.Evaluate Binary Microsoft.ML.Data.BinaryClassifierMamlEvaluator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+ClassificationEvaluateOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 3e95b0e09d..eead6163ee 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -119,356 +119,6 @@ } ] }, - { - "Name": "Data.TextLoader", - "Desc": "Import a dataset from a text file", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ - { - "Name": "InputFile", - "Type": "FileHandle", - "Desc": "Location of the input file", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Arguments", - "Type": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the column", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Type", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "Type of the items in the column", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Source", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Min", - "Type": "Int", - "Desc": "First index in the range", - "Required": true, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "Max", - "Type": "Int", - "Desc": "Last index in the range", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "AutoEnd", - "Type": "Bool", - "Desc": "This range extends to the end of the line, but should be a fixed number of items", - "Aliases": [ - "auto" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "VariableEnd", - "Type": "Bool", - "Desc": "This range extends to the end of the line, which can vary from line to line", - "Aliases": [ - "var" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "AllOther", - "Type": "Bool", - "Desc": "This range includes only other indices not specified", - "Aliases": [ - "other" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "ForceVector", - "Type": "Bool", - "Desc": "Force scalar columns to be treated as vectors of length one", - "Aliases": [ - "vector" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - } - ] - } - }, - "Desc": "Source index range(s) of the column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "KeyRange", - "Type": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Min", - "Type": "UInt", - "Desc": "First index in the range", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "Max", - "Type": "UInt", - "Desc": "Last index in the range", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Contiguous", - "Type": "Bool", - "Desc": "Whether the key is contiguous", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - } - ] - }, - "Desc": "For a key column, this defines the range of values", - "Aliases": [ - "key" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", - "Aliases": [ - "col" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "UseThreads", - "Type": "Bool", - "Desc": "Use separate parsing threads?", - "Aliases": [ - "threads" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "HeaderFile", - "Type": "String", - "Desc": "File containing a header with feature names. If specified, header defined in the data file (header+) is ignored.", - "Aliases": [ - "hf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "MaxRows", - "Type": "Int", - "Desc": "Maximum number of rows to produce", - "Aliases": [ - "rows" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "AllowQuoting", - "Type": "Bool", - "Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.", - "Aliases": [ - "quote" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "AllowSparse", - "Type": "Bool", - "Desc": "Whether the input may include sparse representations", - "Aliases": [ - "sparse" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "InputSize", - "Type": "Int", - "Desc": "Number of source columns in the text data. Default is that sparse rows contain their size information.", - "Aliases": [ - "size" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Separator", - "Type": { - "Kind": "Array", - "ItemType": "Char" - }, - "Desc": "Source column separator.", - "Aliases": [ - "sep" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": [ - "\t" - ] - }, - { - "Name": "TrimWhitespace", - "Type": "Bool", - "Desc": "Remove trailing whitespace from lines", - "Aliases": [ - "trim" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "HasHeader", - "Type": "Bool", - "Desc": "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.", - "Aliases": [ - "header" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - } - ] - }, - "Desc": "Arguments", - "Aliases": [ - "args" - ], - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "The resulting data view" - } - ], - "InputKind": [ - "ILearningPipelineLoader" - ] - }, { "Name": "Models.AnomalyDetectionEvaluator", "Desc": "Evaluates an anomaly detection scored dataset.", @@ -29196,10 +28846,6 @@ } ] }, - { - "Kind": "ILearningPipelineLoader", - "Settings": [] - }, { "Kind": "IMulticlassClassificationOutput", "Settings": [] From 97e81828a6381d5dc5ed71d0a6b26853f304fb40 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 4 Jan 2019 20:01:49 +0100 Subject: [PATCH 5/5] reverting changes in favor of PR 2024 which will remove the tests on housing dataset --- .../Common/EntryPoints/core_ep-list.tsv | 1 + .../Common/EntryPoints/core_manifest.json | 354 ++++++++++++++++++ .../EnvironmentExtensions.cs | 3 + .../Microsoft.ML.TestFramework.csproj | 1 + .../Microsoft.ML.TestFramework/ModelHelper.cs | 290 ++++++++++++-- .../PredictionModelTests.cs | 97 +++-- .../Scenarios/HousePricePredictionTests.cs | 14 +- 7 files changed, 685 insertions(+), 75 deletions(-) diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 68efd39400..75f83c5030 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -2,6 +2,7 @@ Data.CustomTextLoader Import a dataset from a text file Microsoft.ML.EntryPoints Data.DataViewReference Pass dataview from memory to experiment Microsoft.ML.EntryPoints.DataViewReference ImportData Microsoft.ML.EntryPoints.DataViewReference+Input Microsoft.ML.EntryPoints.DataViewReference+Output Data.IDataViewArrayConverter Create an array variable of IDataView Microsoft.ML.EntryPoints.MacroUtils MakeArray Microsoft.ML.EntryPoints.MacroUtils+ArrayIDataViewInput Microsoft.ML.EntryPoints.MacroUtils+ArrayIDataViewOutput Data.PredictorModelArrayConverter Create an array variable of PredictorModel Microsoft.ML.EntryPoints.MacroUtils MakeArray Microsoft.ML.EntryPoints.MacroUtils+ArrayIPredictorModelInput Microsoft.ML.EntryPoints.MacroUtils+ArrayIPredictorModelOutput +Data.TextLoader Import a dataset from a text file Microsoft.ML.Legacy.EntryPoints.ImportTextData TextLoader Microsoft.ML.Legacy.EntryPoints.ImportTextData+LoaderInput Microsoft.ML.EntryPoints.ImportTextData+Output Models.AnomalyDetectionEvaluator Evaluates an anomaly detection scored dataset. Microsoft.ML.Data.Evaluate AnomalyDetection Microsoft.ML.Data.AnomalyDetectionMamlEvaluator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+CommonEvaluateOutput Models.AnomalyPipelineEnsemble Combine anomaly detection models into an ensemble Microsoft.ML.EntryPoints.EnsembleCreator CreateAnomalyPipelineEnsemble Microsoft.ML.EntryPoints.EnsembleCreator+PipelineAnomalyInput Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput Models.BinaryClassificationEvaluator Evaluates a binary classification scored dataset. Microsoft.ML.Data.Evaluate Binary Microsoft.ML.Data.BinaryClassifierMamlEvaluator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+ClassificationEvaluateOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index eead6163ee..3e95b0e09d 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -119,6 +119,356 @@ } ] }, + { + "Name": "Data.TextLoader", + "Desc": "Import a dataset from a text file", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "InputFile", + "Type": "FileHandle", + "Desc": "Location of the input file", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Arguments", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the column", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Type", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "Type of the items in the column", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Source", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Min", + "Type": "Int", + "Desc": "First index in the range", + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Max", + "Type": "Int", + "Desc": "Last index in the range", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AutoEnd", + "Type": "Bool", + "Desc": "This range extends to the end of the line, but should be a fixed number of items", + "Aliases": [ + "auto" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "VariableEnd", + "Type": "Bool", + "Desc": "This range extends to the end of the line, which can vary from line to line", + "Aliases": [ + "var" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "AllOther", + "Type": "Bool", + "Desc": "This range includes only other indices not specified", + "Aliases": [ + "other" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "ForceVector", + "Type": "Bool", + "Desc": "Force scalar columns to be treated as vectors of length one", + "Aliases": [ + "vector" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + } + ] + } + }, + "Desc": "Source index range(s) of the column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "KeyRange", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Min", + "Type": "UInt", + "Desc": "First index in the range", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Max", + "Type": "UInt", + "Desc": "Last index in the range", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Contiguous", + "Type": "Bool", + "Desc": "Whether the key is contiguous", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + } + ] + }, + "Desc": "For a key column, this defines the range of values", + "Aliases": [ + "key" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Use separate parsing threads?", + "Aliases": [ + "threads" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "HeaderFile", + "Type": "String", + "Desc": "File containing a header with feature names. If specified, header defined in the data file (header+) is ignored.", + "Aliases": [ + "hf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "MaxRows", + "Type": "Int", + "Desc": "Maximum number of rows to produce", + "Aliases": [ + "rows" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AllowQuoting", + "Type": "Bool", + "Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.", + "Aliases": [ + "quote" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "AllowSparse", + "Type": "Bool", + "Desc": "Whether the input may include sparse representations", + "Aliases": [ + "sparse" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "InputSize", + "Type": "Int", + "Desc": "Number of source columns in the text data. Default is that sparse rows contain their size information.", + "Aliases": [ + "size" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Separator", + "Type": { + "Kind": "Array", + "ItemType": "Char" + }, + "Desc": "Source column separator.", + "Aliases": [ + "sep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": [ + "\t" + ] + }, + { + "Name": "TrimWhitespace", + "Type": "Bool", + "Desc": "Remove trailing whitespace from lines", + "Aliases": [ + "trim" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "HasHeader", + "Type": "Bool", + "Desc": "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.", + "Aliases": [ + "header" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + } + ] + }, + "Desc": "Arguments", + "Aliases": [ + "args" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "The resulting data view" + } + ], + "InputKind": [ + "ILearningPipelineLoader" + ] + }, { "Name": "Models.AnomalyDetectionEvaluator", "Desc": "Evaluates an anomaly detection scored dataset.", @@ -28846,6 +29196,10 @@ } ] }, + { + "Kind": "ILearningPipelineLoader", + "Settings": [] + }, { "Kind": "IMulticlassClassificationOutput", "Settings": [] diff --git a/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs b/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs index 1b8cbdcb60..0302b6c2a4 100644 --- a/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs +++ b/test/Microsoft.ML.TestFramework/EnvironmentExtensions.cs @@ -25,6 +25,9 @@ public static TEnvironment AddStandardComponents(this TEnvironment env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble env.ComponentCatalog.RegisterAssembly(typeof(KMeansModelParameters).Assembly); // ML.KMeansClustering env.ComponentCatalog.RegisterAssembly(typeof(PcaModelParameters).Assembly); // ML.PCA +#pragma warning disable 612 + env.ComponentCatalog.RegisterAssembly(typeof(Experiment).Assembly); // ML.Legacy +#pragma warning restore 612 env.ComponentCatalog.RegisterAssembly(typeof(CVSplit).Assembly); // ML.EntryPoints return env; } diff --git a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj index 3854fc5f67..62ae1cea6a 100644 --- a/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj +++ b/test/Microsoft.ML.TestFramework/Microsoft.ML.TestFramework.csproj @@ -15,6 +15,7 @@ + diff --git a/test/Microsoft.ML.TestFramework/ModelHelper.cs b/test/Microsoft.ML.TestFramework/ModelHelper.cs index 953b5cab70..3f6811185e 100644 --- a/test/Microsoft.ML.TestFramework/ModelHelper.cs +++ b/test/Microsoft.ML.TestFramework/ModelHelper.cs @@ -2,53 +2,281 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using Microsoft.ML.Core.Data; +using System.IO; using Microsoft.ML.Data; +using Microsoft.ML.EntryPoints; +using Microsoft.ML.Legacy.Data; namespace Microsoft.ML.TestFramework { +#pragma warning disable 612, 618 public static class ModelHelper { - public static IDataView GetKcHouseDataView(MLContext mlContext, string dataPath) + private static MLContext s_environment = new MLContext(seed: 1); + private static TransformModel s_housePriceModel; + + public static void WriteKcHousePriceModel(string dataPath, string outputModelPath) + { + if (File.Exists(outputModelPath)) + { + File.Delete(outputModelPath); + } + + using (var saveStream = File.OpenWrite(outputModelPath)) + { + WriteKcHousePriceModel(dataPath, saveStream); + } + } + + public static void WriteKcHousePriceModel(string dataPath, Stream stream) { - return mlContext.Data.ReadFromTextFile(dataPath, + if (s_housePriceModel == null) + { + s_housePriceModel = CreateKcHousePricePredictorModel(dataPath); + } + s_housePriceModel.Save(s_environment, stream); + } + + public static IDataView GetKcHouseDataView(string dataPath) + { + return s_environment.Data.ReadFromTextFile(dataPath, columns: new[] { - new TextLoader.Column("Id", DataKind.TX, 0), - new TextLoader.Column("Date", DataKind.TX, 1), - new TextLoader.Column("Label", DataKind.R4, 2), - new TextLoader.Column("BedRooms", DataKind.R4, 3), - new TextLoader.Column("BathRooms", DataKind.R4, 4), - new TextLoader.Column("SqftLiving", DataKind.R4, 5), - new TextLoader.Column("SqftLot", DataKind.R4, 6), - new TextLoader.Column("Floors", DataKind.R4, 7), - new TextLoader.Column("WaterFront", DataKind.R4, 8), - new TextLoader.Column("View", DataKind.R4, 9), - new TextLoader.Column("Condition", DataKind.R4, 10), - new TextLoader.Column("Grade", DataKind.R4, 11), - new TextLoader.Column("SqftAbove", DataKind.R4, 12), - new TextLoader.Column("SqftBasement", DataKind.R4, 13), - new TextLoader.Column("YearBuilt", DataKind.R4, 14), - new TextLoader.Column("YearRenovated", DataKind.R4, 15), - new TextLoader.Column("Zipcode", DataKind.R4, 16), - new TextLoader.Column("Lat", DataKind.R4, 17), - new TextLoader.Column("Long", DataKind.R4, 18), - new TextLoader.Column("SqftLiving15", DataKind.R4, 19), - new TextLoader.Column("SqftLot15", DataKind.R4, 20) + new Data.TextLoader.Column("Id", Data.DataKind.TX, 0), + new Data.TextLoader.Column("Date", Data.DataKind.TX, 1), + new Data.TextLoader.Column("Label", Data.DataKind.R4, 2), + new Data.TextLoader.Column("BedRooms", Data.DataKind.R4, 3), + new Data.TextLoader.Column("BathRooms", Data.DataKind.R4, 4), + new Data.TextLoader.Column("SqftLiving", Data.DataKind.R4, 5), + new Data.TextLoader.Column("SqftLot", Data.DataKind.R4, 6), + new Data.TextLoader.Column("Floors", Data.DataKind.R4, 7), + new Data.TextLoader.Column("WaterFront", Data.DataKind.R4, 8), + new Data.TextLoader.Column("View", Data.DataKind.R4, 9), + new Data.TextLoader.Column("Condition", Data.DataKind.R4, 10), + new Data.TextLoader.Column("Grade", Data.DataKind.R4, 11), + new Data.TextLoader.Column("SqftAbove", Data.DataKind.R4, 12), + new Data.TextLoader.Column("SqftBasement", Data.DataKind.R4, 13), + new Data.TextLoader.Column("YearBuilt", Data.DataKind.R4, 14), + new Data.TextLoader.Column("YearRenovated", Data.DataKind.R4, 15), + new Data.TextLoader.Column("Zipcode", Data.DataKind.R4, 16), + new Data.TextLoader.Column("Lat", Data.DataKind.R4, 17), + new Data.TextLoader.Column("Long", Data.DataKind.R4, 18), + new Data.TextLoader.Column("SqftLiving15", Data.DataKind.R4, 19), + new Data.TextLoader.Column("SqftLot15", Data.DataKind.R4, 20) }, hasHeader: true, separatorChar: ',' ); } - public static IEstimator GetKcHousePipeline(MLContext mlContext) + private static TransformModel CreateKcHousePricePredictorModel(string dataPath) { - // Define pipeline. - return mlContext.Transforms.Concatenate("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15") - .Append(mlContext.Transforms.Concatenate("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode")) - .Append(mlContext.Transforms.Categorical.OneHotEncoding("CategoryFeatures")) - .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoryFeatures")) - .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent(advancedSettings: s => { s.NumThreads = 1; })); + Experiment experiment = s_environment.CreateExperiment(); + var importData = new Legacy.Data.TextLoader(dataPath) + { + Arguments = new TextLoaderArguments + { + Separator = new[] { ',' }, + HasHeader = true, + Column = new[] + { + new TextLoaderColumn() + { + Name = "Id", + Source = new [] { new TextLoaderRange(0) }, + Type = Legacy.Data.DataKind.Text + }, + + new TextLoaderColumn() + { + Name = "Date", + Source = new [] { new TextLoaderRange(1) }, + Type = Legacy.Data.DataKind.Text + }, + + new TextLoaderColumn() + { + Name = "Label", + Source = new [] { new TextLoaderRange(2) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Bedrooms", + Source = new [] { new TextLoaderRange(3) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Bathrooms", + Source = new [] { new TextLoaderRange(4) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLiving", + Source = new [] { new TextLoaderRange(5) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLot", + Source = new [] { new TextLoaderRange(6) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Floors", + Source = new [] { new TextLoaderRange(7) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Waterfront", + Source = new [] { new TextLoaderRange(8) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "View", + Source = new [] { new TextLoaderRange(9) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Condition", + Source = new [] { new TextLoaderRange(10) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Grade", + Source = new [] { new TextLoaderRange(11) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftAbove", + Source = new [] { new TextLoaderRange(12) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftBasement", + Source = new [] { new TextLoaderRange(13) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "YearBuilt", + Source = new [] { new TextLoaderRange(14) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "YearRenovated", + Source = new [] { new TextLoaderRange(15) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Zipcode", + Source = new [] { new TextLoaderRange(16) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Lat", + Source = new [] { new TextLoaderRange(17) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "Long", + Source = new [] { new TextLoaderRange(18) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLiving15", + Source = new [] { new TextLoaderRange(19) }, + Type = Legacy.Data.DataKind.Num + }, + + new TextLoaderColumn() + { + Name = "SqftLot15", + Source = new [] { new TextLoaderRange(20) }, + Type = Legacy.Data.DataKind.Num + }, + } + } + + //new Data.CustomTextLoader(); + // importData.CustomSchema = dataSchema; + // + }; + + Legacy.Data.TextLoader.Output imported = experiment.Add(importData); + var numericalConcatenate = new Legacy.Transforms.ColumnConcatenator(); + numericalConcatenate.Data = imported.Data; + numericalConcatenate.AddColumn("NumericalFeatures", "SqftLiving", "SqftLot", "SqftAbove", "SqftBasement", "Lat", "Long", "SqftLiving15", "SqftLot15"); + Legacy.Transforms.ColumnConcatenator.Output numericalConcatenated = experiment.Add(numericalConcatenate); + + var categoryConcatenate = new Legacy.Transforms.ColumnConcatenator(); + categoryConcatenate.Data = numericalConcatenated.OutputData; + categoryConcatenate.AddColumn("CategoryFeatures", "Bedrooms", "Bathrooms", "Floors", "Waterfront", "View", "Condition", "Grade", "YearBuilt", "YearRenovated", "Zipcode"); + Legacy.Transforms.ColumnConcatenator.Output categoryConcatenated = experiment.Add(categoryConcatenate); + + var categorize = new Legacy.Transforms.CategoricalOneHotVectorizer(); + categorize.AddColumn("CategoryFeatures"); + categorize.Data = categoryConcatenated.OutputData; + Legacy.Transforms.CategoricalOneHotVectorizer.Output categorized = experiment.Add(categorize); + + var featuresConcatenate = new Legacy.Transforms.ColumnConcatenator(); + featuresConcatenate.Data = categorized.OutputData; + featuresConcatenate.AddColumn("Features", "NumericalFeatures", "CategoryFeatures"); + Legacy.Transforms.ColumnConcatenator.Output featuresConcatenated = experiment.Add(featuresConcatenate); + + var learner = new Legacy.Trainers.StochasticDualCoordinateAscentRegressor(); + learner.TrainingData = featuresConcatenated.OutputData; + learner.NumThreads = 1; + Legacy.Trainers.StochasticDualCoordinateAscentRegressor.Output learnerOutput = experiment.Add(learner); + + var combineModels = new Legacy.Transforms.ManyHeterogeneousModelCombiner(); + combineModels.TransformModels = new ArrayVar(numericalConcatenated.Model, categoryConcatenated.Model, categorized.Model, featuresConcatenated.Model); + combineModels.PredictorModel = learnerOutput.PredictorModel; + Legacy.Transforms.ManyHeterogeneousModelCombiner.Output combinedModels = experiment.Add(combineModels); + + var scorer = new Legacy.Transforms.Scorer + { + PredictorModel = combinedModels.PredictorModel + }; + + var scorerOutput = experiment.Add(scorer); + experiment.Compile(); + experiment.SetInput(importData.InputFile, new SimpleFileHandle(s_environment, dataPath, false, false)); + experiment.Run(); + + return experiment.GetOutput(scorerOutput.ScoringTransform); } } +#pragma warning restore 612, 618 } diff --git a/test/Microsoft.ML.Tests/PredictionModelTests.cs b/test/Microsoft.ML.Tests/PredictionModelTests.cs index 4aaa194605..7d8d565445 100644 --- a/test/Microsoft.ML.Tests/PredictionModelTests.cs +++ b/test/Microsoft.ML.Tests/PredictionModelTests.cs @@ -2,6 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.IO; +using System.Threading.Tasks; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Xunit; @@ -9,6 +11,7 @@ namespace Microsoft.ML.EntryPoints.Tests { +#pragma warning disable 612 public class PredictionModelTests : BaseTestClass { public class HousePriceData @@ -40,48 +43,67 @@ public class HousePricePrediction } [Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")] - public void ReadStrongTypeModelFromStream() + public async Task ReadStrongTypeModelFromStream() { - var mlContext = new MLContext(seed: 1); - var data = ModelHelper.GetKcHouseDataView(mlContext, GetDataPath("kc_house_data.csv")); - var pipeline = ModelHelper.GetKcHousePipeline(mlContext); - var model = pipeline.Fit(data); + using (var memoryStream = new MemoryStream()) + { + ModelHelper.WriteKcHousePriceModel(GetDataPath("kc_house_data.csv"), memoryStream); + memoryStream.Position = 0; - var engine = model.CreatePredictionEngine(mlContext); + var model = await Legacy.PredictionModel.ReadAsync(memoryStream); - HousePricePrediction prediction = engine.Predict(new HousePriceData() - { - Bedrooms = 3, - Bathrooms = 1.75f, - SqftLiving = 2450, - SqftLot = 2691, - Floors = 2, - Waterfront = 0, - View = 0, - Condition = 3, - Grade = 8, - SqftAbove = 1750, - SqftBasement = 700, - YearBuilt = 1915, - YearRenovated = 0, - Zipcode = 98119, - Lat = 47.6386f, - Long = -122.36f, - SqftLiving15 = 1760, - SqftLot15 = 3573 - }); + HousePricePrediction prediction = model.Predict(new HousePriceData() + { + Bedrooms = 3, + Bathrooms = 1.75f, + SqftLiving = 2450, + SqftLot = 2691, + Floors = 2, + Waterfront = 0, + View = 0, + Condition = 3, + Grade = 8, + SqftAbove = 1750, + SqftBasement = 700, + YearBuilt = 1915, + YearRenovated = 0, + Zipcode = 98119, + Lat = 47.6386f, + Long = -122.36f, + SqftLiving15 = 1760, + SqftLot15 = 3573 + }); - Assert.InRange(prediction.Price, 790_000, 850_000); + Assert.InRange(prediction.Price, 790_000, 850_000); - var dataView = model.Transform(data); - dataView.Schema.TryGetColumnIndex("Score", out int scoreColumn); - using (var cursor = dataView.GetRowCursor((int col) => col == scoreColumn)) - { - var scoreGetter = cursor.GetGetter(scoreColumn); - float score = 0; - cursor.MoveNext(); - scoreGetter(ref score); - Assert.InRange(score, 100_000, 200_000); + + var dataView = model.Predict(ModelHelper.GetKcHouseDataView(GetDataPath("kc_house_data.csv"))); + dataView.Schema.TryGetColumnIndex("Score", out int scoreColumn); + using (var cursor = dataView.GetRowCursor((int col) => col == scoreColumn)) + { + var scoreGetter = cursor.GetGetter(scoreColumn); + float score = 0; + cursor.MoveNext(); + scoreGetter(ref score); + Assert.InRange(score, 100_000, 200_000); + } + + Legacy.PredictionModel nonGenericModel; + using (var anotherStream = new MemoryStream()) + { + await model.WriteAsync(anotherStream); + nonGenericModel = await Legacy.PredictionModel.ReadAsync(anotherStream); + } + + dataView = nonGenericModel.Predict(ModelHelper.GetKcHouseDataView(GetDataPath("kc_house_data.csv"))); + using (var cursor = dataView.GetRowCursor((int col) => col == scoreColumn)) + { + var scoreGetter = cursor.GetGetter(scoreColumn); + float score = 0; + cursor.MoveNext(); + scoreGetter(ref score); + Assert.InRange(score, 100_000, 200_000); + } } } @@ -90,4 +112,5 @@ public PredictionModelTests(ITestOutputHelper output) { } } +#pragma warning restore 612 } diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs index bc88045562..ef8f704f4d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs @@ -16,17 +16,16 @@ A real-estate firm Contoso wants to add a house price prediction to their ASP.NE The application will let users submit information about their house, and see a price they could expect if they put the house for sale. Because real estate transaction data is public, Contoso has historical data they intend to use to train Machine Learning prediction engine. */ +#pragma warning disable 612 [Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")] - public void PredictHousePriceModelTest() + public async void PredictHousePriceModelTest() { - var mlContext = new MLContext(seed: 1); - var data = ModelHelper.GetKcHouseDataView(mlContext, GetDataPath("kc_house_data.csv")); - var pipeline = ModelHelper.GetKcHousePipeline(mlContext); - var model = pipeline.Fit(data); + string modelFilePath = GetOutputPath("PredictHousePriceModelTest.zip"); + ModelHelper.WriteKcHousePriceModel(GetDataPath("kc_house_data.csv"), modelFilePath); - var engine = model.CreatePredictionEngine(mlContext); + var model = await Legacy.PredictionModel.ReadAsync(modelFilePath); - HousePricePrediction prediction = engine.Predict(new HousePriceData() + HousePricePrediction prediction = model.Predict(new HousePriceData() { Bedrooms = 3, Bathrooms = 2, @@ -50,6 +49,7 @@ public void PredictHousePriceModelTest() Assert.InRange(prediction.Price, 260_000, 330_000); } +#pragma warning restore 612 public class HousePriceData {