From f699cb0a0fe212e12bcfbbef2b1d4e2d50de4fb0 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Tue, 11 Dec 2018 15:59:22 -0800 Subject: [PATCH 01/11] moving reflection-based read out of the legacy project. --- .../DataLoadSave/Text/TextLoader.cs | 19 ++- .../Text/TextLoaderSaverCatalog.cs | 118 +++++++++++++++++- 2 files changed, 127 insertions(+), 10 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index a3d5260531..2396343602 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -343,10 +343,10 @@ public class ArgumentsCore " missing value and an empty value is denoted by \"\". When false, consecutive separators" + " denote an empty value.", ShortName = "quote")] - public bool AllowQuoting = true; + public bool AllowQuoting = DefaultArguments.AllowQuoting; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the input may include sparse representations", ShortName = "sparse")] - public bool AllowSparse = true; + public bool AllowSparse = DefaultArguments.AllowSparse; [Argument(ArgumentType.AtMostOnce, HelpText = "Number of source columns in the text data. Default is that sparse rows contain their size information.", @@ -354,17 +354,17 @@ public class ArgumentsCore public int? InputSize; [Argument(ArgumentType.AtMostOnce, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")] - public string Separator = "tab"; + public string Separator = "tab"; //DefaultArguments.Separator [Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")] - public char[] SeparatorChars = new[] { '\t' }; + public char[] SeparatorChars = new[] { DefaultArguments.Separator }; [Argument(ArgumentType.Multiple, HelpText = "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", ShortName = "col", SortOrder = 1)] public Column[] Column; [Argument(ArgumentType.AtMostOnce, HelpText = "Remove trailing whitespace from lines", ShortName = "trim")] - public bool TrimWhitespace; + public bool TrimWhitespace = DefaultArguments.TrimWhitespace; [Argument(ArgumentType.AtMostOnce, ShortName = "header", HelpText = "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.")] @@ -392,6 +392,15 @@ public sealed class Arguments : ArgumentsCore public long? MaxRows; } + internal static class DefaultArguments + { + internal const bool AllowQuoting = true; + internal const bool AllowSparse = true; + internal const char Separator = '\t'; + internal const bool HasHeader = false; + internal const bool TrimWhitespace = false; + } + /// /// Used as an input column range. /// A variable length segment (extending to the end of the input line) is represented by Lim == SrcLim. diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index b4cf936a38..f84d78c1d7 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -2,12 +2,16 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using System; +using System.Collections.Generic; using System.IO; -using static Microsoft.ML.Runtime.Data.TextLoader; +using System.Linq; +using System.Reflection; +using System.Text.RegularExpressions; namespace Microsoft.ML { @@ -22,7 +26,7 @@ public static class TextLoaderSaverCatalog /// The character used as separator between data points in a row. By default the tab character is used as separator. /// The optional location of a data sample. public static TextLoader CreateTextReader(this DataOperations catalog, - Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) + TextLoader.Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); /// @@ -31,9 +35,113 @@ public static TextLoader CreateTextReader(this DataOperations catalog, /// The catalog. /// Defines the settings of the load operation. /// Allows to expose items that can be used for reading. - public static TextLoader CreateTextReader(this DataOperations catalog, Arguments args, IMultiStreamSource dataSample = null) + public static TextLoader CreateTextReader(this DataOperations catalog, TextLoader.Arguments args, IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample); + /// + /// Create a text reader . + /// + /// The catalog. + /// + /// + /// + /// + /// + public static TextLoader CreateTextReader(this DataOperations catalog, + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separator = TextLoader.DefaultArguments.Separator, + bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, + bool supportSparse = TextLoader.DefaultArguments.AllowSparse, + bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) + { + var userType = typeof(TInput); + + var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance); + + var propertyInfos = + userType + .GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(x => x.CanRead && x.CanWrite && x.GetGetMethod() != null && x.GetSetMethod() != null && x.GetIndexParameters().Length == 0); + + var memberInfos = (fieldInfos as IEnumerable).Concat(propertyInfos).ToArray(); + + var columns = new TextLoader.Column[memberInfos.Length]; + + for (int index = 0; index < memberInfos.Length; index++) + { + var memberInfo = memberInfos[index]; + var mappingAttr = memberInfo.GetCustomAttribute(); + var mptr = memberInfo.GetCustomAttributes(); + + if (mappingAttr == null) + throw Contracts.Except($"Field or property {memberInfo.Name} is missing ColumnAttribute"); + + if (Regex.Match(mappingAttr.Ordinal, @"[^(0-9,\*\-~)]+").Success) + throw Contracts.Except($"{mappingAttr.Ordinal} contains invalid characters. " + + $"Valid characters are 0-9, *, - and ~"); + + var mappingNameAttr = memberInfo.GetCustomAttribute(); + var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; + + TextLoader.Range[] sources; + if (!TextLoader.Column.TryParseSourceEx(mappingAttr.Ordinal, out sources)) + throw Contracts.Except($"{mappingAttr.Ordinal} could not be parsed."); + + Contracts.Assert(sources != null); + + var column = new TextLoader.Column(); + column.Name = name; + column.Source = new TextLoader.Range[sources.Length]; + DataKind dk; + switch (memberInfo) + { + case FieldInfo field: + if (!DataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) + throw Contracts.Except($"Field {name} is of unsupported type."); + + break; + + case PropertyInfo property: + if (!DataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk)) + throw Contracts.Except($"Property {name} is of unsupported type."); + break; + + default: + Contracts.Assert(false); + throw Contracts.ExceptNotSupp("Expected a FieldInfo or a PropertyInfo"); + } + + column.Type = dk; + + for (int indexLocal = 0; indexLocal < column.Source.Length; indexLocal++) + { + column.Source[indexLocal] = new TextLoader.Range + { + AllOther = sources[indexLocal].AllOther, + AutoEnd = sources[indexLocal].AutoEnd, + ForceVector = sources[indexLocal].ForceVector, + VariableEnd = sources[indexLocal].VariableEnd, + Max = sources[indexLocal].Max, + Min = sources[indexLocal].Min + }; + } + + columns[index] = column; + } + + TextLoader.Arguments args = new TextLoader.Arguments + { + HasHeader = hasHeader, + SeparatorChars = new[] { separator }, + AllowQuoting = allowQuotedStrings, + AllowSparse = supportSparse, + TrimWhitespace = trimWhitespace, + Column = columns + }; + + return new TextLoader(CatalogUtils.GetEnvironment(catalog), args); + } + /// /// Read a data view from a text file using . /// @@ -44,7 +152,7 @@ public static TextLoader CreateTextReader(this DataOperations catalog, Arguments /// The path to the file. /// The data view. public static IDataView ReadFromTextFile(this DataOperations catalog, - string path, Column[] columns, bool hasHeader = false, char separatorChar = '\t') + string path, TextLoader.Column[] columns, bool hasHeader = false, char separatorChar = '\t') { Contracts.CheckNonEmpty(path, nameof(path)); @@ -62,7 +170,7 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, /// The catalog. /// Specifies a file from which to read. /// Defines the settings of the load operation. - public static IDataView ReadFromTextFile(this DataOperations catalog, string path, Arguments args = null) + public static IDataView ReadFromTextFile(this DataOperations catalog, string path, TextLoader.Arguments args = null) { Contracts.CheckNonEmpty(path, nameof(path)); From 3931d1330c8c8bf2866fb6d54239a7d3554cc04e Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 13 Dec 2018 15:54:27 -0800 Subject: [PATCH 02/11] adding more members to LoadColumnAttribute --- .../Data/SchemaDefinition.cs | 56 ++++++++++++++++--- .../Text/TextLoaderSaverCatalog.cs | 10 +--- src/Microsoft.ML.Legacy/Data/TextLoader.cs | 12 ++-- .../PredictionEngineBench.cs | 4 +- ...sticDualCoordinateAscentClassifierBench.cs | 10 ++-- test/Microsoft.ML.FSharp.Tests/SmokeTests.fs | 12 ++-- .../CollectionDataSourceTests.cs | 24 ++++---- .../LearningPipelineTests.cs | 2 +- .../Scenarios/Api/ApiScenariosTests.cs | 13 ++++- .../Api/Estimators/CrossValidation.cs | 3 +- .../Estimators/DecomposableTrainAndPredict.cs | 2 +- .../Scenarios/ClusteringTests.cs | 8 +-- .../Scenarios/HousePricePredictionTests.cs | 42 +++++++------- .../Scenarios/IrisPlantClassificationTests.cs | 10 ++-- ...PlantClassificationWithStringLabelTests.cs | 10 ++-- .../PipelineApi/PipelineApiScenarioTests.cs | 14 ++--- .../Scenarios/SentimentPredictionTests.cs | 4 +- .../Scenarios/TensorflowTests.cs | 8 +-- .../TensorflowTests.cs | 4 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 30 +++++----- 20 files changed, 163 insertions(+), 115 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs index d0f72b42a4..44644cfba5 100644 --- a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs +++ b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs @@ -68,12 +68,23 @@ public VectorTypeAttribute(params int[] dims) /// column encapsulates. /// [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] - public sealed class ColumnAttribute : Attribute + public sealed class LoadColumnAttribute : Attribute { - public ColumnAttribute(string ordinal, string name = null) + public LoadColumnAttribute(string range = null, + string name = null, + bool loadInverseRange = false, + string start=null, + string end = null, + int[] columnsIndexes = null, + int[] inverseColumnsIndexes = null) { + Range = range; Name = name; - Ordinal = ordinal; + LoadInverseRange = loadInverseRange; + Start = start; + End = end; + ColumnsIndexes = columnsIndexes; + InverseColumnsIndexes = inverseColumnsIndexes; } /// @@ -82,16 +93,47 @@ public ColumnAttribute(string ordinal, string name = null) public string Name { get; } /// - /// Contains positions of indices of source columns in the form - /// of ranges. Examples of range: if we want to include just column + /// Contains the position of the column in the files. + /// It can be a single column index, or a range. + /// Examples of range: if we want to include just column /// with index 1 we can write the range as 1, if we want to include /// columns 1 to 10 then we can write the range as 1-10 and we want to include all the /// columns from column with index 1 until end then we can write 1-*. /// /// This takes sequence of ranges that are comma seperated, example: - /// 1,2-5,10-* + /// 1,2-5,10-*. + /// All the other parameters are alternatives to this one. + /// + public string Range { get; } + + /// + /// If this is set to true, the columns defined in the range will be + /// excluded from loading, and all the other ones will be the ones loaded. + /// + public bool LoadInverseRange { get; } + + public string Start { get; } + public string End { get; } + public int[] ColumnsIndexes { get; } + public int[] InverseColumnsIndexes { get; } + } + + /// + /// Describes column information such as name and the source columns indicies that this + /// column encapsulates. + /// + [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] + public sealed class ColumnAttribute : Attribute + { + public ColumnAttribute(string ordinal, string name = null) + { + Name = name; + } + + /// + /// Column name. /// - public string Ordinal { get; } + public string Name { get; } } /// diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index f84d78c1d7..666391bc4b 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -70,22 +70,18 @@ public static TextLoader CreateTextReader(this DataOperations catalog, for (int index = 0; index < memberInfos.Length; index++) { var memberInfo = memberInfos[index]; - var mappingAttr = memberInfo.GetCustomAttribute(); + var mappingAttr = memberInfo.GetCustomAttribute(); var mptr = memberInfo.GetCustomAttributes(); if (mappingAttr == null) throw Contracts.Except($"Field or property {memberInfo.Name} is missing ColumnAttribute"); - if (Regex.Match(mappingAttr.Ordinal, @"[^(0-9,\*\-~)]+").Success) - throw Contracts.Except($"{mappingAttr.Ordinal} contains invalid characters. " + - $"Valid characters are 0-9, *, - and ~"); - var mappingNameAttr = memberInfo.GetCustomAttribute(); var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; TextLoader.Range[] sources; - if (!TextLoader.Column.TryParseSourceEx(mappingAttr.Ordinal, out sources)) - throw Contracts.Except($"{mappingAttr.Ordinal} could not be parsed."); + if (!TextLoader.Column.TryParseSourceEx(mappingAttr.Range, out sources)) + throw Contracts.Except($"{mappingAttr.Range} could not be parsed."); Contracts.Assert(sources != null); diff --git a/src/Microsoft.ML.Legacy/Data/TextLoader.cs b/src/Microsoft.ML.Legacy/Data/TextLoader.cs index 434af8f3f5..b2be3d309f 100644 --- a/src/Microsoft.ML.Legacy/Data/TextLoader.cs +++ b/src/Microsoft.ML.Legacy/Data/TextLoader.cs @@ -86,20 +86,20 @@ public TextLoader CreateFrom(bool useHeader = false, for (int index = 0; index < memberInfos.Length; index++) { var memberInfo = memberInfos[index]; - var mappingAttr = memberInfo.GetCustomAttribute(); + var mappingAttr = memberInfo.GetCustomAttribute(); if (mappingAttr == null) - throw Contracts.Except($"Field or property {memberInfo.Name} is missing ColumnAttribute"); + throw Contracts.Except($"Field or property {memberInfo.Name} is missing LoadColumnAttributeAttribute"); - if (Regex.Match(mappingAttr.Ordinal, @"[^(0-9,\*\-~)]+").Success) - throw Contracts.Except($"{mappingAttr.Ordinal} contains invalid characters. " + + if (Regex.Match(mappingAttr.Range, @"[^(0-9,\*\-~)]+").Success) + throw Contracts.Except($"{mappingAttr.Range} contains invalid characters. " + $"Valid characters are 0-9, *, - and ~"); var mappingNameAttr = memberInfo.GetCustomAttribute(); var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; Runtime.Data.TextLoader.Range[] sources; - if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Ordinal, out sources)) - throw Contracts.Except($"{mappingAttr.Ordinal} could not be parsed."); + if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Range, out sources)) + throw Contracts.Except($"{mappingAttr.Range} could not be parsed."); Contracts.Assert(sources != null); diff --git a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs index 2ba6435eb1..5ac76ef35f 100644 --- a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs @@ -146,10 +146,10 @@ public void MakeBreastCancerPredictions() public class SentimentData { - [ColumnName("Label"), Column("0")] + [ColumnName("Label"), LoadColumn("0")] public bool Sentiment; - [Column("1")] + [LoadColumn("1")] public string SentimentText; } diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index 3ff43d0c21..a7fd5a32fb 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -167,19 +167,19 @@ private void Consume(IEnumerable predictions) public class IrisData { - [Column("0")] + [LoadColumn("0")] public float Label; - [Column("1")] + [LoadColumn("1")] public float SepalLength; - [Column("2")] + [LoadColumn("2")] public float SepalWidth; - [Column("3")] + [LoadColumn("3")] public float PetalLength; - [Column("4")] + [LoadColumn("4")] public float PetalWidth; } diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index ebff475fb8..39f6bc62e1 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -65,9 +65,9 @@ open Xunit module SmokeTest1 = type SentimentData() = - [] + [] val mutable SentimentText : string - [] + [] val mutable Sentiment : float32 type SentimentPrediction() = @@ -130,10 +130,10 @@ module SmokeTest2 = [] type SentimentData = - { [] + { [] SentimentText : string - [] + [] Sentiment : float32 } [] @@ -195,10 +195,10 @@ module SmokeTest2 = module SmokeTest3 = type SentimentData() = - [] + [] member val SentimentText = "".AsMemory() with get, set - [] + [] member val Sentiment = 0.0 with get, set type SentimentPrediction() = diff --git a/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs b/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs index 1fc6e151a0..b5cdfbe922 100644 --- a/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs +++ b/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs @@ -216,28 +216,28 @@ public void CanTrainProperties() public class Input { - [Column("0")] + [LoadColumn("0")] public float Number1; - [Column("1")] + [LoadColumn("1")] public string String1; } public class IrisData { - [Column("0")] + [LoadColumn("0")] public float Label; - [Column("1")] + [LoadColumn("1")] public float SepalLength; - [Column("2")] + [LoadColumn("2")] public float SepalWidth; - [Column("3")] + [LoadColumn("3")] public float PetalLength; - [Column("4")] + [LoadColumn("4")] public float PetalWidth; } @@ -255,19 +255,19 @@ public class IrisDataProperties private float _PetalLength; private float _PetalWidth; - [Column("0")] + [LoadColumn("0")] public float Label { get { return _Label; } set { _Label = value; } } - [Column("1")] + [LoadColumn("1")] public float SepalLength { get { return _SepalLength; } set { _SepalLength = value; } } - [Column("2")] + [LoadColumn("2")] public float SepalWidth { get { return _SepalWidth; } set { _SepalWidth = value; } } - [Column("3")] + [LoadColumn("3")] public float PetalLength { get { return _PetalLength; } set { _PetalLength = value; } } - [Column("4")] + [LoadColumn("4")] public float PetalWidth { get { return _PetalWidth; } set { _PetalWidth = value; } } } diff --git a/test/Microsoft.ML.Tests/LearningPipelineTests.cs b/test/Microsoft.ML.Tests/LearningPipelineTests.cs index 64c6abd8cc..ed80853906 100644 --- a/test/Microsoft.ML.Tests/LearningPipelineTests.cs +++ b/test/Microsoft.ML.Tests/LearningPipelineTests.cs @@ -49,7 +49,7 @@ public void CanAddAndRemoveFromPipeline() private class InputData { - [Column(ordinal: "1")] + [LoadColumn(range: "1")] public string F1; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs index eb79c12aa2..b4ca1306a1 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; -using Microsoft.ML.Runtime.Data; using Microsoft.ML.TestFramework; using Xunit.Abstractions; @@ -20,14 +19,22 @@ public ApiScenariosTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { + [LoadColumn("4", name: "Label")] public string Label; } public class IrisDataNoLabel { + [LoadColumn("0")] public float SepalLength; + + [LoadColumn("1")] public float SepalWidth; + + [LoadColumn("2")] public float PetalLength; + + [LoadColumn("3")] public float PetalWidth; } @@ -39,8 +46,10 @@ public class IrisPrediction public class SentimentData { - [ColumnName("Label")] + [LoadColumn("0", name: "Label")] public bool Sentiment; + + [LoadColumn("1")] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs index b660abcf00..aac5cdcd37 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs @@ -27,7 +27,8 @@ void New_CrossValidation() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.CreateTextReader(hasHeader: true) + .Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.ConvergenceTolerance = 1f; s.NumThreads = 1; })); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index 7eeab00c4e..4b59b84552 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -29,7 +29,7 @@ void New_DecomposableTrainAndPredict() var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); - var data = ml.Data.CreateTextReader(TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') + var data = ml.Data.CreateTextReader(separator: ',') .Read(dataPath); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") diff --git a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs index 9e796428de..f50ae3029e 100644 --- a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs @@ -47,16 +47,16 @@ shall not be infringed."" public class NewsData { - [Column(ordinal: "0")] + [LoadColumn(range: "0")] public string Id; - [Column(ordinal: "1", name: "Label")] + [LoadColumn(range: "1", name: "Label")] public string Topic; - [Column(ordinal: "2")] + [LoadColumn(range: "2")] public string Subject; - [Column(ordinal: "3")] + [LoadColumn(range: "3")] public string Content; } diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs index 180de2b766..efa6325617 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs @@ -53,67 +53,67 @@ public async void PredictHousePriceModelTest() public class HousePriceData { - [Column(ordinal: "0")] + [LoadColumn(range: "0")] public string Id; - [Column(ordinal: "1")] + [LoadColumn(range: "1")] public string Date; - [Column(ordinal: "2", name: "Label")] + [LoadColumn(range: "2", name: "Label")] public float Price; - [Column(ordinal: "3")] + [LoadColumn(range: "3")] public float Bedrooms; - [Column(ordinal: "4")] + [LoadColumn(range: "4")] public float Bathrooms; - [Column(ordinal: "5")] + [LoadColumn(range: "5")] public float SqftLiving; - [Column(ordinal: "6")] + [LoadColumn(range: "6")] public float SqftLot; - [Column(ordinal: "7")] + [LoadColumn(range: "7")] public float Floors; - [Column(ordinal: "8")] + [LoadColumn(range: "8")] public float Waterfront; - [Column(ordinal: "9")] + [LoadColumn(range: "9")] public float View; - [Column(ordinal: "10")] + [LoadColumn(range: "10")] public float Condition; - [Column(ordinal: "11")] + [LoadColumn(range: "11")] public float Grade; - [Column(ordinal: "12")] + [LoadColumn(range: "12")] public float SqftAbove; - [Column(ordinal: "13")] + [LoadColumn(range: "13")] public float SqftBasement; - [Column(ordinal: "14")] + [LoadColumn(range: "14")] public float YearBuilt; - [Column(ordinal: "15")] + [LoadColumn(range: "15")] public float YearRenovated; - [Column(ordinal: "16")] + [LoadColumn(range: "16")] public float Zipcode; - [Column(ordinal: "17")] + [LoadColumn(range: "17")] public float Lat; - [Column(ordinal: "18")] + [LoadColumn(range: "18")] public float Long; - [Column(ordinal: "19")] + [LoadColumn(range: "19")] public float SqftLiving15; - [Column(ordinal: "20")] + [LoadColumn(range: "20")] public float SqftLot15; } diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs index ac76ae71f3..06d3b949f9 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs @@ -117,19 +117,19 @@ public void TrainAndPredictIrisModelTest() public class IrisData { - [Column("0")] + [LoadColumn("0")] public float Label; - [Column("1")] + [LoadColumn("1")] public float SepalLength; - [Column("2")] + [LoadColumn("2")] public float SepalWidth; - [Column("3")] + [LoadColumn("3")] public float PetalLength; - [Column("4")] + [LoadColumn("4")] public float PetalWidth; } diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index 5a5f0ebe7a..060a362c74 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -127,19 +127,19 @@ public void TrainAndPredictIrisModelWithStringLabelTest() public class IrisDataWithStringLabel { - [Column("0")] + [LoadColumn("0")] public float SepalLength; - [Column("1")] + [LoadColumn("1")] public float SepalWidth; - [Column("2")] + [LoadColumn("2")] public float PetalLength; - [Column("3")] + [LoadColumn("3")] public float PetalWidth; - [Column("4", name: "Label")] + [LoadColumn("4", name: "Label")] public string IrisPlantType; } } diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs index 93438995ef..da74eab830 100644 --- a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs @@ -21,22 +21,22 @@ public PipelineApiScenarioTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { - [Column("0")] + [LoadColumn("0")] public string Label; } public class IrisDataNoLabel { - [Column("1")] + [LoadColumn("1")] public float SepalLength; - [Column("2")] + [LoadColumn("2")] public float SepalWidth; - [Column("3")] + [LoadColumn("3")] public float PetalLength; - [Column("4")] + [LoadColumn("4")] public float PetalWidth; } @@ -47,9 +47,9 @@ public class IrisPrediction public class SentimentData { - [Column("0", name: "Label")] + [LoadColumn("0", name: "Label")] public bool Sentiment; - [Column("1")] + [LoadColumn("1")] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs index cb81c685fc..4f4b52c595 100644 --- a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs @@ -504,9 +504,9 @@ private IEnumerable GetTestData() public class SentimentData { - [Column(ordinal: "0", name: "Label")] + [LoadColumn(range: "0", name: "Label")] public float Sentiment; - [Column(ordinal: "1")] + [LoadColumn(range: "1")] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs index 1bbcbd19b1..f8729e1653 100644 --- a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs @@ -73,10 +73,10 @@ public void TensorFlowTransforCifarEndToEndTest() public class CifarData { - [Column("0")] + [LoadColumn("0")] public string ImagePath; - [Column("1")] + [LoadColumn("1")] public string Label; } @@ -88,10 +88,10 @@ public class CifarPrediction public class ImageNetData { - [Column("0")] + [LoadColumn("0")] public string ImagePath; - [Column("1")] + [LoadColumn("1")] public string Label; } diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 7cf3ae14c4..e2814b6d54 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -582,10 +582,10 @@ private int GetMaxIndexForOnePrediction(MNISTPrediction onePrediction) public class MNISTData { - [Column("0")] + [LoadColumn("0")] public long Label; - [Column(ordinal: "1-784")] + [LoadColumn(range: "1-784")] [VectorType(784)] public float[] Placeholder; } diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index bc3b1db7e2..a5c626e4a5 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -336,7 +336,7 @@ public void CanSuccessfullyTrimSpaces() public void ThrowsExceptionWithPropertyName() { Exception ex = Assert.Throws(() => new Legacy.Data.TextLoader("fakefile.txt").CreateFrom()); - Assert.StartsWith("Field or property String1 is missing ColumnAttribute", ex.Message); + Assert.StartsWith("Field or property String1 is missing LoadColumn attribute", ex.Message); } [Fact] @@ -350,46 +350,46 @@ public void CanSuccessfullyColumnNameProperty() public class QuoteInput { - [Column("0")] + [LoadColumn("0")] public float ID; - [Column("1")] + [LoadColumn("1")] public string Text; } public class SparseInput { - [Column("0")] + [LoadColumn("0")] public float C1; - [Column("1")] + [LoadColumn("1")] public float C2; - [Column("2")] + [LoadColumn("2")] public float C3; - [Column("3")] + [LoadColumn("3")] public float C4; - [Column("4")] + [LoadColumn("4")] public float C5; } public class Input { - [Column("0")] + [LoadColumn("0")] public string String1; - [Column("1")] + [LoadColumn("1")] public float Number1; } public class InputWithUnderscore { - [Column("0")] + [LoadColumn("0")] public string String_1; - [Column("1")] + [LoadColumn("1")] public float Number_1; } @@ -400,12 +400,12 @@ public class ModelWithoutColumnAttribute public class ModelWithColumnNameAttribute { - [Column("0", "Col1")] + [LoadColumn("0", "Col1")] public string String_1; - [Column("1")] + [LoadColumn("1")] [ColumnName("Col2")] public string String_2; - [Column("3")] + [LoadColumn("3")] public string String_3; } } From 7cacf58c1a5cf4a028448ce59ab271ee54bf34e9 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Mon, 17 Dec 2018 23:03:24 -0800 Subject: [PATCH 03/11] Breaking down the ordinal/range into ordinal, start, end, columnIndices. Introducing test. --- .../Data/SchemaDefinition.cs | 94 ++++++--- .../Text/TextLoaderSaverCatalog.cs | 33 +-- src/Microsoft.ML.Legacy/Data/TextLoader.cs | 8 +- src/Native/MatrixFactorizationNative/libmf | 2 +- .../PredictionEngineBench.cs | 4 +- ...sticDualCoordinateAscentClassifierBench.cs | 10 +- .../CollectionDataSourceTests.cs | 54 ++--- .../LearningPipelineTests.cs | 2 +- .../Scenarios/Api/ApiScenariosTests.cs | 14 +- .../Scenarios/ClusteringTests.cs | 8 +- .../Scenarios/HousePricePredictionTests.cs | 42 ++-- .../Scenarios/IrisPlantClassificationTests.cs | 10 +- ...PlantClassificationWithStringLabelTests.cs | 10 +- .../PipelineApi/PipelineApiScenarioTests.cs | 10 +- .../Scenarios/SentimentPredictionTests.cs | 4 +- .../Scenarios/TensorflowTests.cs | 8 +- .../TensorflowTests.cs | 3 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 188 ++++++++++++++++-- 18 files changed, 327 insertions(+), 177 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs index 44644cfba5..f56c8d6a93 100644 --- a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs +++ b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs @@ -70,52 +70,90 @@ public VectorTypeAttribute(params int[] dims) [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] public sealed class LoadColumnAttribute : Attribute { - public LoadColumnAttribute(string range = null, - string name = null, - bool loadInverseRange = false, - string start=null, - string end = null, - int[] columnsIndexes = null, - int[] inverseColumnsIndexes = null) + + public LoadColumnAttribute(int ordinal, string name = null, bool loadAllOthers = false) + { + Start = ordinal.ToString(); + Sources = new List(); + var range = new TextLoader.Range(ordinal); + range.AllOther = loadAllOthers; + Sources.Add(range); + } + + public LoadColumnAttribute(string start, string end = null, string name = null, int[] columnIndexes = null) + { + Name = name; + Start = start; + End = end; + ColumnIndexes = columnIndexes; + + Sources = new List(); + + bool hasEnd = int.TryParse(end, out int endIndex); + var range = hasEnd ? new TextLoader.Range(int.Parse(start), endIndex) : new TextLoader.Range(int.Parse(start)); + Sources.Add(range); + + if (columnIndexes != null) + { + foreach (var col in columnIndexes) + Sources.Add(new TextLoader.Range(col)); + } + } + + // REVIEW : AllOther seems to work only for a single column. Verify. + public LoadColumnAttribute(string start, string end, string name = null, bool loadInverseRange = false) { - Range = range; Name = name; LoadInverseRange = loadInverseRange; Start = start; End = end; - ColumnsIndexes = columnsIndexes; - InverseColumnsIndexes = inverseColumnsIndexes; + + Sources = new List(); + var range = new TextLoader.Range(int.Parse(start), int.Parse(end)); + range.AllOther = loadInverseRange; + Sources.Add(range); + } + + public LoadColumnAttribute(int[] columnIndexes, string name = null) + { + Name = name; + ColumnIndexes = columnIndexes; + + Sources = new List(); + foreach (var col in columnIndexes) + Sources.Add(new TextLoader.Range(col)); } + internal List Sources; + /// /// Column name. /// public string Name { get; } /// - /// Contains the position of the column in the files. - /// It can be a single column index, or a range. - /// Examples of range: if we want to include just column - /// with index 1 we can write the range as 1, if we want to include - /// columns 1 to 10 then we can write the range as 1-10 and we want to include all the - /// columns from column with index 1 until end then we can write 1-*. - /// - /// This takes sequence of ranges that are comma seperated, example: - /// 1,2-5,10-*. - /// All the other parameters are alternatives to this one. + /// The optional start index for loading a contiguous range of columns, or the single index in the case + /// of loading a single column. + /// Either this parameters, or the should be specified. /// - public string Range { get; } + public string Start { get; } /// - /// If this is set to true, the columns defined in the range will be - /// excluded from loading, and all the other ones will be the ones loaded. + /// Optional field, used to set the dataset columns range end index when loading a range of columns. /// - public bool LoadInverseRange { get; } - - public string Start { get; } public string End { get; } - public int[] ColumnsIndexes { get; } - public int[] InverseColumnsIndexes { get; } + + /// + /// Optional field used to specify the distinct indices of the dataset columns that need to be loaded, and mapped to this + /// . + /// + public int[] ColumnIndexes { get; } + + /// + /// If this is set to true, the columns defined in the range through either the , or the + /// will be excluded from loading, and all the other ones will loaded and mapped to the . + /// + public bool LoadInverseRange { get; } } /// diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 666391bc4b..4e4a0e66e4 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -73,33 +73,23 @@ public static TextLoader CreateTextReader(this DataOperations catalog, var mappingAttr = memberInfo.GetCustomAttribute(); var mptr = memberInfo.GetCustomAttributes(); - if (mappingAttr == null) - throw Contracts.Except($"Field or property {memberInfo.Name} is missing ColumnAttribute"); - - var mappingNameAttr = memberInfo.GetCustomAttribute(); - var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; - - TextLoader.Range[] sources; - if (!TextLoader.Column.TryParseSourceEx(mappingAttr.Range, out sources)) - throw Contracts.Except($"{mappingAttr.Range} could not be parsed."); - - Contracts.Assert(sources != null); + Contracts.Assert(mappingAttr != null, $"Field or property {memberInfo.Name} is missing the LoadColumn attribute"); var column = new TextLoader.Column(); - column.Name = name; - column.Source = new TextLoader.Range[sources.Length]; + column.Name = mappingAttr.Name ?? memberInfo.Name; + column.Source = mappingAttr.Sources.ToArray(); DataKind dk; switch (memberInfo) { case FieldInfo field: if (!DataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) - throw Contracts.Except($"Field {name} is of unsupported type."); + throw Contracts.Except($"Field {memberInfo.Name} is of unsupported type."); break; case PropertyInfo property: if (!DataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk)) - throw Contracts.Except($"Property {name} is of unsupported type."); + throw Contracts.Except($"Property {memberInfo.Name} is of unsupported type."); break; default: @@ -109,19 +99,6 @@ public static TextLoader CreateTextReader(this DataOperations catalog, column.Type = dk; - for (int indexLocal = 0; indexLocal < column.Source.Length; indexLocal++) - { - column.Source[indexLocal] = new TextLoader.Range - { - AllOther = sources[indexLocal].AllOther, - AutoEnd = sources[indexLocal].AutoEnd, - ForceVector = sources[indexLocal].ForceVector, - VariableEnd = sources[indexLocal].VariableEnd, - Max = sources[indexLocal].Max, - Min = sources[indexLocal].Min - }; - } - columns[index] = column; } diff --git a/src/Microsoft.ML.Legacy/Data/TextLoader.cs b/src/Microsoft.ML.Legacy/Data/TextLoader.cs index b2be3d309f..e949fae037 100644 --- a/src/Microsoft.ML.Legacy/Data/TextLoader.cs +++ b/src/Microsoft.ML.Legacy/Data/TextLoader.cs @@ -90,16 +90,16 @@ public TextLoader CreateFrom(bool useHeader = false, if (mappingAttr == null) throw Contracts.Except($"Field or property {memberInfo.Name} is missing LoadColumnAttributeAttribute"); - if (Regex.Match(mappingAttr.Range, @"[^(0-9,\*\-~)]+").Success) - throw Contracts.Except($"{mappingAttr.Range} contains invalid characters. " + + if (Regex.Match(mappingAttr.Start, @"[^(0-9,\*\-~)]+").Success) + throw Contracts.Except($"{mappingAttr.Start} contains invalid characters. " + $"Valid characters are 0-9, *, - and ~"); var mappingNameAttr = memberInfo.GetCustomAttribute(); var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; Runtime.Data.TextLoader.Range[] sources; - if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Range, out sources)) - throw Contracts.Except($"{mappingAttr.Range} could not be parsed."); + if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Start, out sources)) + throw Contracts.Except($"{mappingAttr.Start} could not be parsed."); Contracts.Assert(sources != null); diff --git a/src/Native/MatrixFactorizationNative/libmf b/src/Native/MatrixFactorizationNative/libmf index f92a18161b..1ecc365249 160000 --- a/src/Native/MatrixFactorizationNative/libmf +++ b/src/Native/MatrixFactorizationNative/libmf @@ -1 +1 @@ -Subproject commit f92a18161b6824fda4c4ab698a69d299a836841a +Subproject commit 1ecc365249e5cac5e72c66317a141298dc52f6e3 diff --git a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs index 5ac76ef35f..9373724f50 100644 --- a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs @@ -146,10 +146,10 @@ public void MakeBreastCancerPredictions() public class SentimentData { - [ColumnName("Label"), LoadColumn("0")] + [ColumnName("Label"), LoadColumn(0)] public bool Sentiment; - [LoadColumn("1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index a7fd5a32fb..930c38af10 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -167,19 +167,19 @@ private void Consume(IEnumerable predictions) public class IrisData { - [LoadColumn("0")] + [LoadColumn(0)] public float Label; - [LoadColumn("1")] + [LoadColumn(1)] public float SepalLength; - [LoadColumn("2")] + [LoadColumn(2)] public float SepalWidth; - [LoadColumn("3")] + [LoadColumn(3)] public float PetalLength; - [LoadColumn("4")] + [LoadColumn(4)] public float PetalWidth; } diff --git a/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs b/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs index b5cdfbe922..17ba426483 100644 --- a/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs +++ b/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs @@ -175,10 +175,10 @@ public void CanTrain() public void CanTrainProperties() { var pipeline = new Legacy.LearningPipeline(); - var data = new List() { - new IrisDataProperties { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, - new IrisDataProperties { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, - new IrisDataProperties { SepalLength = 1.2f, SepalWidth = 0.5f, PetalLength=0.3f, PetalWidth=5.1f, Label=0} + var data = new List() { + new IrisData { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, + new IrisData { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, + new IrisData { SepalLength = 1.2f, SepalWidth = 0.5f, PetalLength=0.3f, PetalWidth=5.1f, Label=0} }; var collection = CollectionDataSource.Create(data); @@ -186,9 +186,9 @@ public void CanTrainProperties() pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); pipeline.Add(new StochasticDualCoordinateAscentClassifier()); - var model = pipeline.Train(); + var model = pipeline.Train(); - IrisPredictionProperties prediction = model.Predict(new IrisDataProperties() + IrisPredictionProperties prediction = model.Predict(new IrisData { SepalLength = 3.3f, SepalWidth = 1.6f, @@ -202,9 +202,9 @@ public void CanTrainProperties() pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); pipeline.Add(new StochasticDualCoordinateAscentClassifier()); - model = pipeline.Train(); + model = pipeline.Train(); - prediction = model.Predict(new IrisDataProperties() + prediction = model.Predict(new IrisData { SepalLength = 3.3f, SepalWidth = 1.6f, @@ -216,28 +216,28 @@ public void CanTrainProperties() public class Input { - [LoadColumn("0")] + [LoadColumn(0)] public float Number1; - [LoadColumn("1")] + [LoadColumn(1)] public string String1; } public class IrisData { - [LoadColumn("0")] + [LoadColumn(0)] public float Label; - [LoadColumn("1")] + [LoadColumn(1)] public float SepalLength; - [LoadColumn("2")] + [LoadColumn(2)] public float SepalWidth; - [LoadColumn("3")] + [LoadColumn(3)] public float PetalLength; - [LoadColumn("4")] + [LoadColumn(4)] public float PetalWidth; } @@ -247,30 +247,6 @@ public class IrisPrediction public float[] PredictedLabels; } - public class IrisDataProperties - { - private float _Label; - private float _SepalLength; - private float _SepalWidth; - private float _PetalLength; - private float _PetalWidth; - - [LoadColumn("0")] - public float Label { get { return _Label; } set { _Label = value; } } - - [LoadColumn("1")] - public float SepalLength { get { return _SepalLength; } set { _SepalLength = value; } } - - [LoadColumn("2")] - public float SepalWidth { get { return _SepalWidth; } set { _SepalWidth = value; } } - - [LoadColumn("3")] - public float PetalLength { get { return _PetalLength; } set { _PetalLength = value; } } - - [LoadColumn("4")] - public float PetalWidth { get { return _PetalWidth; } set { _PetalWidth = value; } } - } - public class IrisPredictionProperties { private float[] _PredictedLabels; diff --git a/test/Microsoft.ML.Tests/LearningPipelineTests.cs b/test/Microsoft.ML.Tests/LearningPipelineTests.cs index ed80853906..2dda2124ef 100644 --- a/test/Microsoft.ML.Tests/LearningPipelineTests.cs +++ b/test/Microsoft.ML.Tests/LearningPipelineTests.cs @@ -49,7 +49,7 @@ public void CanAddAndRemoveFromPipeline() private class InputData { - [LoadColumn(range: "1")] + [LoadColumn(ordinal: "1")] public string F1; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs index b4ca1306a1..712bb959c9 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs @@ -19,22 +19,22 @@ public ApiScenariosTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { - [LoadColumn("4", name: "Label")] + [LoadColumn(4, name: "Label")] public string Label; } public class IrisDataNoLabel { - [LoadColumn("0")] + [LoadColumn(0)] public float SepalLength; - [LoadColumn("1")] + [LoadColumn(1)] public float SepalWidth; - [LoadColumn("2")] + [LoadColumn(2)] public float PetalLength; - [LoadColumn("3")] + [LoadColumn(3)] public float PetalWidth; } @@ -46,10 +46,10 @@ public class IrisPrediction public class SentimentData { - [LoadColumn("0", name: "Label")] + [LoadColumn(0, name: "Label")] public bool Sentiment; - [LoadColumn("1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs index f50ae3029e..da6bbd5d00 100644 --- a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs @@ -47,16 +47,16 @@ shall not be infringed."" public class NewsData { - [LoadColumn(range: "0")] + [LoadColumn(0)] public string Id; - [LoadColumn(range: "1", name: "Label")] + [LoadColumn(1, name: "Label")] public string Topic; - [LoadColumn(range: "2")] + [LoadColumn(2)] public string Subject; - [LoadColumn(range: "3")] + [LoadColumn(3)] public string Content; } diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs index efa6325617..b724fc23f2 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs @@ -53,67 +53,67 @@ public async void PredictHousePriceModelTest() public class HousePriceData { - [LoadColumn(range: "0")] + [LoadColumn(0)] public string Id; - [LoadColumn(range: "1")] + [LoadColumn(1)] public string Date; - [LoadColumn(range: "2", name: "Label")] + [LoadColumn(2, name: "Label")] public float Price; - [LoadColumn(range: "3")] + [LoadColumn(3)] public float Bedrooms; - [LoadColumn(range: "4")] + [LoadColumn(4)] public float Bathrooms; - [LoadColumn(range: "5")] + [LoadColumn(5)] public float SqftLiving; - [LoadColumn(range: "6")] + [LoadColumn(6)] public float SqftLot; - [LoadColumn(range: "7")] + [LoadColumn(7)] public float Floors; - [LoadColumn(range: "8")] + [LoadColumn(8)] public float Waterfront; - [LoadColumn(range: "9")] + [LoadColumn(9)] public float View; - [LoadColumn(range: "10")] + [LoadColumn(10)] public float Condition; - [LoadColumn(range: "11")] + [LoadColumn(11)] public float Grade; - [LoadColumn(range: "12")] + [LoadColumn(12)] public float SqftAbove; - [LoadColumn(range: "13")] + [LoadColumn(13)] public float SqftBasement; - [LoadColumn(range: "14")] + [LoadColumn(14)] public float YearBuilt; - [LoadColumn(range: "15")] + [LoadColumn(15)] public float YearRenovated; - [LoadColumn(range: "16")] + [LoadColumn(16)] public float Zipcode; - [LoadColumn(range: "17")] + [LoadColumn(17)] public float Lat; - [LoadColumn(range: "18")] + [LoadColumn(18)] public float Long; - [LoadColumn(range: "19")] + [LoadColumn(19)] public float SqftLiving15; - [LoadColumn(range: "20")] + [LoadColumn(20)] public float SqftLot15; } diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs index 06d3b949f9..ab295ddca5 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs @@ -117,19 +117,19 @@ public void TrainAndPredictIrisModelTest() public class IrisData { - [LoadColumn("0")] + [LoadColumn(0)] public float Label; - [LoadColumn("1")] + [LoadColumn(1)] public float SepalLength; - [LoadColumn("2")] + [LoadColumn(2)] public float SepalWidth; - [LoadColumn("3")] + [LoadColumn(3)] public float PetalLength; - [LoadColumn("4")] + [LoadColumn(4)] public float PetalWidth; } diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index 060a362c74..a1ecf43e67 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -127,19 +127,19 @@ public void TrainAndPredictIrisModelWithStringLabelTest() public class IrisDataWithStringLabel { - [LoadColumn("0")] + [LoadColumn(0)] public float SepalLength; - [LoadColumn("1")] + [LoadColumn(1)] public float SepalWidth; - [LoadColumn("2")] + [LoadColumn(2)] public float PetalLength; - [LoadColumn("3")] + [LoadColumn(3)] public float PetalWidth; - [LoadColumn("4", name: "Label")] + [LoadColumn(4, name: "Label")] public string IrisPlantType; } } diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs index da74eab830..8925c9ecec 100644 --- a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs @@ -21,22 +21,22 @@ public PipelineApiScenarioTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { - [LoadColumn("0")] + [LoadColumn(0)] public string Label; } public class IrisDataNoLabel { - [LoadColumn("1")] + [LoadColumn(1)] public float SepalLength; - [LoadColumn("2")] + [LoadColumn(2)] public float SepalWidth; - [LoadColumn("3")] + [LoadColumn(3)] public float PetalLength; - [LoadColumn("4")] + [LoadColumn(4)] public float PetalWidth; } diff --git a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs index 4f4b52c595..4124c3d21d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs @@ -504,9 +504,9 @@ private IEnumerable GetTestData() public class SentimentData { - [LoadColumn(range: "0", name: "Label")] + [LoadColumn(0, name: "Label")] public float Sentiment; - [LoadColumn(range: "1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs index f8729e1653..6d9480a78e 100644 --- a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs @@ -73,10 +73,10 @@ public void TensorFlowTransforCifarEndToEndTest() public class CifarData { - [LoadColumn("0")] + [LoadColumn(0)] public string ImagePath; - [LoadColumn("1")] + [LoadColumn(1)] public string Label; } @@ -88,10 +88,10 @@ public class CifarPrediction public class ImageNetData { - [LoadColumn("0")] + [LoadColumn(0)] public string ImagePath; - [LoadColumn("1")] + [LoadColumn(1)] public string Label; } diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index e2814b6d54..a868d0bcc0 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -582,10 +582,9 @@ private int GetMaxIndexForOnePrediction(MNISTPrediction onePrediction) public class MNISTData { - [LoadColumn("0")] + [Column("0")] public long Label; - [LoadColumn(range: "1-784")] [VectorType(784)] public float[] Placeholder; } diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index a5c626e4a5..30e47bd0e5 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -8,6 +8,7 @@ using Microsoft.ML.Runtime.RunTests; using Microsoft.ML.TestFramework; using System; +using System.Collections.Generic; using System.IO; using Xunit; using Xunit.Abstractions; @@ -350,46 +351,46 @@ public void CanSuccessfullyColumnNameProperty() public class QuoteInput { - [LoadColumn("0")] + [LoadColumn(0)] public float ID; - [LoadColumn("1")] + [LoadColumn(1)] public string Text; } public class SparseInput { - [LoadColumn("0")] + [LoadColumn(0)] public float C1; - [LoadColumn("1")] + [LoadColumn(1)] public float C2; - [LoadColumn("2")] + [LoadColumn(2)] public float C3; - [LoadColumn("3")] + [LoadColumn(3)] public float C4; - [LoadColumn("4")] + [LoadColumn(4)] public float C5; } public class Input { - [LoadColumn("0")] + [LoadColumn(0)] public string String1; - [LoadColumn("1")] + [LoadColumn(1)] public float Number1; } public class InputWithUnderscore { - [LoadColumn("0")] + [LoadColumn(0)] public string String_1; - [LoadColumn("1")] + [LoadColumn(1)] public float Number_1; } @@ -400,14 +401,173 @@ public class ModelWithoutColumnAttribute public class ModelWithColumnNameAttribute { - [LoadColumn("0", "Col1")] + [LoadColumn(0, "Col1")] public string String_1; - [LoadColumn("1")] + + [LoadColumn(1)] [ColumnName("Col2")] public string String_2; - [LoadColumn("3")] + + [LoadColumn(3)] public string String_3; } } + + public class TextLoaderFromModelTests : BaseTestClass + { + public TextLoaderFromModelTests(ITestOutputHelper output) + : base(output) + { + + } + + public class Iris + { + [LoadColumn(0)] + public float SepalLength; + + [LoadColumn(1)] + public float SepalWidth; + + [LoadColumn(2)] + public float PetalLength; + + [LoadColumn(3)] + public float PetalWidth; + + [LoadColumn(4)] + public string Type; + } + + public class IrisAllOther + { + [LoadColumn(4, loadAllOthers: true)] + public string Features; + + [LoadColumn(4)] + public string Type; + } + + public class IrisStartEnd + { + [LoadColumn(start:"0", end:"3", name:"Features", columnIndexes:null)] + public float Features; + + [LoadColumn(4, name: "Label")] + public string Type; + } + + public class IrisStartEndInverse + { + [LoadColumn(start:"0", end:"2", loadInverseRange: true)] + public float Features; + + [LoadColumn(4, name: "Label")] + public string Type; + } + + public class IrisColumnIndices + { + [LoadColumn(columnIndexes: new[] { 0, 2 })] + public float Features; + + [LoadColumn(4, name: "Label")] + public string Type; + } + + [Fact] + public void LoaderColumnsFromIrisData() + { + var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); + var ml = new MLContext(); + + var irisFirstRow = new Dictionary(); + irisFirstRow["SepalLength"] = 5.1f; + irisFirstRow["SepalWidth"] = 3.5f; + irisFirstRow["PetalLength"] = 1.4f; + irisFirstRow["PetalWidth"] = 0.2f; + + var irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); + + // Simple load + var dataIris = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var previewIris = dataIris.Preview(1); + + Assert.Equal(5, previewIris.ColumnView.Length); + Assert.Equal("SepalLength", previewIris.Schema.GetColumnName(0)); + Assert.Equal(NumberType.R4, previewIris.Schema.GetColumnType(0)); + int index = 0; + foreach (var entry in irisFirstRow) + { + Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key); + Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value); + } + Assert.Equal("Type", previewIris.RowView[0].Values[index].Key); + Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); + + // Load allOther + var dataIrisAllOther = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var previewdataIrisAllOther = dataIrisAllOther.Preview(1); + + Assert.Equal(2, previewdataIrisAllOther.ColumnView.Length); + Assert.Equal("Features", previewdataIrisAllOther.RowView[0].Values[0].Key); + VBuffer featureValue = (VBuffer)previewdataIrisAllOther.RowView[0].Values[0].Value; + Assert.True(featureValue.IsDense); + Assert.Equal(4, featureValue.Length); + + foreach (var val in featureValue.GetValues()) + { + irisFirstRowValues.MoveNext(); + Assert.Equal(irisFirstRowValues.Current, val); + } + + // Load with start and end indexes + var dataIrisStartEnd = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var previewIrisStartEnd = dataIrisStartEnd.Preview(1); + + Assert.Equal(2, previewIrisStartEnd.ColumnView.Length); + Assert.Equal("Features", previewIrisStartEnd.RowView[0].Values[0].Key); + featureValue = (VBuffer)previewIrisStartEnd.RowView[0].Values[0].Value; + Assert.True(featureValue.IsDense); + Assert.Equal(4, featureValue.Length); + + irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); + foreach (var val in featureValue.GetValues()) + { + irisFirstRowValues.MoveNext(); + Assert.Equal(irisFirstRowValues.Current, val); + } + + // load setting start, end and inverse = true + var dataIrisStartEndInverse = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var previewIrisStartEndInverse = dataIrisStartEndInverse.Preview(1); + + Assert.Equal(2, previewIrisStartEndInverse.ColumnView.Length); + + featureValue = (VBuffer)previewIrisStartEndInverse.RowView[0].Values[0].Value; + Assert.True(featureValue.IsDense); + // Assert.Equal(1, featureValue.Length); // TODO: failing. It loading all the columns. The columns created are correct. Look into. + var vals3 = featureValue.GetValues(); + irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); irisFirstRowValues.MoveNext(); + irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext();//skip 0, 1, 2 + // Assert.Equal(vals3[1], irisFirstRowValues.Current); + + // load setting the distinct columns. Loading column 0 and 2 + var dataIrisColumnIndices = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1); + + Assert.Equal(2, previewIrisColumnIndices.ColumnView.Length); + featureValue = (VBuffer)previewIrisColumnIndices.RowView[0].Values[0].Value; + Assert.True(featureValue.IsDense); + Assert.Equal(2, featureValue.Length); + var vals4 = featureValue.GetValues(); + + irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); + irisFirstRowValues.MoveNext(); + Assert.Equal(vals4[0], irisFirstRowValues.Current); + irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext(); // skip col 1 + Assert.Equal(vals4[1], irisFirstRowValues.Current); + } + } #pragma warning restore 612, 618 } From c69eb4bc3ec01b6d5bc32ec397745f58303dbf8a Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Mon, 17 Dec 2018 23:04:10 -0800 Subject: [PATCH 04/11] reverting updating the libmf module. --- src/Native/MatrixFactorizationNative/libmf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Native/MatrixFactorizationNative/libmf b/src/Native/MatrixFactorizationNative/libmf index 1ecc365249..f92a18161b 160000 --- a/src/Native/MatrixFactorizationNative/libmf +++ b/src/Native/MatrixFactorizationNative/libmf @@ -1 +1 @@ -Subproject commit 1ecc365249e5cac5e72c66317a141298dc52f6e3 +Subproject commit f92a18161b6824fda4c4ab698a69d299a836841a From 31e17d955e79a987c7bb3b1c22efcc206d0e47de Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Tue, 18 Dec 2018 10:14:25 -0800 Subject: [PATCH 05/11] Removing the option to have ranges + allOthers together. --- .../Data/SchemaDefinition.cs | 77 ++++++++----------- .../Text/TextLoaderSaverCatalog.cs | 53 +++++++++---- src/Microsoft.ML.Legacy/Data/TextLoader.cs | 6 +- .../LearningPipelineTests.cs | 2 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 23 ------ 5 files changed, 71 insertions(+), 90 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs index f56c8d6a93..16755494b8 100644 --- a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs +++ b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs @@ -70,23 +70,31 @@ public VectorTypeAttribute(params int[] dims) [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] public sealed class LoadColumnAttribute : Attribute { - + /// + /// Initializes new instance of . + /// + /// The index of the column in the text file. + /// The optional name of the column, if it should be different from the name of the field or property where the attribute is positioned. + /// Wheather to load all columns, besides the one specified in the ordinal. public LoadColumnAttribute(int ordinal, string name = null, bool loadAllOthers = false) + : this(ordinal.ToString(), name) { - Start = ordinal.ToString(); Sources = new List(); var range = new TextLoader.Range(ordinal); range.AllOther = loadAllOthers; Sources.Add(range); } + /// + /// Initializes new instance of . + /// + /// The starting column index, for the range. + /// The ending column index, for the range. + /// The optional name of the column, if it should be different from the name of the field or property where the attribute is positioned. + /// Distinct text file column indces to load as part of this column. public LoadColumnAttribute(string start, string end = null, string name = null, int[] columnIndexes = null) + : this(start, name) { - Name = name; - Start = start; - End = end; - ColumnIndexes = columnIndexes; - Sources = new List(); bool hasEnd = int.TryParse(end, out int endIndex); @@ -100,29 +108,26 @@ public LoadColumnAttribute(string start, string end = null, string name = null, } } - // REVIEW : AllOther seems to work only for a single column. Verify. - public LoadColumnAttribute(string start, string end, string name = null, bool loadInverseRange = false) + /// + /// Initializes new instance of . + /// + /// Distinct text file column indces to load as part of this column. + /// The optional name of the column, if it should be different from the name of the field or property where the attribute is positioned. + public LoadColumnAttribute(int[] columnIndexes, string name = null) + : this(columnIndexes[0].ToString(), name) { - Name = name; - LoadInverseRange = loadInverseRange; - Start = start; - End = end; - Sources = new List(); - var range = new TextLoader.Range(int.Parse(start), int.Parse(end)); - range.AllOther = loadInverseRange; - Sources.Add(range); + foreach (var col in columnIndexes) + Sources.Add(new TextLoader.Range(col)); } - public LoadColumnAttribute(int[] columnIndexes, string name = null) +#pragma warning disable 618 + private LoadColumnAttribute(string start, string name) { + Start = start; Name = name; - ColumnIndexes = columnIndexes; - - Sources = new List(); - foreach (var col in columnIndexes) - Sources.Add(new TextLoader.Range(col)); } +#pragma warning restore 618 internal List Sources; @@ -131,29 +136,9 @@ public LoadColumnAttribute(int[] columnIndexes, string name = null) /// public string Name { get; } - /// - /// The optional start index for loading a contiguous range of columns, or the single index in the case - /// of loading a single column. - /// Either this parameters, or the should be specified. - /// - public string Start { get; } - - /// - /// Optional field, used to set the dataset columns range end index when loading a range of columns. - /// - public string End { get; } - - /// - /// Optional field used to specify the distinct indices of the dataset columns that need to be loaded, and mapped to this - /// . - /// - public int[] ColumnIndexes { get; } - - /// - /// If this is set to true, the columns defined in the range through either the , or the - /// will be excluded from loading, and all the other ones will loaded and mapped to the . - /// - public bool LoadInverseRange { get; } + [Obsolete("Will be deleted together with the Legacy project.")] + [BestFriend] + internal string Start { get; } } /// diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 4e4a0e66e4..0efa4b9c3b 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -20,33 +20,44 @@ public static class TextLoaderSaverCatalog /// /// Create a text reader . /// - /// The catalog. + /// The catalog. /// The columns of the schema. /// Whether the file has a header. /// The character used as separator between data points in a row. By default the tab character is used as separator. /// The optional location of a data sample. public static TextLoader CreateTextReader(this DataOperations catalog, - TextLoader.Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) + TextLoader.Column[] columns, + bool hasHeader = false, + char separatorChar = '\t', + IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); /// /// Create a text reader . /// - /// The catalog. + /// The catalog. /// Defines the settings of the load operation. /// Allows to expose items that can be used for reading. - public static TextLoader CreateTextReader(this DataOperations catalog, TextLoader.Arguments args, IMultiStreamSource dataSample = null) + public static TextLoader CreateTextReader(this DataOperations catalog, + TextLoader.Arguments args, + IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample); /// - /// Create a text reader . + /// Create a text reader by inferencing the dataset schema from a data model type. /// - /// The catalog. - /// - /// - /// - /// - /// + /// The catalog. + /// Does the file contains header? + /// Column separator character. Default is '\t' + /// Whether the input may include quoted values, + /// which can contain separator characters, colons, + /// and distinguish empty values from missing values. When true, consecutive separators + /// denote a missing value and an empty value is denoted by \"\". + /// When false, consecutive separators denote an empty value. + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 + /// Remove trailing whitespace from lines public static TextLoader CreateTextReader(this DataOperations catalog, bool hasHeader = TextLoader.DefaultArguments.HasHeader, char separator = TextLoader.DefaultArguments.Separator, @@ -118,14 +129,17 @@ public static TextLoader CreateTextReader(this DataOperations catalog, /// /// Read a data view from a text file using . /// - /// The catalog. + /// The catalog. /// The columns of the schema. /// Whether the file has a header. /// The character used as separator between data points in a row. By default the tab character is used as separator. /// The path to the file. /// The data view. public static IDataView ReadFromTextFile(this DataOperations catalog, - string path, TextLoader.Column[] columns, bool hasHeader = false, char separatorChar = '\t') + string path, + TextLoader.Column[] columns, + bool hasHeader = false, + char separatorChar = '\t') { Contracts.CheckNonEmpty(path, nameof(path)); @@ -140,7 +154,7 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, /// /// Read a data view from a text file using . /// - /// The catalog. + /// The catalog. /// Specifies a file from which to read. /// Defines the settings of the load operation. public static IDataView ReadFromTextFile(this DataOperations catalog, string path, TextLoader.Arguments args = null) @@ -156,15 +170,20 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, string pat /// /// Save the data view as text. /// - /// The catalog. + /// The catalog. /// The data view to save. /// The stream to write to. /// The column separator. /// Whether to write the header row. /// Whether to write the header comment with the schema. /// Whether to keep hidden columns in the dataset. - public static void SaveAsText(this DataOperations catalog, IDataView data, Stream stream, - char separator = '\t', bool headerRow = true, bool schema = true, bool keepHidden = false) + public static void SaveAsText(this DataOperations catalog, + IDataView data, + Stream stream, + char separator = '\t', + bool headerRow = true, + bool schema = true, + bool keepHidden = false) { Contracts.CheckValue(catalog, nameof(catalog)); Contracts.CheckValue(data, nameof(data)); diff --git a/src/Microsoft.ML.Legacy/Data/TextLoader.cs b/src/Microsoft.ML.Legacy/Data/TextLoader.cs index e949fae037..14e0d63bd5 100644 --- a/src/Microsoft.ML.Legacy/Data/TextLoader.cs +++ b/src/Microsoft.ML.Legacy/Data/TextLoader.cs @@ -89,9 +89,9 @@ public TextLoader CreateFrom(bool useHeader = false, var mappingAttr = memberInfo.GetCustomAttribute(); if (mappingAttr == null) throw Contracts.Except($"Field or property {memberInfo.Name} is missing LoadColumnAttributeAttribute"); - +#pragma warning disable 618 if (Regex.Match(mappingAttr.Start, @"[^(0-9,\*\-~)]+").Success) - throw Contracts.Except($"{mappingAttr.Start} contains invalid characters. " + + throw Contracts.Except($"{mappingAttr.Start} contains invalid characters. " + $"Valid characters are 0-9, *, - and ~"); var mappingNameAttr = memberInfo.GetCustomAttribute(); @@ -100,7 +100,7 @@ public TextLoader CreateFrom(bool useHeader = false, Runtime.Data.TextLoader.Range[] sources; if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Start, out sources)) throw Contracts.Except($"{mappingAttr.Start} could not be parsed."); - +#pragma warning restore 618 Contracts.Assert(sources != null); TextLoaderColumn tlc = new TextLoaderColumn(); diff --git a/test/Microsoft.ML.Tests/LearningPipelineTests.cs b/test/Microsoft.ML.Tests/LearningPipelineTests.cs index 2dda2124ef..b9e83d18fa 100644 --- a/test/Microsoft.ML.Tests/LearningPipelineTests.cs +++ b/test/Microsoft.ML.Tests/LearningPipelineTests.cs @@ -49,7 +49,7 @@ public void CanAddAndRemoveFromPipeline() private class InputData { - [LoadColumn(ordinal: "1")] + [LoadColumn(ordinal: 1)] public string F1; } diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 30e47bd0e5..32d26024e8 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -457,15 +457,6 @@ public class IrisStartEnd public string Type; } - public class IrisStartEndInverse - { - [LoadColumn(start:"0", end:"2", loadInverseRange: true)] - public float Features; - - [LoadColumn(4, name: "Label")] - public string Type; - } - public class IrisColumnIndices { [LoadColumn(columnIndexes: new[] { 0, 2 })] @@ -538,20 +529,6 @@ public void LoaderColumnsFromIrisData() Assert.Equal(irisFirstRowValues.Current, val); } - // load setting start, end and inverse = true - var dataIrisStartEndInverse = ml.Data.CreateTextReader(separator: ',').Read(dataPath); - var previewIrisStartEndInverse = dataIrisStartEndInverse.Preview(1); - - Assert.Equal(2, previewIrisStartEndInverse.ColumnView.Length); - - featureValue = (VBuffer)previewIrisStartEndInverse.RowView[0].Values[0].Value; - Assert.True(featureValue.IsDense); - // Assert.Equal(1, featureValue.Length); // TODO: failing. It loading all the columns. The columns created are correct. Look into. - var vals3 = featureValue.GetValues(); - irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); irisFirstRowValues.MoveNext(); - irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext();//skip 0, 1, 2 - // Assert.Equal(vals3[1], irisFirstRowValues.Current); - // load setting the distinct columns. Loading column 0 and 2 var dataIrisColumnIndices = ml.Data.CreateTextReader(separator: ',').Read(dataPath); var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1); From e1201bb40ed652af2840698b933de8591c3a2d96 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Wed, 19 Dec 2018 12:57:36 -0800 Subject: [PATCH 06/11] Addressing most comments --- docs/code/MlNetCookBook.md | 171 +++++++++--------- .../Data/SchemaDefinition.cs | 78 -------- .../DataLoadSave/Text/LoadColumn.cs | 65 +++++++ .../DataLoadSave/Text/TextLoader.cs | 71 ++++++++ .../Text/TextLoaderSaverCatalog.cs | 105 ++++------- src/Microsoft.ML.Legacy/Data/TextLoader.cs | 2 +- test/Microsoft.ML.FSharp.Tests/SmokeTests.fs | 12 +- .../LearningPipelineTests.cs | 2 +- .../Scenarios/Api/ApiScenariosTests.cs | 4 +- .../CookbookSamplesDynamicApi.cs | 159 ++++++++-------- .../Api/Estimators/CrossValidation.cs | 4 +- .../Estimators/DecomposableTrainAndPredict.cs | 3 +- .../Api/Estimators/IntrospectiveTraining.cs | 3 +- .../Api/Estimators/Metacomponents.cs | 3 +- .../Api/Estimators/MultithreadedPrediction.cs | 5 +- .../Estimators/ReconfigurablePrediction.cs | 6 +- .../Api/Estimators/SimpleTrainAndPredict.cs | 6 +- .../Estimators/TrainSaveModelAndPredict.cs | 5 +- .../Estimators/TrainWithInitialPredictor.cs | 2 +- .../Api/Estimators/TrainWithValidationSet.cs | 6 +- .../Scenarios/ClusteringTests.cs | 2 +- .../Scenarios/HousePricePredictionTests.cs | 2 +- ...PlantClassificationWithStringLabelTests.cs | 2 +- .../PipelineApi/PipelineApiScenarioTests.cs | 4 +- .../Scenarios/SentimentPredictionTests.cs | 2 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 35 +--- .../TrainerEstimators/MetalinearEstimators.cs | 2 +- 27 files changed, 373 insertions(+), 388 deletions(-) create mode 100644 src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 87e8b82647..ef0ad0a39d 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -131,6 +131,39 @@ var reader = mlContext.Data.CreateTextReader(new[] { var data = reader.Read(dataPath); ``` +You can also create a data model class, and read the data based on this type. + +```csharp +// The data model. This type will be used through the document. +private class InspectedRow +{ + [LoadColumn(0)] + public bool IsOver50K { get; set; } + + [LoadColumn(1)] + public string Workclass { get; set; } + + [LoadColumn(2)] + public string Education { get; set; } + + [LoadColumn(3)] + public string MaritalStatus { get; set; } + + public string[] AllFeatures { get; set; } +} + +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Read the data into a data view. +var data = mlContext.Data.ReadFromTextFile(dataPath, + // First line of the file is a header, not a data row. + hasHeader: true +) + +``` + ## How do I load data from multiple files? You can again use the `TextLoader`, and specify an array of files to its Read method. @@ -231,9 +264,9 @@ var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. var reader = mlContext.Data.CreateTextReader(new[] { // We read the first 10 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), + new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 10)}), // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 10) + new TextLoader.Column("Target", DataKind.R4, 11) }, // Default separator is tab, but we need a comma. s => s.Separator = ","); @@ -242,6 +275,30 @@ var reader = mlContext.Data.CreateTextReader(new[] { var data = reader.Read(dataPath); ``` +Or by creating a data model for it: + +```csharp +private class AdultData +{ + [LoadColumn("0", "10"), ColumnName("Features")] + public float FeatureVector { get; } + + [LoadColumn(11)] + public float Target { get; } +} + +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Read the data into a data view. +var data = mlContext.Data.ReadFromTextFile(dataPath, + // First line of the file is a header, not a data row. + separator: ',' +); + +``` + ## How do I debug my experiment or preview my pipeline? Most ML.NET operations are 'lazy': they are not actually processing data, they just validate that the operation is possible, and then defer execution until the output data is actually requested. This provides good efficiency, but makes it hard to step through and debug the experiment. @@ -342,33 +399,14 @@ var sameFeatureColumns = dynamicData.GetColumn(mlContext, "AllFeatures .Take(20).ToArray(); ``` -The above code assumes that we defined our `InspectedRow` class as follows: -```csharp -private class InspectedRow -{ - public bool IsOver50K; - public string Workclass; - public string Education; - public string MaritalStatus; - public string[] AllFeatures; -} -``` - You can also use the dynamic API to create the equivalent of the previous pipeline. ```csharp // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); -// Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50K", DataKind.BL, 0), - // Three text columns. - new TextLoader.Column("Workclass", DataKind.TX, 1), - new TextLoader.Column("Education", DataKind.TX, 2), - new TextLoader.Column("MaritalStatus", DataKind.TX, 3) - }, +// Read the data into a data view. +var data = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. hasHeader: true ); @@ -377,10 +415,6 @@ var reader = mlContext.Data.CreateTextReader(new[] { // together into one. var dynamicPipeline = mlContext.Transforms.Concatenate("AllFeatures", "Education", "MaritalStatus"); -// Let's verify that the data has been read correctly. -// First, we read the data file. -var data = reader.Read(dataPath); - // Fit our data pipeline and transform data with it. var transformedData = dynamicPipeline.Fit(data).Transform(data); @@ -476,22 +510,12 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - // We read the first 11 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), - - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 11), - }, - // First line of the file is a header, not a data row. - hasHeader: true, - // Default separator is tab, but we need a semicolon. - separatorChar: ';' +// Read the data into a data view. Remember though, readers are lazy, so the actual reading will happen when the data is accessed. +var trainData = mlContext.Data.ReadFromTextFile(dataPath, + // First line of the file is a header, not a data row. + separator: ',' ); -// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). -var trainData = reader.Read(trainDataPath); - // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used // several times somewhere. The caching mechanism is also lazy; it only caches things after being used. // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because @@ -537,7 +561,10 @@ var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: r Calculating the metrics with the dynamic API is as follows. ```csharp // Read the test dataset. -var testData = reader.Read(testDataPath); +var testData = mlContext.Data.ReadFromTextFile(testDataPath, + // First line of the file is a header, not a data row. + separator: ',' +); // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); ``` @@ -644,29 +671,19 @@ You can also use the dynamic API to create the equivalent of the previous pipeli var mlContext = new MLContext(); // Step one: read the data as an IDataView. -// First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // Retrieve the training data. +var trainData = mlContext.Data.ReadFromTextFile(irisDataPath, // Default separator is tab, but the dataset has comma. - separatorChar: ',' + separator: ',' ); -// Retrieve the training data. -var trainData = reader.Read(irisDataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest) - // Cache data in moemory for steps after the cache check point stage. + // Cache data in memory for steps after the cache check point stage. .AppendCacheCheckpoint(mlContext) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) @@ -937,24 +954,27 @@ var meanVarValues = normalizedData.GetColumn(r => r.MeanVarNormalized).ToArray() You can achieve the same results using the dynamic API. ```csharp +//data model for the Iris class +private class IrisInputAllFeatures +{ + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label"), LoadColumn(4)] + public string IgnoredLabel { get; set; } + + [LoadColumn(4, loadAllOthers:true)] + public float Features { get; set; } +} + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); -// Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - // The four features of the Iris dataset will be grouped together as one Features column. - new TextLoader.Column("Features", DataKind.R4, 0, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, +// Read the training data. +var trainData = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separatorChar: ',' + separator: ',' ); -// Read the training data. -var trainData = reader.Read(dataPath); - // Apply all kinds of standard ML.NET normalization to the raw features. var pipeline = mlContext.Transforms.Normalize( @@ -1315,24 +1335,11 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Step one: read the data as an IDataView. -// First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] - { - // We read the first 11 values as a single float vector. - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, +var data = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separatorChar: ',' + separator: ',' ); -// Read the data. -var data = reader.Read(dataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. diff --git a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs index 16755494b8..437d37d071 100644 --- a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs +++ b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs @@ -63,84 +63,6 @@ public VectorTypeAttribute(params int[] dims) } } - /// - /// Describes column information such as name and the source columns indicies that this - /// column encapsulates. - /// - [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] - public sealed class LoadColumnAttribute : Attribute - { - /// - /// Initializes new instance of . - /// - /// The index of the column in the text file. - /// The optional name of the column, if it should be different from the name of the field or property where the attribute is positioned. - /// Wheather to load all columns, besides the one specified in the ordinal. - public LoadColumnAttribute(int ordinal, string name = null, bool loadAllOthers = false) - : this(ordinal.ToString(), name) - { - Sources = new List(); - var range = new TextLoader.Range(ordinal); - range.AllOther = loadAllOthers; - Sources.Add(range); - } - - /// - /// Initializes new instance of . - /// - /// The starting column index, for the range. - /// The ending column index, for the range. - /// The optional name of the column, if it should be different from the name of the field or property where the attribute is positioned. - /// Distinct text file column indces to load as part of this column. - public LoadColumnAttribute(string start, string end = null, string name = null, int[] columnIndexes = null) - : this(start, name) - { - Sources = new List(); - - bool hasEnd = int.TryParse(end, out int endIndex); - var range = hasEnd ? new TextLoader.Range(int.Parse(start), endIndex) : new TextLoader.Range(int.Parse(start)); - Sources.Add(range); - - if (columnIndexes != null) - { - foreach (var col in columnIndexes) - Sources.Add(new TextLoader.Range(col)); - } - } - - /// - /// Initializes new instance of . - /// - /// Distinct text file column indces to load as part of this column. - /// The optional name of the column, if it should be different from the name of the field or property where the attribute is positioned. - public LoadColumnAttribute(int[] columnIndexes, string name = null) - : this(columnIndexes[0].ToString(), name) - { - Sources = new List(); - foreach (var col in columnIndexes) - Sources.Add(new TextLoader.Range(col)); - } - -#pragma warning disable 618 - private LoadColumnAttribute(string start, string name) - { - Start = start; - Name = name; - } -#pragma warning restore 618 - - internal List Sources; - - /// - /// Column name. - /// - public string Name { get; } - - [Obsolete("Will be deleted together with the Legacy project.")] - [BestFriend] - internal string Start { get; } - } - /// /// Describes column information such as name and the source columns indicies that this /// column encapsulates. diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs new file mode 100644 index 0000000000..3cac2c255e --- /dev/null +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.Data +{ +#pragma warning disable 618 + /// + /// Describes column information such as name and the source columns indices that this + /// column encapsulates. + /// + [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] + public sealed class LoadColumnAttribute : Attribute + { + /// + /// Initializes new instance of . + /// + /// The index of the column in the text file. + public LoadColumnAttribute(int columnIndex) + : this(columnIndex.ToString()) + { + Sources.Add(new TextLoader.Range(columnIndex)); + } + + /// + /// Initializes new instance of . + /// + /// The starting column index, for the range. + /// The ending column index, for the range. + public LoadColumnAttribute(int start, int end) + : this(start) //REVIEW this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. + { + Sources.Add(new TextLoader.Range(start, end)); + } + + /// + /// Initializes new instance of . + /// + /// Distinct text file column indices to load as part of this column. + public LoadColumnAttribute(int[] columnIndexes) + : this(columnIndexes[0]) // REVIEW: this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. + { + foreach (var col in columnIndexes) + Sources.Add(new TextLoader.Range(col)); + } + + [Obsolete("Should be deleted together with the Legacy project.")] + private LoadColumnAttribute(string start) + { + Sources = new List(); + Start = start; + } + + internal List Sources; + + [Obsolete("Should be deleted together with the Legacy project.")] + [BestFriend] + internal string Start { get; } + } +#pragma warning restore 618 +} diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 2396343602..0e92d1f6ce 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -12,6 +12,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using System.Text; using Float = System.Single; @@ -1362,6 +1363,76 @@ public void Save(ModelSaveContext ctx) public IDataView Read(string path) => Read(new MultiFileSource(path)); + internal static TextLoader CreateTextReader(IHostEnvironment host, + bool hasHeader = DefaultArguments.HasHeader, + char separator = DefaultArguments.Separator, + bool allowQuotedStrings = DefaultArguments.AllowQuoting, + bool supportSparse = DefaultArguments.AllowSparse, + bool trimWhitespace = DefaultArguments.TrimWhitespace) + { + var userType = typeof(TInput); + + var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance); + + var propertyInfos = + userType + .GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(x => x.CanRead && x.CanWrite && x.GetGetMethod() != null && x.GetSetMethod() != null && x.GetIndexParameters().Length == 0); + + var memberInfos = (fieldInfos as IEnumerable).Concat(propertyInfos).ToArray(); + + var columns = new List(); + + for (int index = 0; index < memberInfos.Length; index++) + { + var memberInfo = memberInfos[index]; + var mappingAttr = memberInfo.GetCustomAttribute(); + + if(mappingAttr == null) + continue; + + var mappingAttrName = memberInfo.GetCustomAttribute(); + + var column = new Column(); + column.Name = mappingAttrName?.Name ?? memberInfo.Name; + column.Source = mappingAttr.Sources.ToArray(); + DataKind dk; + switch (memberInfo) + { + case FieldInfo field: + if (!DataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) + throw Contracts.Except($"Field {memberInfo.Name} is of unsupported type."); + + break; + + case PropertyInfo property: + if (!DataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk)) + throw Contracts.Except($"Property {memberInfo.Name} is of unsupported type."); + break; + + default: + Contracts.Assert(false); + throw Contracts.ExceptNotSupp("Expected a FieldInfo or a PropertyInfo"); + } + + column.Type = dk; + + columns.Add(column); + } + + Arguments args = new Arguments + { + HasHeader = hasHeader, + SeparatorChars = new[] { separator }, + AllowQuoting = allowQuotedStrings, + AllowSparse = supportSparse, + TrimWhitespace = trimWhitespace, + Column = columns.ToArray() + }; + + return new TextLoader(host, args); + } + private sealed class BoundLoader : IDataLoader { private readonly TextLoader _reader; diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 0efa4b9c3b..3c19223663 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -2,16 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; -using System; -using System.Collections.Generic; using System.IO; -using System.Linq; -using System.Reflection; -using System.Text.RegularExpressions; namespace Microsoft.ML { @@ -64,67 +58,7 @@ public static TextLoader CreateTextReader(this DataOperations catalog, bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, bool supportSparse = TextLoader.DefaultArguments.AllowSparse, bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) - { - var userType = typeof(TInput); - - var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance); - - var propertyInfos = - userType - .GetProperties(BindingFlags.Public | BindingFlags.Instance) - .Where(x => x.CanRead && x.CanWrite && x.GetGetMethod() != null && x.GetSetMethod() != null && x.GetIndexParameters().Length == 0); - - var memberInfos = (fieldInfos as IEnumerable).Concat(propertyInfos).ToArray(); - - var columns = new TextLoader.Column[memberInfos.Length]; - - for (int index = 0; index < memberInfos.Length; index++) - { - var memberInfo = memberInfos[index]; - var mappingAttr = memberInfo.GetCustomAttribute(); - var mptr = memberInfo.GetCustomAttributes(); - - Contracts.Assert(mappingAttr != null, $"Field or property {memberInfo.Name} is missing the LoadColumn attribute"); - - var column = new TextLoader.Column(); - column.Name = mappingAttr.Name ?? memberInfo.Name; - column.Source = mappingAttr.Sources.ToArray(); - DataKind dk; - switch (memberInfo) - { - case FieldInfo field: - if (!DataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) - throw Contracts.Except($"Field {memberInfo.Name} is of unsupported type."); - - break; - - case PropertyInfo property: - if (!DataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk)) - throw Contracts.Except($"Property {memberInfo.Name} is of unsupported type."); - break; - - default: - Contracts.Assert(false); - throw Contracts.ExceptNotSupp("Expected a FieldInfo or a PropertyInfo"); - } - - column.Type = dk; - - columns[index] = column; - } - - TextLoader.Arguments args = new TextLoader.Arguments - { - HasHeader = hasHeader, - SeparatorChars = new[] { separator }, - AllowQuoting = allowQuotedStrings, - AllowSparse = supportSparse, - TrimWhitespace = trimWhitespace, - Column = columns - }; - - return new TextLoader(CatalogUtils.GetEnvironment(catalog), args); - } + => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separator, allowQuotedStrings, supportSparse, trimWhitespace); /// /// Read a data view from a text file using . @@ -138,8 +72,8 @@ public static TextLoader CreateTextReader(this DataOperations catalog, public static IDataView ReadFromTextFile(this DataOperations catalog, string path, TextLoader.Column[] columns, - bool hasHeader = false, - char separatorChar = '\t') + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separatorChar = TextLoader.DefaultArguments.Separator) { Contracts.CheckNonEmpty(path, nameof(path)); @@ -151,6 +85,39 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, return reader.Read(new MultiFileSource(path)); } + /// + /// Read a data view from a text file using . + /// + /// The catalog. + /// Does the file contains header? + /// Column separator character. Default is '\t' + /// Whether the input may include quoted values, + /// which can contain separator characters, colons, + /// and distinguish empty values from missing values. When true, consecutive separators + /// denote a missing value and an empty value is denoted by \"\". + /// When false, consecutive separators denote an empty value. + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 + /// Remove trailing whitespace from lines + /// The path to the file. + /// The data view. + public static IDataView ReadFromTextFile(this DataOperations catalog, + string path, + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separator = TextLoader.DefaultArguments.Separator, + bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, + bool supportSparse = TextLoader.DefaultArguments.AllowSparse, + bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) + { + Contracts.CheckNonEmpty(path, nameof(path)); + + // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. + // Therefore, we are going to disallow data sample. + return TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separator, allowQuotedStrings, supportSparse, trimWhitespace) + .Read(new MultiFileSource(path)); + } + /// /// Read a data view from a text file using . /// diff --git a/src/Microsoft.ML.Legacy/Data/TextLoader.cs b/src/Microsoft.ML.Legacy/Data/TextLoader.cs index 14e0d63bd5..ee8ff63831 100644 --- a/src/Microsoft.ML.Legacy/Data/TextLoader.cs +++ b/src/Microsoft.ML.Legacy/Data/TextLoader.cs @@ -95,7 +95,7 @@ public TextLoader CreateFrom(bool useHeader = false, $"Valid characters are 0-9, *, - and ~"); var mappingNameAttr = memberInfo.GetCustomAttribute(); - var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; + var name = mappingNameAttr?.Name ?? memberInfo.Name; Runtime.Data.TextLoader.Range[] sources; if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Start, out sources)) diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index 39f6bc62e1..f3ff3f1ab8 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -65,9 +65,9 @@ open Xunit module SmokeTest1 = type SentimentData() = - [] + [] val mutable SentimentText : string - [] + [] val mutable Sentiment : float32 type SentimentPrediction() = @@ -130,10 +130,10 @@ module SmokeTest2 = [] type SentimentData = - { [] + { [] SentimentText : string - [] + [] Sentiment : float32 } [] @@ -195,10 +195,10 @@ module SmokeTest2 = module SmokeTest3 = type SentimentData() = - [] + [] member val SentimentText = "".AsMemory() with get, set - [] + [] member val Sentiment = 0.0 with get, set type SentimentPrediction() = diff --git a/test/Microsoft.ML.Tests/LearningPipelineTests.cs b/test/Microsoft.ML.Tests/LearningPipelineTests.cs index b9e83d18fa..651d29a940 100644 --- a/test/Microsoft.ML.Tests/LearningPipelineTests.cs +++ b/test/Microsoft.ML.Tests/LearningPipelineTests.cs @@ -49,7 +49,7 @@ public void CanAddAndRemoveFromPipeline() private class InputData { - [LoadColumn(ordinal: 1)] + [LoadColumn(columnIndex: 1)] public string F1; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs index 712bb959c9..47b33e313f 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs @@ -19,7 +19,7 @@ public ApiScenariosTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { - [LoadColumn(4, name: "Label")] + [LoadColumn(4), ColumnName("Label")] public string Label; } @@ -46,7 +46,7 @@ public class IrisPrediction public class SentimentData { - [LoadColumn(0, name: "Label")] + [LoadColumn(0), ColumnName("Label")] public bool Sentiment; [LoadColumn(1)] diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 16d4248e24..51803ae899 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -40,15 +40,8 @@ private void IntermediateData(string dataPath) // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50K", DataKind.BL, 0), - // Three text columns. - new TextLoader.Column("Workclass", DataKind.TX, 1), - new TextLoader.Column("Education", DataKind.TX, 2), - new TextLoader.Column("MaritalStatus", DataKind.TX, 3) - }, + // Read the data into a data view. + var data = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. hasHeader: true ); @@ -57,10 +50,6 @@ private void IntermediateData(string dataPath) // together into one. var dynamicPipeline = mlContext.Transforms.Concatenate("AllFeatures", "Education", "MaritalStatus"); - // Let's verify that the data has been read correctly. - // First, we read the data file. - var data = reader.Read(dataPath); - // Fit our data pipeline and transform data with it. var transformedData = dynamicPipeline.Fit(data).Transform(data); @@ -90,23 +79,14 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m var mlContext = new MLContext(); // Step one: read the data as an IDataView. - // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // We read the first 11 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), - - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 11), - }, + // Read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var trainData = mlContext.Data.ReadFromTextFile(trainDataPath, // First line of the file is a header, not a data row. hasHeader: true, // Default separator is tab, but we need a semicolon. - separatorChar: ';' + separator: ';' ); - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var trainData = reader.Read(trainDataPath); - // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used // several times somewhere. The caching mechanism is also lazy; it only caches things after being used. // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because @@ -136,7 +116,13 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m var model = dynamicPipeline.Fit(trainData); // Read the test dataset. - var testData = reader.Read(testDataPath); + var testData = mlContext.Data.ReadFromTextFile(testDataPath, + // First line of the file is a header, not a data row. + hasHeader: true, + // Default separator is tab, but we need a semicolon. + separator: ';' + ); + // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); @@ -166,29 +152,19 @@ private ITransformer TrainOnIris(string irisDataPath) var mlContext = new MLContext(); // Step one: read the data as an IDataView. - // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // Retrieve the training data. + var trainData = mlContext.Data.ReadFromTextFile(irisDataPath, // Default separator is tab, but the dataset has comma. - separatorChar: ',' + separator: ',' ); - // Retrieve the training data. - var trainData = reader.Read(irisDataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. .Append(mlContext.Transforms.Conversion.MapValueToKey("Label"), TransformerScope.TrainTest) - // Cache data in moemory for steps after the cache check point stage. + // Cache data in memory for steps after the cache check point stage. .AppendCacheCheckpoint(mlContext) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) @@ -233,20 +209,12 @@ private void NormalizationWorkout(string dataPath) // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // The four features of the Iris dataset will be grouped together as one Features column. - new TextLoader.Column("Features", DataKind.R4, 0, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // Read the training data. + var trainData = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separatorChar: ',' + separator: ',' ); - // Read the training data. - var trainData = reader.Read(dataPath); - // Apply all kinds of standard ML.NET normalization to the raw features. var pipeline = mlContext.Transforms.Normalize( @@ -265,17 +233,6 @@ private void NormalizationWorkout(string dataPath) public void Normalization() => NormalizationWorkout(GetDataPath("iris.data")); - private class IrisInput - { - // Unfortunately, we still need the dummy 'Label' column to be present. - [ColumnName("Label")] - public string IgnoredLabel { get; set; } - public float SepalLength { get; set; } - public float SepalWidth { get; set; } - public float PetalLength { get; set; } - public float PetalWidth { get; set; } - } - private IEnumerable GetChurnInfo() { var r = new Random(454); @@ -425,24 +382,11 @@ private void CrossValidationOn(string dataPath) var mlContext = new MLContext(); // Step one: read the data as an IDataView. - // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] - { - // We read the first 11 values as a single float vector. - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + var data = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separatorChar: ',' + separator: ',' ); - // Read the data. - var data = reader.Read(dataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. @@ -486,18 +430,10 @@ private void ReadDataDynamic(string dataPath) // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // We read the first 10 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 10) - }, - // Default separator is tab, but we need a comma. - separatorChar: ',' ); - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var data = reader.Read(dataPath); + var reader = mlContext.Data.ReadFromTextFile(dataPath, + // Default separator is tab, but we need a comma. + separator: ',' ); } // Define a class for all the input columns that we intend to consume. @@ -616,11 +552,58 @@ private class IrisPrediction private class InspectedRow { + [LoadColumn(0)] public bool IsOver50K { get; set; } + + [LoadColumn(1)] public string Workclass { get; set; } + + [LoadColumn(2)] public string Education { get; set; } + + [LoadColumn(3)] public string MaritalStatus { get; set; } + public string[] AllFeatures { get; set; } } + + private class IrisInput + { + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label"), LoadColumn(4)] + public string IgnoredLabel { get; set; } + + [LoadColumn(0)] + public float SepalLength { get; set; } + + [LoadColumn(1)] + public float SepalWidth { get; set; } + + [LoadColumn(2)] + public float PetalLength { get; set; } + + [LoadColumn(3)] + public float PetalWidth { get; set; } + } + + private class IrisInputAllFeatures + { + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label"), LoadColumn(4)] + public string IgnoredLabel { get; set; } + + [LoadColumn(0, 3)] + public float Features { get; set; } + } + + private class AdultData + { + [LoadColumn(0, 10), ColumnName("FeatureVector")] + public float Features { get; set; } + + [LoadColumn(11)] + public float Target { get; set; } + } + } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs index aac5cdcd37..057c9c3bf5 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs @@ -27,8 +27,8 @@ void New_CrossValidation() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(hasHeader: true) - .Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); + // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.ConvergenceTolerance = 1f; s.NumThreads = 1; })); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index 4b59b84552..8aa1f627c2 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -29,8 +29,7 @@ void New_DecomposableTrainAndPredict() var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); - var data = ml.Data.CreateTextReader(separator: ',') - .Read(dataPath); + var data = ml.Data.ReadFromTextFile(dataPath, separator: ','); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs index 49c30579e7..d0c40aaee4 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs @@ -34,8 +34,7 @@ public partial class ApiScenariosTests public void New_IntrospectiveTraining() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .AppendCacheCheckpoint(ml) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs index 099a8f5484..91bc71df32 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs @@ -26,8 +26,7 @@ public partial class ApiScenariosTests public void New_Metacomponents() { var ml = new MLContext(); - var data = ml.Data.CreateTextReader(TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') - .Read(GetDataPath(TestDatasets.irisData.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.irisData.trainFilename), separator: ','); var sdcaTrainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs index 912b5f50d1..fea876b738 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs @@ -26,8 +26,7 @@ public partial class ApiScenariosTests void New_MultithreadedPrediction() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); - var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename))); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") @@ -41,7 +40,7 @@ void New_MultithreadedPrediction() var engine = model.MakePredictionFunction(ml); // Take a couple examples out of the test data and run predictions on top. - var testData = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.testFilename))) + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true) .AsEnumerable(ml, false); Parallel.ForEach(testData, (input) => diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs index 8d95868a4a..a3d0ef5223 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs @@ -22,10 +22,10 @@ public partial class ApiScenariosTests public void New_ReconfigurablePrediction() { var ml = new MLContext(seed: 1, conc: 1); - var dataReader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); + var dataReader = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - var data = dataReader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); - var testData = dataReader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs index d5b43f36dc..8f0c18ed9a 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs @@ -22,8 +22,8 @@ public partial class ApiScenariosTests public void New_SimpleTrainAndPredict() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); - var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); + // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .AppendCacheCheckpoint(ml) @@ -36,7 +36,7 @@ public void New_SimpleTrainAndPredict() var engine = model.MakePredictionFunction(ml); // Take a couple examples out of the test data and run predictions on top. - var testData = reader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)) + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true) .AsEnumerable(ml, false); foreach (var input in testData.Take(5)) { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs index 4d1606ee9e..fd9d856e2a 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs @@ -24,8 +24,7 @@ public partial class ApiScenariosTests public void New_TrainSaveModelAndPredict() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); - var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") @@ -49,7 +48,7 @@ public void New_TrainSaveModelAndPredict() var engine = loadedModel.MakePredictionFunction(ml); // Take a couple examples out of the test data and run predictions on top. - var testData = reader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)) + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true) .AsEnumerable(ml, false); foreach (var input in testData.Take(5)) { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs index a471bb7858..6b10338782 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs @@ -22,7 +22,7 @@ public void New_TrainWithInitialPredictor() var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features"); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs index 14593d0b85..b5550c2f73 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs @@ -20,14 +20,14 @@ public void New_TrainWithValidationSet() { var ml = new MLContext(seed: 1, conc: 1); // Pipeline. - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features"); // Train the pipeline, prepare train and validation set. - var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); var preprocess = pipeline.Fit(data); var trainData = preprocess.Transform(data); - var validData = preprocess.Transform(reader.Read(GetDataPath(TestDatasets.Sentiment.testFilename))); + var validDataSource = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true); + var validData = preprocess.Transform(validDataSource); // Train model with validation set. var trainer = ml.BinaryClassification.Trainers.FastTree("Label","Features"); diff --git a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs index da6bbd5d00..049ff574d1 100644 --- a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs @@ -50,7 +50,7 @@ public class NewsData [LoadColumn(0)] public string Id; - [LoadColumn(1, name: "Label")] + [LoadColumn(1) , ColumnName("Label")] public string Topic; [LoadColumn(2)] diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs index b724fc23f2..ef8f704f4d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs @@ -59,7 +59,7 @@ public class HousePriceData [LoadColumn(1)] public string Date; - [LoadColumn(2, name: "Label")] + [LoadColumn(2), ColumnName("Label")] public float Price; [LoadColumn(3)] diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index a1ecf43e67..6ad0059032 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -139,7 +139,7 @@ public class IrisDataWithStringLabel [LoadColumn(3)] public float PetalWidth; - [LoadColumn(4, name: "Label")] + [LoadColumn(4), ColumnName("Label")] public string IrisPlantType; } } diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs index 8925c9ecec..488827881c 100644 --- a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs @@ -47,9 +47,9 @@ public class IrisPrediction public class SentimentData { - [LoadColumn("0", name: "Label")] + [LoadColumn(0), ColumnName("Label")] public bool Sentiment; - [LoadColumn("1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs index 4124c3d21d..542a3dab97 100644 --- a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs @@ -504,7 +504,7 @@ private IEnumerable GetTestData() public class SentimentData { - [LoadColumn(0, name: "Label")] + [LoadColumn(0), ColumnName("Label")] public float Sentiment; [LoadColumn(1)] public string SentimentText; diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 32d26024e8..17cb187f65 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -401,7 +401,7 @@ public class ModelWithoutColumnAttribute public class ModelWithColumnNameAttribute { - [LoadColumn(0, "Col1")] + [LoadColumn(0), ColumnName("Col1")] public string String_1; [LoadColumn(1)] @@ -439,21 +439,12 @@ public class Iris public string Type; } - public class IrisAllOther - { - [LoadColumn(4, loadAllOthers: true)] - public string Features; - - [LoadColumn(4)] - public string Type; - } - public class IrisStartEnd { - [LoadColumn(start:"0", end:"3", name:"Features", columnIndexes:null)] + [LoadColumn(start:0, end:3), ColumnName("Features")] public float Features; - [LoadColumn(4, name: "Label")] + [LoadColumn(4), ColumnName("Label")] public string Type; } @@ -462,7 +453,7 @@ public class IrisColumnIndices [LoadColumn(columnIndexes: new[] { 0, 2 })] public float Features; - [LoadColumn(4, name: "Label")] + [LoadColumn(4), ColumnName("Label")] public string Type; } @@ -496,29 +487,13 @@ public void LoaderColumnsFromIrisData() Assert.Equal("Type", previewIris.RowView[0].Values[index].Key); Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); - // Load allOther - var dataIrisAllOther = ml.Data.CreateTextReader(separator: ',').Read(dataPath); - var previewdataIrisAllOther = dataIrisAllOther.Preview(1); - - Assert.Equal(2, previewdataIrisAllOther.ColumnView.Length); - Assert.Equal("Features", previewdataIrisAllOther.RowView[0].Values[0].Key); - VBuffer featureValue = (VBuffer)previewdataIrisAllOther.RowView[0].Values[0].Value; - Assert.True(featureValue.IsDense); - Assert.Equal(4, featureValue.Length); - - foreach (var val in featureValue.GetValues()) - { - irisFirstRowValues.MoveNext(); - Assert.Equal(irisFirstRowValues.Current, val); - } - // Load with start and end indexes var dataIrisStartEnd = ml.Data.CreateTextReader(separator: ',').Read(dataPath); var previewIrisStartEnd = dataIrisStartEnd.Preview(1); Assert.Equal(2, previewIrisStartEnd.ColumnView.Length); Assert.Equal("Features", previewIrisStartEnd.RowView[0].Values[0].Key); - featureValue = (VBuffer)previewIrisStartEnd.RowView[0].Values[0].Value; + var featureValue = (VBuffer)previewIrisStartEnd.RowView[0].Values[0].Value; Assert.True(featureValue.IsDense); Assert.Equal(4, featureValue.Length); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs index 0fb1e167ac..2599df8bdd 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs @@ -70,7 +70,7 @@ public void Pkpd() } [Fact] - public void New_MetacomponentsFeaturesRenamed() + public void MetacomponentsFeaturesRenamed() { var data = new TextLoader(Env, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') .Read(GetDataPath(TestDatasets.irisData.trainFilename)); From 55e1bdda0a3a3637d2821d04329953783a9150c6 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Wed, 19 Dec 2018 13:41:23 -0800 Subject: [PATCH 07/11] post merge fixes --- .../Text/{LoadColumn.cs => LoadColumnAttribute.cs} | 4 ++-- test/Microsoft.ML.Tests/TextLoaderTests.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename src/Microsoft.ML.Data/DataLoadSave/Text/{LoadColumn.cs => LoadColumnAttribute.cs} (91%) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs similarity index 91% rename from src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs rename to src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs index 3cac2c255e..a0e741a551 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumn.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs @@ -32,7 +32,7 @@ public LoadColumnAttribute(int columnIndex) /// The starting column index, for the range. /// The ending column index, for the range. public LoadColumnAttribute(int start, int end) - : this(start) //REVIEW this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. + : this(start.ToString()) //REVIEW this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. { Sources.Add(new TextLoader.Range(start, end)); } @@ -42,7 +42,7 @@ public LoadColumnAttribute(int start, int end) /// /// Distinct text file column indices to load as part of this column. public LoadColumnAttribute(int[] columnIndexes) - : this(columnIndexes[0]) // REVIEW: this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. + : this(columnIndexes[0].ToString()) // REVIEW: this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. { foreach (var col in columnIndexes) Sources.Add(new TextLoader.Range(col)); diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 17cb187f65..5ab971e98d 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -476,8 +476,8 @@ public void LoaderColumnsFromIrisData() var previewIris = dataIris.Preview(1); Assert.Equal(5, previewIris.ColumnView.Length); - Assert.Equal("SepalLength", previewIris.Schema.GetColumnName(0)); - Assert.Equal(NumberType.R4, previewIris.Schema.GetColumnType(0)); + Assert.Equal("SepalLength", previewIris.Schema[0].Name); + Assert.Equal(NumberType.R4, previewIris.Schema[0].Type); int index = 0; foreach (var entry in irisFirstRow) { From 01fab7a80653c0c8b16441b2a50a7775b5d54faf Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Wed, 19 Dec 2018 14:20:39 -0800 Subject: [PATCH 08/11] this test is not needed if we don't process data models without the LoadColumn attribute. --- test/Microsoft.ML.Tests/TextLoaderTests.cs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 5ab971e98d..04bd50cd4b 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -333,13 +333,6 @@ public void CanSuccessfullyTrimSpaces() } } - [Fact] - public void ThrowsExceptionWithPropertyName() - { - Exception ex = Assert.Throws(() => new Legacy.Data.TextLoader("fakefile.txt").CreateFrom()); - Assert.StartsWith("Field or property String1 is missing LoadColumn attribute", ex.Message); - } - [Fact] public void CanSuccessfullyColumnNameProperty() { From c540d7f0cb792288538293e17012e097ae220896 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 20 Dec 2018 09:48:59 -0800 Subject: [PATCH 09/11] removing tabs from the cookbook. positioning comments in the LoadColumnAttribute. --- docs/code/MlNetCookBook.md | 44 +++++++++---------- .../DataLoadSave/Text/LoadColumnAttribute.cs | 8 +++- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 51a827179c..4fb801e03a 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -137,19 +137,19 @@ You can also create a data model class, and read the data based on this type. // The data model. This type will be used through the document. private class InspectedRow { - [LoadColumn(0)] - public bool IsOver50K { get; set; } + [LoadColumn(0)] + public bool IsOver50K { get; set; } - [LoadColumn(1)] - public string Workclass { get; set; } + [LoadColumn(1)] + public string Workclass { get; set; } - [LoadColumn(2)] - public string Education { get; set; } + [LoadColumn(2)] + public string Education { get; set; } - [LoadColumn(3)] - public string MaritalStatus { get; set; } + [LoadColumn(3)] + public string MaritalStatus { get; set; } - public string[] AllFeatures { get; set; } + public string[] AllFeatures { get; set; } } // Create a new context for ML.NET operations. It can be used for exception tracking and logging, @@ -158,10 +158,10 @@ var mlContext = new MLContext(); // Read the data into a data view. var data = mlContext.Data.ReadFromTextFile(dataPath, - // First line of the file is a header, not a data row. - hasHeader: true + // First line of the file is a header, not a data row. + hasHeader: true ) - + ``` ## How do I load data from multiple files? @@ -280,11 +280,11 @@ Or by creating a data model for it: ```csharp private class AdultData { - [LoadColumn("0", "10"), ColumnName("Features")] - public float FeatureVector { get; } + [LoadColumn("0", "10"), ColumnName("Features")] + public float FeatureVector { get; } - [LoadColumn(11)] - public float Target { get; } + [LoadColumn(11)] + public float Target { get; } } // Create a new context for ML.NET operations. It can be used for exception tracking and logging, @@ -293,8 +293,8 @@ var mlContext = new MLContext(); // Read the data into a data view. var data = mlContext.Data.ReadFromTextFile(dataPath, - // First line of the file is a header, not a data row. - separator: ',' + // First line of the file is a header, not a data row. + separator: ',' ); ``` @@ -512,8 +512,8 @@ var mlContext = new MLContext(); // First, we define the reader: specify the data columns and where to find them in the text file. // Read the data into a data view. Remember though, readers are lazy, so the actual reading will happen when the data is accessed. var trainData = mlContext.Data.ReadFromTextFile(dataPath, - // First line of the file is a header, not a data row. - separator: ',' + // First line of the file is a header, not a data row. + separator: ',' ); // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used @@ -562,8 +562,8 @@ Calculating the metrics with the dynamic API is as follows. ```csharp // Read the test dataset. var testData = mlContext.Data.ReadFromTextFile(testDataPath, - // First line of the file is a header, not a data row. - separator: ',' + // First line of the file is a header, not a data row. + separator: ',' ); // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs index a0e741a551..b5b0d3d626 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs @@ -20,6 +20,8 @@ public sealed class LoadColumnAttribute : Attribute /// Initializes new instance of . /// /// The index of the column in the text file. + // REVIEW: Remove calling the private constructor with just the start parameter, + // when the Legacy API's TextLoader gets deleted, and with it the Start field here. public LoadColumnAttribute(int columnIndex) : this(columnIndex.ToString()) { @@ -31,8 +33,10 @@ public LoadColumnAttribute(int columnIndex) /// /// The starting column index, for the range. /// The ending column index, for the range. + // REVIEW: Calling the private constructor with just the start parameter, is incorrect, + // but it is just temporary there, until the Legacy API's TextLoader gets deleted, together with the Start field. public LoadColumnAttribute(int start, int end) - : this(start.ToString()) //REVIEW this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. + : this(start.ToString()) { Sources.Add(new TextLoader.Range(start, end)); } @@ -41,6 +45,8 @@ public LoadColumnAttribute(int start, int end) /// Initializes new instance of . /// /// Distinct text file column indices to load as part of this column. + // REVIEW: Calling the private constructor with just the columnIndexes[0] parameter, is incorrect, + // but it is just temporary there, until the Legacy API's TextLoader gets deleted together with the Start field. public LoadColumnAttribute(int[] columnIndexes) : this(columnIndexes[0].ToString()) // REVIEW: this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. { From ce25b693bda9c73b47b9123356ac2f5870f2ffed Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Fri, 21 Dec 2018 09:04:00 -0800 Subject: [PATCH 10/11] addressing feedback --- docs/code/MlNetCookBook.md | 37 +++++++++++-------- .../DataLoadSave/Text/LoadColumnAttribute.cs | 2 + .../DataLoadSave/Text/TextLoader.cs | 5 +-- .../Text/TextLoaderSaverCatalog.cs | 24 ++++++------ src/Native/MatrixFactorizationNative/libmf | 2 +- .../CookbookSamplesDynamicApi.cs | 18 +++++---- .../Estimators/DecomposableTrainAndPredict.cs | 2 +- .../Api/Estimators/Metacomponents.cs | 2 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 6 +-- 9 files changed, 54 insertions(+), 44 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 4fb801e03a..908010c721 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -152,6 +152,11 @@ private class InspectedRow public string[] AllFeatures { get; set; } } +private class InspectedRowWithAllFeatures : InspectedRow +{ + public string[] AllFeatures { get; set; } +} + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); @@ -247,7 +252,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Target: ctx.LoadFloat(11) ), // Default separator is tab, but we need a comma. - separator: ','); + separatorChar: ','); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). @@ -269,7 +274,7 @@ var reader = mlContext.Data.CreateTextReader(new[] { new TextLoader.Column("Target", DataKind.R4, 11) }, // Default separator is tab, but we need a comma. - s => s.Separator = ","); + separatorChar: ','); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var data = reader.Read(dataPath); @@ -294,7 +299,7 @@ var mlContext = new MLContext(); // Read the data into a data view. var data = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. - separator: ',' + separatorChar: ',' ); ``` @@ -382,7 +387,7 @@ var transformedData = dataPipeline.Fit(data).Transform(data); // 'transformedData' is a 'promise' of data. Let's actually read it. var someRows = transformedData.AsDynamic // Convert to an enumerable of user-defined type. - .AsEnumerable(mlContext, reuseRowObject: false) + .AsEnumerable(mlContext, reuseRowObject: false) // Take a couple values as an array. .Take(4).ToArray(); @@ -421,7 +426,7 @@ var transformedData = dynamicPipeline.Fit(data).Transform(data); // 'transformedData' is a 'promise' of data. Let's actually read it. var someRows = transformedData // Convert to an enumerable of user-defined type. - .AsEnumerable(mlContext, reuseRowObject: false) + .AsEnumerable(mlContext, reuseRowObject: false) // Take a couple values as an array. .Take(4).ToArray(); @@ -465,7 +470,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( // The data file has header. hasHeader: true, // Default separator is tab, but we need a semicolon. - separator: ';'); + separatorChar: ';'); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). @@ -513,7 +518,7 @@ var mlContext = new MLContext(); // Read the data into a data view. Remember though, readers are lazy, so the actual reading will happen when the data is accessed. var trainData = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. - separator: ',' + separatorChar: ',' ); // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used @@ -563,7 +568,7 @@ Calculating the metrics with the dynamic API is as follows. // Read the test dataset. var testData = mlContext.Data.ReadFromTextFile(testDataPath, // First line of the file is a header, not a data row. - separator: ',' + separatorChar: ',' ); // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); @@ -632,7 +637,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Retrieve the training data. var trainData = reader.Read(irisDataPath); @@ -674,7 +679,7 @@ var mlContext = new MLContext(); // Retrieve the training data. var trainData = mlContext.Data.ReadFromTextFile(irisDataPath, // Default separator is tab, but the dataset has comma. - separator: ',' + separatorChar: ',' ); // Build the training pipeline. @@ -838,7 +843,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Retrieve the training data. var trainData = reader.Read(dataPath); @@ -931,7 +936,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Read the training data. var trainData = reader.Read(dataPath); @@ -972,7 +977,7 @@ var mlContext = new MLContext(); // Read the training data. var trainData = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separator: ',' + separatorChar: ',' ); // Apply all kinds of standard ML.NET normalization to the raw features. @@ -1290,7 +1295,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Read the data. var data = reader.Read(dataPath); @@ -1337,7 +1342,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. var data = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separator: ',' + separatorChar: ',' ); // Build the training pipeline. @@ -1397,7 +1402,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Read the data. var data = reader.Read(dataPath); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs index b5b0d3d626..fcf0cbae3f 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs @@ -8,6 +8,8 @@ namespace Microsoft.ML.Data { +// REVIEW: The Start field is decorated with [Obsolete], and this warning disables using Obsolete for this class. +// The Start field should get deleted together with the Legacy API. #pragma warning disable 618 /// /// Describes column information such as name and the source columns indices that this diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 8118aaaa2c..f47e2e8118 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -355,7 +355,7 @@ public class ArgumentsCore public int? InputSize; [Argument(ArgumentType.AtMostOnce, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")] - public string Separator = "tab"; //DefaultArguments.Separator + public string Separator = DefaultArguments.Separator.ToString(); [Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")] public char[] SeparatorChars = new[] { DefaultArguments.Separator }; @@ -1387,8 +1387,7 @@ internal static TextLoader CreateTextReader(IHostEnvironment host, var memberInfo = memberInfos[index]; var mappingAttr = memberInfo.GetCustomAttribute(); - if(mappingAttr == null) - continue; + host.Assert(mappingAttr != null, $"Field or property {memberInfo.Name} is missing the {nameof(LoadColumnAttribute)} attribute"); var mappingAttrName = memberInfo.GetCustomAttribute(); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 3c19223663..0c8fc1b574 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -21,8 +21,8 @@ public static class TextLoaderSaverCatalog /// The optional location of a data sample. public static TextLoader CreateTextReader(this DataOperations catalog, TextLoader.Column[] columns, - bool hasHeader = false, - char separatorChar = '\t', + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separatorChar = TextLoader.DefaultArguments.Separator, IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); @@ -42,7 +42,7 @@ public static TextLoader CreateTextReader(this DataOperations catalog, /// /// The catalog. /// Does the file contains header? - /// Column separator character. Default is '\t' + /// Column separator character. Default is '\t' /// Whether the input may include quoted values, /// which can contain separator characters, colons, /// and distinguish empty values from missing values. When true, consecutive separators @@ -54,11 +54,11 @@ public static TextLoader CreateTextReader(this DataOperations catalog, /// Remove trailing whitespace from lines public static TextLoader CreateTextReader(this DataOperations catalog, bool hasHeader = TextLoader.DefaultArguments.HasHeader, - char separator = TextLoader.DefaultArguments.Separator, + char separatorChar = TextLoader.DefaultArguments.Separator, bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, bool supportSparse = TextLoader.DefaultArguments.AllowSparse, bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) - => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separator, allowQuotedStrings, supportSparse, trimWhitespace); + => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); /// /// Read a data view from a text file using . @@ -90,7 +90,7 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, /// /// The catalog. /// Does the file contains header? - /// Column separator character. Default is '\t' + /// Column separator character. Default is '\t' /// Whether the input may include quoted values, /// which can contain separator characters, colons, /// and distinguish empty values from missing values. When true, consecutive separators @@ -105,7 +105,7 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, public static IDataView ReadFromTextFile(this DataOperations catalog, string path, bool hasHeader = TextLoader.DefaultArguments.HasHeader, - char separator = TextLoader.DefaultArguments.Separator, + char separatorChar = TextLoader.DefaultArguments.Separator, bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, bool supportSparse = TextLoader.DefaultArguments.AllowSparse, bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) @@ -114,7 +114,7 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. - return TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separator, allowQuotedStrings, supportSparse, trimWhitespace) + return TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace) .Read(new MultiFileSource(path)); } @@ -140,15 +140,15 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, string pat /// The catalog. /// The data view to save. /// The stream to write to. - /// The column separator. + /// The column separator. /// Whether to write the header row. /// Whether to write the header comment with the schema. /// Whether to keep hidden columns in the dataset. public static void SaveAsText(this DataOperations catalog, IDataView data, Stream stream, - char separator = '\t', - bool headerRow = true, + char separatorChar = TextLoader.DefaultArguments.Separator, + bool headerRow = TextLoader.DefaultArguments.HasHeader, bool schema = true, bool keepHidden = false) { @@ -157,7 +157,7 @@ public static void SaveAsText(this DataOperations catalog, Contracts.CheckValue(stream, nameof(stream)); var env = catalog.GetEnvironment(); - var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separator.ToString(), OutputHeader = headerRow, OutputSchema = schema }); + var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema }); using (var ch = env.Start("Saving data")) DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden); diff --git a/src/Native/MatrixFactorizationNative/libmf b/src/Native/MatrixFactorizationNative/libmf index f92a18161b..1ecc365249 160000 --- a/src/Native/MatrixFactorizationNative/libmf +++ b/src/Native/MatrixFactorizationNative/libmf @@ -1 +1 @@ -Subproject commit f92a18161b6824fda4c4ab698a69d299a836841a +Subproject commit 1ecc365249e5cac5e72c66317a141298dc52f6e3 diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 21c2613c4b..b0075f9975 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -56,7 +56,7 @@ private void IntermediateData(string dataPath) // 'transformedData' is a 'promise' of data. Let's actually read it. var someRows = transformedData // Convert to an enumerable of user-defined type. - .AsEnumerable(mlContext, reuseRowObject: false) + .AsEnumerable(mlContext, reuseRowObject: false) // Take a couple values as an array. .Take(4).ToArray(); @@ -84,7 +84,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m // First line of the file is a header, not a data row. hasHeader: true, // Default separator is tab, but we need a semicolon. - separator: ';' + separatorChar: ';' ); // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used @@ -120,7 +120,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m // First line of the file is a header, not a data row. hasHeader: true, // Default separator is tab, but we need a semicolon. - separator: ';' + separatorChar: ';' ); // Calculate metrics of the model on the test data. @@ -155,7 +155,7 @@ private ITransformer TrainOnIris(string irisDataPath) // Retrieve the training data. var trainData = mlContext.Data.ReadFromTextFile(irisDataPath, // Default separator is tab, but the dataset has comma. - separator: ',' + separatorChar: ',' ); // Build the training pipeline. @@ -212,7 +212,7 @@ private void NormalizationWorkout(string dataPath) // Read the training data. var trainData = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separator: ',' + separatorChar: ',' ); // Apply all kinds of standard ML.NET normalization to the raw features. @@ -384,7 +384,7 @@ private void CrossValidationOn(string dataPath) // Step one: read the data as an IDataView. var data = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. - separator: ',' + separatorChar: ',' ); // Build the training pipeline. @@ -433,7 +433,7 @@ private void ReadDataDynamic(string dataPath) // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var reader = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but we need a comma. - separator: ',' ); + separatorChar: ',' ); } // Define a class for all the input columns that we intend to consume. @@ -564,6 +564,10 @@ private class InspectedRow [LoadColumn(3)] public string MaritalStatus { get; set; } + } + + private class InspectedRowWithAllFeatures : InspectedRow + { public string[] AllFeatures { get; set; } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index 5d02d19eea..3fe1bf3db9 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -29,7 +29,7 @@ void New_DecomposableTrainAndPredict() var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); - var data = ml.Data.ReadFromTextFile(dataPath, separator: ','); + var data = ml.Data.ReadFromTextFile(dataPath, separatorChar: ','); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs index 91bc71df32..410a26f6fa 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs @@ -26,7 +26,7 @@ public partial class ApiScenariosTests public void New_Metacomponents() { var ml = new MLContext(); - var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.irisData.trainFilename), separator: ','); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.irisData.trainFilename), separatorChar: ','); var sdcaTrainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 04bd50cd4b..e23f4c8c80 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -465,7 +465,7 @@ public void LoaderColumnsFromIrisData() var irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); // Simple load - var dataIris = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var dataIris = ml.Data.CreateTextReader(separatorChar: ',').Read(dataPath); var previewIris = dataIris.Preview(1); Assert.Equal(5, previewIris.ColumnView.Length); @@ -481,7 +481,7 @@ public void LoaderColumnsFromIrisData() Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); // Load with start and end indexes - var dataIrisStartEnd = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var dataIrisStartEnd = ml.Data.CreateTextReader(separatorChar: ',').Read(dataPath); var previewIrisStartEnd = dataIrisStartEnd.Preview(1); Assert.Equal(2, previewIrisStartEnd.ColumnView.Length); @@ -498,7 +498,7 @@ public void LoaderColumnsFromIrisData() } // load setting the distinct columns. Loading column 0 and 2 - var dataIrisColumnIndices = ml.Data.CreateTextReader(separator: ',').Read(dataPath); + var dataIrisColumnIndices = ml.Data.CreateTextReader(separatorChar: ',').Read(dataPath); var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1); Assert.Equal(2, previewIrisColumnIndices.ColumnView.Length); From 0392712eb0e0e2a117dbaa6952b1520a8653980f Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Fri, 21 Dec 2018 09:11:24 -0800 Subject: [PATCH 11/11] addign back the test about the exception. Reverting the changes to libmf --- src/Native/MatrixFactorizationNative/libmf | 2 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/Native/MatrixFactorizationNative/libmf b/src/Native/MatrixFactorizationNative/libmf index 1ecc365249..f92a18161b 160000 --- a/src/Native/MatrixFactorizationNative/libmf +++ b/src/Native/MatrixFactorizationNative/libmf @@ -1 +1 @@ -Subproject commit 1ecc365249e5cac5e72c66317a141298dc52f6e3 +Subproject commit f92a18161b6824fda4c4ab698a69d299a836841a diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index e23f4c8c80..d425bd8bdf 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -333,6 +333,13 @@ public void CanSuccessfullyTrimSpaces() } } + [Fact] + public void ThrowsExceptionWithPropertyName() + { + Exception ex = Assert.Throws(() => new Legacy.Data.TextLoader("fakefile.txt").CreateFrom()); + Assert.StartsWith($"Field or property String1 is missing {nameof(LoadColumnAttribute)}", ex.Message); + } + [Fact] public void CanSuccessfullyColumnNameProperty() {