diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index cb2791e585..23344d9e32 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -565,7 +565,7 @@ public ValueMapper GetKeyStringConversion(KeyType key) } } - public TryParseMapper GetParseConversion(ColumnType typeDst) + public TryParseMapper GetTryParseConversion(ColumnType typeDst) { Contracts.CheckValue(typeDst, nameof(typeDst)); Contracts.CheckParam(typeDst.IsStandardScalar || typeDst.IsKey, nameof(typeDst), @@ -1196,7 +1196,7 @@ public bool TryParseKey(in TX src, U8 min, U8 max, out U8 dst) if (min > uu || uu > max) { dst = 0; - return true; + return false; } dst = uu - min + 1; @@ -1428,8 +1428,7 @@ public bool TryParse(in TX src, out TS dst) if (TimeSpan.TryParse(src.ToString(), CultureInfo.InvariantCulture, out dst)) return true; dst = default; - var span = src.Span; - return IsStdMissing(ref span); + return false; } public bool TryParse(in TX src, out DT dst) @@ -1443,8 +1442,7 @@ public bool TryParse(in TX src, out DT dst) if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out dst)) return true; dst = default; - var span = src.Span; - return IsStdMissing(ref span); + return false; } public bool TryParse(in TX src, out DZ dst) @@ -1459,8 +1457,7 @@ public bool TryParse(in TX src, out DZ dst) return true; dst = default; - var span = src.Span; - return IsStdMissing(ref span); + return false; } // These throw an exception for unparsable and overflow values. @@ -1547,14 +1544,6 @@ public bool TryParse(in TX src, out BL dst) { var span = src.Span; - if (!span.IsEmpty && IsStdMissing(ref span)) - { - dst = false; - return false; - } - - Contracts.Assert(!IsStdMissing(ref span)); - char ch; switch (src.Length) { diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 3226d18446..7883b828d0 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -74,8 +74,8 @@ private Func GetCreatorOneCore(PrimitiveType type) { Contracts.Assert(type.IsStandardScalar || type.IsKey); Contracts.Assert(typeof(T) == type.RawType); - var fn = _conv.GetParseConversion(type); - return rows => new PrimitivePipe(rows, fn); + var fn = _conv.GetTryParseConversion(type); + return rows => new PrimitivePipe(rows, type, fn); } private Func GetCreatorVecCore(PrimitiveType type) @@ -88,8 +88,8 @@ private Func GetCreatorVecCore(PrimitiveType type) { Contracts.Assert(type.IsStandardScalar || type.IsKey); Contracts.Assert(typeof(T) == type.RawType); - var fn = _conv.GetParseConversion(type); - return rows => new VectorPipe(rows, fn); + var fn = _conv.GetTryParseConversion(type); + return rows => new VectorPipe(rows, type, fn); } public Func GetCreatorOne(KeyType key) @@ -218,6 +218,8 @@ private abstract class ColumnPipe { public readonly RowSet Rows; + public abstract bool HasNA { get; } + protected ColumnPipe(RowSet rows) { Contracts.AssertValue(rows); @@ -239,12 +241,16 @@ private sealed class PrimitivePipe : ColumnPipe // Has length Rows.Count, so indexed by irow. private TResult[] _values; - public PrimitivePipe(RowSet rows, TryParseMapper conv) + public override bool HasNA { get; } + + public PrimitivePipe(RowSet rows, PrimitiveType type, TryParseMapper conv) : base(rows) { Contracts.AssertValue(conv); + Contracts.Assert(typeof(TResult) == type.RawType); _conv = conv; _values = new TResult[Rows.Count]; + HasNA = Conversions.Instance.TryGetIsNAPredicate(type, out var del); } public override void Reset(int irow, int size) @@ -279,6 +285,8 @@ private sealed class VectorPipe : ColumnPipe { private readonly TryParseMapper _conv; + public override bool HasNA { get; } + private class VectorValue { private readonly VectorPipe _pipe; @@ -421,14 +429,16 @@ public void Get(ref VBuffer dst) // Has length Rows.Count, so indexed by irow. private VectorValue[] _values; - public VectorPipe(RowSet rows, TryParseMapper conv) + public VectorPipe(RowSet rows, PrimitiveType type, TryParseMapper conv) : base(rows) { Contracts.AssertValue(conv); + Contracts.Assert(typeof(TItem) == type.RawType); _conv = conv; _values = new VectorValue[Rows.Count]; for (int i = 0; i < _values.Length; i++) _values[i] = new VectorValue(this); + HasNA = Conversions.Instance.TryGetIsNAPredicate(type, out var del); } public override void Reset(int irow, int size) @@ -1330,7 +1340,11 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, var srcCur = fields.Indices[isrc]; Contracts.Assert(min <= srcCur & srcCur < lim); if (!v.Consume(irow, indexBase + srcCur, ref fields.Spans[isrc])) - throw Contracts.Except($"Could not parse value {fields.Spans[isrc]} in slot {indexBase + srcCur} of column {info.Name} in line {line}"); + { + if (!v.HasNA) + throw Contracts.Except($"Could not parse value {fields.Spans[isrc]} in slot {indexBase + srcCur} of column {info.Name} in line {line}"); + v.Rows.Stats.LogBadValue(line, info.Name, indexBase + srcCur); + } } } ivDst += sizeSeg; @@ -1349,7 +1363,11 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long if (isrc < vs.Count && vs.Indices[isrc] == src) { if (!v.Consume(irow, 0, ref vs.Spans[isrc])) - throw Contracts.Except($"Could not parse value {vs.Spans[isrc]} in line {line}, column {info.Name}"); + { + if (!v.HasNA) + throw Contracts.Except($"Could not parse value {vs.Spans[isrc]} in line {line}, column {info.Name}"); + v.Rows.Stats.LogBadValue(line, info.Name); + } } else v.Reset(irow, 0); diff --git a/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs b/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs index 7cc2f18c6a..3b774d1f85 100644 --- a/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs +++ b/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs @@ -205,7 +205,7 @@ protected Builder(PrimitiveType type) public override void ParseAddTermArg(ref ReadOnlyMemory terms, IChannel ch) { T val; - var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetParseConversion(ItemType); + var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetTryParseConversion(ItemType); for (bool more = true; more;) { ReadOnlyMemory term; @@ -231,7 +231,7 @@ public override void ParseAddTermArg(ref ReadOnlyMemory terms, IChannel ch public override void ParseAddTermArg(string[] terms, IChannel ch) { T val; - var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetParseConversion(ItemType); + var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetTryParseConversion(ItemType); foreach (var sterm in terms) { ReadOnlyMemory term = sterm.AsMemory(); diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index 9edb7bd312..2aff8417e6 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -32,7 +32,7 @@ public sealed partial class TestDataPipe : TestDataPipeBase private static VBuffer dataDoubleSparse = new VBuffer(5, 3, new double[] { -0.0, 0, 1 }, new[] { 0, 3, 4 }); private static uint[] resultsDoubleSparse = new uint[] { 21, 21, 21, 21, 31 }; - [Fact(Skip = "Schema baseline comparison fails")] + [Fact()] public void SavePipeLabelParsers() { string pathData = GetDataPath(@"lm.sample.txt"); @@ -44,7 +44,7 @@ public void SavePipeLabelParsers() "xf=AutoLabel{col=AutoLabel:RawLabel}", "xf=Term{col=StringLabel:RawLabel terms={Wirtschaft,Gesundheit,Deutschland,Ausland,Unterhaltung,Sport,Technik & Wissen}}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=AutoLabel keepcol=StringLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=AutoLabel keepcol=StringLabel keepcol=FileLabel hidden=-}" }); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -64,7 +64,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "1"); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -84,7 +84,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "2"); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -104,7 +104,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{key=- col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "3"); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -127,15 +127,14 @@ public void SavePipeLabelParsers() { TestCore(pathData, true, new[] { - "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", - string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData), - string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey}" + "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", + string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData), + string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData), + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey hidden=-}" }, suffix: "4"); writer.WriteLine(ProgressLogLine); Env.PrintProgress(); } - CheckEqualityNormalized("SavePipe", name); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -155,7 +154,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "5"); Done(); @@ -421,7 +420,7 @@ public void SavePipeCat() Done(); } - [Fact(Skip = "Schema baseline comparison fails")] + [Fact()] public void SavePipeHash() { string pathData = DeleteOutputPath("SavePipe", "HashTransform.txt");