From a8655862003f29d36799d9b456a855ba7f65c1b0 Mon Sep 17 00:00:00 2001 From: Yael Dekel Date: Tue, 30 Oct 2018 15:29:46 -0700 Subject: [PATCH 1/4] Fix TermLookup bug and enable two unit tests --- .../TermLookupTransform.cs | 2 +- .../Common/SavePipe/SavePipeHash-CursLog.txt | 7 - .../SavePipe/SavePipeLabelParsers4-Data.txt | 127 ------------------ .../SavePipe/SavePipeLabelParsers4-Schema.txt | 35 ----- .../SavePipe/SavePipeLabelParsers4-out.txt | 83 ------------ .../DataPipe/TestDataPipe.cs | 42 +++--- 6 files changed, 25 insertions(+), 271 deletions(-) delete mode 100644 test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt delete mode 100644 test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt delete mode 100644 test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt diff --git a/src/Microsoft.ML.Transforms/TermLookupTransform.cs b/src/Microsoft.ML.Transforms/TermLookupTransform.cs index 48c3b9a9d9..e153494bfa 100644 --- a/src/Microsoft.ML.Transforms/TermLookupTransform.cs +++ b/src/Microsoft.ML.Transforms/TermLookupTransform.cs @@ -389,7 +389,7 @@ private static IComponentFactory GetLoaderFacto // computing max and min. if (Conversions.Instance.TryParseKey(in txt, 1, ulong.MaxValue, out res)) { - if (res < min && res != 0) + if (res < min) min = res; if (res > max) max = res; diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt b/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt index 0168b9883e..c3374d87ab 100644 --- a/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt +++ b/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt @@ -1,8 +1 @@ - Bad value at line 1 in column CatU1 at slot 2 - Bad value at line 1 in column CatU2 at slot 0 - Bad value at line 2 in column CatU1 at slot 1 - Bad value at line 3 in column CatU2 at slot 0 - Bad value at line 3 in column CatU2 at slot 1 - Bad value at line 4 in column CatU2 at slot 2 -Processed 4 rows with 6 bad values and 0 format errors Cursored through 4 rows diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt deleted file mode 100644 index fcc2345716..0000000000 --- a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt +++ /dev/null @@ -1,127 +0,0 @@ -#@ TextLoader{ -#@ header+ -#@ sep=tab -#@ col=RawLabel:TX:0 -#@ col=FileLabelNum:R4:1 -#@ col=FileLabelKey:U4[0-*]:2 -#@ } -RawLabel FileLabelNum FileLabelKey -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Gesundheit 0.1 -Gesundheit 0.1 -Gesundheit 0.1 -Gesundheit 0.1 -Gesundheit 0.1 -Gesundheit 0.1 -Gesundheit 0.1 -Gesundheit 0.1 -Deutschland 1.5 -Deutschland 1.5 -Ausland 0.5 -Ausland 0.5 -Unterhaltung ? -Unterhaltung ? -Unterhaltung ? -Unterhaltung ? -Unterhaltung ? -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Sport 2.71 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Gesundheit 0.1 -Gesundheit 0.1 -Ausland 0.5 -Ausland 0.5 -Ausland 0.5 -Ausland 0.5 -Ausland 0.5 -Ausland 0.5 -Ausland 0.5 -Ausland 0.5 -Unterhaltung ? -Unterhaltung ? -Unterhaltung ? -Sport 2.71 -Sport 2.71 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Technik & Wissen 0.01 -Wirtschaft 3.14 -Gesundheit 0.1 -Wirtschaft 3.14 -Deutschland 1.5 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Wirtschaft 3.14 -Unterhaltung ? -Deutschland 1.5 -Gesundheit 0.1 -Gesundheit 0.1 -Deutschland 1.5 -Wirtschaft 3.14 -Unterhaltung ? -Wirtschaft 3.14 -Wirtschaft 3.14 -Gesundheit 0.1 -Deutschland 1.5 -Gesundheit 0.1 -Wirtschaft 3.14 -Wirtschaft 3.14 -Deutschland 1.5 -Wirtschaft 3.14 -Wirtschaft 3.14 -Gesundheit 0.1 -Unterhaltung ? -Wirtschaft 3.14 -Deutschland 1.5 -Wirtschaft 3.14 -Unterhaltung ? -Gesundheit 0.1 -Deutschland 1.5 -Wirtschaft 3.14 -Unterhaltung ? -Deutschland 1.5 -Wirtschaft 3.14 -Deutschland 1.5 -Wirtschaft 3.14 -Wirtschaft 3.14 -Deutschland 1.5 diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt deleted file mode 100644 index e54b1a5652..0000000000 --- a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt +++ /dev/null @@ -1,35 +0,0 @@ ----- BoundLoader ---- -3 columns: - RawLabel: Text - Names: Vec - Metadata 'SlotNames': Vec: Length=2, Count=2 - [0] 'de-DE', [1] 'url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss' - Features: Vec - Metadata 'SlotNames': Vec: Length=2, Count=2 - [0] 'weg fuer milliardenhilfe frei', [1] 'vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt' ----- TermLookupTransform ---- -4 columns: - RawLabel: Text - Names: Vec - Metadata 'SlotNames': Vec: Length=2, Count=2 - [0] 'de-DE', [1] 'url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss' - Features: Vec - Metadata 'SlotNames': Vec: Length=2, Count=2 - [0] 'weg fuer milliardenhilfe frei', [1] 'vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt' - FileLabelNum: R4 ----- TermLookupTransform ---- -5 columns: - RawLabel: Text - Names: Vec - Metadata 'SlotNames': Vec: Length=2, Count=2 - [0] 'de-DE', [1] 'url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss' - Features: Vec - Metadata 'SlotNames': Vec: Length=2, Count=2 - [0] 'weg fuer milliardenhilfe frei', [1] 'vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt' - FileLabelNum: R4 - FileLabelKey: Key ----- SelectColumnsDataTransform ---- -3 columns: - RawLabel: Text - FileLabelNum: R4 - FileLabelKey: Key diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt deleted file mode 100644 index bf71ea5899..0000000000 --- a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt +++ /dev/null @@ -1,83 +0,0 @@ - Bad value at line 5 in column Value -Processed 7 rows with 1 bad values and 0 format errors - Bad value at line 5 in column Value -Processed 7 rows with 1 bad values and 0 format errors -Wrote 7 rows across 2 columns in %Time% -Warning: Term 'Wirtschaft' in mapping file is mapped to non key value '3.14' -Warning: Term 'Gesundheit' in mapping file is mapped to non key value '0.1' -Warning: Term 'Deutschland' in mapping file is mapped to non key value '1.5' -Warning: Term 'Ausland' in mapping file is mapped to non key value '0.5' -Warning: Term 'Unterhaltung' in mapping file is mapped to non key value '1a' -Warning: Found 7 non key values in the file '%Output% -Warning: did not find any valid key values in the file '%Output% - Bad value at line 1 in column Value - Bad value at line 2 in column Value - Bad value at line 3 in column Value - Bad value at line 4 in column Value - Bad value at line 5 in column Value - Bad value at line 6 in column Value - Bad value at line 7 in column Value -Processed 7 rows with 7 bad values and 0 format errors - Bad value at line 1 in column Value - Bad value at line 2 in column Value - Bad value at line 3 in column Value - Bad value at line 4 in column Value - Bad value at line 5 in column Value - Bad value at line 6 in column Value - Bad value at line 7 in column Value -Processed 7 rows with 7 bad values and 0 format errors -Wrote 7 rows across 2 columns in %Time% -Wrote 119 rows of length 3 -Wrote 119 rows across 3 columns in %Time% ---- Progress log --- -[1] 'Building term dictionary' started. -[1] (%Time%) 119 examples Total Terms: 7 -[1] 'Building term dictionary' finished in %Time%. -[2] 'BinarySaver' started. -[2] (%Time%) 7 rows -[2] 'BinarySaver' finished in %Time%. -[3] 'TextSaver: saving data' started. -[3] (%Time%) 119 rows -[3] 'TextSaver: saving data' finished in %Time%. -[4] 'BinarySaver #2' started. -[4] (%Time%) 119 rows -[4] 'BinarySaver #2' finished in %Time%. -[5] 'BinarySaver #3' started. -[5] (%Time%) 7 rows -[5] 'BinarySaver #3' finished in %Time%. -[6] 'TextSaver: saving data #2' started. -[6] (%Time%) 119 rows -[6] 'TextSaver: saving data #2' finished in %Time%. -[7] 'BinarySaver #4' started. -[7] (%Time%) 119 rows -[7] 'BinarySaver #4' finished in %Time%. -[8] 'BinarySaver #5' started. -[8] (%Time%) 7 rows -[8] 'BinarySaver #5' finished in %Time%. -[9] 'TextSaver: saving data #3' started. -[9] (%Time%) 119 rows -[9] 'TextSaver: saving data #3' finished in %Time%. -[10] 'BinarySaver #6' started. -[10] (%Time%) 119 rows -[10] 'BinarySaver #6' finished in %Time%. -[11] 'BinarySaver #7' started. -[11] (%Time%) 7 rows -[11] 'BinarySaver #7' finished in %Time%. -[12] 'TextSaver: saving data #4' started. -[12] (%Time%) 119 rows -[12] 'TextSaver: saving data #4' finished in %Time%. -[13] 'BinarySaver #8' started. -[13] (%Time%) 119 rows -[13] 'BinarySaver #8' finished in %Time%. -[14] 'BinarySaver #9' started. -[14] (%Time%) 7 rows -[14] 'BinarySaver #9' finished in %Time%. -[15] 'BinarySaver #10' started. -[15] (%Time%) 7 rows -[15] 'BinarySaver #10' finished in %Time%. -[16] 'TextSaver: saving data #5' started. -[16] (%Time%) 119 rows -[16] 'TextSaver: saving data #5' finished in %Time%. -[17] 'BinarySaver #11' started. -[17] (%Time%) 119 rows -[17] 'BinarySaver #11' finished in %Time%. diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index 9edb7bd312..a8f1e76e67 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -32,7 +32,7 @@ public sealed partial class TestDataPipe : TestDataPipeBase private static VBuffer dataDoubleSparse = new VBuffer(5, 3, new double[] { -0.0, 0, 1 }, new[] { 0, 3, 4 }); private static uint[] resultsDoubleSparse = new uint[] { 21, 21, 21, 21, 31 }; - [Fact(Skip = "Schema baseline comparison fails")] + [Fact()] public void SavePipeLabelParsers() { string pathData = GetDataPath(@"lm.sample.txt"); @@ -44,7 +44,7 @@ public void SavePipeLabelParsers() "xf=AutoLabel{col=AutoLabel:RawLabel}", "xf=Term{col=StringLabel:RawLabel terms={Wirtschaft,Gesundheit,Deutschland,Ausland,Unterhaltung,Sport,Technik & Wissen}}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=AutoLabel keepcol=StringLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=AutoLabel keepcol=StringLabel keepcol=FileLabel hidden=-}" }); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -64,7 +64,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "1"); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -84,7 +84,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "2"); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -104,7 +104,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{key=- col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "3"); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); @@ -125,19 +125,25 @@ public void SavePipeLabelParsers() using (var writer = OpenWriter(pathOut)) using (Env.RedirectChannelOutput(writer, writer)) { - TestCore(pathData, true, - new[] { - "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", - string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData), - string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey}" - }, suffix: "4"); - writer.WriteLine(ProgressLogLine); - Env.PrintProgress(); + try + { + TestCore(pathData, true, + new[] { + "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", + string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData), + string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData), + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey hidden=-}" + }, suffix: "4"); + Assert.True(false); + } + catch (Exception e) + { + while (e.InnerException != null) + e = e.InnerException; + Assert.Equal("Could not parse value 1a in line 5, column Value", e.Message); + } } - CheckEqualityNormalized("SavePipe", name); - mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); File.WriteAllLines(mappingPathData, new[] { @@ -155,7 +161,7 @@ public void SavePipeLabelParsers() new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData), - "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}" + "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}" }, suffix: "5"); Done(); @@ -421,7 +427,7 @@ public void SavePipeCat() Done(); } - [Fact(Skip = "Schema baseline comparison fails")] + [Fact()] public void SavePipeHash() { string pathData = DeleteOutputPath("SavePipe", "HashTransform.txt"); From c3f7d3a584d0d5f445d25af2eda559162b52bccc Mon Sep 17 00:00:00 2001 From: Yael Dekel Date: Wed, 31 Oct 2018 14:09:22 -0700 Subject: [PATCH 2/4] Change TryParseKey logic to return false when parsed value is not in range --- src/Microsoft.ML.Data/Data/Conversion.cs | 4 +- .../DataLoadSave/Text/TextLoaderParser.cs | 34 +++-- .../Transforms/TermTransformImpl.cs | 4 +- .../TermLookupTransform.cs | 2 +- .../Common/SavePipe/SavePipeHash-CursLog.txt | 7 + .../SavePipe/SavePipeLabelParsers4-Data.txt | 127 ++++++++++++++++++ .../SavePipe/SavePipeLabelParsers4-Schema.txt | 35 +++++ .../SavePipe/SavePipeLabelParsers4-out.txt | 83 ++++++++++++ .../DataPipe/TestDataPipe.cs | 19 +-- 9 files changed, 289 insertions(+), 26 deletions(-) create mode 100644 test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt create mode 100644 test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt create mode 100644 test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index cb2791e585..377014aff2 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -565,7 +565,7 @@ public ValueMapper GetKeyStringConversion(KeyType key) } } - public TryParseMapper GetParseConversion(ColumnType typeDst) + public TryParseMapper GetTryParseConversion(ColumnType typeDst) { Contracts.CheckValue(typeDst, nameof(typeDst)); Contracts.CheckParam(typeDst.IsStandardScalar || typeDst.IsKey, nameof(typeDst), @@ -1196,7 +1196,7 @@ public bool TryParseKey(in TX src, U8 min, U8 max, out U8 dst) if (min > uu || uu > max) { dst = 0; - return true; + return false; } dst = uu - min + 1; diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 3226d18446..7883b828d0 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -74,8 +74,8 @@ private Func GetCreatorOneCore(PrimitiveType type) { Contracts.Assert(type.IsStandardScalar || type.IsKey); Contracts.Assert(typeof(T) == type.RawType); - var fn = _conv.GetParseConversion(type); - return rows => new PrimitivePipe(rows, fn); + var fn = _conv.GetTryParseConversion(type); + return rows => new PrimitivePipe(rows, type, fn); } private Func GetCreatorVecCore(PrimitiveType type) @@ -88,8 +88,8 @@ private Func GetCreatorVecCore(PrimitiveType type) { Contracts.Assert(type.IsStandardScalar || type.IsKey); Contracts.Assert(typeof(T) == type.RawType); - var fn = _conv.GetParseConversion(type); - return rows => new VectorPipe(rows, fn); + var fn = _conv.GetTryParseConversion(type); + return rows => new VectorPipe(rows, type, fn); } public Func GetCreatorOne(KeyType key) @@ -218,6 +218,8 @@ private abstract class ColumnPipe { public readonly RowSet Rows; + public abstract bool HasNA { get; } + protected ColumnPipe(RowSet rows) { Contracts.AssertValue(rows); @@ -239,12 +241,16 @@ private sealed class PrimitivePipe : ColumnPipe // Has length Rows.Count, so indexed by irow. private TResult[] _values; - public PrimitivePipe(RowSet rows, TryParseMapper conv) + public override bool HasNA { get; } + + public PrimitivePipe(RowSet rows, PrimitiveType type, TryParseMapper conv) : base(rows) { Contracts.AssertValue(conv); + Contracts.Assert(typeof(TResult) == type.RawType); _conv = conv; _values = new TResult[Rows.Count]; + HasNA = Conversions.Instance.TryGetIsNAPredicate(type, out var del); } public override void Reset(int irow, int size) @@ -279,6 +285,8 @@ private sealed class VectorPipe : ColumnPipe { private readonly TryParseMapper _conv; + public override bool HasNA { get; } + private class VectorValue { private readonly VectorPipe _pipe; @@ -421,14 +429,16 @@ public void Get(ref VBuffer dst) // Has length Rows.Count, so indexed by irow. private VectorValue[] _values; - public VectorPipe(RowSet rows, TryParseMapper conv) + public VectorPipe(RowSet rows, PrimitiveType type, TryParseMapper conv) : base(rows) { Contracts.AssertValue(conv); + Contracts.Assert(typeof(TItem) == type.RawType); _conv = conv; _values = new VectorValue[Rows.Count]; for (int i = 0; i < _values.Length; i++) _values[i] = new VectorValue(this); + HasNA = Conversions.Instance.TryGetIsNAPredicate(type, out var del); } public override void Reset(int irow, int size) @@ -1330,7 +1340,11 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v, var srcCur = fields.Indices[isrc]; Contracts.Assert(min <= srcCur & srcCur < lim); if (!v.Consume(irow, indexBase + srcCur, ref fields.Spans[isrc])) - throw Contracts.Except($"Could not parse value {fields.Spans[isrc]} in slot {indexBase + srcCur} of column {info.Name} in line {line}"); + { + if (!v.HasNA) + throw Contracts.Except($"Could not parse value {fields.Spans[isrc]} in slot {indexBase + srcCur} of column {info.Name} in line {line}"); + v.Rows.Stats.LogBadValue(line, info.Name, indexBase + srcCur); + } } } ivDst += sizeSeg; @@ -1349,7 +1363,11 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long if (isrc < vs.Count && vs.Indices[isrc] == src) { if (!v.Consume(irow, 0, ref vs.Spans[isrc])) - throw Contracts.Except($"Could not parse value {vs.Spans[isrc]} in line {line}, column {info.Name}"); + { + if (!v.HasNA) + throw Contracts.Except($"Could not parse value {vs.Spans[isrc]} in line {line}, column {info.Name}"); + v.Rows.Stats.LogBadValue(line, info.Name); + } } else v.Reset(irow, 0); diff --git a/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs b/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs index 7cc2f18c6a..3b774d1f85 100644 --- a/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs +++ b/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs @@ -205,7 +205,7 @@ protected Builder(PrimitiveType type) public override void ParseAddTermArg(ref ReadOnlyMemory terms, IChannel ch) { T val; - var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetParseConversion(ItemType); + var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetTryParseConversion(ItemType); for (bool more = true; more;) { ReadOnlyMemory term; @@ -231,7 +231,7 @@ public override void ParseAddTermArg(ref ReadOnlyMemory terms, IChannel ch public override void ParseAddTermArg(string[] terms, IChannel ch) { T val; - var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetParseConversion(ItemType); + var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetTryParseConversion(ItemType); foreach (var sterm in terms) { ReadOnlyMemory term = sterm.AsMemory(); diff --git a/src/Microsoft.ML.Transforms/TermLookupTransform.cs b/src/Microsoft.ML.Transforms/TermLookupTransform.cs index e153494bfa..48c3b9a9d9 100644 --- a/src/Microsoft.ML.Transforms/TermLookupTransform.cs +++ b/src/Microsoft.ML.Transforms/TermLookupTransform.cs @@ -389,7 +389,7 @@ private static IComponentFactory GetLoaderFacto // computing max and min. if (Conversions.Instance.TryParseKey(in txt, 1, ulong.MaxValue, out res)) { - if (res < min) + if (res < min && res != 0) min = res; if (res > max) max = res; diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt b/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt index c3374d87ab..0168b9883e 100644 --- a/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt +++ b/test/BaselineOutput/Common/SavePipe/SavePipeHash-CursLog.txt @@ -1 +1,8 @@ + Bad value at line 1 in column CatU1 at slot 2 + Bad value at line 1 in column CatU2 at slot 0 + Bad value at line 2 in column CatU1 at slot 1 + Bad value at line 3 in column CatU2 at slot 0 + Bad value at line 3 in column CatU2 at slot 1 + Bad value at line 4 in column CatU2 at slot 2 +Processed 4 rows with 6 bad values and 0 format errors Cursored through 4 rows diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt new file mode 100644 index 0000000000..fcc2345716 --- /dev/null +++ b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Data.txt @@ -0,0 +1,127 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=RawLabel:TX:0 +#@ col=FileLabelNum:R4:1 +#@ col=FileLabelKey:U4[0-*]:2 +#@ } +RawLabel FileLabelNum FileLabelKey +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Gesundheit 0.1 +Gesundheit 0.1 +Gesundheit 0.1 +Gesundheit 0.1 +Gesundheit 0.1 +Gesundheit 0.1 +Gesundheit 0.1 +Gesundheit 0.1 +Deutschland 1.5 +Deutschland 1.5 +Ausland 0.5 +Ausland 0.5 +Unterhaltung ? +Unterhaltung ? +Unterhaltung ? +Unterhaltung ? +Unterhaltung ? +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Sport 2.71 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Gesundheit 0.1 +Gesundheit 0.1 +Ausland 0.5 +Ausland 0.5 +Ausland 0.5 +Ausland 0.5 +Ausland 0.5 +Ausland 0.5 +Ausland 0.5 +Ausland 0.5 +Unterhaltung ? +Unterhaltung ? +Unterhaltung ? +Sport 2.71 +Sport 2.71 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Technik & Wissen 0.01 +Wirtschaft 3.14 +Gesundheit 0.1 +Wirtschaft 3.14 +Deutschland 1.5 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Wirtschaft 3.14 +Unterhaltung ? +Deutschland 1.5 +Gesundheit 0.1 +Gesundheit 0.1 +Deutschland 1.5 +Wirtschaft 3.14 +Unterhaltung ? +Wirtschaft 3.14 +Wirtschaft 3.14 +Gesundheit 0.1 +Deutschland 1.5 +Gesundheit 0.1 +Wirtschaft 3.14 +Wirtschaft 3.14 +Deutschland 1.5 +Wirtschaft 3.14 +Wirtschaft 3.14 +Gesundheit 0.1 +Unterhaltung ? +Wirtschaft 3.14 +Deutschland 1.5 +Wirtschaft 3.14 +Unterhaltung ? +Gesundheit 0.1 +Deutschland 1.5 +Wirtschaft 3.14 +Unterhaltung ? +Deutschland 1.5 +Wirtschaft 3.14 +Deutschland 1.5 +Wirtschaft 3.14 +Wirtschaft 3.14 +Deutschland 1.5 diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt new file mode 100644 index 0000000000..e54b1a5652 --- /dev/null +++ b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-Schema.txt @@ -0,0 +1,35 @@ +---- BoundLoader ---- +3 columns: + RawLabel: Text + Names: Vec + Metadata 'SlotNames': Vec: Length=2, Count=2 + [0] 'de-DE', [1] 'url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss' + Features: Vec + Metadata 'SlotNames': Vec: Length=2, Count=2 + [0] 'weg fuer milliardenhilfe frei', [1] 'vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt' +---- TermLookupTransform ---- +4 columns: + RawLabel: Text + Names: Vec + Metadata 'SlotNames': Vec: Length=2, Count=2 + [0] 'de-DE', [1] 'url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss' + Features: Vec + Metadata 'SlotNames': Vec: Length=2, Count=2 + [0] 'weg fuer milliardenhilfe frei', [1] 'vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt' + FileLabelNum: R4 +---- TermLookupTransform ---- +5 columns: + RawLabel: Text + Names: Vec + Metadata 'SlotNames': Vec: Length=2, Count=2 + [0] 'de-DE', [1] 'url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss' + Features: Vec + Metadata 'SlotNames': Vec: Length=2, Count=2 + [0] 'weg fuer milliardenhilfe frei', [1] 'vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt' + FileLabelNum: R4 + FileLabelKey: Key +---- SelectColumnsDataTransform ---- +3 columns: + RawLabel: Text + FileLabelNum: R4 + FileLabelKey: Key diff --git a/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt new file mode 100644 index 0000000000..bf71ea5899 --- /dev/null +++ b/test/BaselineOutput/Common/SavePipe/SavePipeLabelParsers4-out.txt @@ -0,0 +1,83 @@ + Bad value at line 5 in column Value +Processed 7 rows with 1 bad values and 0 format errors + Bad value at line 5 in column Value +Processed 7 rows with 1 bad values and 0 format errors +Wrote 7 rows across 2 columns in %Time% +Warning: Term 'Wirtschaft' in mapping file is mapped to non key value '3.14' +Warning: Term 'Gesundheit' in mapping file is mapped to non key value '0.1' +Warning: Term 'Deutschland' in mapping file is mapped to non key value '1.5' +Warning: Term 'Ausland' in mapping file is mapped to non key value '0.5' +Warning: Term 'Unterhaltung' in mapping file is mapped to non key value '1a' +Warning: Found 7 non key values in the file '%Output% +Warning: did not find any valid key values in the file '%Output% + Bad value at line 1 in column Value + Bad value at line 2 in column Value + Bad value at line 3 in column Value + Bad value at line 4 in column Value + Bad value at line 5 in column Value + Bad value at line 6 in column Value + Bad value at line 7 in column Value +Processed 7 rows with 7 bad values and 0 format errors + Bad value at line 1 in column Value + Bad value at line 2 in column Value + Bad value at line 3 in column Value + Bad value at line 4 in column Value + Bad value at line 5 in column Value + Bad value at line 6 in column Value + Bad value at line 7 in column Value +Processed 7 rows with 7 bad values and 0 format errors +Wrote 7 rows across 2 columns in %Time% +Wrote 119 rows of length 3 +Wrote 119 rows across 3 columns in %Time% +--- Progress log --- +[1] 'Building term dictionary' started. +[1] (%Time%) 119 examples Total Terms: 7 +[1] 'Building term dictionary' finished in %Time%. +[2] 'BinarySaver' started. +[2] (%Time%) 7 rows +[2] 'BinarySaver' finished in %Time%. +[3] 'TextSaver: saving data' started. +[3] (%Time%) 119 rows +[3] 'TextSaver: saving data' finished in %Time%. +[4] 'BinarySaver #2' started. +[4] (%Time%) 119 rows +[4] 'BinarySaver #2' finished in %Time%. +[5] 'BinarySaver #3' started. +[5] (%Time%) 7 rows +[5] 'BinarySaver #3' finished in %Time%. +[6] 'TextSaver: saving data #2' started. +[6] (%Time%) 119 rows +[6] 'TextSaver: saving data #2' finished in %Time%. +[7] 'BinarySaver #4' started. +[7] (%Time%) 119 rows +[7] 'BinarySaver #4' finished in %Time%. +[8] 'BinarySaver #5' started. +[8] (%Time%) 7 rows +[8] 'BinarySaver #5' finished in %Time%. +[9] 'TextSaver: saving data #3' started. +[9] (%Time%) 119 rows +[9] 'TextSaver: saving data #3' finished in %Time%. +[10] 'BinarySaver #6' started. +[10] (%Time%) 119 rows +[10] 'BinarySaver #6' finished in %Time%. +[11] 'BinarySaver #7' started. +[11] (%Time%) 7 rows +[11] 'BinarySaver #7' finished in %Time%. +[12] 'TextSaver: saving data #4' started. +[12] (%Time%) 119 rows +[12] 'TextSaver: saving data #4' finished in %Time%. +[13] 'BinarySaver #8' started. +[13] (%Time%) 119 rows +[13] 'BinarySaver #8' finished in %Time%. +[14] 'BinarySaver #9' started. +[14] (%Time%) 7 rows +[14] 'BinarySaver #9' finished in %Time%. +[15] 'BinarySaver #10' started. +[15] (%Time%) 7 rows +[15] 'BinarySaver #10' finished in %Time%. +[16] 'TextSaver: saving data #5' started. +[16] (%Time%) 119 rows +[16] 'TextSaver: saving data #5' finished in %Time%. +[17] 'BinarySaver #11' started. +[17] (%Time%) 119 rows +[17] 'BinarySaver #11' finished in %Time%. diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index a8f1e76e67..2aff8417e6 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -125,24 +125,17 @@ public void SavePipeLabelParsers() using (var writer = OpenWriter(pathOut)) using (Env.RedirectChannelOutput(writer, writer)) { - try - { - TestCore(pathData, true, - new[] { + TestCore(pathData, true, + new[] { "loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}", string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData), string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData), "xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey hidden=-}" - }, suffix: "4"); - Assert.True(false); - } - catch (Exception e) - { - while (e.InnerException != null) - e = e.InnerException; - Assert.Equal("Could not parse value 1a in line 5, column Value", e.Message); - } + }, suffix: "4"); + writer.WriteLine(ProgressLogLine); + Env.PrintProgress(); } + CheckEqualityNormalized("SavePipe", name); mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt"); File.WriteAllLines(mappingPathData, From 168df5e9fca4071726e9b31c17d1df4adc281b1a Mon Sep 17 00:00:00 2001 From: Yael Dekel Date: Wed, 31 Oct 2018 17:12:21 -0700 Subject: [PATCH 3/4] Change bool TryParse --- src/Microsoft.ML.Data/Data/Conversion.cs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 377014aff2..6255440e9d 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -1547,14 +1547,6 @@ public bool TryParse(in TX src, out BL dst) { var span = src.Span; - if (!span.IsEmpty && IsStdMissing(ref span)) - { - dst = false; - return false; - } - - Contracts.Assert(!IsStdMissing(ref span)); - char ch; switch (src.Length) { From da16609e6dfd8134941566c5233de5cefa793f10 Mon Sep 17 00:00:00 2001 From: Yael Dekel Date: Thu, 1 Nov 2018 11:51:19 -0700 Subject: [PATCH 4/4] Fix DateTime/DateTimeOffset/TimeSpan TryParse --- src/Microsoft.ML.Data/Data/Conversion.cs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 6255440e9d..23344d9e32 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -1428,8 +1428,7 @@ public bool TryParse(in TX src, out TS dst) if (TimeSpan.TryParse(src.ToString(), CultureInfo.InvariantCulture, out dst)) return true; dst = default; - var span = src.Span; - return IsStdMissing(ref span); + return false; } public bool TryParse(in TX src, out DT dst) @@ -1443,8 +1442,7 @@ public bool TryParse(in TX src, out DT dst) if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out dst)) return true; dst = default; - var span = src.Span; - return IsStdMissing(ref span); + return false; } public bool TryParse(in TX src, out DZ dst) @@ -1459,8 +1457,7 @@ public bool TryParse(in TX src, out DZ dst) return true; dst = default; - var span = src.Span; - return IsStdMissing(ref span); + return false; } // These throw an exception for unparsable and overflow values.