Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 5 additions & 16 deletions src/Microsoft.ML.Data/Data/Conversion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ public ValueMapper<TSrc, SB> GetKeyStringConversion<TSrc>(KeyType key)
}
}

public TryParseMapper<TDst> GetParseConversion<TDst>(ColumnType typeDst)
public TryParseMapper<TDst> GetTryParseConversion<TDst>(ColumnType typeDst)
{
Contracts.CheckValue(typeDst, nameof(typeDst));
Contracts.CheckParam(typeDst.IsStandardScalar || typeDst.IsKey, nameof(typeDst),
Expand Down Expand Up @@ -1196,7 +1196,7 @@ public bool TryParseKey(in TX src, U8 min, U8 max, out U8 dst)
if (min > uu || uu > max)
{
dst = 0;
return true;
return false;
}

dst = uu - min + 1;
Expand Down Expand Up @@ -1428,8 +1428,7 @@ public bool TryParse(in TX src, out TS dst)
if (TimeSpan.TryParse(src.ToString(), CultureInfo.InvariantCulture, out dst))
return true;
dst = default;
var span = src.Span;
return IsStdMissing(ref span);
return false;
}

public bool TryParse(in TX src, out DT dst)
Expand All @@ -1443,8 +1442,7 @@ public bool TryParse(in TX src, out DT dst)
if (DateTime.TryParse(src.ToString(), CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal, out dst))
return true;
dst = default;
var span = src.Span;
return IsStdMissing(ref span);
return false;
}

public bool TryParse(in TX src, out DZ dst)
Expand All @@ -1459,8 +1457,7 @@ public bool TryParse(in TX src, out DZ dst)
return true;

dst = default;
var span = src.Span;
return IsStdMissing(ref span);
return false;
}

// These throw an exception for unparsable and overflow values.
Expand Down Expand Up @@ -1547,14 +1544,6 @@ public bool TryParse(in TX src, out BL dst)
{
var span = src.Span;

if (!span.IsEmpty && IsStdMissing(ref span))
{
dst = false;
return false;
}

Contracts.Assert(!IsStdMissing(ref span));

char ch;
switch (src.Length)
{
Expand Down
34 changes: 26 additions & 8 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ private Func<RowSet, ColumnPipe> GetCreatorOneCore<T>(PrimitiveType type)
{
Contracts.Assert(type.IsStandardScalar || type.IsKey);
Contracts.Assert(typeof(T) == type.RawType);
var fn = _conv.GetParseConversion<T>(type);
return rows => new PrimitivePipe<T>(rows, fn);
var fn = _conv.GetTryParseConversion<T>(type);
return rows => new PrimitivePipe<T>(rows, type, fn);
}

private Func<RowSet, ColumnPipe> GetCreatorVecCore(PrimitiveType type)
Expand All @@ -88,8 +88,8 @@ private Func<RowSet, ColumnPipe> GetCreatorVecCore<T>(PrimitiveType type)
{
Contracts.Assert(type.IsStandardScalar || type.IsKey);
Contracts.Assert(typeof(T) == type.RawType);
var fn = _conv.GetParseConversion<T>(type);
return rows => new VectorPipe<T>(rows, fn);
var fn = _conv.GetTryParseConversion<T>(type);
return rows => new VectorPipe<T>(rows, type, fn);
}

public Func<RowSet, ColumnPipe> GetCreatorOne(KeyType key)
Expand Down Expand Up @@ -218,6 +218,8 @@ private abstract class ColumnPipe
{
public readonly RowSet Rows;

public abstract bool HasNA { get; }

protected ColumnPipe(RowSet rows)
{
Contracts.AssertValue(rows);
Expand All @@ -239,12 +241,16 @@ private sealed class PrimitivePipe<TResult> : ColumnPipe
// Has length Rows.Count, so indexed by irow.
private TResult[] _values;

public PrimitivePipe(RowSet rows, TryParseMapper<TResult> conv)
public override bool HasNA { get; }

public PrimitivePipe(RowSet rows, PrimitiveType type, TryParseMapper<TResult> conv)
: base(rows)
{
Contracts.AssertValue(conv);
Contracts.Assert(typeof(TResult) == type.RawType);
_conv = conv;
_values = new TResult[Rows.Count];
HasNA = Conversions.Instance.TryGetIsNAPredicate(type, out var del);
}

public override void Reset(int irow, int size)
Expand Down Expand Up @@ -279,6 +285,8 @@ private sealed class VectorPipe<TItem> : ColumnPipe
{
private readonly TryParseMapper<TItem> _conv;

public override bool HasNA { get; }

private class VectorValue
{
private readonly VectorPipe<TItem> _pipe;
Expand Down Expand Up @@ -421,14 +429,16 @@ public void Get(ref VBuffer<TItem> dst)
// Has length Rows.Count, so indexed by irow.
private VectorValue[] _values;

public VectorPipe(RowSet rows, TryParseMapper<TItem> conv)
public VectorPipe(RowSet rows, PrimitiveType type, TryParseMapper<TItem> conv)
: base(rows)
{
Contracts.AssertValue(conv);
Contracts.Assert(typeof(TItem) == type.RawType);
_conv = conv;
_values = new VectorValue[Rows.Count];
for (int i = 0; i < _values.Length; i++)
_values[i] = new VectorValue(this);
HasNA = Conversions.Instance.TryGetIsNAPredicate(type, out var del);
}

public override void Reset(int irow, int size)
Expand Down Expand Up @@ -1330,7 +1340,11 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v,
var srcCur = fields.Indices[isrc];
Contracts.Assert(min <= srcCur & srcCur < lim);
if (!v.Consume(irow, indexBase + srcCur, ref fields.Spans[isrc]))
throw Contracts.Except($"Could not parse value {fields.Spans[isrc]} in slot {indexBase + srcCur} of column {info.Name} in line {line}");
{
if (!v.HasNA)
throw Contracts.Except($"Could not parse value {fields.Spans[isrc]} in slot {indexBase + srcCur} of column {info.Name} in line {line}");
v.Rows.Stats.LogBadValue(line, info.Name, indexBase + srcCur);
}
}
}
ivDst += sizeSeg;
Expand All @@ -1349,7 +1363,11 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long
if (isrc < vs.Count && vs.Indices[isrc] == src)
{
if (!v.Consume(irow, 0, ref vs.Spans[isrc]))
throw Contracts.Except($"Could not parse value {vs.Spans[isrc]} in line {line}, column {info.Name}");
{
if (!v.HasNA)
throw Contracts.Except($"Could not parse value {vs.Spans[isrc]} in line {line}, column {info.Name}");
v.Rows.Stats.LogBadValue(line, info.Name);
}
}
else
v.Reset(irow, 0);
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ protected Builder(PrimitiveType type)
public override void ParseAddTermArg(ref ReadOnlyMemory<char> terms, IChannel ch)
{
T val;
var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetParseConversion<T>(ItemType);
var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetTryParseConversion<T>(ItemType);
for (bool more = true; more;)
{
ReadOnlyMemory<char> term;
Expand All @@ -231,7 +231,7 @@ public override void ParseAddTermArg(ref ReadOnlyMemory<char> terms, IChannel ch
public override void ParseAddTermArg(string[] terms, IChannel ch)
{
T val;
var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetParseConversion<T>(ItemType);
var tryParse = Runtime.Data.Conversion.Conversions.Instance.GetTryParseConversion<T>(ItemType);
foreach (var sterm in terms)
{
ReadOnlyMemory<char> term = sterm.AsMemory();
Expand Down
23 changes: 11 additions & 12 deletions test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public sealed partial class TestDataPipe : TestDataPipeBase
private static VBuffer<Double> dataDoubleSparse = new VBuffer<Double>(5, 3, new double[] { -0.0, 0, 1 }, new[] { 0, 3, 4 });
private static uint[] resultsDoubleSparse = new uint[] { 21, 21, 21, 21, 31 };

[Fact(Skip = "Schema baseline comparison fails")]
[Fact()]
public void SavePipeLabelParsers()
{
string pathData = GetDataPath(@"lm.sample.txt");
Expand All @@ -44,7 +44,7 @@ public void SavePipeLabelParsers()
"xf=AutoLabel{col=AutoLabel:RawLabel}",
"xf=Term{col=StringLabel:RawLabel terms={Wirtschaft,Gesundheit,Deutschland,Ausland,Unterhaltung,Sport,Technik & Wissen}}",
string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData),
"xf=SelectColumns{keepcol=RawLabel keepcol=AutoLabel keepcol=StringLabel keepcol=FileLabel}"
"xf=SelectColumns{keepcol=RawLabel keepcol=AutoLabel keepcol=StringLabel keepcol=FileLabel hidden=-}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keepHidden=false is default.

});

mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt");
Expand All @@ -64,7 +64,7 @@ public void SavePipeLabelParsers()
new[] {
"loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}",
string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData),
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}"
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}"
}, suffix: "1");

mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt");
Expand All @@ -84,7 +84,7 @@ public void SavePipeLabelParsers()
new[] {
"loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}",
string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData),
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}"
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}"
}, suffix: "2");

mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt");
Expand All @@ -104,7 +104,7 @@ public void SavePipeLabelParsers()
new[] {
"loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}",
string.Format("xf=TermLookup{{key=- col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData),
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}"
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}"
}, suffix: "3");

mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt");
Expand All @@ -127,15 +127,14 @@ public void SavePipeLabelParsers()
{
TestCore(pathData, true,
new[] {
"loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}",
string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData),
string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData),
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey}"
"loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}",
string.Format("xf=TermLookup{{key=- col=FileLabelNum:RawLabel data={{{0}}}}}", mappingPathData),
string.Format("xf=TermLookup{{col=FileLabelKey:RawLabel data={{{0}}}}}", mappingPathData),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this meant to be duplicated?

"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabelNum keepcol=FileLabelKey hidden=-}"
}, suffix: "4");
writer.WriteLine(ProgressLogLine);
Env.PrintProgress();
}

CheckEqualityNormalized("SavePipe", name);

mappingPathData = DeleteOutputPath("SavePipe", "Mapping.txt");
Expand All @@ -155,7 +154,7 @@ public void SavePipeLabelParsers()
new[] {
"loader=Text{col=RawLabel:TXT:0 col=Names:TXT:1-2 col=Features:TXT:3-4 header+}",
string.Format("xf=TermLookup{{col=FileLabel:RawLabel data={{{0}}}}}", mappingPathData),
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel}"
"xf=SelectColumns{keepcol=RawLabel keepcol=FileLabel hidden=-}"
}, suffix: "5");

Done();
Expand Down Expand Up @@ -421,7 +420,7 @@ public void SavePipeCat()
Done();
}

[Fact(Skip = "Schema baseline comparison fails")]
[Fact()]
public void SavePipeHash()
{
string pathData = DeleteOutputPath("SavePipe", "HashTransform.txt");
Expand Down