Skip to content
This repository has been archived by the owner on Aug 2, 2023. It is now read-only.

Commit

Permalink
Missing values default to a StringDataFrameColumn (#2982)
Browse files Browse the repository at this point in the history
* Make LoadCsv more robust

* Test empty string column

* Retain prev guess where possible
  • Loading branch information
Prashanth Govindarajan committed Oct 20, 2020
1 parent bdafc55 commit 3bd4a55
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 22 deletions.
32 changes: 10 additions & 22 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,38 +32,26 @@ private static Type GuessKind(int col, List<string[]> read)
continue;
}

bool boolParse = bool.TryParse(val, out bool boolResult);
if (boolParse)
if (!string.IsNullOrEmpty(val))
{
res = DetermineType(nbline == 0, typeof(bool), res);
++nbline;
continue;
}
else
{
if (string.IsNullOrEmpty(val))
bool boolParse = bool.TryParse(val, out bool boolResult);
if (boolParse)
{
res = DetermineType(nbline == 0, typeof(bool), res);
++nbline;
continue;
}
}
bool floatParse = float.TryParse(val, out float floatResult);
if (floatParse)
{
res = DetermineType(nbline == 0, typeof(float), res);
++nbline;
continue;
}
else
{
if (string.IsNullOrEmpty(val))
bool floatParse = float.TryParse(val, out float floatResult);
if (floatParse)
{
res = DetermineType(nbline == 0, typeof(float), res);
++nbline;
continue;
}

res = DetermineType(nbline == 0, typeof(string), res);
++nbline;
}
res = DetermineType(nbline == 0, typeof(string), res);
++nbline;
}
return res;
}
Expand Down
44 changes: 44 additions & 0 deletions tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -760,5 +760,49 @@ public void TestWriteCsvWithSemicolonSeparator()
Assert.Equal(1F, readIn[1, 9]);
Assert.Equal(1F, readIn[1, 10]);
}

[Fact]
public void TestMixedDataTypesInCsv()
{
string data = @"vendor_id,empty
null,
1,
true,
Null,
,
CMT,";

Stream GetStream(string streamData)
{
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}
DataFrame df = DataFrame.LoadCsv(GetStream(data));
Assert.Equal(6, df.Rows.Count);
Assert.Equal(2, df.Columns.Count);

Assert.True(typeof(string) == df.Columns[0].DataType);
Assert.True(typeof(string) == df.Columns[1].DataType);

Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("empty", df.Columns[1].Name);
VerifyColumnTypes(df);
Assert.Equal(2, df.Columns[0].NullCount);
Assert.Equal(0, df.Columns[1].NullCount);

var nullRow = df.Rows[3];
Assert.Null(nullRow[0]);

nullRow = df.Rows[4];
Assert.Equal("", nullRow[0]);

Assert.Null(df[0, 0]);
Assert.Null(df[3, 0]);

StringDataFrameColumn emptyColumn = (StringDataFrameColumn)df.Columns[1];
for (long i = 0; i < emptyColumn.Length; i++)
{
Assert.Equal("", emptyColumn[i]);
}
}
}
}

0 comments on commit 3bd4a55

Please sign in to comment.