Skip to content

Commit

Permalink
Merge pull request #1070 from great-expectations/eugene/0.9.0_value_c…
Browse files Browse the repository at this point in the history
…ount_fix

PandasDataset value_count fix for mixed types in object columns
  • Loading branch information
eugmandel committed Feb 12, 2020
2 parents f504158 + dd16f10 commit ae651a0
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
9 changes: 8 additions & 1 deletion great_expectations/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,14 @@ def get_column_value_counts(self, column, sort="value", collate=None):
)
counts = self[column].value_counts()
if sort == "value":
counts.sort_index(inplace=True)
try:
counts.sort_index(inplace=True)
except TypeError:
# Having values of multiple types in a object dtype column (e.g., strings and floats)
# raises a TypeError when the sorting method performs comparisons.
if self[column].dtype == object:
counts.index = counts.index.astype(str)
counts.sort_index(inplace=True)
elif sort == "counts":
counts.sort_values(inplace=True)
counts.name = "count"
Expand Down
14 changes: 14 additions & 0 deletions tests/dataset/test_pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,3 +577,17 @@ def test_pandas_deepcopy():
assert df2.expect_column_to_exist("a").success == True
assert list(df["a"]) == [2, 3, 4]
assert list(df2["a"]) == [1, 2, 3]

def test_ge_value_count_of_object_dtype_column_with_mixed_types():
"""
Having mixed type values in a object dtype column (e.g., strings and floats)
used to raise a TypeError when sorting value_counts. This test verifies
that the issue is fixed.
"""
df = ge.dataset.PandasDataset({
'A': [1.5, 0.009, 0.5, "I am a string in an otherwise float column"],
})

value_counts = df.get_column_value_counts("A")
assert value_counts["I am a string in an otherwise float column"] == 1

0 comments on commit ae651a0

Please sign in to comment.