Skip to content

Commit

Permalink
Merge pull request #560 from great-expectations/feature/pandas_types
Browse files Browse the repository at this point in the history
Feature/pandas types
  • Loading branch information
jcampbell committed Aug 1, 2019
2 parents c6422f0 + d58d03c commit 06a7966
Show file tree
Hide file tree
Showing 10 changed files with 222 additions and 30 deletions.
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ matrix:
- dist: trusty
python: 2.7
env: PANDAS=0.23.4
- dist: trusty
python: 2.7
env: PANDAS=0.24.2
- dist: trusty
python: 2.7
env: PANDAS=latest
Expand All @@ -19,6 +22,9 @@ matrix:
- dist: trusty
python: 3.6
env: PANDAS=0.23.4
- dist: trusty
python: 3.6
env: PANDAS=0.24.2
- dist: trusty
python: 3.6
env: PANDAS=latest
Expand Down
3 changes: 2 additions & 1 deletion docs/roadmap_changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ Planned Features

v.0.7.4__develop
-----------------

* Add support for pandas extension dtypes in pandas backend of expect_column_values_to_be_of_type and
expect_column_values_to_be_in_type_list and fix bug affecting some dtype-based checks.

v.0.7.3
-----------------
Expand Down
3 changes: 2 additions & 1 deletion great_expectations/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1758,7 +1758,8 @@ def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
column (str): \
The column name.
distribution (str): \
The scipy distribution name. See: `<https://docs.scipy.org/doc/scipy/reference/stats.html>`_
The scipy distribution name. See: `<https://docs.scipy.org/doc/scipy/reference/stats.html>`_ Currently
supported distributions are listed in the Notes section below.
p_value (float): \
The threshold p-value for a passing test. Default is 0.05.
params (dict or list) : \
Expand Down
73 changes: 63 additions & 10 deletions great_expectations/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd
from dateutil.parser import parse
from scipy import stats
from six import PY3, integer_types, string_types
from six import PY2, PY3, integer_types, string_types

from great_expectations.data_asset import DataAsset
from .dataset import Dataset
Expand Down Expand Up @@ -528,17 +528,29 @@ def _expect_column_values_to_be_of_type__aggregate(
try:
comp_types.append(np.dtype(type_).type)
except TypeError:
pass
try:
pd_type = getattr(pd, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

try:
pd_type = getattr(pd.core.dtypes.dtypes, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

native_type = self._native_type_type_map(type_)
if native_type is not None:
comp_types.extend(native_type)
success = (self[column].dtype in comp_types)
success = (self[column].dtype.type in comp_types)

return {
"success": success,
"result": {
"observed_value": self[column].dtype.name
"observed_value": self[column].dtype.type.__name__
}
}

Expand All @@ -559,7 +571,12 @@ def _native_type_type_map(type_):
return complex,
elif type_.lower() == "str":
return str,
elif type_.lower() in ["string_types", "unicode"]:
elif type_.lower() == "unicode":
if PY2:
return unicode
else:
return None
elif type_.lower() in ["string_types"]:
return string_types

@MetaPandasDataset.column_map_expectation
Expand All @@ -573,7 +590,19 @@ def _expect_column_values_to_be_of_type__map(
try:
comp_types.append(np.dtype(type_).type)
except TypeError:
pass
try:
pd_type = getattr(pd, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

try:
pd_type = getattr(pd.core.dtypes.dtypes, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

native_type = self._native_type_type_map(type_)
if native_type is not None:
Expand Down Expand Up @@ -682,18 +711,30 @@ def _expect_column_values_to_be_in_type_list__aggregate(
try:
comp_types.append(np.dtype(type_).type)
except TypeError:
pass
try:
pd_type = getattr(pd, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

try:
pd_type = getattr(pd.core.dtypes.dtypes, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

native_type = self._native_type_type_map(type_)
if native_type is not None:
comp_types.extend(native_type)

success = (self[column].dtype in comp_types)
success = (self[column].dtype.type in comp_types)

return {
"success": success,
"result": {
"observed_value": self[column].dtype.name
"observed_value": self[column].dtype.type.__name__
}
}

Expand All @@ -709,7 +750,19 @@ def _expect_column_values_to_be_in_type_list__map(
try:
comp_types.append(np.dtype(type_).type)
except TypeError:
pass
try:
pd_type = getattr(pd, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

try:
pd_type = getattr(pd.core.dtypes.dtypes, type_)
if isinstance(pd_type, type):
comp_types.append(pd_type)
except AttributeError:
pass

native_type = self._native_type_type_map(type_)
if native_type is not None:
Expand Down
3 changes: 0 additions & 3 deletions great_expectations/profile/basic_dataset_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,6 @@ def _profile(cls, dataset):

for column in df.get_table_columns():

if column == 'sizes':
print("sizes")

# df.expect_column_to_exist(column)

type_ = cls._get_column_type(df, column)
Expand Down
17 changes: 11 additions & 6 deletions tests/datasource/test_datasources.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,19 @@ def test_standalone_sqlalchemy_datasource(test_db_connection_string):
def test_create_sqlalchemy_datasource(data_context):
name = "test_sqlalchemy_datasource"
type_ = "sqlalchemy"

# Use sqlite so we don't require postgres for this test.
connection_kwargs = {
"drivername": "postgresql",
"username": "postgres",
"password": "",
"host": "localhost",
"port": 5432,
"database": "test_ci",
"drivername": "sqlite"
}
# connection_kwargs = {
# "drivername": "postgresql",
# "username": "postgres",
# "password": "",
# "host": "localhost",
# "port": 5432,
# "database": "test_ci",
# }

# It should be possible to create a sqlalchemy source using these params without
# saving a profile
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,86 @@
},
"only_for": ["spark"]
}
]
}]
]
},
{
"data": {
"datetime" : ["2016-01-01T12:34:56", "2016-01-02T11:59:58", "2016-02-02T10:48:36", "2016-02-02T09:33:21", "2016-03-01T08:56:45", "2017-02-01T00:01:02", null, null, null, null],
"datetime_tz" : ["2016-01-01T12:34:56Z", "2016-01-02T11:59:58Z", "2016-02-02T10:48:36Z", "2016-02-02T09:33:21Z", "2016-03-01T08:56:45Z", "2017-02-01T00:01:02Z", null, null, null, null]
},
"schemas": {
"pandas" : {
"datetime": "datetime64[ns]",
"datetime_tz": "timestamp"
},
"postgresql": {
"datetime": "TIMESTAMP",
"datetime_tz": "TIMESTAMP"
},
"spark": {
"datetime": "TimestampType",
"datetime_tz": "TimestampType"
},
"sqlite": {
"datetime": "DATETIME",
"datetime_tz": "DATETIME"
},
"mysql": {
"timestamp": "DATETIME",
"timestamp_tz": "DATETIME"
}
},
"tests": [
{
"title": "positive_pandas_datetime_no_timezone",
"exact_match_out": false,
"in":{"column":"datetime","type_":"datetime64"},
"out":{
"success":true,
"observed_value": "datetime64"
},
"only_for": ["pandas"]
},
{
"title": "positive_pandas_datetime_with_timezone",
"exact_match_out": false,
"in":{"column":"datetime_tz", "type_":"Timestamp"},
"out":{
"success":true,
"observed_value": "Timestamp"
},
"only_for": ["pandas>=024"]
},
{
"title": "positive_pandas_datetime_with_timezone_pd_022_pd_023",
"exact_match_out": false,
"in":{"column":"datetime_tz", "type_":"DatetimeTZDtypeType"},
"out":{
"success":true,
"observed_value": "DatetimeTZDtypeType"
},
"only_for": ["pandas_022", "pandas_023"]
},
{
"title": "negative_pandas_datetime_with_timezone",
"exact_match_out": false,
"in":{"column":"datetime_tz","type_":"datetime64"},
"out":{
"success":false,
"observed_value_list": ["Timestamp", "DatetimeTZDtypeType"]
},
"only_for": ["pandas"]
},
{
"title": "negative_pandas_datetime_expected_int",
"exact_match_out": false,
"in":{"column":"datetime_tz","type_":"int"},
"out":{
"success":false,
"observed_value_list": ["Timestamp", "DatetimeTZDtypeType"]
},
"only_for": ["pandas"]
}
]
}]
}
8 changes: 8 additions & 0 deletions tests/test_definitions/test_expectations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import logging
from collections import OrderedDict

import pandas as pd

from sqlalchemy.dialects.sqlite import dialect as sqliteDialect
from sqlalchemy.dialects.postgresql import dialect as postgresqlDialect
from sqlalchemy.dialects.mysql import dialect as mysqlDialect
Expand Down Expand Up @@ -72,6 +74,12 @@ def pytest_generate_tests(metafunc):
elif isinstance(data_asset, PandasDataset):
if "pandas" in test["only_for"]:
generate_test = True
if (("pandas_022" in test["only_for"] or "pandas_023" in test["only_for"]) and
int(pd.__version__.split(".")[1]) in [22, 23]):
generate_test = True
if (("pandas>=24" in test["only_for"]) and
int(pd.__version__.split(".")[1]) > 24):
generate_test = True
elif isinstance(data_asset, SparkDFDataset):
if "spark" in test["only_for"]:
generate_test = True
Expand Down
Loading

0 comments on commit 06a7966

Please sign in to comment.