Merge pull request #560 from great-expectations/feature/pandas_types

Feature/pandas types
great-expectations · Aug 1, 2019 · 06a7966 · 06a7966
2 parents c6422f0 + d58d03c
commit 06a7966
Show file tree

Hide file tree

Showing 10 changed files with 222 additions and 30 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -10,6 +10,9 @@ matrix:
   - dist: trusty
     python: 2.7
     env: PANDAS=0.23.4
+  - dist: trusty
+    python: 2.7
+    env: PANDAS=0.24.2
   - dist: trusty
     python: 2.7
     env: PANDAS=latest
@@ -19,6 +22,9 @@ matrix:
   - dist: trusty
     python: 3.6
     env: PANDAS=0.23.4
+  - dist: trusty
+    python: 3.6
+    env: PANDAS=0.24.2
   - dist: trusty
     python: 3.6
     env: PANDAS=latest

diff --git a/docs/roadmap_changelog.rst b/docs/roadmap_changelog.rst
@@ -13,7 +13,8 @@ Planned Features
 
 v.0.7.4__develop
 -----------------
-
+* Add support for pandas extension dtypes in pandas backend of expect_column_values_to_be_of_type and
+  expect_column_values_to_be_in_type_list and fix bug affecting some dtype-based checks.
 
 v.0.7.3
 -----------------

diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py
@@ -1758,7 +1758,8 @@ def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
             column (str): \
                 The column name.
             distribution (str): \
-                The scipy distribution name. See: `<https://docs.scipy.org/doc/scipy/reference/stats.html>`_
+                The scipy distribution name. See: `<https://docs.scipy.org/doc/scipy/reference/stats.html>`_ Currently
+                supported distributions are listed in the Notes section below.
             p_value (float): \
                 The threshold p-value for a passing test. Default is 0.05.
             params (dict or list) : \

diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from dateutil.parser import parse
 from scipy import stats
-from six import PY3, integer_types, string_types
+from six import PY2, PY3, integer_types, string_types
 
 from great_expectations.data_asset import DataAsset
 from .dataset import Dataset
@@ -528,17 +528,29 @@ def _expect_column_values_to_be_of_type__aggregate(
             try:
                 comp_types.append(np.dtype(type_).type)
             except TypeError:
-                pass
+                try:
+                    pd_type = getattr(pd, type_)
+                    if isinstance(pd_type, type):
+                        comp_types.append(pd_type)
+                except AttributeError:
+                    pass
+
+                try:
+                    pd_type = getattr(pd.core.dtypes.dtypes, type_)
+                    if isinstance(pd_type, type):
+                        comp_types.append(pd_type)
+                except AttributeError:
+                    pass
 
             native_type = self._native_type_type_map(type_)
             if native_type is not None:
                 comp_types.extend(native_type)
-            success = (self[column].dtype in comp_types)
+            success = (self[column].dtype.type in comp_types)
 
         return {
             "success": success,
             "result": {
-                "observed_value": self[column].dtype.name
+                "observed_value": self[column].dtype.type.__name__
             }
         }
 
@@ -559,7 +571,12 @@ def _native_type_type_map(type_):
             return complex,
         elif type_.lower() == "str":
             return str,
-        elif type_.lower() in ["string_types", "unicode"]:
+        elif type_.lower() == "unicode":
+            if PY2:
+                return unicode
+            else:
+                return None
+        elif type_.lower() in ["string_types"]:
             return string_types
 
     @MetaPandasDataset.column_map_expectation
@@ -573,7 +590,19 @@ def _expect_column_values_to_be_of_type__map(
         try:
             comp_types.append(np.dtype(type_).type)
         except TypeError:
-            pass
+            try:
+                pd_type = getattr(pd, type_)
+                if isinstance(pd_type, type):
+                    comp_types.append(pd_type)
+            except AttributeError:
+                pass
+
+            try:
+                pd_type = getattr(pd.core.dtypes.dtypes, type_)
+                if isinstance(pd_type, type):
+                    comp_types.append(pd_type)
+            except AttributeError:
+                pass
 
         native_type = self._native_type_type_map(type_)
         if native_type is not None:
@@ -682,18 +711,30 @@ def _expect_column_values_to_be_in_type_list__aggregate(
                 try:
                     comp_types.append(np.dtype(type_).type)
                 except TypeError:
-                    pass
+                    try:
+                        pd_type = getattr(pd, type_)
+                        if isinstance(pd_type, type):
+                            comp_types.append(pd_type)
+                    except AttributeError:
+                        pass
+
+                    try:
+                        pd_type = getattr(pd.core.dtypes.dtypes, type_)
+                        if isinstance(pd_type, type):
+                            comp_types.append(pd_type)
+                    except AttributeError:
+                        pass
 
                 native_type = self._native_type_type_map(type_)
                 if native_type is not None:
                     comp_types.extend(native_type)
 
-            success = (self[column].dtype in comp_types)
+            success = (self[column].dtype.type in comp_types)
 
         return {
             "success": success,
             "result": {
-                "observed_value": self[column].dtype.name
+                "observed_value": self[column].dtype.type.__name__
             }
         }
 
@@ -709,7 +750,19 @@ def _expect_column_values_to_be_in_type_list__map(
             try:
                 comp_types.append(np.dtype(type_).type)
             except TypeError:
-                pass
+                try:
+                    pd_type = getattr(pd, type_)
+                    if isinstance(pd_type, type):
+                        comp_types.append(pd_type)
+                except AttributeError:
+                    pass
+
+                try:
+                    pd_type = getattr(pd.core.dtypes.dtypes, type_)
+                    if isinstance(pd_type, type):
+                        comp_types.append(pd_type)
+                except AttributeError:
+                    pass
 
             native_type = self._native_type_type_map(type_)
             if native_type is not None:

diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py
@@ -99,9 +99,6 @@ def _profile(cls, dataset):
 
         for column in df.get_table_columns():
 
-            if column == 'sizes':
-                print("sizes")
-
             # df.expect_column_to_exist(column)
 
             type_ = cls._get_column_type(df, column)

diff --git a/tests/datasource/test_datasources.py b/tests/datasource/test_datasources.py
@@ -105,14 +105,19 @@ def test_standalone_sqlalchemy_datasource(test_db_connection_string):
 def test_create_sqlalchemy_datasource(data_context):
     name = "test_sqlalchemy_datasource"
     type_ = "sqlalchemy"
+
+    # Use sqlite so we don't require postgres for this test.
     connection_kwargs = {
-        "drivername": "postgresql",
-        "username": "postgres",
-        "password": "",
-        "host": "localhost",
-        "port": 5432,
-        "database": "test_ci",
+        "drivername": "sqlite"
     }
+    # connection_kwargs = {
+    #     "drivername": "postgresql",
+    #     "username": "postgres",
+    #     "password": "",
+    #     "host": "localhost",
+    #     "port": 5432,
+    #     "database": "test_ci",
+    # }
 
     # It should be possible to create a sqlalchemy source using these params without
     # saving a profile

diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_of_type.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_of_type.json
@@ -275,6 +275,86 @@
           },
           "only_for": ["spark"]
         }
-    ]  
-  }]
+      ]
+    },
+    {
+      "data": {
+        "datetime" : ["2016-01-01T12:34:56", "2016-01-02T11:59:58", "2016-02-02T10:48:36", "2016-02-02T09:33:21", "2016-03-01T08:56:45", "2017-02-01T00:01:02", null, null, null, null],
+        "datetime_tz" : ["2016-01-01T12:34:56Z", "2016-01-02T11:59:58Z", "2016-02-02T10:48:36Z", "2016-02-02T09:33:21Z", "2016-03-01T08:56:45Z", "2017-02-01T00:01:02Z", null, null, null, null]
+      },
+      "schemas": {
+        "pandas" : {
+          "datetime": "datetime64[ns]",
+          "datetime_tz": "timestamp"
+        },
+        "postgresql": {
+          "datetime": "TIMESTAMP",
+          "datetime_tz": "TIMESTAMP"
+        },
+        "spark": {
+          "datetime": "TimestampType",
+          "datetime_tz": "TimestampType"
+        },
+        "sqlite": {
+          "datetime": "DATETIME",
+          "datetime_tz": "DATETIME"
+        },
+        "mysql": {
+          "timestamp": "DATETIME",
+          "timestamp_tz": "DATETIME"
+        }
+      },
+      "tests": [
+        {
+          "title": "positive_pandas_datetime_no_timezone",
+          "exact_match_out": false,
+          "in":{"column":"datetime","type_":"datetime64"},
+          "out":{
+            "success":true,
+            "observed_value": "datetime64"
+          },
+          "only_for": ["pandas"]
+        },
+        {
+          "title": "positive_pandas_datetime_with_timezone",
+          "exact_match_out": false,
+          "in":{"column":"datetime_tz", "type_":"Timestamp"},
+          "out":{
+            "success":true,
+            "observed_value": "Timestamp"
+          },
+          "only_for": ["pandas>=024"]
+        },
+        {
+          "title": "positive_pandas_datetime_with_timezone_pd_022_pd_023",
+          "exact_match_out": false,
+          "in":{"column":"datetime_tz", "type_":"DatetimeTZDtypeType"},
+          "out":{
+            "success":true,
+            "observed_value": "DatetimeTZDtypeType"
+          },
+          "only_for": ["pandas_022", "pandas_023"]
+        },
+        {
+          "title": "negative_pandas_datetime_with_timezone",
+          "exact_match_out": false,
+          "in":{"column":"datetime_tz","type_":"datetime64"},
+          "out":{
+            "success":false,
+            "observed_value_list": ["Timestamp", "DatetimeTZDtypeType"]
+          },
+          "only_for": ["pandas"]
+        },
+        {
+          "title": "negative_pandas_datetime_expected_int",
+          "exact_match_out": false,
+          "in":{"column":"datetime_tz","type_":"int"},
+          "out":{
+            "success":false,
+            "observed_value_list": ["Timestamp", "DatetimeTZDtypeType"]
+          },
+          "only_for": ["pandas"]
+        }
+      ]
+    }]
 }
diff --git a/tests/test_definitions/test_expectations.py b/tests/test_definitions/test_expectations.py
@@ -6,6 +6,8 @@
 import logging
 from collections import OrderedDict
 
+import pandas as pd
+
 from sqlalchemy.dialects.sqlite import dialect as sqliteDialect
 from sqlalchemy.dialects.postgresql import dialect as postgresqlDialect
 from sqlalchemy.dialects.mysql import dialect as mysqlDialect
@@ -72,6 +74,12 @@ def pytest_generate_tests(metafunc):
                             elif isinstance(data_asset, PandasDataset):
                                 if "pandas" in test["only_for"]:
                                     generate_test = True
+                                if (("pandas_022" in test["only_for"] or "pandas_023" in test["only_for"]) and
+                                        int(pd.__version__.split(".")[1]) in [22, 23]):
+                                    generate_test = True
+                                if (("pandas>=24" in test["only_for"]) and
+                                        int(pd.__version__.split(".")[1]) > 24):
+                                    generate_test = True
                             elif isinstance(data_asset, SparkDFDataset):
                                 if "spark" in test["only_for"]:
                                     generate_test = True
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,7 +13,8 @@ Planned Features @@
     v.0.7.4__develop
     -----------------
+    * Add support for pandas extension dtypes in pandas backend of expect_column_values_to_be_of_type and
+      expect_column_values_to_be_in_type_list and fix bug affecting some dtype-based checks.
     v.0.7.3
     -----------------
@@ Expand Down @@