Merge fac8b34 into cb2060a

great-expectations · Apr 23, 2019 · 92be4a2 · 92be4a2
2 parents cb2060a + fac8b34
commit 92be4a2
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ For quick, hands-on introductions to Great Expectations' key features, check out
 What's the best way to get in touch with the Great Expectations team?
 --------------------------------------------------------------------------------
 
-[Issues on GitHub](https://github.com/great-expectations/great_expectations/issues). If you have questions, comments, feature requests, etc., [opening an issue](https://github.com/great-expectations/great_expectations/issues/new) is definitely the best path forward.
+If you have questions, comments, feature requests, etc., [opening an issue](https://github.com/great-expectations/great_expectations/issues/new) is definitely the best path forward. We also have a slack channel: if you emal us at <team@greatexpectations.io> with the subject line "SLACK" we'll get you an invite.
 
 
 Great Expectations doesn't do X. Is it right for my use case?

diff --git a/great_expectations/cli.py b/great_expectations/cli.py
@@ -2,11 +2,14 @@
 import sys
 import os
 import argparse
+import logging
 
 from great_expectations import read_csv
 from great_expectations import __version__
-from great_expectations.dataset import PandasDataset
+from great_expectations.dataset import Dataset, PandasDataset
+from great_expectations.data_asset import FileDataAsset
 
+logger = logging.getLogger(__name__)
 
 def dispatch(args):
     parser = argparse.ArgumentParser(
@@ -69,6 +72,7 @@ def validate(parsed_args):
     else:
         evaluation_parameters = None
 
+    # Use a custom dataasset module and class if provided. Otherwise infer from the config.
     if parsed_args["custom_dataset_module"]:
         sys.path.insert(0, os.path.dirname(
             parsed_args["custom_dataset_module"]))
@@ -77,14 +81,27 @@ def validate(parsed_args):
         custom_module = __import__(module_name)
         dataset_class = getattr(
             custom_module, parsed_args["custom_dataset_class"])
-
+    elif "data_asset_type" in expectations_config:
+        if expectations_config["data_asset_type"] == "Dataset" or expectations_config["data_asset_type"] == "PandasDataset":
+            dataset_class = PandasDataset
+        elif expectations_config["data_asset_type"].endswith("Dataset"):
+            logger.info("Using PandasDataset to validate dataset of type %s." % expectations_config["data_asset_type"])
+            dataset_class = PandasDataset
+        elif expectations_config["data_asset_type"] == "FileDataAsset":
+            dataset_class = FileDataAsset
+        else:
+            logger.critical("Unrecognized data_asset_type %s. You may need to specifcy custom_dataset_module and custom_dataset_class." % expectations_config["data_asset_type"])
+            return -1
     else:
         dataset_class = PandasDataset
 
-    df = read_csv(data_set, expectations_config=expectations_config,
-                  dataset_class=dataset_class)
+    if issubclass(dataset_class, Dataset):
+        da = read_csv(data_set, expectations_config=expectations_config,
+                    dataset_class=dataset_class)
+    else:
+        da = dataset_class(data_set, config=expectations_config)
 
-    result = df.validate(
+    result = da.validate(
         evaluation_parameters=evaluation_parameters,
         result_format=parsed_args["result_format"],
         catch_exceptions=parsed_args["catch_exceptions"],
@@ -103,6 +120,11 @@ def version(parsed_args):
 
 
 def main():
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
     return_value = dispatch(sys.argv[1:])
     sys.exit(return_value)
 

diff --git a/great_expectations/data_asset/base.py b/great_expectations/data_asset/base.py
@@ -5,6 +5,7 @@
 from functools import wraps
 import traceback
 import warnings
+import logging
 from six import PY3, string_types
 from collections import namedtuple
 
@@ -17,6 +18,7 @@
 from great_expectations.data_asset.util import DotDict, recursively_convert_to_json_serializable, parse_result_format
 from great_expectations.dataset.autoinspect import columns_exist
 
+logger = logging.getLogger("DataAsset")
 
 class DataAsset(object):
 
@@ -35,9 +37,11 @@ def __init__(self, *args, **kwargs):
 
         """
         autoinspect_func = kwargs.pop("autoinspect_func", None)
+        initial_config = kwargs.pop("config", None)
+        data_asset_name = kwargs.pop("data_asset_name", None)
 
         super(DataAsset, self).__init__(*args, **kwargs)
-        self._initialize_expectations()
+        self._initialize_expectations(config=initial_config, data_asset_name=data_asset_name)
         if autoinspect_func is not None:
             autoinspect_func(self)
 
@@ -198,20 +202,24 @@ def wrapper(self, *args, **kwargs):
 
         return outer_wrapper
 
-    def _initialize_expectations(self, config=None, name=None):
+    def _initialize_expectations(self, config=None, data_asset_name=None):
         """Instantiates `_expectations_config` as empty by default or with a specified expectation `config`.
         In addition, this always sets the `default_expectation_args` to:
             `include_config`: False,
             `catch_exceptions`: False,
             `output_format`: 'BASIC'
 
+        By default, initializes data_asset_type to the name of the implementing class, but subclasses
+        that have interoperable semantics (e.g. Dataset) may override that parameter to clarify their
+        interoperability.
+
         Args:
             config (json): \
                 A json-serializable expectation config. \
                 If None, creates default `_expectations_config` with an empty list of expectations and \
-                key value `data_asset_name` as `name`.
+                key value `data_asset_name` as `data_asset_name`.
 
-            name (string): \
+            data_asset_name (string): \
                 The name to assign to `_expectations_config.data_asset_name` if `config` is not provided.
 
         """
@@ -224,14 +232,17 @@ def _initialize_expectations(self, config=None, name=None):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", category=UserWarning)
                 self._expectations_config = DotDict(copy.deepcopy(config))
+                if data_asset_name is not None:
+                    self._expectations_config["data_asset_name"] = data_asset_name
 
         else:
             # Pandas incorrectly interprets this as an attempt to create a column and throws up a warning. Suppress it
             # since we are subclassing.
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", category=UserWarning)
                 self._expectations_config = DotDict({
-                    "data_asset_name": name,
+                    "data_asset_name": data_asset_name,
+                    "data_asset_type": self.__class__.__name__,
                     "meta": {
                         "great_expectations.__version__": __version__
                     },
@@ -730,7 +741,7 @@ def validate(self, expectations_config=None, evaluation_parameters=None, catch_e
                     If True, the returned results include the config information associated with each expectation, if \
                     it exists.
                 only_return_failures (boolean): \
-                    If True, expectation results are only returned when ``success = False``\.
+                    If True, expectation results are only returned when ``success = False`` \
 
             Returns:
                 A JSON-formatted dictionary containing a list of the validation results. \
@@ -912,6 +923,14 @@ def set_evaluation_parameter(self, parameter_name, parameter_value):
         self._expectations_config['evaluation_parameters'].update(
             {parameter_name: parameter_value})
 
+    def set_data_asset_name(self, data_asset_name):
+        """Sets the name of this data_asset as stored in the expectations configuration."""
+        self._expectations_config['data_asset_name'] = data_asset_name
+
+    def get_data_asset_name(self):
+        """Gets the current name of this data_asset as stored in the expectations configuration."""
+        return self._expectations_config['data_asset_name']
+
     def _build_evaluation_parameters(self, expectation_args, evaluation_parameters):
         """Build a dictionary of parameters to evaluate, using the provided evaluation_paramters,
         AND mutate expectation_args by removing any parameter values passed in as temporary values during

diff --git a/great_expectations/data_asset/file_data_asset.py b/great_expectations/data_asset/file_data_asset.py
@@ -60,7 +60,7 @@ def file_lines_map_expectation(cls, func):
         @wraps(func)
         def inner_wrapper(self, skip=None, mostly=None, null_lines_regex=r"^\s*$", result_format=None, *args, **kwargs):
             try:
-                f = open(self.path, "r")
+                f = open(self._path, "r")
             except:
                 raise
 
@@ -139,10 +139,9 @@ class FileDataAsset(MetaFileDataAsset):
     """
 
 
-    def __init__(self, file_path, *args, **kwargs):
+    def __init__(self, file_path=None, *args, **kwargs):
         super(FileDataAsset, self).__init__(*args, **kwargs)
-        self.path = file_path
-
+        self._path = file_path
 
     @MetaFileDataAsset.file_lines_map_expectation
     def expect_file_line_regex_match_count_to_be_between(self,
@@ -375,7 +374,7 @@ def expect_file_hash_to_equal(self, value, hash_alg='md5', result_format=None,
         # Limit file reads to 64 KB chunks at a time
             BLOCKSIZE = 65536
             try:
-                with open(self.path, 'rb') as file:
+                with open(self._path, 'rb') as file:
                     file_buffer = file.read(BLOCKSIZE)
                     while file_buffer:
                         hash.update(file_buffer)
@@ -424,7 +423,7 @@ def expect_file_size_to_be_between(self, minsize, maxsize, result_format=None,
 
         success = False
         try:
-            size = os.path.getsize(self.path)
+            size = os.path.getsize(self._path)
         except OSError:
             raise
 
@@ -448,13 +447,18 @@ def expect_file_size_to_be_between(self, minsize, maxsize, result_format=None,
 
         return {"success":success}
 
-    @DataAsset.expectation([])
-    def expect_file_to_exist(self, result_format=None, include_config=False,
+    @DataAsset.expectation(["filepath"])
+    def expect_file_to_exist(self, filepath=None, result_format=None, include_config=False,
                              catch_exceptions=None, meta=None):
 
         """
         Checks to see if a file specified by the user actually exists
 
+        Args:
+            filepath (str or None): \
+                The filepath to evalutate. If none, will check the currently-configured path object
+                of this FileDataAsset.
+
         Keyword Args:
 
             result_format (str or None): \
@@ -481,9 +485,12 @@ def expect_file_to_exist(self, result_format=None, include_config=False,
         :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
         """
 
-        success = False
-        if os.path.isfile(self.path):
+        if filepath is not None and os.path.isfile(filepath):
+            success = True
+        elif self._path is not None and os.path.isfile(self._path):
             success = True
+        else:
+            success = False
 
         return {"success":success}
 
@@ -537,7 +544,7 @@ def expect_file_to_have_valid_table_header(self, regex, skip=None,
         success = False
 
         try:
-            with open(self.path, 'r') as f:
+            with open(self._path, 'r') as f:
                 lines = f.readlines() #Read in file lines
 
         except IOError:
@@ -596,7 +603,7 @@ def expect_file_to_be_valid_json(self, schema=None, result_format=None,
         success = False
         if schema is None:
             try:
-                with open(self.path, 'r') as f:
+                with open(self._path, 'r') as f:
                     json.load(f)
                 success = True
             except ValueError:
@@ -606,7 +613,7 @@ def expect_file_to_be_valid_json(self, schema=None, result_format=None,
                 with open(schema, 'r') as s:
                     schema_data = s.read()
                 sdata = json.loads(schema_data)
-                with open(self.path, 'r') as f:
+                with open(self._path, 'r') as f:
                     json_data = f.read()
                 jdata = json.loads(json_data)
                 jsonschema.validate(jdata, sdata)

diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py
@@ -5,6 +5,12 @@ class Dataset(DataAsset):
     def __init__(self, *args, **kwargs):
         super(Dataset, self).__init__(*args, **kwargs)
 
+    def _initialize_expectations(self, config=None, data_asset_name=None):
+        """Override data_asset_type with "Dataset"
+        """
+        super(Dataset, self)._initialize_expectations(config=config, data_asset_name=data_asset_name)
+        self._expectations_config["data_asset_type"] = "Dataset"
+
     @classmethod
     def column_map_expectation(cls, func):
         """Constructs an expectation using column-map semantics.

diff --git a/tests/test_data_asset.py b/tests/test_data_asset.py
@@ -32,24 +32,11 @@ def test_data_asset(self):
             D._expectations_config,
             {
                 "data_asset_name": None,
+                "data_asset_type": "Dataset",
                 "meta": {
                     "great_expectations.__version__": ge.__version__
                 },
                 "expectations": []
-                # No longer expect autoinspection 20180920
-                # {
-                #     "expectation_type" : "expect_column_to_exist",
-                #     "kwargs" : { "column" : "x", 'result_format': 'BASIC'},
-                #     'success_on_last_run': True
-                # },{
-                #     "expectation_type" : "expect_column_to_exist",
-                #     "kwargs" : { "column" : "y", 'result_format': 'BASIC'},
-                #     'success_on_last_run': True
-                # },{
-                #     "expectation_type" : "expect_column_to_exist",
-                #     "kwargs" : { "column" : "z", 'result_format': 'BASIC'},
-                #     'success_on_last_run': True
-                # }]
             }
         )
 
@@ -58,21 +45,11 @@ def test_data_asset(self):
             D.get_expectations_config(),
             {
                 "data_asset_name": None,
+                "data_asset_type": "Dataset",
                 "meta": {
                     "great_expectations.__version__": ge.__version__
                 },
                 "expectations": []
-                # No longer expect autoinspection 20180920
-                # {
-                #     "expectation_type" : "expect_column_to_exist",
-                #     "kwargs" : { "column" : "x"}
-                # },{
-                #     "expectation_type" : "expect_column_to_exist",
-                #     "kwargs" : { "column" : "y"}
-                # },{
-                #     "expectation_type" : "expect_column_to_exist",
-                #     "kwargs" : { "column" : "z"}
-                # }]
             }
         )
 
@@ -189,6 +166,7 @@ def test_get_and_save_expectation_config(self):
                 }
             ],
             "data_asset_name": None,
+            "data_asset_type": "Dataset",
             "meta": {
                 "great_expectations.__version__": ge.__version__
             }
@@ -261,6 +239,7 @@ def test_get_and_save_expectation_config(self):
                 }
             ],
             "data_asset_name": None,
+            "data_asset_type": "Dataset",
             "meta": {
                 "great_expectations.__version__": ge.__version__
             }
@@ -332,6 +311,7 @@ def test_get_and_save_expectation_config(self):
                 }
             ],
             "data_asset_name": None,
+            "data_asset_type": "Dataset",
             "meta": {
                 "great_expectations.__version__": ge.__version__
             }
@@ -996,6 +976,7 @@ def test_remove_expectation(self):
                     }
                 ],
                 'data_asset_name': None,
+                "data_asset_type": "Dataset",
                 "meta": {
                     "great_expectations.__version__": ge.__version__
                 }