diff --git a/docs/source/evaluation_parameters.rst b/docs/source/evaluation_parameters.rst index 52ce49e29377..9cb44ba4defd 100644 --- a/docs/source/evaluation_parameters.rst +++ b/docs/source/evaluation_parameters.rst @@ -24,7 +24,7 @@ value that should be used during the initial evaluation of the expectation. } You can also store parameter values in a special dictionary called evaluation_parameters that is stored in the \ -expectations_config to be available to multiple expectations or while declaring additional expectations. +expectation_suite to be available to multiple expectations or while declaring additional expectations. .. code-block:: python @@ -42,7 +42,7 @@ When validating expectations, you can provide evaluation parameters based on ups .. code-block:: python - >> my_df.validate(expectations_config=my_dag_step_config, evaluation_parameters={"upstream_row_count": upstream_row_count}) + >> my_df.validate(expectation_suite=my_dag_step_config, evaluation_parameters={"upstream_row_count": upstream_row_count}) Finally, the command-line tool also allows you to provide a JSON file that contains parameters to use during evaluation: @@ -52,4 +52,4 @@ Finally, the command-line tool also allows you to provide a JSON file that conta { "upstream_row_count": 10 } - >> great_expectations validate --evaluation_paramters=my_parameters_file.json dataset_file.csv expectations_config.json + >> great_expectations validate --evaluation_parameters=my_parameters_file.json dataset_file.csv expectation_suite.json diff --git a/docs/source/expectations.rst b/docs/source/expectations.rst index 024500a33d93..2dc6c1966d70 100644 --- a/docs/source/expectations.rst +++ b/docs/source/expectations.rst @@ -125,13 +125,13 @@ You can also add notes or even structured metadata to expectations to describe t Saving Expectations ------------------------------------------------------------------------------ -At the end of your exploration, call `save_expectations` to store all Expectations from your session to your pipeline test files. +At the end of your exploration, call `save_expectation_suite` to store all Expectations from your session to your pipeline test files. This is how you always know what to expect from your data. .. code-block:: bash - >> my_df.save_expectations("my_titanic_expectations.json") + >> my_df.save_expectation_suite("my_titanic_expectations.json") For more detail on how to control expectation output, please see :ref:`standard_arguments` and :ref:`result_format`. diff --git a/docs/source/profiling.rst b/docs/source/profiling.rst index 75e4dbe87215..aadcd797b895 100644 --- a/docs/source/profiling.rst +++ b/docs/source/profiling.rst @@ -16,7 +16,7 @@ profiler class that will evaluate a dataset object and add expectations to it. >> import great_expectations as ge >> df = ge.dataset.PandasDataset({"col": [1, 2, 3, 4, 5]}) >> df.profile(ge.profile.ColumnsExistProfiler) - >> df.get_expectations() + >> df.get_expectation_suite() {'dataset_name': None, 'meta': {'great_expectations.__version__': '0.4.4__develop'}, 'expectations': [ diff --git a/docs/source/standard_arguments.rst b/docs/source/standard_arguments.rst index 77733537e661..59b1fa396c26 100644 --- a/docs/source/standard_arguments.rst +++ b/docs/source/standard_arguments.rst @@ -7,7 +7,7 @@ Standard arguments for expectations All Expectations return a json-serializable dictionary when evaluated, and share four standard (optional) arguments: - :ref:`result_format`: controls what information is returned from the evaluation of the expectation expectation. - - :ref:`include_config`: If true, then the expectation config itself is returned as part of the result object. + - :ref:`include_config`: If true, then the expectation suite itself is returned as part of the result object. - :ref:`catch_exceptions`: If true, execution will not fail if the Expectation encounters an error. Instead, it will \ return success = False and provide an informative error message. - :ref:`meta`: allows user-supplied meta-data to be stored with an expectation. @@ -23,7 +23,7 @@ See :ref:`result_format` for more information. `include_config` ------------------------------------------------------------------------------ -All Expectations accept a boolean `include_config` parameter. If true, then the expectation config itself is returned as part of the result object +All Expectations accept a boolean `include_config` parameter. If true, then the expectation suite itself is returned as part of the result object .. code-block:: bash diff --git a/docs/source/validation.rst b/docs/source/validation.rst index f43839c3b60d..0987503adbbc 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -10,10 +10,10 @@ Once you've constructed and stored Expectations, you can use them to validate ne >> import json >> import great_expectations as ge - >> my_expectations_config = json.load(file("my_titanic_expectations.json")) + >> my_expectation_suite = json.load(file("my_titanic_expectations.json")) >> my_df = ge.read_csv( "./tests/examples/titanic.csv", - expectations_config=my_expectations_config + expectation_suite=my_expectation_suite ) >> my_df.validate() diff --git a/examples/integrations/airflow/operators/expectation_operator.py b/examples/integrations/airflow/operators/expectation_operator.py index 86b0e780288b..8fa55310699b 100644 --- a/examples/integrations/airflow/operators/expectation_operator.py +++ b/examples/integrations/airflow/operators/expectation_operator.py @@ -32,7 +32,7 @@ def __init__(self, Validate provided dataset using great_expectations. :param dataset: Name of the dataset being loaded :type str - :param expectations_json: file pointing to expectation config or json string + :param expectations_json: file pointing to expectation suite or json string :type str :param fail_on_error: True if airflow job should fail when expectations fail :type bool @@ -137,15 +137,15 @@ def _get_dataframe(self): def _load_json(self): """ - Load expectation config based on operator parameters. If provided expectations_json is a file the config will - be loaded from this file. Otherwise we'll try to load the config as a string. + Load expectation suite based on operator parameters. If provided expectations_json is a file the suite will + be loaded from this file. Otherwise we'll try to load the expectation suite as a string. :return: """ if os.path.isfile(self.expectations_json): - self.log.info("Loading expectation config from file {file}".format(file=self.expectations_json)) + self.log.info("Loading expectation suite from file {file}".format(file=self.expectations_json)) return json.load(open(self.expectations_json)) else: - self.log.info("Loading expectation config from string") + self.log.info("Loading expectation suite from string") return json.loads(self.expectations_json) def _store_results(self, results): @@ -154,9 +154,9 @@ def _store_results(self, results): def execute(self, context): df = self._get_dataframe() - config = self._load_json() + suite = self._load_json() self.log.info("Start dataset validation for set {set}".format(set=self.dataset_name)) - results = df.validate(expectations_config=config) + results = df.validate(expectation_suite=suite) self.log.info(pformat(results)) diff --git a/examples/notebooks/Crop_Expectations_With_Reshape.ipynb b/examples/notebooks/Crop_Expectations_With_Reshape.ipynb index 9c760683173f..885676299651 100644 --- a/examples/notebooks/Crop_Expectations_With_Reshape.ipynb +++ b/examples/notebooks/Crop_Expectations_With_Reshape.ipynb @@ -166,7 +166,7 @@ }, "outputs": [], "source": [ - "print(json.dumps(df.get_expectations(), indent = 2))" + "print(json.dumps(df.get_expectation_suite(), indent = 2))" ] } ], diff --git a/examples/notebooks/Distributional_Expectations_Demo.ipynb b/examples/notebooks/Distributional_Expectations_Demo.ipynb index ecbf32226d69..4a658d9417dd 100644 --- a/examples/notebooks/Distributional_Expectations_Demo.ipynb +++ b/examples/notebooks/Distributional_Expectations_Demo.ipynb @@ -364,7 +364,7 @@ }, "outputs": [], "source": [ - "df.get_expectations()" + "df.get_expectation_suite()" ] }, { @@ -459,7 +459,7 @@ }, "outputs": [], "source": [ - "my_expectations = df.get_expectations()" + "my_expectations = df.get_expectation_suite()" ] }, { @@ -481,7 +481,7 @@ }, "outputs": [], "source": [ - "results = df_test.validate(expectations_config=my_expectations)\n", + "results = df_test.validate(expectation_suite=my_expectations)\n", "results" ] }, @@ -493,7 +493,7 @@ }, "outputs": [], "source": [ - "failures = df_test.validate(expectations_config=my_expectations, only_return_failures=True)\n", + "failures = df_test.validate(expectation_suite=my_expectations, only_return_failures=True)\n", "failures" ] }, diff --git a/examples/notebooks/explore_titanic_data.ipynb b/examples/notebooks/explore_titanic_data.ipynb index 24cf3f423969..df0e41620a65 100644 --- a/examples/notebooks/explore_titanic_data.ipynb +++ b/examples/notebooks/explore_titanic_data.ipynb @@ -136,8 +136,8 @@ }, "outputs": [], "source": [ - "print json.dumps(titanic_df.get_expectations(), indent=2)\n", - "# titanic_df.save_expectations('titanic_expectations.json')" + "print json.dumps(titanic_df.get_expectation_suite(), indent=2)\n", + "# titanic_df.save_expectation_suite('titanic_expectations.json')" ] }, { diff --git a/great_expectations/cli/cli.py b/great_expectations/cli/cli.py index d2cdba99bc4c..e72ccf3c39d7 100644 --- a/great_expectations/cli/cli.py +++ b/great_expectations/cli/cli.py @@ -45,7 +45,7 @@ def cli(): @cli.command() @click.argument('dataset') -@click.argument('expectations_config_file') +@click.argument('expectation_suite_file') @click.option('--evaluation_parameters', '-p', default=None, help='Path to a file containing JSON object used to evaluate parameters in expectations config.') @click.option('--result_format', '-o', default="SUMMARY", @@ -59,32 +59,32 @@ def cli(): help='Path to a python module containing a custom dataset class.') @click.option('--custom_dataset_class', '-c', default=None, help='Name of the custom dataset class to use during evaluation.') -def validate(dataset, expectations_config_file, evaluation_parameters, result_format, +def validate(dataset, expectation_suite_file, evaluation_parameters, result_format, catch_exceptions, only_return_failures, custom_dataset_module, custom_dataset_class): - """Validate a CSV file against an expectations configuration. + """Validate a CSV file against an expectation suite. - DATASET: Path to a file containing a CSV file to validate using the provided expectations_config_file. + DATASET: Path to a file containing a CSV file to validate using the provided expectation_suite_file. - EXPECTATIONS_CONFIG_FILE: Path to a file containing a valid great_expectations expectations config to use to \ + EXPECTATION_SUITE_FILE: Path to a file containing a valid great_expectations expectations suite to use to \ validate the data. """ """ - Read a dataset file and validate it using a config saved in another file. Uses parameters defined in the dispatch + Read a dataset file and validate it using an expectation suite saved in another file. Uses parameters defined in the dispatch method. :param parsed_args: A Namespace object containing parsed arguments from the dispatch method. :return: The number of unsucessful expectations """ - expectations_config_file = expectations_config_file + expectation_suite_file = expectation_suite_file - expectations_config = json.load(open(expectations_config_file)) + expectation_suite = json.load(open(expectation_suite_file)) if evaluation_parameters is not None: evaluation_parameters = json.load( open(evaluation_parameters, "r")) - # Use a custom dataasset module and class if provided. Otherwise infer from the config. + # Use a custom dataasset module and class if provided. Otherwise infer from the expectation suite if custom_dataset_module: sys.path.insert(0, os.path.dirname( custom_dataset_module)) @@ -93,28 +93,28 @@ def validate(dataset, expectations_config_file, evaluation_parameters, result_fo custom_module = __import__(str(module_name)) dataset_class = getattr( custom_module, custom_dataset_class) - elif "data_asset_type" in expectations_config: - if (expectations_config["data_asset_type"] == "Dataset" or - expectations_config["data_asset_type"] == "PandasDataset"): + elif "data_asset_type" in expectation_suite: + if (expectation_suite["data_asset_type"] == "Dataset" or + expectation_suite["data_asset_type"] == "PandasDataset"): dataset_class = PandasDataset - elif expectations_config["data_asset_type"].endswith("Dataset"): + elif expectation_suite["data_asset_type"].endswith("Dataset"): logger.info("Using PandasDataset to validate dataset of type %s." % - expectations_config["data_asset_type"]) + expectation_suite["data_asset_type"]) dataset_class = PandasDataset - elif expectations_config["data_asset_type"] == "FileDataAsset": + elif expectation_suite["data_asset_type"] == "FileDataAsset": dataset_class = FileDataAsset else: logger.critical("Unrecognized data_asset_type %s. You may need to specifcy custom_dataset_module and \ - custom_dataset_class." % expectations_config["data_asset_type"]) + custom_dataset_class." % expectation_suite["data_asset_type"]) return -1 else: dataset_class = PandasDataset if issubclass(dataset_class, Dataset): - da = read_csv(dataset, expectations_config=expectations_config, + da = read_csv(dataset, expectation_suite=expectation_suite, dataset_class=dataset_class) else: - da = dataset_class(dataset, config=expectations_config) + da = dataset_class(dataset, config=expectation_suite) result = da.validate( evaluation_parameters=evaluation_parameters, diff --git a/great_expectations/data_asset/data_asset.py b/great_expectations/data_asset/data_asset.py index 3aea0ecef6d7..3b9ae23b6590 100644 --- a/great_expectations/data_asset/data_asset.py +++ b/great_expectations/data_asset/data_asset.py @@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs): """ interactive_evaluation = kwargs.pop("interactive_evaluation", True) profiler = kwargs.pop("profiler", None) - expectations_config = kwargs.pop("expectations_config", None) + expectation_suite = kwargs.pop("expectation_suite", None) data_asset_name = kwargs.pop("data_asset_name", None) data_context = kwargs.pop("data_context", None) batch_kwargs = kwargs.pop("batch_kwargs", None) @@ -47,7 +47,7 @@ def __init__(self, *args, **kwargs): warnings.warn("Autoinspect_func is no longer supported; use a profiler instead (migration is easy!).") super(DataAsset, self).__init__(*args, **kwargs) self._interactive_evaluation = interactive_evaluation - self._initialize_expectations(config=expectations_config, data_asset_name=data_asset_name) + self._initialize_expectations(expectation_suite=expectation_suite, data_asset_name=data_asset_name) self._data_context = data_context self._batch_kwargs = batch_kwargs if profiler is not None: @@ -147,9 +147,9 @@ def wrapper(self, *args, **kwargs): # This will become the stored config expectation_args = copy.deepcopy(all_args) - if "evaluation_parameters" in self._expectations_config: + if "evaluation_parameters" in self._expectation_suite: evaluation_args = self._build_evaluation_parameters(expectation_args, - self._expectations_config["evaluation_parameters"]) # This will be passed to the evaluation + self._expectation_suite["evaluation_parameters"]) # This will be passed to the evaluation else: evaluation_args = self._build_evaluation_parameters( expectation_args, None) @@ -189,7 +189,6 @@ def wrapper(self, *args, **kwargs): else: return_obj = {"stored_configuration": expectation_config} - # Append the expectation to the config. self._append_expectation(expectation_config) @@ -225,8 +224,8 @@ def wrapper(self, *args, **kwargs): return outer_wrapper - def _initialize_expectations(self, config=None, data_asset_name=None): - """Instantiates `_expectations_config` as empty by default or with a specified expectation `config`. + def _initialize_expectations(self, expectation_suite=None, data_asset_name=None): + """Instantiates `_expectation_suite` as empty by default or with a specified expectation `config`. In addition, this always sets the `default_expectation_args` to: `include_config`: False, `catch_exceptions`: False, @@ -237,23 +236,23 @@ def _initialize_expectations(self, config=None, data_asset_name=None): interoperability. Args: - config (json): \ + expectation_suite (json): \ A json-serializable expectation config. \ - If None, creates default `_expectations_config` with an empty list of expectations and \ + If None, creates default `_expectation_suite` with an empty list of expectations and \ key value `data_asset_name` as `data_asset_name`. data_asset_name (string): \ - The name to assign to `_expectations_config.data_asset_name` if `config` is not provided. + The name to assign to `_expectation_suite.data_asset_name` if `config` is not provided. """ - if config != None: - #!!! Should validate the incoming config with jsonschema here - self._expectations_config = DotDict(copy.deepcopy(config)) + if expectation_suite is not None: + # TODO: validate the incoming expectation_suite with jsonschema here + self._expectation_suite = DotDict(copy.deepcopy(expectation_suite)) if data_asset_name is not None: - self._expectations_config["data_asset_name"] = data_asset_name + self._expectation_suite["data_asset_name"] = data_asset_name else: - self._expectations_config = DotDict({ + self._expectation_suite = DotDict({ "data_asset_name": data_asset_name, "data_asset_type": self.__class__.__name__, "meta": { @@ -269,16 +268,16 @@ def _initialize_expectations(self, config=None, data_asset_name=None): } def _append_expectation(self, expectation_config): - """Appends an expectation to `DataAsset._expectations_config` and drops existing expectations of the same type. + """Appends an expectation to `DataAsset._expectation_suite` and drops existing expectations of the same type. If `expectation_config` is a column expectation, this drops existing expectations that are specific to \ that column and only if it is the same expectation type as `expectation_config`. Otherwise, if it's not a \ column expectation, this drops existing expectations of the same type as `expectation config`. \ - After expectations of the same type are dropped, `expectation_config` is appended to `DataAsset._expectations_config`. + After expectations of the same type are dropped, `expectation_config` is appended to `DataAsset._expectation_suite`. Args: expectation_config (json): \ - The JSON-serializable expectation to be added to the DataAsset expectations in `_expectations_config`. + The JSON-serializable expectation to be added to the DataAsset expectations in `_expectation_suite`. Notes: May raise future errors once json-serializable tests are implemented to check for correct arg formatting @@ -302,18 +301,18 @@ def _append_expectation(self, expectation_config): if 'column' in expectation_config['kwargs']: column = expectation_config['kwargs']['column'] - self._expectations_config.expectations = [f for f in filter( + self._expectation_suite.expectations = [f for f in filter( lambda exp: (exp['expectation_type'] != expectation_type) or ( 'column' in exp['kwargs'] and exp['kwargs']['column'] != column), - self._expectations_config.expectations + self._expectation_suite.expectations )] else: - self._expectations_config.expectations = [f for f in filter( + self._expectation_suite.expectations = [f for f in filter( lambda exp: exp['expectation_type'] != expectation_type, - self._expectations_config.expectations + self._expectation_suite.expectations )] - self._expectations_config.expectations.append(expectation_config) + self._expectation_suite.expectations.append(expectation_config) def _copy_and_clean_up_expectation(self, expectation, @@ -368,10 +367,10 @@ def _copy_and_clean_up_expectations_from_indexes( discard_include_configs_kwargs=True, discard_catch_exceptions_kwargs=True, ): - """Copies and cleans all expectations provided by their index in DataAsset._expectations_config.expectations. + """Copies and cleans all expectations provided by their index in DataAsset._expectation_suite.expectations. Applies the _copy_and_clean_up_expectation method to multiple expectations, provided by their index in \ - `DataAsset,_expectations_config.expectations`. Returns a list of the copied and cleaned expectations. + `DataAsset,_expectation_suite.expectations`. Returns a list of the copied and cleaned expectations. Args: match_indexes (List): \ @@ -394,7 +393,7 @@ def _copy_and_clean_up_expectations_from_indexes( for i in match_indexes: rval.append( self._copy_and_clean_up_expectation( - self._expectations_config.expectations[i], + self._expectation_suite.expectations[i], discard_result_format_kwargs, discard_include_configs_kwargs, discard_catch_exceptions_kwargs, @@ -429,7 +428,7 @@ def find_expectation_indexes(self, expectation_kwargs["column"] = column match_indexes = [] - for i, exp in enumerate(self._expectations_config.expectations): + for i, exp in enumerate(self._expectation_suite.expectations): if expectation_type == None or (expectation_type == exp['expectation_type']): # if column == None or ('column' not in exp['kwargs']) or (exp['kwargs']['column'] == column) or (exp['kwargs']['column']==: match = True @@ -522,18 +521,18 @@ def remove_expectation(self, else: if not dry_run: - self._expectations_config.expectations = [i for j, i in enumerate( - self._expectations_config.expectations) if j not in match_indexes] + self._expectation_suite.expectations = [i for j, i in enumerate( + self._expectation_suite.expectations) if j not in match_indexes] else: return self._copy_and_clean_up_expectations_from_indexes(match_indexes) else: # Exactly one match expectation = self._copy_and_clean_up_expectation( - self._expectations_config.expectations[match_indexes[0]] + self._expectation_suite.expectations[match_indexes[0]] ) if not dry_run: - del self._expectations_config.expectations[match_indexes[0]] + del self._expectation_suite.expectations[match_indexes[0]] else: if remove_multiple_matches: @@ -598,17 +597,17 @@ def get_expectations_config(self, suppress_warnings=False ): warnings.warn("get_expectations_config is deprecated, and will be removed in a future release. " + - "Please use get_expectations instead.", DeprecationWarning) - return self.get_expectations(discard_failed_expectations, discard_result_format_kwargs, - discard_include_configs_kwargs, discard_catch_exceptions_kwargs, suppress_warnings) - - def get_expectations(self, - discard_failed_expectations=True, - discard_result_format_kwargs=True, - discard_include_configs_kwargs=True, - discard_catch_exceptions_kwargs=True, - suppress_warnings=False - ): + "Please use get_expectation_suite instead.", DeprecationWarning) + return self.get_expectation_suite(discard_failed_expectations, discard_result_format_kwargs, + discard_include_configs_kwargs, discard_catch_exceptions_kwargs, suppress_warnings) + + def get_expectation_suite(self, + discard_failed_expectations=True, + discard_result_format_kwargs=True, + discard_include_configs_kwargs=True, + discard_catch_exceptions_kwargs=True, + suppress_warnings=False + ): """Returns _expectation_config as a JSON object, and perform some cleaning along the way. Args: @@ -625,9 +624,9 @@ def get_expectations(self, An expectation config. Note: - get_expectations does not affect the underlying config at all. The returned config is a copy of _expectations_config, not the original object. + get_expectation_suite does not affect the underlying config at all. The returned config is a copy of _expectation_suite, not the original object. """ - config = dict(self._expectations_config) + config = dict(self._expectation_suite) config = copy.deepcopy(config) expectations = config["expectations"] @@ -670,7 +669,7 @@ def get_expectations(self, if not suppress_warnings: """ -WARNING: get_expectations discarded +WARNING: get_expectation_suite discarded 12 failing expectations 44 result_format kwargs 0 include_config kwargs @@ -678,7 +677,7 @@ def get_expectations(self, If you wish to change this behavior, please set discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs appropirately. """ if any([discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, discard_catch_exceptions_kwargs]): - print("WARNING: get_expectations discarded") + print("WARNING: get_expectation_suite discarded") if discard_failed_expectations: print("\t%d failing expectations" % discards["failed_expectations"]) @@ -706,11 +705,11 @@ def save_expectations_config( suppress_warnings=False ): warnings.warn("save_expectations_config is deprecated, and will be removed in a future release. " + - "Please use save_expectations instead.", DeprecationWarning) - self.save_expectations(filepath, discard_failed_expectations, discard_result_format_kwargs, + "Please use save_expectation_suite instead.", DeprecationWarning) + self.save_expectation_suite(filepath, discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, discard_catch_exceptions_kwargs, suppress_warnings) - def save_expectations( + def save_expectation_suite( self, filepath=None, discard_failed_expectations=True, @@ -744,7 +743,7 @@ def save_expectations( suppressed. """ - expectations_config = self.get_expectations( + expectation_suite = self.get_expectation_suite( discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, @@ -752,15 +751,15 @@ def save_expectations( suppress_warnings ) if filepath is None and self._data_context is not None: - self._data_context.save_expectations(expectations_config) + self._data_context.save_expectation_suite(expectation_suite) elif filepath is not None: - expectation_config_str = json.dumps(expectations_config, indent=2) + expectation_config_str = json.dumps(expectation_suite, indent=2) open(filepath, 'w').write(expectation_config_str) else: raise ValueError("Unable to save config: filepath or data_context must be available.") def validate(self, - expectations_config=None, + expectation_suite=None, run_id=None, data_context=None, evaluation_parameters=None, @@ -769,14 +768,14 @@ def validate(self, only_return_failures=False): """Generates a JSON-formatted report describing the outcome of all expectations. - Use the default expectations_config=None to validate the expectations config associated with the DataAsset. + Use the default expectation_suite=None to validate the expectations config associated with the DataAsset. Args: - expectations_config (json or None): \ + expectation_suite (json or None): \ If None, uses the expectations config generated with the DataAsset during the current session. \ If a JSON file, validates those expectations. evaluation_parameters (dict or None): \ - If None, uses the evaluation_paramters from the expectations_config provided or as part of the data_asset. + If None, uses the evaluation_paramters from the expectation_suite provided or as part of the data_asset. If a dict, uses the evaluation parameters in the dictionary. catch_exceptions (boolean): \ If True, exceptions raised by tests will not end validation and will be described in the returned report. @@ -842,15 +841,15 @@ def validate(self, results = [] - if expectations_config is None: - expectations_config = self.get_expectations( + if expectation_suite is None: + expectation_suite = self.get_expectation_suite( discard_failed_expectations=False, discard_result_format_kwargs=False, discard_include_configs_kwargs=False, discard_catch_exceptions_kwargs=False, ) - elif isinstance(expectations_config, string_types): - expectations_config = json.load(open(expectations_config, 'r')) + elif isinstance(expectation_suite, string_types): + expectation_suite = json.load(open(expectation_suite, 'r')) # Evaluation parameter priority is # 1. from provided parameters @@ -859,26 +858,26 @@ def validate(self, # So, we load them in reverse order if data_context is not None: - runtime_evaluation_parameters = data_context.bind_evaluation_parameters(run_id, expectations_config) + runtime_evaluation_parameters = data_context.bind_evaluation_parameters(run_id, expectation_suite) else: runtime_evaluation_parameters = {} - if "evaluation_parameters" in expectations_config: - runtime_evaluation_parameters.update(expectations_config["evaluation_parameters"]) + if "evaluation_parameters" in expectation_suite: + runtime_evaluation_parameters.update(expectation_suite["evaluation_parameters"]) if evaluation_parameters is not None: runtime_evaluation_parameters.update(evaluation_parameters) # Warn if our version is different from the version in the configuration try: - if expectations_config['meta']['great_expectations.__version__'] != __version__: + if expectation_suite['meta']['great_expectations.__version__'] != __version__: warnings.warn( - "WARNING: This configuration object was built using version %s of great_expectations, but is currently being valided by version %s." % (expectations_config['meta']['great_expectations.__version__'], __version__)) + "WARNING: This configuration object was built using version %s of great_expectations, but is currently being valided by version %s." % (expectation_suite['meta']['great_expectations.__version__'], __version__)) except KeyError: warnings.warn( "WARNING: No great_expectations version found in configuration object.") - for expectation in expectations_config['expectations']: + for expectation in expectation_suite['expectations']: try: expectation_method = getattr( @@ -937,15 +936,15 @@ def validate(self, # TODO: refactor this once we've settled on the correct naming convetion everywhere data_asset_name = None - if "data_asset_name" in expectations_config: - data_asset_name = expectations_config["data_asset_name"] - elif "dataset_name" in expectations_config: - data_asset_name = expectations_config["dataset_name"] - elif "meta" in expectations_config: - if "data_asset_name" in expectations_config["meta"]: - data_asset_name = expectations_config["meta"]["data_asset_name"] - elif "dataset_name" in expectations_config["meta"]: - data_asset_name = expectations_config["meta"]["dataset_name"] + if "data_asset_name" in expectation_suite: + data_asset_name = expectation_suite["data_asset_name"] + elif "dataset_name" in expectation_suite: + data_asset_name = expectation_suite["dataset_name"] + elif "meta" in expectation_suite: + if "data_asset_name" in expectation_suite["meta"]: + data_asset_name = expectation_suite["meta"]["data_asset_name"] + elif "dataset_name" in expectation_suite["meta"]: + data_asset_name = expectation_suite["meta"]["dataset_name"] result = { "results": results, @@ -992,9 +991,9 @@ def get_evaluation_parameter(self, parameter_name, default_value=None): Returns: The current value of the evaluation parameter. """ - if "evaluation_parameters" in self._expectations_config and \ - parameter_name in self._expectations_config['evaluation_parameters']: - return self._expectations_config['evaluation_parameters'][parameter_name] + if "evaluation_parameters" in self._expectation_suite and \ + parameter_name in self._expectation_suite['evaluation_parameters']: + return self._expectation_suite['evaluation_parameters'][parameter_name] else: return default_value @@ -1007,20 +1006,20 @@ def set_evaluation_parameter(self, parameter_name, parameter_value): parameter_value (any): The value to be used """ - if 'evaluation_parameters' not in self._expectations_config: - self._expectations_config['evaluation_parameters'] = {} + if 'evaluation_parameters' not in self._expectation_suite: + self._expectation_suite['evaluation_parameters'] = {} - self._expectations_config['evaluation_parameters'].update( + self._expectation_suite['evaluation_parameters'].update( {parameter_name: parameter_value}) def set_data_asset_name(self, data_asset_name): """Sets the name of this data_asset as stored in the expectations configuration.""" - self._expectations_config['data_asset_name'] = data_asset_name + self._expectation_suite['data_asset_name'] = data_asset_name def get_data_asset_name(self): """Gets the current name of this data_asset as stored in the expectations configuration.""" - if "data_asset_name" in self._expectations_config: - return self._expectations_config['data_asset_name'] + if "data_asset_name" in self._expectation_suite: + return self._expectation_suite['data_asset_name'] else: return None diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index 1dcc9ceb368a..d08ec18499bf 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -9,7 +9,7 @@ from six import string_types import datetime -from .util import get_slack_callback, safe_mmkdir +from .util import NormalizedDataAssetName, get_slack_callback, safe_mmkdir from great_expectations.version import __version__ from great_expectations.exceptions import DataContextError, ConfigNotFoundError, ProfilerError @@ -31,11 +31,11 @@ from .expectation_explorer import ExpectationExplorer logger = logging.getLogger(__name__) -debug_view = widgets.Output(layout={'border': '3 px solid pink'}) yaml = YAML() yaml.indent(mapping=2, sequence=4, offset=2) yaml.default_flow_style = False +ALLOWED_DELIMITERS = ['.', '/'] class DataContext(object): """A DataContext represents a Great Expectations project. It captures essential information such as @@ -61,8 +61,7 @@ def create(cls, context_root_dir=None): return cls(context_root_dir) - - def __init__(self, context_root_dir=None, expectation_explorer=False): + def __init__(self, context_root_dir=None, expectation_explorer=False, data_asset_name_delimiter = '/'): self._expectation_explorer = expectation_explorer self._datasources = {} if expectation_explorer: @@ -93,6 +92,9 @@ def __init__(self, context_root_dir=None, expectation_explorer=False): self._load_evaluation_parameter_store() self._compiled = False + if data_asset_name_delimiter not in ALLOWED_DELIMITERS: + raise DataContextError("Invalid delimiter: delimiter must be '.' or '/'") + self._data_asset_name_delimiter = data_asset_name_delimiter def get_context_root_directory(self): return self.context_root_directory @@ -101,17 +103,60 @@ def _load_project_config(self): try: with open(os.path.join(self.context_root_directory, "great_expectations/great_expectations.yml"), "r") as data: return yaml.load(data) - except IOError as e: + except IOError: raise ConfigNotFoundError(self.context_root_directory) + @property + def data_asset_name_delimiter(self): + return self._data_asset_name_delimiter + + @data_asset_name_delimiter.setter + def data_asset_name_delimiter(self, new_delimiter): + if new_delimiter not in ALLOWED_DELIMITERS: + raise DataContextError("Invalid delimiter: delimiter must be '.' or '/'") + else: + self._data_asset_name_delimiter = new_delimiter + + ##### + # + # Internal helper methods + # + ##### + + def _get_normalized_data_asset_name_filepath(self, data_asset_name, base_path=None): + """Get the path where the project-normalized data_asset_name expectations are stored.""" + if base_path is None: + base_path = os.path.join(self.get_context_root_directory(), "great_expectations/expectations") + + # We need to ensure data_asset_name is a valid filepath no matter its current state + if isinstance(data_asset_name, NormalizedDataAssetName): + name_parts = [name_part.replace("/", "__") for name_part in data_asset_name] + relative_path = "/".join(name_parts) + elif isinstance(data_asset_name, string_types): + # if our delimiter is not '/', we need to first replace any slashes that exist in the name + # to avoid extra layers of nesting (e.g. for dbt models) + relative_path = data_asset_name + if self.data_asset_name_delimiter != "/": + relative_path.replace("/", "__") + relative_path = relative_path.replace(self.data_asset_name_delimiter, "/") + else: + raise DataContextError("data_assset_name must be a NormalizedDataAssetName or string") + + relative_path += ".json" + return os.path.join( + base_path, + relative_path + ) + def _save_project_config(self): with open(os.path.join(self.context_root_directory, "great_expectations/great_expectations.yml"), "w") as data: yaml.dump(self._project_config, data) def _get_all_profile_credentials(self): try: - with open(os.path.join(self.context_root_directory, "great_expectations/uncommitted/credentials/profiles.yml"), "r") as profiles_file: + with open(os.path.join(self.context_root_directory, + "great_expectations/uncommitted/credentials/profiles.yml"), "r") as profiles_file: return yaml.load(profiles_file) or {} except IOError as e: if e.errno != errno.ENOENT: @@ -178,7 +223,7 @@ def get_datasource_config(self, datasource_name): return datasource_config def get_available_data_asset_names(self, datasource_names=None, generator_names=None): - data_asset_names = [] + data_asset_names = {} if datasource_names is None: datasource_names = [datasource["name"] for datasource in self.list_datasources()] elif isinstance(datasource_names, string_types): @@ -194,22 +239,20 @@ def get_available_data_asset_names(self, datasource_names=None, generator_names= for idx, datasource_name in enumerate(datasource_names): datasource = self.get_datasource(datasource_name) - data_asset_names.append( - { - "datasource": datasource_name, - "generators": datasource.get_available_data_asset_names(generator_names[idx] if generator_names is not None else None) - } - ) + data_asset_names[datasource_name] = \ + datasource.get_available_data_asset_names(generator_names[idx] if generator_names is not None else None) return data_asset_names - def get_batch(self, datasource_name, data_asset_name, batch_kwargs=None, **kwargs): - data_asset_name = self._normalize_data_asset_name(data_asset_name) - # datasource_name = find(data_asset_name.split("/")[0] - datasource = self.get_datasource(datasource_name) + def get_batch(self, data_asset_name, batch_kwargs=None, **kwargs): + normalized_data_asset_name = self._normalize_data_asset_name(data_asset_name) + + datasource = self.get_datasource(normalized_data_asset_name.datasource) if not datasource: - raise Exception("Can't find datasource {0:s} in the config - please check your great_expectations.yml") + raise DataContextError("Can't find datasource {0:s} in the config - please check your great_expectations.yml") - data_asset = datasource.get_data_asset(data_asset_name, batch_kwargs, **kwargs) + data_asset = datasource.get_batch(normalized_data_asset_name, + batch_kwargs, + **kwargs) return data_asset def add_datasource(self, name, type_, **kwargs): @@ -280,7 +323,8 @@ def get_run_parameters(self, run_id): return self.dict[run_id] else: return {} - + ##### + # # If user wishes to provide their own implementation for this key value store (e.g., # Redis-based), they should specify the following in the project config file: # @@ -300,6 +344,8 @@ def get_run_parameters(self, run_id): # 3. def set(self, name, value) # # We will load the module dynamically + # + ##### try: config_block = self._project_config.get("evaluation_parameter_store") if not config_block or not config_block.get("type"): @@ -318,9 +364,10 @@ def get_run_parameters(self, run_id): logger.exception("Failed to load evaluation_parameter_store class") raise - def list_expectations_configs(self): + def list_expectation_suites(self): root_path = self.expectations_directory - result = [os.path.splitext(os.path.relpath(y, root_path))[0] for x in os.walk(root_path) for y in glob(os.path.join(x[0], '*.json'))] + result = [os.path.relpath(y, root_path)[:-5] for x in os.walk(root_path) for y in glob(os.path.join(x[0], '*.json'))] + # result = [os.path.splitext(os.path.relpath(y, root_path))[0] for x in os.walk(root_path) for y in glob(os.path.join(x[0], '*.json'))] return result def list_datasources(self): @@ -330,53 +377,231 @@ def _normalize_data_asset_name(self, data_asset_name, batch_kwargs=None): """Normalizes data_asset_names for a data context A data_asset_name is defined per-project and consists of four components: - - a datasouce name - - a data_asset_name - - a sub-name, which by default is the name of the generator from which the data_asset is derived + - a datasource name + - a generator_name + - a generator_asset + - an expectation_suite name - a generator name It has a string representation consisting of each of those components delimited by a slash - """ - - configs = self.list_expectations_configs() - if data_asset_name in configs: + """ + if isinstance(data_asset_name, NormalizedDataAssetName): return data_asset_name - else: - last_found_config = None - options = 0 - for config in configs: - config_components = config.split("/") - if data_asset_name in config: - options += 1 - last_found_config = config - if options == 1: - return last_found_config - - # We allow "new" configs to be considered normalized out of the box - return data_asset_name - # raise ExpectationsConfigNotFoundError(data_asset_name) - - def get_expectations(self, data_asset_name, batch_kwargs=None): - config_file_path = os.path.join(self.expectations_directory, data_asset_name + '.json') + + split_name = data_asset_name.split(self.data_asset_name_delimiter) + + if len(split_name) > 4: + raise DataContextError("Invalid data_asset_name {data_asset_name}: found too many components using delimiter '{delimiter}'".format( + data_asset_name=data_asset_name, + delimiter=self.data_asset_name_delimiter + )) + + elif len(split_name) == 1: + # In this case, the name *must* refer to a unique data_asset_name + provider_names = [] + generator_asset = split_name[0] + for normalized_identifier in self.list_expectation_suites(): + normalized_split = normalized_identifier.split(self.data_asset_name_delimiter) + curr_generator_asset = normalized_split[2] + if generator_asset == curr_generator_asset: + provider_names.append( + NormalizedDataAssetName(*normalized_split) + ) + if len(provider_names) == 1: + return provider_names[0] + elif len(provider_names) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names)) + + # If we haven't found a match, see if this is provided by exactly one datasource and generator + available_names = self.get_available_data_asset_names() + for datasource_name in available_names.keys(): + for generator in available_names[datasource_name].keys(): + names_set = available_names[datasource_name][generator] + if generator_asset in names_set: + provider_names.append( + NormalizedDataAssetName(datasource_name, generator, generator_asset, "default") + ) + + if len(provider_names) == 1: + return provider_names[0] + + elif len(provider_names) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names)) + + # Finally, if the name *would be* unambiguous but for not havding been defined yet allow the normalization + if len(available_names) == 1: + # in this case, datasource_name from the loop above is still valid + if len(available_names[datasource_name]) == 1: + # in this case, generator from the inner loop above is also still valid + return NormalizedDataAssetName(datasource_name, generator, generator_asset, "default") + else: + raise DataContextError("Cannot find {data_asset_name} among currently-defined data assets.".format(data_asset_name=data_asset_name)) + + elif len(split_name) == 2: + # In this case, the name must be one of the following options: + # (a) datasource_name/generator_asset + # (b) generator_asset/suite + + # If the data_asset_name is already defined by a config in that datasource, return that normalized name. + provider_names = [] + for normalized_identifier in self.list_expectation_suites(): + normalized_split = normalized_identifier.split(self._data_asset_name_delimiter) + curr_datasource_name = normalized_split[0] + curr_generator_name = normalized_split[1] + curr_generator_asset = normalized_split[2] + curr_expectation_suite = normalized_split[3] + # Option 1: + if curr_datasource_name == split_name[0] and curr_generator_asset == split_name[1]: + provider_names.append(NormalizedDataAssetName(*normalized_split)) + # Option 2: + if curr_generator_asset == split_name[0] and curr_expectation_suite == split_name[1]: + provider_names.append(NormalizedDataAssetName(*normalized_split)) + + if len(provider_names) == 1: + return provider_names[0] + elif len(provider_names) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names)) + + # If we haven't found a match, see if this is provided by exactly one datasource and generator + available_names = self.get_available_data_asset_names() + for datasource_name in available_names.keys(): + for generator in available_names[datasource_name].keys(): + generator_assets = available_names[datasource_name][generator] + if split_name[0] == datasource_name and split_name[1] in generator_assets: + provider_names.append(NormalizedDataAssetName(datasource_name, generator, split_name[1], "default")) + + if split_name[0] in generator_assets: + provider_names.append(NormalizedDataAssetName(datasource_name, generator, split_name[0], split_name[1])) + + if len(provider_names) == 1: + return provider_names[0] + + elif len(provider_names) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names)) + + else: + raise ConfigNotFoundError("No generator available to produce data_asset_name {data_asset_name} with datasource {datasource_name}" + .format(data_asset_name=data_asset_name, datasource_name=datasource_name)) + + + elif len(split_name) == 3: + # In this case, the name could refer to + # (a) a datasource, generator, and data_asset_name, or + # (b) a datasource, data_asset_name, and purpose + # If a generator is specified, there must be exactly one defined + # purpose with that name and generator + # If suite is defined, there must be exactly one + # defined generator with that name and purpose + datasource_name = split_name[0] + generator_assets = set([split_name[1], split_name[2]]) + provider_names = [] + for normalized_identifier in self.list_expectation_suites(): + normalized_split = normalized_identifier.split(self._data_asset_name_delimiter) + curr_datasource_name = normalized_split[0] + if datasource_name != curr_datasource_name: + continue + curr_generator_name = normalized_split[1] + curr_data_asset_name = normalized_split[2] + curr_curr_suite = normalized_split[3] + if ((curr_data_asset_name in generator_assets) and + ( + curr_generator_name in generator_assets or + curr_curr_suite in generator_assets + )): + provider_names.append( + NormalizedDataAssetName(*normalized_split) + ) + + if len(provider_names) == 1: + return provider_names[0] + + elif len(provider_names) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}: multiple providers found." + .format(data_asset_name=data_asset_name)) + + # If the data_asset_name is not already defined, but it exists among the valid names from exactly one generator, or the named generator, provide that name + available_names = self.get_available_data_asset_names(datasource_name) + for generator in available_names[datasource_name].keys(): + names_set = available_names[datasource_name][generator] + intersection = generator_assets.intersection(names_set) + if len(intersection) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}: multiple possible providers found." + .format(data_asset_name=data_asset_name)) + elif len(intersection) == 1: + possible_name = intersection.pop() + if possible_name == split_name[1]: # we were given a name and purpose + provider_names.append( + NormalizedDataAssetName(datasource_name, generator, possible_name, split_name[2]) + ) + elif possible_name == split_name[2] and split_name[1] == generator: # possible_name == split_name[2], we were given a generator and name + provider_names.append( + NormalizedDataAssetName(datasource_name, generator, possible_name, "default") + ) + if len(provider_names) == 1: + return provider_names[0] + + elif len(provider_names) > 1: + raise DataContextError("Ambiguous data_asset_name {data_asset_name}. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names)) + + else: + raise ConfigNotFoundError("No generator available to produce data_asset_name {data_asset_name} with datasource {datasource_name}" + .format(data_asset_name=data_asset_name, datasource_name=datasource_name)) + + elif len(split_name) == 4: + return NormalizedDataAssetName(*split_name) + # # This must match an existing config or available data_asset + # for normalized_identifier in self.list_expectation_suites(): + # normalized_split = normalized_identifier.split(self._data_asset_name_delimiter) + # if (split_name[0] == normalized_split[0] and split_name[1] == normalized_split[1] and + # split_name[2] == normalized_split[2] and split_name[3] == normalized_split[3]): + # return normalized_identifier + + # datasource_name = split_name[0] + # generator_name = split_name[1] + # generator_asset = split_name[2] + # purpose = split_name[3] + # # If we haven't found a match yet, look in the available_data_assets + # available_names = self.get_available_data_asset_names(datasource_name) + # if generator_name in available_names[datasource_name] and + # generator_asset in available_names[datasource_name][generator_name]: + # return NormalizedDataAssetName(datasource_name, generator_name, generator_asset, purpose) + + # raise DataContextError("Data asset {data_asset_name} could not be resolved in this DataContext.".format(data_asset_name=data_asset_name)) + + def get_expectation_suite(self, data_asset_name, batch_kwargs=None): + if not isinstance(data_asset_name, NormalizedDataAssetName): + data_asset_name = self._normalize_data_asset_name(data_asset_name) + + config_file_path = self._get_normalized_data_asset_name_filepath(data_asset_name) if os.path.isfile(config_file_path): - with open(os.path.join(self.expectations_directory, data_asset_name + '.json')) as json_file: - return json.load(json_file) + with open(config_file_path, 'r') as json_file: + read_config = json.load(json_file) + # update the data_asset_name to correspond to the current name (in case the config has been moved/renamed) + read_config["data_asset_name"] = self.data_asset_name_delimiter.join(data_asset_name) + return read_config else: # TODO: Should this return None? Currently this method acts as get_or_create return { - 'data_asset_name': data_asset_name, + 'data_asset_name': self.data_asset_name_delimiter.join(data_asset_name), 'meta': { 'great_expectations.__version__': __version__ }, - 'expectations': [], + 'expectations': [] } - def save_expectations(self, expectations, data_asset_name=None): + def save_expectation_suite(self, expectations, data_asset_name=None): if data_asset_name is None: data_asset_name = expectations['data_asset_name'] - config_file_path = os.path.join(self.expectations_directory, data_asset_name + '.json') - safe_mmkdir(os.path.split(config_file_path)[0], exist_ok=True) + if not isinstance(data_asset_name, NormalizedDataAssetName): + data_asset_name = self._normalize_data_asset_name(data_asset_name) + config_file_path = self._get_normalized_data_asset_name_filepath(data_asset_name) + safe_mmkdir(os.path.dirname(config_file_path), exist_ok=True) with open(config_file_path, 'w') as outfile: json.dump(expectations, outfile) self._compiled = False @@ -399,10 +624,15 @@ def register_validation_results(self, run_id, validation_results, data_asset=Non if "result_store" in self._project_config: result_store = self._project_config["result_store"] if isinstance(result_store, dict) and "filesystem" in result_store: - validation_filepath = os.path.join(self.context_root_directory, "great_expectations", result_store["filesystem"]["base_directory"], - run_id, data_asset_name + ".json") + validation_filepath = self._get_normalized_data_asset_name_filepath( + data_asset_name, + base_path=os.path.join(self.context_root_directory, + "great_expectations", + result_store["filesystem"]["base_directory"], + run_id) + ) logger.info("Storing validation result: %s" % validation_filepath) - safe_mmkdir(os.path.join(self.context_root_directory, "great_expectations", result_store["filesystem"]["base_directory"], run_id)) + safe_mmkdir(os.path.dirname(validation_filepath)) with open(validation_filepath, "w") as outfile: json.dump(validation_results, outfile) if isinstance(result_store, dict) and "s3" in result_store: @@ -556,7 +786,7 @@ def _compile(self): "data_assets": {} } - known_assets = self.list_expectations_configs() + known_assets = self.list_expectation_suites() config_paths = [y for x in os.walk(self.expectations_directory) for y in glob(os.path.join(x[0], '*.json'))] for config_file in config_paths: @@ -672,16 +902,18 @@ def update_return_obj(self, data_asset, return_obj): else: return return_obj - def profile_datasource(self, datasource_name, profiler=BasicDatasetProfiler, max_data_assets=10): + def profile_datasource(self, datasource_name, generator_name=None, profiler=BasicDatasetProfiler, max_data_assets=10): logger.info("Profiling %s with %s" % (datasource_name, profiler.__name__)) - datasource = self.get_datasource(datasource_name) - data_asset_names = datasource.get_available_data_asset_names() - # TODO: This is fixed in a different PR. JPC -- merge - #!!! Abe 2019/06/11: This seems brittle. I don't understand why this object is packaged this way. - #!!! Note: need to review this to make sure the names are properly qualified. - data_asset_name_list = list(data_asset_names[0]["available_data_asset_names"]) + data_asset_names = self.get_available_data_asset_names(datasource_name) + if generator_name is None: + if len(data_asset_names[datasource_name].keys()) == 1: + generator_name = list(data_asset_names[datasource_name].keys())[0] + if generator_name not in data_asset_names[datasource_name]: + raise ProfilerError("Generator %s not found for datasource %s" % (generator_name, datasource_name)) + + data_asset_name_list = list(data_asset_names[datasource_name][generator_name]) total_data_assets = len(data_asset_name_list) - logger.info("Found %d named data assets" % (total_data_assets)) + logger.info("Found %d data assets using generator %s" % (total_data_assets, generator_name)) if max_data_assets == None or max_data_assets >= len(data_asset_name_list): logger.info("Profiling all %d." % (len(data_asset_name_list))) @@ -697,27 +929,25 @@ def profile_datasource(self, datasource_name, profiler=BasicDatasetProfiler, max start_time = datetime.datetime.now() #FIXME: There needs to be an affordance here to limit to 100 rows, or downsample, etc. - batch = self.get_batch(datasource_name=datasource_name, data_asset_name=name) + batch = self.get_batch(data_asset_name=NormalizedDataAssetName(datasource_name, generator_name, name, profiler.__name__)) if not profiler.validate(batch): raise ProfilerError("batch %s is not a valid batch for the %s profiler" % (name, profiler.__name__)) #Note: This logic is specific to DatasetProfilers, which profile a single batch. Multi-batch profilers will have more to unpack. - expectations_config, validation_result = profiler.profile(batch) + expectation_suite, validation_result = profiler.profile(batch) if isinstance(batch, Dataset): # For datasets, we can produce some more detailed statistics row_count = batch.get_row_count() total_rows += row_count - new_column_count = len(set([exp["kwargs"]["column"] for exp in expectations_config["expectations"] if "column" in exp["kwargs"]])) + new_column_count = len(set([exp["kwargs"]["column"] for exp in expectation_suite["expectations"] if "column" in exp["kwargs"]])) total_columns += new_column_count - new_expectation_count = len(expectations_config["expectations"]) + new_expectation_count = len(expectation_suite["expectations"]) total_expectations += new_expectation_count - #We should be able to pass a parameter to make this a `_candidate_` file - self.save_expectations(expectations_config)#, name) - # self.save_validation_result(validation_result, name) + self.save_expectation_suite(expectation_suite) duration = (datetime.datetime.now() - start_time).total_seconds() @@ -727,7 +957,7 @@ def profile_datasource(self, datasource_name, profiler=BasicDatasetProfiler, max # TODO: ^^^ except ProfilerError as err: logger.warning(err.message) - except: + except Exception as exc: logger.warning("\tSomething went wrong when profiling %s. (Perhaps a loading error?) Skipping." % (name)) skipped_data_assets += 1 @@ -755,6 +985,11 @@ def profile_datasource(self, datasource_name, profiler=BasicDatasetProfiler, max # For more help configuring great expectations, # see the documentation at: https://greatexpectations.io/config_file.html +# NOTE: GE uses the names of configured datasources and generators to manage +# how expectations and other configuration artifacts are stored in the +# expectations/ and datasources/ folders. If you need to rename an existing +# datasource or generator, be sure to also update the paths for related artifacts. + """ PROJECT_OPTIONAL_CONFIG_COMMENT = """ @@ -805,4 +1040,4 @@ def profile_datasource(self, datasource_name, profiler=BasicDatasetProfiler, max Otherwise, all credential options specified here for a given profile will be passed to sqlalchemy's create URL function. -""" \ No newline at end of file +""" diff --git a/great_expectations/data_context/util.py b/great_expectations/data_context/util.py index b24bf47c1f44..7ba8fa2eac19 100644 --- a/great_expectations/data_context/util.py +++ b/great_expectations/data_context/util.py @@ -5,9 +5,17 @@ import os import json import errno +from collections import namedtuple logger = logging.getLogger(__name__) +NormalizedDataAssetName = namedtuple("NormalizedDataAssetName", [ + "datasource", + "generator", + "generator_asset", + "suite" +]) + def build_slack_notification_request(validation_json=None): # Defaults @@ -78,9 +86,7 @@ def build_slack_notification_request(validation_json=None): def get_slack_callback(webhook): def send_slack_notification(validation_json=None): - """ - Post a slack notification. - """ + """Post a slack notification.""" session = requests.Session() query = build_slack_notification_request(validation_json) diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py index f6b935396666..0c3031a1875c 100644 --- a/great_expectations/dataset/dataset.py +++ b/great_expectations/dataset/dataset.py @@ -204,11 +204,12 @@ def get_column_count_in_range(self, column, min_val=None, max_val=None, min_stri """Returns: int""" raise NotImplementedError - def _initialize_expectations(self, config=None, data_asset_name=None): + def _initialize_expectations(self, expectation_suite=None, data_asset_name=None): """Override data_asset_type with "Dataset" """ - super(Dataset, self)._initialize_expectations(config=config, data_asset_name=data_asset_name) - self._expectations_config["data_asset_type"] = "Dataset" + super(Dataset, self)._initialize_expectations(expectation_suite=expectation_suite, + data_asset_name=data_asset_name) + self._expectation_suite["data_asset_type"] = "Dataset" @classmethod def column_map_expectation(cls, func): @@ -283,7 +284,11 @@ def test_column_aggregate_expectation_function(self, function, *args, **kwargs): new_function = self.column_aggregate_expectation(function) return new_function(self, *args, **kwargs) - ##### Table shape expectations ##### + ##### + # + # Table shape expectations + # + ##### @DocInherit @DataAsset.expectation(["column"]) diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py index 81b1256228b0..d770673ad34c 100644 --- a/great_expectations/dataset/pandas_dataset.py +++ b/great_expectations/dataset/pandas_dataset.py @@ -278,7 +278,7 @@ class PandasDataset(MetaPandasDataset, pd.DataFrame): # get an attribute error when trying to access them (I think this could be done in __finalize__?) _internal_names = pd.DataFrame._internal_names + [ '_batch_kwargs', - '_expectations_config', + '_expectation_suite', 'caching', 'default_expectation_args', 'discard_subset_failing_expectations' @@ -294,7 +294,7 @@ def _constructor(self): def __finalize__(self, other, method=None, **kwargs): if isinstance(other, PandasDataset): - self._initialize_expectations(other.get_expectations( + self._initialize_expectations(other.get_expectation_suite( discard_failed_expectations=False, discard_result_format_kwargs=False, discard_include_configs_kwargs=False, diff --git a/great_expectations/datasource/__init__.py b/great_expectations/datasource/__init__.py index 3a4fac852446..e75bfd807f55 100644 --- a/great_expectations/datasource/__init__.py +++ b/great_expectations/datasource/__init__.py @@ -1,2 +1,2 @@ from .spark_source import SparkDFDatasource -from .pandas_source import PandasDatasource, FilesystemPathGenerator \ No newline at end of file +from .pandas_source import PandasDatasource, SubdirReaderGenerator \ No newline at end of file diff --git a/great_expectations/datasource/batch_generator.py b/great_expectations/datasource/batch_generator.py index 9fa25d91b7f6..036f11591919 100644 --- a/great_expectations/datasource/batch_generator.py +++ b/great_expectations/datasource/batch_generator.py @@ -1,10 +1,23 @@ +# -*- coding: utf-8 -*- + import os import copy import logging logger = logging.getLogger(__name__) + class BatchGenerator(object): + """Generators produce identifying information, called "batch_kwargs" that datasources + can use to get individual batches of data. They add flexibility in how to obtain data + such as with time-based partitioning, downsampling, or other techniques appropriate + for the datasource. + + For example, a generator could produce a SQL query that logically represents "rows in + the Events table with a timestamp on February 7, 2012," which a SqlAlchemyDatasource + could use to materialize a SqlAlchemyDataset corresponding to that batch of data and + ready for validation. + """ def __init__(self, name, type_, datasource=None): self._name = name @@ -25,7 +38,7 @@ def get_config(self): def _save_config(self): if self._datasource is not None: - self._datasource._save_config() + self._datasource.save_config() else: logger.warning("Unable to save generator config without a datasource attached.") @@ -53,3 +66,12 @@ def yield_batch_kwargs(self, data_asset_name): # If we don't actually have an iterator we can generate, even after reseting, just return empty logger.warning("Unable to generate batch_kwargs for data_asset_name %s" % data_asset_name) return {} + + +class EmptyGenerator(BatchGenerator): + + def _get_iterator(self, data_asset_name, **kwargs): + return iter([]) + + def get_available_data_asset_names(self): + return set() \ No newline at end of file diff --git a/great_expectations/datasource/databricks_generator.py b/great_expectations/datasource/databricks_generator.py index 9f3728a712c7..d72ff424f856 100644 --- a/great_expectations/datasource/databricks_generator.py +++ b/great_expectations/datasource/databricks_generator.py @@ -2,7 +2,6 @@ import logging from .batch_generator import BatchGenerator -from great_expectations.dataset.sparkdf_dataset import SparkDFDataset logger = logging.getLogger(__name__) @@ -11,6 +10,7 @@ except ImportError: logger.debug("Unable to load spark context; install optional spark dependency for support.") + class DatabricksTableGenerator(BatchGenerator): """Meant to be used in a Databricks notebook diff --git a/great_expectations/datasource/datasource.py b/great_expectations/datasource/datasource.py index 635478aad44d..4e4fc9f77155 100644 --- a/great_expectations/datasource/datasource.py +++ b/great_expectations/datasource/datasource.py @@ -3,13 +3,23 @@ import copy from six import string_types +from ..data_context.util import NormalizedDataAssetName + import logging logger = logging.getLogger(__name__) yaml = YAML() yaml.default_flow_style = False + class Datasource(object): + """Datasources are responsible for connecting to data infrastructure. + Each Datasource (within your DataContext) is a source of materialized data, such as a SQL database, S3 bucket, + or local file directory. + + Since opinionated DAG managers such as airflow, dbt, prefect.io, dagster can also act as sources of + materialized data, they can also act as Datasources. + """ @classmethod def from_configuration(cls, **kwargs): @@ -26,28 +36,37 @@ def __init__(self, name, type_, data_context=None, generators=None): "generators": generators } - extra_config = self._load_datasource_config() - self._datasource_config.update(extra_config) - + # extra_config = self._load_datasource_config() + # self._datasource_config.update(extra_config) + + @property + def data_context(self): + return self._data_context + + @property + def name(self): + return self._name + def _build_generators(self): for generator in self._datasource_config["generators"].keys(): self.get_generator(generator) - def _load_datasource_config(self): - # For now, just use the data context config - return {} - # if self._data_context is None: - # # Setup is done; no additional config to read - # return {} - # try: - # config_path = os.path.join(self._data_context.context_root_directory, "great_expectations/datasources", self._name, "config.yml") - # with open(config_path, "r") as data: - # extra_config = yaml.load(data) or {} - # logger.info("Loading config from %s" % str(config_path)) - # return extra_config - # except FileNotFoundError: - # logger.debug("No additional config file found.") - # return {} + # def _load_datasource_config(self): + # # For now, just use the data context config + # return {} + # # if self._data_context is None: + # # # Setup is done; no additional config to read + # # return {} + # # try: + # # config_path = os.path.join(self._data_context.context_root_directory, + # "great_expectations/datasources", self._name, "config.yml") + # # with open(config_path, "r") as data: + # # extra_config = yaml.load(data) or {} + # # logger.info("Loading config from %s" % str(config_path)) + # # return extra_config + # # except FileNotFoundError: + # # logger.debug("No additional config file found.") + # # return {} def get_credentials(self, profile_name): if self._data_context is not None: @@ -56,10 +75,10 @@ def get_credentials(self, profile_name): def get_config(self): if self._data_context is not None: - self._save_config() + self.save_config() return self._datasource_config - def _save_config(self): + def save_config(self): # For now, just use the data context config if self._data_context is not None: self._data_context._save_project_config() @@ -71,9 +90,11 @@ def _save_config(self): # if self._data_context is not None: # base_config = copy.deepcopy(self._datasource_config) # if "config_file" in base_config: - # config_filepath = os.path.join(self._data_context.context_root_directory, base_config.pop["config_file"]) + # config_filepath = os.path.join(self._data_context.context_root_directory, + # base_config.pop["config_file"]) # else: - # config_filepath = os.path.join(self._data_context.context_root_directory, "great_expectations/datasources", self._name, "config.yml") + # config_filepath = os.path.join(self._data_context.context_root_directory, + # "great_expectations/datasources", self._name, "config.yml") # else: # logger.warning("Unable to save config with no data context attached.") @@ -85,11 +106,11 @@ def add_generator(self, name, type_, **kwargs): data_asset_generator_class = self._get_generator_class(type_) generator = data_asset_generator_class(name=name, datasource=self, **kwargs) self._generators[name] = generator - if not "generators" in self._datasource_config: + if "generators" not in self._datasource_config: self._datasource_config["generators"] = {} self._datasource_config["generators"][name] = generator.get_config() if self._data_context is not None: - self._save_config() + self.save_config() return generator def get_generator(self, generator_name="default"): @@ -104,7 +125,9 @@ def get_generator(self, generator_name="default"): generator_name = list(self._datasource_config["generators"])[0] generator_config = copy.deepcopy(self._datasource_config["generators"][generator_name]) else: - raise ValueError("Unable to load generator %s -- no configuration found or invalid configuration." % generator_name) + raise ValueError( + "Unable to load generator %s -- no configuration found or invalid configuration." % generator_name + ) type_ = generator_config.pop("type") generator_class = self._get_generator_class(type_) generator = generator_class(name=generator_name, datasource=self, **generator_config) @@ -114,30 +137,51 @@ def get_generator(self, generator_name="default"): def list_generators(self): return [{"name": key, "type": value["type"]} for key, value in self._datasource_config["generators"].items()] - def get_data_asset(self, data_asset_name, batch_kwargs=None, **kwargs): + def get_batch(self, data_asset_name, batch_kwargs=None, **kwargs): + if isinstance(data_asset_name, NormalizedDataAssetName): # this richer type can include more metadata + generator_name = data_asset_name.generator + generator_asset = data_asset_name.generator_asset + if self._data_context is not None: + expectation_suite = self._data_context.get_expectation_suite( + data_asset_name, + batch_kwargs) + # In this case, we want to ensure we don't overwrite the name below; use the full data_asset_name + data_asset_name = self._data_context.data_asset_name_delimiter.join(data_asset_name) + else: + expectation_suite = None + # If data_context is not set, we cannot definitely use a fully normalized data_asset reference. + # This would mean someone got a normalized name without a data context which is unusual + logger.warning("Using NormalizedDataAssetName type without a data_context could result in unexpected behavior: \ + using '/' as a default delimiter.") + data_asset_name = "/".join(data_asset_name) + else: + generator_name = "default" + generator_asset = data_asset_name + expectation_suite = None + if self._data_context is not None: + logger.warning( + "Requesting a data_asset without a normalized data_asset_name; expectation_suite will not be set" + ) + if batch_kwargs is None: - generator = self.get_generator() + generator = self.get_generator(generator_name) if generator is not None: - batch_kwargs = generator.yield_batch_kwargs(data_asset_name) + batch_kwargs = generator.yield_batch_kwargs(generator_asset) else: raise ValueError("No generator or batch_kwargs available to provide a dataset.") + elif not isinstance(batch_kwargs, dict): + batch_kwargs = self.build_batch_kwargs(batch_kwargs) - if self._data_context is not None: - expectations_config = self._data_context.get_expectations(data_asset_name, batch_kwargs) - else: - expectations_config = None - - return self._get_data_asset(data_asset_name, batch_kwargs, expectations_config, **kwargs) + return self._get_data_asset(data_asset_name, batch_kwargs, expectation_suite, **kwargs) - - def _get_data_asset(self, data_asset_name, batch_kwargs, expectations_config, **kwargs): + def _get_data_asset(self, data_asset_name, batch_kwargs, expectation_suite, **kwargs): raise NotImplementedError def _get_generator_class(self, type_): raise NotImplementedError def get_available_data_asset_names(self, generator_names=None): - available_data_asset_names = [] + available_data_asset_names = {} if generator_names is None: generator_names = [generator["name"] for generator in self.list_generators()] elif isinstance(generator_names, string_types): @@ -145,18 +189,11 @@ def get_available_data_asset_names(self, generator_names=None): for generator_name in generator_names: generator = self.get_generator(generator_name) - available_data_asset_names.append( - { - "generator": generator_name, - "available_data_asset_names": generator.get_available_data_asset_names() - } - ) + available_data_asset_names[generator_name] = generator.get_available_data_asset_names() return available_data_asset_names - def build_batch_kwargs(self, **kwargs): + def build_batch_kwargs(self, *args, **kwargs): raise NotImplementedError def get_data_context(self): return self._data_context - - diff --git a/great_expectations/datasource/dbt_source.py b/great_expectations/datasource/dbt_source.py index 0d1052c7d555..2f332c5659a5 100644 --- a/great_expectations/datasource/dbt_source.py +++ b/great_expectations/datasource/dbt_source.py @@ -1,14 +1,14 @@ import os import time import logging +import errno from ruamel.yaml import YAML -yaml = YAML(typ='safe') -from .datasource import Datasource +from .sqlalchemy_source import SqlAlchemyDatasource from .batch_generator import BatchGenerator -from great_expectations.dataset.sqlalchemy_dataset import SqlAlchemyDataset +yaml = YAML(typ='safe') logger = logging.getLogger(__name__) try: @@ -17,20 +17,21 @@ except ImportError: logger.debug("Unable to import sqlalchemy.") + class DBTModelGenerator(BatchGenerator): """This is a helper class that makes using great expectations with dbt easy!""" - def __init__(self, name="default", datasource=None): + def __init__(self, name="dbt_models", datasource=None): super(DBTModelGenerator, self).__init__(name, type_="dbt_models", datasource=datasource) self.dbt_target_path = datasource.dbt_target_path - def _get_iterator(self, data_asset_name): + def _get_iterator(self, data_asset_name, **kwargs): """ Read compiled SQL of a dbt model. - :param model_name: model name. For model file blah/boo/mymodel.sql, pass the value "blah/boo/mymodel" + :param data_asset_name: model name. For model file blah/boo/mymodel.sql, pass the value "blah/boo/mymodel" - :return: compiled SQL ready to be executed + :return: iterator over batch_kwargs with a query parameter equal to the content of the relevant model file """ try: with open(os.path.join(self.dbt_target_path, data_asset_name) + ".sql", "r") as data: @@ -50,24 +51,25 @@ def get_available_data_asset_names(self): return set([path for path in os.walk(self.dbt_target_path) if path.endswith(".sql")]) -class DBTDatasource(Datasource): +class DBTDatasource(SqlAlchemyDatasource): """ - A DBTDataSource create a SQLAlchemy connection to the database used by a dbt project + A DBTDataSource creates a SQLAlchemy connection to the database used by a dbt project. + and allows to create, manage and validate expectations on the models that exist in that dbt project. """ - def __init__(self, - name="default", - data_context=None, - generators=None, - profile="default", - project_filepath="dbt_project.yml", - profiles_filepath="~/.dbt/profiles.yml", - **kwargs + def __init__(self, + name="dbt", + data_context=None, + generators=None, + profile="default", + project_filepath="dbt_project.yml", + profiles_filepath="~/.dbt/profiles.yml", + **kwargs ): if generators is None: generators = { - "default": {"type": "dbt_models"} + "dbt_models": {"type": "dbt_models"} } super(DBTDatasource, self).__init__(name, type_="dbt", data_context=data_context, generators=generators) self._datasource_config.update({ @@ -75,9 +77,10 @@ def __init__(self, "project_filepath": project_filepath, "profiles_filepath": profiles_filepath }) + self._datasource_config.update(kwargs) - self.meta = MetaData() - with open(os.path.join(self._data_context.get_context_root_directory(), self._datasource_config["project_filepath"]), "r") as f: + with open(os.path.join(self._data_context.get_context_root_directory(), + self._datasource_config["project_filepath"]), "r") as f: self._dbt_project = yaml.load(f) or {} self.dbt_target_path = os.path.join( @@ -88,13 +91,10 @@ def __init__(self, ) self._options = self._get_sqlalchemy_connection_options() - self._connect(self._get_sqlalchemy_connection_options()) + self._connect(self._get_sqlalchemy_connection_options(**kwargs)) self._build_generators() - def _connect(self, options, *args, **kwargs): - self.engine = create_engine(options, *args, **kwargs) - - def _get_sqlalchemy_connection_options(self): + def _get_sqlalchemy_connection_options(self, **kwargs): with open(os.path.expanduser(self._datasource_config["profiles_filepath"]), "r") as data: profiles_config = yaml.load(data) or {} @@ -117,24 +117,17 @@ def _get_generator_class(self, type_): else: raise ValueError("Unrecognized DataAssetGenerator type %s" % type_) - def _get_data_asset(self, data_asset_name, batch_kwargs, expectations_config): - """ - Get a data asset object that will allow to create, manage and validate expectations on a dbt model. - - Args: - data_asset_name (string): \ - Name of an existing dbt model. - If your model sql file is models/myfolder1/my_model1.sql, pass "myfolder1/my_model1". - - Notes: - This method will read the compiled SQL for this model from dbt's "compiled" folder - make sure that - it is up to date after modifying the model's SQL source - recompile or rerun your dbt pipeline - """ - custom_sql = batch_kwargs["query"] - return SqlAlchemyDataset(table_name=data_asset_name, - engine=self.engine, - data_context=self._data_context, - data_asset_name=data_asset_name, - expectations_config=expectations_config, - custom_sql=custom_sql, - batch_kwargs=batch_kwargs) \ No newline at end of file + def build_batch_kwargs(self, *args, **kwargs): + if len(args) > 0: + # Allow a model name here + generator = self.get_generator() + if isinstance(generator, DBTModelGenerator): + batch_kwargs = generator.yield_batch_kwargs(args[0]) + else: + batch_kwargs = {} + else: + batch_kwargs = {} + batch_kwargs.update({ + "timestamp": time.time() + }) + return batch_kwargs diff --git a/great_expectations/datasource/filesystem_path_generator.py b/great_expectations/datasource/filesystem_path_generator.py index 6e3e1a9c94c2..a1d094e95613 100644 --- a/great_expectations/datasource/filesystem_path_generator.py +++ b/great_expectations/datasource/filesystem_path_generator.py @@ -1,22 +1,55 @@ import os -import errno -import hashlib +import time +import re from .batch_generator import BatchGenerator -class FilesystemPathGenerator(BatchGenerator): - """ + +class SubdirReaderGenerator(BatchGenerator): + """The SubdirReaderGenerator inspects a filesytem and produces batch_kwargs with a path and timestamp. + + SubdirReaderGenerator recognizes data_asset_name using two criteria: + - for files directly in 'base_directory' with recognized extensions (.csv), it uses the name of the file without + the extension + - for other files or directories in 'base_directory', is uses the file or directory name + + For directories in 'base_directory', SubdirReaderGenerator iterates over + + SubdirReaderGenerator also uses + SubdirReaderGenerator /data/users/users_20180101.csv /data/users/users_20180102.csv """ - def __init__(self, name="default", datasource=None, base_directory="/data"): - super(FilesystemPathGenerator, self).__init__(name, type_="filesystem", datasource=datasource) + def __init__(self, name="default", + datasource=None, + base_directory="/data", + reader_options=None): + super(SubdirReaderGenerator, self).__init__(name, type_="subdir_reader", datasource=datasource) + if reader_options is None: + reader_options = {} + + self._reader_options = reader_options self._base_directory = base_directory + @property + def reader_options(self): + return self._reader_options + + @property + def base_directory(self): + # If base directory is a relative path, interpret it as relative to the data context's + # context root directory (parent directory of great_expectation dir) + if os.path.isabs(self._base_directory) or self._datasource.get_data_context() is None: + return self._base_directory + else: + return os.path.join(self._datasource.get_data_context().get_context_root_directory(), self._base_directory) + def get_available_data_asset_names(self): known_assets = set() - file_options = os.listdir(self._get_current_base_directory()) + if not os.path.isdir(self.base_directory): + return known_assets + file_options = os.listdir(self.base_directory) for file_option in file_options: if file_option.endswith(".csv"): known_assets.add(file_option[:-4]) @@ -24,53 +57,39 @@ def get_available_data_asset_names(self): known_assets.add(file_option) return known_assets - def _get_iterator(self, data_asset_name): + def _get_iterator(self, data_asset_name, **kwargs): # If the data_asset_name is a file, then return the path. # Otherwise, use files in a subdir as batches - if os.path.isdir(os.path.join(self._get_current_base_directory(), data_asset_name)): + if os.path.isdir(os.path.join(self.base_directory, data_asset_name)): return self._build_batch_kwargs_path_iter( [ - os.path.join(self._get_current_base_directory(), data_asset_name, path) - for path in os.listdir(os.path.join(self._get_current_base_directory(), data_asset_name)) + os.path.join(self.base_directory, data_asset_name, path) + for path in os.listdir(os.path.join(self.base_directory, data_asset_name)) ] - - ) - # return self._build_batch_kwargs_path_iter(os.scandir(os.path.join(self._get_current_base_directory(), data_asset_name))) + ) + # return self._build_batch_kwargs_path_iter(os.scandir(os.path.join(self.base_directory, data_asset_name))) # return iter([{ - # "path": os.path.join(self._get_current_base_directory(), data_asset_name, x) - # } for x in os.listdir(os.path.join(self._get_current_base_directory(), data_asset_name))]) - elif os.path.isfile(os.path.join(self._get_current_base_directory(), data_asset_name)): - path = os.path.join(self._get_current_base_directory(), data_asset_name) + # "path": os.path.join(self.base_directory, data_asset_name, x) + # } for x in os.listdir(os.path.join(self.base_directory, data_asset_name))]) + elif os.path.isfile(os.path.join(self.base_directory, data_asset_name)): + path = os.path.join(self.base_directory, data_asset_name) # with open(path,'rb') as f: # md5 = hashlib.md5(f.read()).hexdigest() - return iter([ - { - "path": path, - # "md5": md5 - } - ]) - elif os.path.isfile(os.path.join(self._get_current_base_directory(), data_asset_name + ".csv")): - path = os.path.join(self._get_current_base_directory(), data_asset_name + ".csv") + return iter([self._build_batch_kwargs(path)]) + elif os.path.isfile(os.path.join(self.base_directory, data_asset_name + ".csv")): + path = os.path.join(self.base_directory, data_asset_name + ".csv") # with open(path,'rb') as f: # md5 = hashlib.md5(f.read()).hexdigest() - return iter([ - { - "path": path, - # "md5": md5 - } - ]) + return iter([self._build_batch_kwargs(path)]) else: - raise IOError(os.path.join(self._base_directory, data_asset_name)) + raise IOError(os.path.join(self.base_directory, data_asset_name)) # def _build_batch_kwargs_path_iter(self, path_iter): def _build_batch_kwargs_path_iter(self, path_list): for path in path_list: # with open(path,'rb') as f: # md5 = hashlib.md5(f.read()).hexdigest() - yield { - "path": path, - # "md5": md5 - } + yield self._build_batch_kwargs(path) # try: # while True: # yield { @@ -79,10 +98,10 @@ def _build_batch_kwargs_path_iter(self, path_list): # except StopIteration: # return - # If base directory is a relative path, interpret it as relative to the data context's - # context root directory (parent directory of great_expectation dir) - def _get_current_base_directory(self): - if os.path.isabs(self._base_directory) or self._datasource.get_data_context() is None: - return self._base_directory - else: - return os.path.join(self._datasource.get_data_context().get_context_root_directory(), self._base_directory) + def _build_batch_kwargs(self, path): + batch_kwargs = { + "path": path, + "timestamp": time.time() + } + batch_kwargs.update(self.reader_options) + return batch_kwargs diff --git a/great_expectations/datasource/pandas_source.py b/great_expectations/datasource/pandas_source.py index c117d77dcd65..4fd02554755d 100644 --- a/great_expectations/datasource/pandas_source.py +++ b/great_expectations/datasource/pandas_source.py @@ -1,10 +1,12 @@ -import os +import copy import time +from six import string_types import pandas as pd from .datasource import Datasource -from .filesystem_path_generator import FilesystemPathGenerator +from .filesystem_path_generator import SubdirReaderGenerator +from .batch_generator import EmptyGenerator from great_expectations.dataset.pandas_dataset import PandasDataset from great_expectations.exceptions import BatchKwargsError @@ -12,54 +14,79 @@ class PandasDatasource(Datasource): """ - A PandasDataSource makes it easy to create, manage and validate expectations on + A PandasDatasource makes it easy to create, manage and validate expectations on Pandas dataframes. - Use with the FilesystemPathGenerator for simple cases. + Use with the SubdirReaderGenerator for simple cases. """ - def __init__(self, name="default", data_context=None, generators=None, read_csv_kwargs=None, **kwargs): + def __init__(self, name="pandas", data_context=None, generators=None, **kwargs): if generators is None: - # Provide a gentle way to build a datasource with a sane default, including ability to specify the base_directory + # Provide a gentle way to build a datasource with a sane default, + # including ability to specify the base_directory and reader_options base_directory = kwargs.pop("base_directory", "/data") + reader_options = kwargs.pop("reader_options", {}) generators = { - "default": {"type": "filesystem", "base_directory": base_directory} - } - super(PandasDatasource, self).__init__(name, type_="pandas", data_context=data_context, generators=generators) - self._datasource_config.update( - { - "read_csv_kwargs": read_csv_kwargs or {} + "default": { + "type": "subdir_reader", + "base_directory": base_directory, + "reader_options": reader_options + } } - ) + super(PandasDatasource, self).__init__(name, type_="pandas", + data_context=data_context, + generators=generators) self._build_generators() def _get_generator_class(self, type_): - if type_ == "filesystem": - return FilesystemPathGenerator + if type_ == "subdir_reader": + return SubdirReaderGenerator + elif type_ == "memory": + return EmptyGenerator else: raise ValueError("Unrecognized BatchGenerator type %s" % type_) - def _get_data_asset(self, data_asset_name, batch_kwargs, expectations_config, **kwargs): - try: - full_path = os.path.join(batch_kwargs["path"]) - except KeyError: - raise BatchKwargsError("Invalid batch_kwargs: path is required for a PandasDatasource", batch_kwargs) - - all_kwargs = dict(**self._datasource_config["read_csv_kwargs"]) - all_kwargs.update(**kwargs) + def _get_data_asset(self, data_asset_name, batch_kwargs, expectation_suite, **kwargs): + batch_kwargs.update(kwargs) + if "path" in batch_kwargs: + reader_options = batch_kwargs.copy() + path = reader_options.pop("path") # We need to remove from the reader + reader_options.pop("timestamp") # ditto timestamp + if path.endswith((".csv", ".tsv")): + df = pd.read_csv(path, **reader_options) + elif path.endswith(".parquet"): + df = pd.read_parquet(path, **reader_options) + elif path.endswith((".xls", ".xlsx")): + df = pd.read_excel(path, **reader_options) + else: + raise BatchKwargsError("Unrecognized path: no available reader.", + batch_kwargs) + elif "df" in batch_kwargs and isinstance(batch_kwargs["df"], (pd.DataFrame, pd.Series)): + df = batch_kwargs.pop("df") # We don't want to store the actual dataframe in kwargs + else: + raise BatchKwargsError("Invalid batch_kwargs: path or df is required for a PandasDatasource", + batch_kwargs) - df = pd.read_csv(full_path, **all_kwargs) - - return PandasDataset(df, - expectations_config=expectations_config, - data_context=self._data_context, - data_asset_name=data_asset_name, - batch_kwargs=batch_kwargs) + return PandasDataset(df, + expectation_suite=expectation_suite, + data_context=self._data_context, + data_asset_name=data_asset_name, + batch_kwargs=batch_kwargs) - def build_batch_kwargs(self, filepath, **kwargs): - batch_kwargs = { - "path": filepath, - "timestamp": time.time() - } - batch_kwargs.update(dict(**kwargs)) - return batch_kwargs \ No newline at end of file + def build_batch_kwargs(self, *args, **kwargs): + if len(args) > 0: + if isinstance(args[0], (pd.DataFrame, pd.Series)): + kwargs.update({ + "df": args[0], + "timestamp": time.time() + }) + elif isinstance(args[0], string_types): + kwargs.update({ + "path": args[0], + "timestamp": time.time() + }) + else: + kwargs.update({ + "timestamp": time.time() + }) + return kwargs diff --git a/great_expectations/datasource/spark_source.py b/great_expectations/datasource/spark_source.py index 0bd2c5484e2b..73fa08c0faee 100644 --- a/great_expectations/datasource/spark_source.py +++ b/great_expectations/datasource/spark_source.py @@ -2,7 +2,7 @@ import logging from .datasource import Datasource -from .filesystem_path_generator import FilesystemPathGenerator +from .filesystem_path_generator import SubdirReaderGenerator from .databricks_generator import DatabricksTableGenerator logger = logging.getLogger(__name__) @@ -14,23 +14,26 @@ # TODO: review logging more detail here logger.debug("Unable to load pyspark; install optional spark dependency for support.") + class SparkDFDatasource(Datasource): - """For now, functions like PandasCSVDataContext + """The SparkDFDatasource produces spark dataframes and supports generators capable of interacting with local + filesystem (the default subdir_reader generator) and databricks notebooks. """ - def __init__(self, name="default", data_context=None, generators=None, reader_options=None, **kwargs): + def __init__(self, name="default", data_context=None, generators=None, **kwargs): if generators is None: - # Provide a gentle way to build a datasource with a sane default, including ability to specify the base_directory + # Provide a gentle way to build a datasource with a sane default, + # including ability to specify the base_directory base_directory = kwargs.pop("base_directory", "/data") + reader_options = kwargs.pop("reader_options", {}) generators = { - "default": {"type": "filesystem", "base_directory": base_directory} + "default": { + "type": "subdir_reader", + "base_directory": base_directory, + "reader_options": reader_options + } } super(SparkDFDatasource, self).__init__(name, type_="spark", data_context=data_context, generators=generators) - self._datasource_config.update( - { - "reader_options": reader_options or {} - } - ) try: self.spark = SparkSession.builder.getOrCreate() except Exception: @@ -40,34 +43,33 @@ def __init__(self, name="default", data_context=None, generators=None, reader_op self._build_generators() def _get_generator_class(self, type_): - if type_ == "filesystem": - return FilesystemPathGenerator + if type_ == "subdir_reader": + return SubdirReaderGenerator elif type_ == "databricks": return DatabricksTableGenerator else: raise ValueError("Unrecognized BatchGenerator type %s" % type_) - - def _get_data_asset(self, data_asset_name, batch_kwargs, expectations_config, caching=False, **kwargs): + def _get_data_asset(self, data_asset_name, batch_kwargs, expectation_suite, caching=False, **kwargs): if self.spark is None: logger.error("No spark session available") return None if "path" in batch_kwargs: + path = batch_kwargs.pop("path") # We remove this so it is not used as a reader option reader = self.spark.read - all_reader_options = dict(**self._datasource_config["reader_options"]) - all_reader_options.update(**kwargs) + batch_kwargs.update(kwargs) - for option in all_reader_options.items(): + for option in batch_kwargs.items(): reader = reader.option(*option) - df = reader.csv(os.path.join(batch_kwargs["path"])) + df = reader.csv(os.path.join(path)) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs.query) return SparkDFDataset(df, - expectations_config=expectations_config, - data_context=self._data_context, - data_asset_name=data_asset_name, - batch_kwargs=batch_kwargs, - caching=caching) + expectation_suite=expectation_suite, + data_context=self._data_context, + data_asset_name=data_asset_name, + batch_kwargs=batch_kwargs, + caching=caching) diff --git a/great_expectations/datasource/sqlalchemy_source.py b/great_expectations/datasource/sqlalchemy_source.py index 86d0df02d2fb..2e8d041d6fde 100644 --- a/great_expectations/datasource/sqlalchemy_source.py +++ b/great_expectations/datasource/sqlalchemy_source.py @@ -13,22 +13,26 @@ import sqlalchemy from sqlalchemy import create_engine, MetaData except ImportError: + sqlalchemy = None + create_engine = None + MetaData = None logger.debug("Unable to import sqlalchemy.") + class QueryGenerator(BatchGenerator): - """ + """Produce query-style batch_kwargs from sql files stored on disk """ - def __init__(self, name="default", datasource=None, engine=None): + def __init__(self, datasource, name="default"): super(QueryGenerator, self).__init__(name=name, type_="queries", datasource=datasource) self.meta = MetaData() - if datasource is not None and datasource._data_context is not None: - self._queries_path = os.path.join(self._datasource._data_context.context_root_directory, - "great_expectations/datasources", - self._datasource._name, - "generators", - self._name, - "queries") + if datasource is not None and datasource.data_context is not None: + self._queries_path = os.path.join(self._datasource.data_context.context_root_directory, + "great_expectations/datasources", + self._datasource.name, + "generators", + self._name, + "queries") else: self._queries_path = None self._queries = {} @@ -36,9 +40,9 @@ def __init__(self, name="default", datasource=None, engine=None): if datasource is not None: self.engine = datasource.engine - def _get_iterator(self, data_asset_name): + def _get_iterator(self, data_asset_name, **kwargs): if self._queries_path: - if data_asset_name in [path for path in os.walk(self._queries_path) if path.endswith(".sql")]: + if data_asset_name in [path for path in os.walk(self._queries_path) if str(path).endswith(".sql")]: with open(os.path.join(self._queries_path, data_asset_name) + ".sql", "r") as data: return iter([{ "query": data.read(), @@ -72,7 +76,7 @@ def add_query(self, data_asset_name, query): def get_available_data_asset_names(self): if self._queries_path: - defined_queries = [path for path in os.walk(self._queries_path) if path.endswith(".sql")] + defined_queries = [path for path in os.walk(self._queries_path) if str(path).endswith(".sql")] else: defined_queries = list(self._queries.keys()) if self.engine is not None: @@ -83,6 +87,7 @@ def get_available_data_asset_names(self): return set(defined_queries + tables) + class SqlAlchemyDatasource(Datasource): """ A SqlAlchemyDatasource will provide data_assets converting batch_kwargs using the following rules: @@ -90,27 +95,26 @@ class SqlAlchemyDatasource(Datasource): to that table - if the batch_kwargs include a query key, the datasource will create a temporary table using that that query. The query can be parameterized according to the standard python Template engine, which - uses $parameter, with additional kwargs passed to the get_data_asset method. + uses $parameter, with additional kwargs passed to the get_batch method. """ def __init__(self, name="default", data_context=None, profile=None, generators=None, **kwargs): if generators is None: generators = { "default": {"type": "queries"} - } - super(SqlAlchemyDatasource, self).__init__(name, type_="sqlalchemy", data_context=data_context, generators=generators) + } + super(SqlAlchemyDatasource, self).__init__(name, + type_="sqlalchemy", + data_context=data_context, + generators=generators) if profile is not None: self._datasource_config.update({ "profile": profile }) - credentials = data_context.get_profile_credentials(profile) - else: - credentials = {} - # if an engine was provided, use that - kwarg_engine = kwargs.pop("engine", None) - if kwarg_engine is not None: - self.engine = kwarg_engine + if "engine" in kwargs: + self.engine = kwargs.pop("engine") + # if a connection string or url was provided, use that elif "connection_string" in kwargs: connection_string = kwargs.pop("connection_string") @@ -118,56 +122,64 @@ def __init__(self, name="default", data_context=None, profile=None, generators=N elif "url" in kwargs: url = kwargs.pop("url") self.engine = create_engine(url, **kwargs) - else: - # Update credentials with anything passed during connection time - credentials.update(dict(**kwargs)) - drivername = credentials.pop("drivername") - options = sqlalchemy.engine.url.URL(drivername, **credentials) - self.engine = create_engine(options) - self.meta = MetaData() + # Otherwise, connect using remaining kwargs + else: + self._connect(self._get_sqlalchemy_connection_options(**kwargs)) + self._build_generators() + def _get_sqlalchemy_connection_options(self, **kwargs): + if "profile" in self._datasource_config: + profile = self._datasource_config["profile"] + credentials = self.data_context.get_profile_credentials(profile) + else: + credentials = {} + + # Update credentials with anything passed during connection time + credentials.update(dict(**kwargs)) + drivername = credentials.pop("drivername") + options = sqlalchemy.engine.url.URL(drivername, **credentials) + return options + + def _connect(self, options): + self.engine = create_engine(options) + self.meta = MetaData() + def _get_generator_class(self, type_): if type_ == "queries": return QueryGenerator else: raise ValueError("Unrecognized DataAssetGenerator type %s" % type_) - def _get_data_asset(self, data_asset_name, batch_kwargs, expectations_config, schema=None, **kwargs): + def _get_data_asset(self, data_asset_name, batch_kwargs, expectation_suite, schema=None, **kwargs): if "table" in batch_kwargs: return SqlAlchemyDataset(table_name=batch_kwargs["table"], - engine=self.engine, - schema=schema, - data_context=self._data_context, - data_asset_name=data_asset_name, - expectations_config=expectations_config, - batch_kwargs=batch_kwargs) + engine=self.engine, + schema=schema, + data_context=self._data_context, + data_asset_name=data_asset_name, + expectation_suite=expectation_suite, + batch_kwargs=batch_kwargs) elif "query" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute(**kwargs) return SqlAlchemyDataset(table_name=data_asset_name, - engine=self.engine, - data_context=self._data_context, - data_asset_name=data_asset_name, - expectations_config=expectations_config, - custom_sql=query, - batch_kwargs=batch_kwargs) + engine=self.engine, + data_context=self._data_context, + data_asset_name=data_asset_name, + expectation_suite=expectation_suite, + custom_sql=query, + batch_kwargs=batch_kwargs) else: raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified") - def build_batch_kwargs(self, table=None, query=None): - if (table is None and query is None) or (table is not None and query is not None): - raise ValueError("Exactly one of 'table' or 'query' must be specified.") - - if table is not None: - return { - "table": table, + def build_batch_kwargs(self, *args, **kwargs): + """Magically build batch_kwargs by guessing that the first non-keyword argument is a table name""" + if len(args) > 0: + kwargs.update({ + "table": args[0], "timestamp": time.time() - } - else: - return { - "query": query, - "timestamp": time.time() - } + }) + return kwargs diff --git a/great_expectations/exceptions.py b/great_expectations/exceptions.py index 6046cf1b1857..810d4fa1906f 100644 --- a/great_expectations/exceptions.py +++ b/great_expectations/exceptions.py @@ -2,7 +2,7 @@ class GreatExpectationsError(Exception): def __init__(self, message): - self. message = message + self.message = message class DataContextError(GreatExpectationsError): pass @@ -14,13 +14,12 @@ class ConfigNotFoundError(DataContextError): def __init__(self, context_root_directory): self.message = "No configuration found in %s" % str(os.path.join(context_root_directory, "great_expectations")) -class ExpectationsConfigNotFoundError(GreatExpectationsError): +class ExpectationSuiteNotFoundError(GreatExpectationsError): def __init__(self, data_asset_name): self.data_asset_name = data_asset_name - self.message = "No expectations config found for data_asset_name %s" % data_asset_name + self.message = "No expectation suite found for data_asset_name %s" % data_asset_name - -class BatchKwargsError(GreatExpectationsError): +class BatchKwargsError(DataContextError): def __init__(self, message, batch_kwargs): self.message = message self.batch_kwargs = batch_kwargs \ No newline at end of file diff --git a/great_expectations/init_notebooks/create_expectations_for_csv_files.ipynb b/great_expectations/init_notebooks/create_expectations_for_csv_files.ipynb index 4189d2fdef0d..5c2aee8e92cd 100644 --- a/great_expectations/init_notebooks/create_expectations_for_csv_files.ipynb +++ b/great_expectations/init_notebooks/create_expectations_for_csv_files.ipynb @@ -240,7 +240,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.get_expectations_config()" + "df.get_expectation_suite()" ] }, { @@ -263,7 +263,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.save_expectations_config()" + "df.save_expectation_suite()" ] }, { diff --git a/great_expectations/init_notebooks/create_expectations_for_spark_dataframes.ipynb b/great_expectations/init_notebooks/create_expectations_for_spark_dataframes.ipynb index e2f5ae83701e..4a0cda7c2088 100644 --- a/great_expectations/init_notebooks/create_expectations_for_spark_dataframes.ipynb +++ b/great_expectations/init_notebooks/create_expectations_for_spark_dataframes.ipynb @@ -239,7 +239,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.get_expectations()" + "df.get_expectation_suite()" ] }, { @@ -262,7 +262,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.save_expectations()" + "df.save_expectation_suite()" ] }, { diff --git a/great_expectations/init_notebooks/create_initial_expectations_pandas.ipynb b/great_expectations/init_notebooks/create_initial_expectations_pandas.ipynb index 6c85e510bd95..4820c9e4005b 100644 --- a/great_expectations/init_notebooks/create_initial_expectations_pandas.ipynb +++ b/great_expectations/init_notebooks/create_initial_expectations_pandas.ipynb @@ -66,7 +66,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = context.get_data_asset(\"local-data\", \"Titanic.csv\")" + "df = context.get_batch(\"local-data\", \"Titanic.csv\")" ] }, { @@ -78,7 +78,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "WARNING: get_expectations_config discarded\n", + "WARNING: get_expectation_suite discarded\n", "\t0 failing expectations\n", "\t0 result_format kwargs\n", "\t0 include_configs kwargs\n", @@ -101,7 +101,7 @@ } ], "source": [ - "df.get_expectations_config()" + "df.get_expectation_suite()" ] }, { @@ -244,7 +244,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "WARNING: get_expectations_config discarded\n", + "WARNING: get_expectation_suite discarded\n", "\t0 failing expectations\n", "\t1 result_format kwargs\n", "\t0 include_configs kwargs\n", @@ -254,7 +254,7 @@ } ], "source": [ - "df.save_expectations_config()" + "df.save_expectation_suite()" ] }, { @@ -263,7 +263,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_every_visit_per_day.save_expectations_config()" + "df_every_visit_per_day.save_expectation_suite()" ] }, { diff --git a/great_expectations/init_notebooks/integrate_validation_into_pipeline.ipynb b/great_expectations/init_notebooks/integrate_validation_into_pipeline.ipynb index d24b1cfc992f..5594b0bd304f 100644 --- a/great_expectations/init_notebooks/integrate_validation_into_pipeline.ipynb +++ b/great_expectations/init_notebooks/integrate_validation_into_pipeline.ipynb @@ -87,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "context.list_expectations_configs() # ????" + "context.list_expectation_suites() # ????" ] }, { diff --git a/great_expectations/init_notebooks/using_great_expectations_with_dbt.ipynb b/great_expectations/init_notebooks/using_great_expectations_with_dbt.ipynb index 63d9c036a831..aff0449e62aa 100644 --- a/great_expectations/init_notebooks/using_great_expectations_with_dbt.ipynb +++ b/great_expectations/init_notebooks/using_great_expectations_with_dbt.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_base_scheduleappointment = context.get_data_asset(\"mydbt\", \"staging/staging_schedule_appointments\")" + "df_base_scheduleappointment = context.get_batch(\"mydbt\", \"staging/staging_schedule_appointments\")" ] }, { @@ -92,7 +92,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "WARNING: get_expectations discarded\n", + "WARNING: get_expectation_suite discarded\n", "\t0 failing expectations\n", "\t0 result_format kwargs\n", "\t0 include_configs kwargs\n", @@ -115,7 +115,7 @@ } ], "source": [ - "df_base_scheduleappointment.get_expectations()" + "df_base_scheduleappointment.get_expectation_suite()" ] }, { @@ -161,7 +161,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "WARNING: get_expectations discarded\n", + "WARNING: get_expectation_suite discarded\n", "\t0 failing expectations\n", "\t1 result_format kwargs\n", "\t0 include_configs kwargs\n", @@ -171,7 +171,7 @@ } ], "source": [ - "df_base_scheduleappointment.save_expectations()" + "df_base_scheduleappointment.save_expectation_suite()" ] }, { @@ -415,7 +415,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_base_scheduleappointment.save_expectations()" + "df_base_scheduleappointment.save_expectation_suite()" ] }, { @@ -445,7 +445,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_every_visit_per_day = context.get_data_asset(\"mydbt\", \"schedule_appointments\")" + "df_every_visit_per_day = context.get_batch(\"mydbt\", \"schedule_appointments\")" ] }, { @@ -568,7 +568,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_every_visit_per_day.save_expectations()" + "df_every_visit_per_day.save_expectation_suite()" ] }, { diff --git a/great_expectations/init_notebooks/using_great_expectations_with_sql.ipynb b/great_expectations/init_notebooks/using_great_expectations_with_sql.ipynb index 77c834bfc292..7542d851e4ac 100644 --- a/great_expectations/init_notebooks/using_great_expectations_with_sql.ipynb +++ b/great_expectations/init_notebooks/using_great_expectations_with_sql.ipynb @@ -73,7 +73,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = context.get_data_asset(\"mydb\", data_asset_name=\"myquery1\", custom_sql=\"select * from scheduleappointment\")" + "df = context.get_batch(\"mydb\", data_asset_name=\"myquery1\", custom_sql=\"select * from scheduleappointment\")" ] }, { @@ -82,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.get_expectations()" + "df.get_expectation_suite()" ] }, { @@ -107,7 +107,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.save_expectations()" + "df.save_expectation_suite()" ] }, { @@ -116,7 +116,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_every_visit_per_day.save_expectations()" + "df_every_visit_per_day.save_expectation_suite()" ] }, { diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index a020edce73fa..4f07e7508bde 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -115,4 +115,4 @@ def _profile(cls, dataset): # print("??????", column, type_, cardinality) pass - return df.get_expectations(suppress_warnings=True) + return df.get_expectation_suite(suppress_warnings=True) diff --git a/great_expectations/profile/columns_exist.py b/great_expectations/profile/columns_exist.py index 29d1476e487c..28e2ac9108fa 100644 --- a/great_expectations/profile/columns_exist.py +++ b/great_expectations/profile/columns_exist.py @@ -34,4 +34,4 @@ def _profile(cls, dataset): create_multiple_expectations( dataset, table_columns, "expect_column_to_exist") - return dataset.get_expectations(suppress_warnings=True) + return dataset.get_expectation_suite(suppress_warnings=True) diff --git a/great_expectations/render/README.md b/great_expectations/render/README.md index e232c5a3d317..f480f13dd15a 100644 --- a/great_expectations/render/README.md +++ b/great_expectations/render/README.md @@ -208,7 +208,7 @@ The relationship from Expectation (or EVR) to snippet methods is one to one or n ``` ge.render( - expectations_config=my_expectation_config, + expectation_suite=my_expectation_config, view_model=ge.render.view_models.default_html.prescriptive_data_docs ) ``` @@ -219,7 +219,7 @@ ge.render( ``` ge.render( - expectations_config=my_expectation_config, + expectation_suite=my_expectation_config, view_model=ge.render.view_models.slack render_to="json" ) diff --git a/great_expectations/util.py b/great_expectations/util.py index 00013fb8c524..4fb3e73b46ed 100644 --- a/great_expectations/util.py +++ b/great_expectations/util.py @@ -14,14 +14,14 @@ logger = logging.getLogger(__name__) -def _convert_to_dataset_class(df, dataset_class, expectations_config=None, profiler=None): +def _convert_to_dataset_class(df, dataset_class, expectation_suite=None, profiler=None): """ - Convert a (pandas) dataframe to a great_expectations dataset, with (optional) expectations_config + Convert a (pandas) dataframe to a great_expectations dataset, with (optional) expectation_suite """ - if expectations_config is not None: - # Create a dataset of the new class type, and manually initialize expectations according to the provided configuration + if expectation_suite is not None: + # Create a dataset of the new class type, and manually initialize expectations according to the provided expectation suite new_df = dataset_class.from_dataset(df) - new_df._initialize_expectations(expectations_config) + new_df._initialize_expectations(expectation_suite) else: # Instantiate the new Dataset with default expectations new_df = dataset_class.from_dataset(df) @@ -34,20 +34,20 @@ def _convert_to_dataset_class(df, dataset_class, expectations_config=None, profi def read_csv( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, profiler=None, *args, **kwargs ): df = pd.read_csv(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, profiler) + df, dataset_class, expectation_suite, profiler) return df def read_json( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, accessor_func=None, profiler=None, *args, **kwargs @@ -61,14 +61,14 @@ def read_json( df = pd.read_json(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, profiler) + df, dataset_class, expectation_suite, profiler) return df def read_excel( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, profiler=None, *args, **kwargs ): @@ -77,7 +77,7 @@ def read_excel( Args: filename (string): path to file to read dataset_class (Dataset class): class to which to convert resulting Pandas df - expectations_config (string): path to great_expectations config file + expectation_suite (string): path to great_expectations expectation suite file Returns: great_expectations dataset or ordered dict of great_expectations datasets, @@ -87,17 +87,17 @@ def read_excel( if isinstance(df, dict): for key in df: df[key] = _convert_to_dataset_class( - df[key], dataset_class, expectations_config, profiler) + df[key], dataset_class, expectation_suite, profiler) else: df = _convert_to_dataset_class( - df, dataset_class, expectations_config, profiler) + df, dataset_class, expectation_suite, profiler) return df def read_table( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, profiler=None, *args, **kwargs ): @@ -106,21 +106,21 @@ def read_table( Args: filename (string): path to file to read dataset_class (Dataset class): class to which to convert resulting Pandas df - expectations_config (string): path to great_expectations config file + expectation_suite (string): path to great_expectations expectation suite file Returns: great_expectations dataset """ df = pd.read_table(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, profiler) + df, dataset_class, expectation_suite, profiler) return df def read_parquet( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, profiler=None, *args, **kwargs ): @@ -129,20 +129,20 @@ def read_parquet( Args: filename (string): path to file to read dataset_class (Dataset class): class to which to convert resulting Pandas df - expectations_config (string): path to great_expectations config file + expectation_suite (string): path to great_expectations expectation suite file Returns: great_expectations dataset """ df = pd.read_parquet(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, profiler) + df, dataset_class, expectation_suite, profiler) return df def from_pandas(pandas_df, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, profiler=None ): """Read a Pandas data frame and return a great_expectations dataset. @@ -151,7 +151,7 @@ def from_pandas(pandas_df, pandas_df (Pandas df): Pandas data frame dataset_class (Dataset class) = dataset.pandas_dataset.PandasDataset: class to which to convert resulting Pandas df - expectations_config (string) = None: path to great_expectations config file + expectation_suite (string) = None: path to great_expectations expectation suite file profiler (profiler class) = None: The profiler that should be run on the dataset to establish a baseline expectation suite. @@ -161,34 +161,34 @@ class to which to convert resulting Pandas df return _convert_to_dataset_class( pandas_df, dataset_class, - expectations_config, + expectation_suite, profiler ) -def validate(data_asset, expectations_config=None, data_asset_name=None, data_context=None, data_asset_type=None, *args, **kwargs): - """Validate the provided data asset using the provided config""" - if expectations_config is None and data_context is None: +def validate(data_asset, expectation_suite=None, data_asset_name=None, data_context=None, data_asset_type=None, *args, **kwargs): + """Validate the provided data asset using the provided expectation suite""" + if expectation_suite is None and data_context is None: raise ValueError( - "Either an expectations config or a DataContext is required for validation.") + "Either an expectation suite or a DataContext is required for validation.") - if expectations_config is None: - logger.info("Using expectations config from DataContext.") + if expectation_suite is None: + logger.info("Using expectation suite from DataContext.") # Allow data_context to be a string, and try loading it from path in that case if isinstance(data_context, string_types): data_context = DataContext(data_context) - expectations_config = data_context.get_expectations(data_asset_name) + expectation_suite = data_context.get_expectation_suite(data_asset_name) else: - if data_asset_name in expectations_config: - logger.info("Using expectations config with name %s" % - expectations_config["data_asset_name"]) + if data_asset_name in expectation_suite: + logger.info("Using expectation suite with name %s" % + expectation_suite["data_asset_name"]) else: - logger.info("Using expectations config with no data_asset_name") + logger.info("Using expectation suite with no data_asset_name") # If the object is already a Dataset type, then this is purely a convenience method # and no conversion is needed if isinstance(data_asset, dataset.Dataset) and data_asset_type is None: - return data_asset.validate(expectations_config=expectations_config, data_context=data_context, *args, **kwargs) + return data_asset.validate(expectation_suite=expectation_suite, data_context=data_context, *args, **kwargs) elif data_asset_type is None: # Guess the GE data_asset_type based on the type of the data_asset if isinstance(data_asset, pd.DataFrame): @@ -209,7 +209,7 @@ def validate(data_asset, expectations_config=None, data_asset_name=None, data_co "The validate util method only supports validation for subtypes of the provided data_asset_type.") data_asset_ = _convert_to_dataset_class( - data_asset, data_asset_type, expectations_config) + data_asset, data_asset_type, expectation_suite) return data_asset_.validate(*args, data_context=data_context, **kwargs) diff --git a/test.html b/test.html new file mode 100644 index 000000000000..dbc6e5621e2f --- /dev/null +++ b/test.html @@ -0,0 +1,1450 @@ + + + + Data documentation compiled by Great Expectations + + + + + + + + + + + + + + +
+ + Documentation autogenerated using  + + Great Expectations. +
+
+ +
+
+
+
+ +
+
+ + + +
+ + + + +

Unnamed: 0

+ + + + +
+

+ int +

+
+ + + + + Example values: +
+

+ + 1 + + 2 + + 3 + + 4 + + 5 + + 6 + + 7 + + 8 + + 9 + + 10 + + 11 + + 12 + + 13 + + 14 + + 15 + + 16 + + 17 + + 18 + + 19 + + 20 + +

+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.0%
Missing (n)0
test rowgarbage data
Unique (%)100.0%
+
+ + + + +
+

+ +

+ +

+
+ + +
+ +
+ + + + +

Name

+ + + + +
+

+ string +

+
+ + + + + Example values: +
+

+ + Carlsson, Mr Frans Olof + + Connolly, Miss Kate + + Kelly, Mr James + + Allen, Miss Elisabeth Walton + + Allison, Master Hudson Trevor + + Allison, Miss Helen Loraine + + Allison, Mr Hudson Joshua Creighton + + Allison, Mrs Hudson JC (Bessie Waldo Daniels) + + Anderson, Mr Harry + + Andrews, Miss Kornelia Theodosia + + Andrews, Mr Thomas, jr + + Appleton, Mrs Edward Dale (Charlotte Lamson) + + Artagaveytia, Mr Ramon + + Astor, Colonel John Jacob + + Astor, Mrs John Jacob (Madeleine Talmadge Force) + + Aubert, Mrs Leontine Pauline + + Barkworth, Mr Algernon H + + Baumann, Mr John D + + Baxter, Mr Quigg Edmond + + Baxter, Mrs James (Helene DeLaudeniere Chaput) + +

+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.0%
Missing (n)0
test rowgarbage data
Leading or trailing whitespace (n)3
Unique (%)99.8%
+
+ + + + +
+

+ +

+ +

+
+ + +
+ +
+ + + + +

PClass

+ + + + +
+

+ string +

+
+ + + + +
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.0%
Missing (n)0
test rowgarbage data
Unique (%)0.3%
Leading or trailing whitespace (n)0
+
+ + + + +
+

+ +

+ +

+
+ + +
+ +
+ + + + +

Age

+ + + + +
+

+ float +

+
+ + + + + Example values: +
+

+ + 22.0 + + 21.0 + + 30.0 + + 18.0 + + 36.0 + + 24.0 + + 26.0 + + 27.0 + + 28.0 + + 19.0 + + 20.0 + + 23.0 + + 25.0 + + 32.0 + + 45.0 + + 29.0 + + 31.0 + + 33.0 + + 35.0 + + 39.0 + +

+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.4%
Missing (n)557
test rowgarbage data
Unique (%)9.9%
+
+ + + + +
+

+ +

+ +

+
+ + +
+ +
+ + + + +

Sex

+ + + + +
+

+ string +

+
+ + + + +
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.0%
Missing (n)0
test rowgarbage data
Unique (%)0.2%
Leading or trailing whitespace (n)0
+
+ + + + +
+

+ +

+ +

+
+ + +
+ +
+ + + + +

Survived

+ + + + +
+

+ int +

+
+ + + + +
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.0%
Missing (n)0
test rowgarbage data
Unique (%)0.2%
+
+ + + + +
+

+ +

+ +

+
+ + +
+ +
+ + + + +

SexCode

+ + + + +
+

+ int +

+
+ + + + +
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Unexpected (%)0.0%
Unexpected (n)0
Missing (%)0.0%
Missing (n)0
test rowgarbage data
Unique (%)0.2%
+
+ + + + +
+

+ +

+ +

+
+ + +
+ + +
+ +
+
+ + + + + \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index e6994d5cc724..0570eff2d6b5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,19 +17,19 @@ @pytest.fixture -def empty_expectations_config(): - config = { - 'dataset_name': "empty_config_fixture", +def empty_expectation_suite(): + expectation_suite = { + 'dataset_name': "empty_suite_fixture", 'meta': {}, 'expectations': [] } - return config + return expectation_suite @pytest.fixture -def basic_expectations_config(): - config = { - 'dataset_name': "basic_config_fixture", +def basic_expectation_suite(): + expectation_suite = { + 'dataset_name': "basic_suite_fixture", 'meta': {}, 'expectations': [ { @@ -58,7 +58,7 @@ def basic_expectations_config(): } ] } - return config + return expectation_suite @pytest.fixture @@ -126,19 +126,17 @@ def data_context(tmp_path_factory): # This data_context is *manually* created to have the config we want, vs created with DataContext.create context_path = tmp_path_factory.mktemp('data_context') context_path = str(context_path) - asset_config_path = os.path.join( - context_path, "great_expectations/expectations") - safe_mmkdir(asset_config_path, exist_ok=True) - shutil.copy("./tests/test_fixtures/great_expectations_basic.yml", - str(os.path.join(context_path, "great_expectations/great_expectations.yml"))) - shutil.copy("./tests/test_fixtures/expectations/parameterized_expectations_config_fixture.json", - str(asset_config_path)) + asset_config_path = os.path.join(context_path, "great_expectations/expectations") + safe_mmkdir(os.path.join(asset_config_path, "mydatasource/mygenerator/parameterized_expectation_suite_fixture"), exist_ok=True) + shutil.copy("./tests/test_fixtures/great_expectations_basic.yml", str(os.path.join(context_path, "great_expectations/great_expectations.yml"))) + shutil.copy("./tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json", + os.path.join(asset_config_path, "mydatasource/mygenerator/parameterized_expectation_suite_fixture/default.json")) return ge.data_context.DataContext(context_path) @pytest.fixture() def filesystem_csv(tmp_path_factory): - base_dir = tmp_path_factory.mktemp('test_file_kwargs_generator') + base_dir = tmp_path_factory.mktemp('filesystem_csv') base_dir = str(base_dir) # Put a few files in the directory with open(os.path.join(base_dir, "f1.csv"), "w") as outfile: diff --git a/tests/data_context/test_data_context.py b/tests/data_context/test_data_context.py index 8681d153e5f3..e9f4965e97a4 100644 --- a/tests/data_context/test_data_context.py +++ b/tests/data_context/test_data_context.py @@ -12,14 +12,14 @@ import sqlalchemy as sa import pandas as pd +from great_expectations.exceptions import DataContextError from great_expectations.data_context import DataContext -from great_expectations.data_context.util import safe_mmkdir -# get_data_context +from great_expectations.data_context.util import safe_mmkdir, NormalizedDataAssetName from great_expectations.dataset import PandasDataset, SqlAlchemyDataset @pytest.fixture() -def parameterized_expectations_config(): +def parameterized_expectation_suite(): return { "data_asset_name": "parameterized_expectaitons_config_fixture", "data_asset_type": "Dataset", @@ -53,7 +53,8 @@ def test_validate_saves_result_inserts_run_id(empty_data_context, filesystem_csv # we should now be able to validate, and have validations saved. assert not_so_empty_data_context._project_config["result_store"]["filesystem"]["base_directory"] == "uncommitted/validations/" - my_batch = not_so_empty_data_context.get_batch("my_datasource", "f1") + my_batch = not_so_empty_data_context.get_batch("f1") + my_batch = not_so_empty_data_context.get_batch("my_datasource/f1") my_batch.expect_column_to_exist("a") with mock.patch("uuid.uuid1") as mock_uuid: @@ -61,7 +62,7 @@ def test_validate_saves_result_inserts_run_id(empty_data_context, filesystem_csv validation_result = my_batch.validate() with open(os.path.join(not_so_empty_data_context.get_context_root_directory(), - "great_expectations/uncommitted/validations/__autogenerated_uuid_v1__/f1.json")) as infile: + "great_expectations/uncommitted/validations/__autogenerated_uuid_v1__/my_datasource/default/f1/default.json")) as infile: saved_validation_result = json.load(infile) assert validation_result == saved_validation_result @@ -70,30 +71,28 @@ def test_list_available_data_asset_names(empty_data_context, filesystem_csv): empty_data_context.add_datasource("my_datasource", "pandas", base_directory= str(filesystem_csv)) available_asset_names = empty_data_context.get_available_data_asset_names() - assert available_asset_names == [{ - "datasource": "my_datasource", - "generators": [{ - "generator": "default", - "available_data_asset_names": set(["f1", "f2", "f3"]) - }] - }] + assert available_asset_names == { + "my_datasource": { + "default": set(["f1", "f2", "f3"]) + } + } -def test_list_expectations_configs(data_context): - assert data_context.list_expectations_configs() == ['parameterized_expectations_config_fixture'] +def test_list_expectation_suites(data_context): + assert data_context.list_expectation_suites() == ['mydatasource/mygenerator/parameterized_expectation_suite_fixture/default'] def test_get_existing_data_asset_config(data_context): - data_asset_config = data_context.get_expectations('parameterized_expectations_config_fixture') - assert data_asset_config['data_asset_name'] == 'parameterized_expectations_config_fixture' + data_asset_config = data_context.get_expectation_suite('mydatasource/mygenerator/parameterized_expectation_suite_fixture/default') + assert data_asset_config['data_asset_name'] == 'mydatasource/mygenerator/parameterized_expectation_suite_fixture/default' assert len(data_asset_config['expectations']) == 2 def test_get_new_data_asset_config(data_context): - data_asset_config = data_context.get_expectations('this_data_asset_config_does_not_exist') - assert data_asset_config['data_asset_name'] == 'this_data_asset_config_does_not_exist' + data_asset_config = data_context.get_expectation_suite('this_data_asset_config_does_not_exist') + assert data_asset_config['data_asset_name'] == 'mydatasource/mygenerator/this_data_asset_config_does_not_exist/default' assert len(data_asset_config['expectations']) == 0 def test_save_data_asset_config(data_context): - data_asset_config = data_context.get_expectations('this_data_asset_config_does_not_exist') - assert data_asset_config['data_asset_name'] == 'this_data_asset_config_does_not_exist' + data_asset_config = data_context.get_expectation_suite('this_data_asset_config_does_not_exist') + assert data_asset_config['data_asset_name'] == 'mydatasource/mygenerator/this_data_asset_config_does_not_exist/default' assert len(data_asset_config['expectations']) == 0 data_asset_config['expectations'].append({ "expectation_type": "expect_table_row_count_to_equal", @@ -101,8 +100,8 @@ def test_save_data_asset_config(data_context): "value": 10 } }) - data_context.save_expectations(data_asset_config) - data_asset_config_saved = data_context.get_expectations('this_data_asset_config_does_not_exist') + data_context.save_expectation_suite(data_asset_config) + data_asset_config_saved = data_context.get_expectation_suite('this_data_asset_config_does_not_exist') assert data_asset_config['expectations'] == data_asset_config_saved['expectations'] def test_register_validation_results(data_context): @@ -199,25 +198,135 @@ def test_compile(data_context): } } -def test_normalize_data_asset_names(tmp_path_factory): - base_dir = tmp_path_factory.mktemp("test_normalize_data_asset_names") - base_dir = str(base_dir) - context = DataContext.create(base_dir) - # asset_dir = context_dir.join("expectations/ds1/gen1/data_asset_1/") - # os.makedirs(asset_dir) - # with open(asset_dir("default.json"), "w") as config: - # json.dump({"data_asset_name": "data_assset_1"}, config) +def test_normalize_data_asset_names_error(data_context): + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("this/should/never/work/because/it/is/so/long") + assert "found too many components using delimiter '/'" in exc.message + + print(data_context.get_available_data_asset_names()) + print(data_context.list_expectation_suites()) + + + assert True + +def test_normalize_data_asset_names_delimiters(data_context): + data_context.data_asset_name_delimiter = '.' + assert data_context._normalize_data_asset_name("this.should.be.okay") == \ + NormalizedDataAssetName("this", "should", "be", "okay") + + data_context.data_asset_name_delimiter = '/' + assert data_context._normalize_data_asset_name("this/should/be/okay") == \ + NormalizedDataAssetName("this", "should", "be", "okay") + + with pytest.raises(DataContextError) as exc: + data_context.data_asset_name_delimiter = "$" + assert "Invalid delimiter" in exc.message + + with pytest.raises(DataContextError) as exc: + data_context.data_asset_name_delimiter = "//" + assert "Invalid delimiter" in exc.message + +def test_normalize_data_asset_names_conditions_single_name(): + pass + + + # "mydatasource/mygenerator/myasset/mypurpose" + # "notadatasource/mygenerator/myasset/mypurpose" + # "mydatasource/myasset" + # "myasset" + # # Ok if only one generator has an asset with name myasset and purpose mypurpose + # # Bad if no such generator exists or multiple generators exist + # "mydatasource/myasset/mypurpose" + + # # Ok if only one purpose exists for myasset + # "mydatasource/mygenerator/myasset" + + # mydatasource/ + # default/ + # default/ + # default.json + # myotherdatasource/ + # default/ + # default/ + # default.json + + # "mydatasource/default/default" -> ok + # "mydatasource/default" -> ok + # "mydatasource/default/default/default" -> properly normaized + # "default" -> not ok; ambiguous + + # mydatasource/ + # default/ + # default/ + # default.json + # myotherasset/ + # default.json + # mythirdasset/ + # default.json + # different_purpose.json + # myotherdatasource/ + # default/ + # default/ + # default.json + # my_other_generator/ + # default/ + # default.json + # different_purpose.json + # mythirddatasource/ + # default/ + # default/ + # default.json + # my_other_generator/ + # default/ + # default.json + # my_third_generator/ + # default/ + # default.json + + # "myotherasset" -> ok. normalize to "mydatasource/default/myotherasset/default.json" + # "mythirdasset" -> ambigous. both default and different_purpose are available + # "myotherdatasource/default" -> ambiguous: two generators + # "myotherdatasource/my_other_generator/default" -> ok. normalize to "myotherdatasource/my_other_generator/default/default" + # "myotherdatasource/default/default" -> ambiguous (could be other_generator/default/default or default/default/default) + # "myotherdatasource/default/different_purpose" -> ok. normalizse to "myotherdatasource/my_other_generator/default/different_purpose" + + + # NO CONFIG, but a datasource produces: + # - "mydatasource/default/myasset" + # - "mydatasource/default/myotherasset" + # - "mydatasource/myothergenerator/myasset" + # "mydatasource/myasset/mypurpose" -> ambiguous + # "mydatasource/default/myasset" -> ok + # "mydatasource/default/myotherasset" -> ok + + # - "mydatasource/myname/myname" + # "mydatasource/myname/myname" -> ok -> "mydatasurce/myname/myname/default" + + + + # - "mydatasource/myname/myname" + # - "mydatasource/myother/myname" + # "mydatasource/myname/myname" -> ambigouous. could be "mydatasource/myname/myname/default" or could be "mydatasource/myother/myname/myname" + + # NO CONFIG, but a datasource produces: + # - "mydatasource/mygenerator/myasset" + # - "mydatasource/mygenerator/myotherasset" + # - "mydatasource/myothergenerator/myasset" + # "mydatasource/myasset/mypurpose" -> ambiguous + + + # NO CONFIG, but a datasource produces + # - "mydatasource/mygenerator/myasset" + # - "mydatasource/mygenerator/myotherasset" + # "mydatasource/myasset/mypurpose" -> "mydatasource/mygenerator/myasset/mypurpose" - # assert context._normalize_data_asset_name("data_asset_1") == "ds1/gen1/data_asset_1" - # NOTE: NORMALIZATION IS CURRENTLY A NO-OP - assert context._normalize_data_asset_name("data_asset_1") == "data_asset_1" def test_list_datasources(data_context): datasources = data_context.list_datasources() assert datasources == [ { - "name": "default", + "name": "mydatasource", "type": "pandas" } ] @@ -228,7 +337,7 @@ def test_list_datasources(data_context): assert datasources == [ { - "name": "default", + "name": "mydatasource", "type": "pandas" }, { diff --git a/tests/datasource/test_batch_generators.py b/tests/datasource/test_batch_generators.py index 664bbdb11a37..fbba8a5f247e 100644 --- a/tests/datasource/test_batch_generators.py +++ b/tests/datasource/test_batch_generators.py @@ -2,6 +2,8 @@ import os +from great_expectations.exceptions import DataContextError + def test_file_kwargs_generator(data_context, filesystem_csv): base_dir = filesystem_csv @@ -9,17 +11,19 @@ def test_file_kwargs_generator(data_context, filesystem_csv): generator = datasource.get_generator("default") known_data_asset_names = datasource.get_available_data_asset_names() - assert known_data_asset_names[0]["available_data_asset_names"] == set([ + assert known_data_asset_names["default"] == set([ "f1", "f2", "f3" ]) f1_batches = [batch_kwargs for batch_kwargs in generator.get_iterator("f1")] + assert len(f1_batches) == 1 + assert "timestamp" in f1_batches[0] + del f1_batches[0]["timestamp"] assert f1_batches[0] == { "path": os.path.join(base_dir, "f1.csv") } - assert len(f1_batches) == 1 - f3_batches = [batch_kwargs for batch_kwargs in generator.get_iterator("f3")] + f3_batches = [batch_kwargs["path"] for batch_kwargs in generator.get_iterator("f3")] expected_batches = [ { "path": os.path.join(base_dir, "f3", "f3_20190101.csv") @@ -29,12 +33,13 @@ def test_file_kwargs_generator(data_context, filesystem_csv): } ] for batch in expected_batches: - assert batch in f3_batches + assert batch["path"] in f3_batches assert len(f3_batches) == 2 def test_file_kwargs_generator_error(data_context, filesystem_csv): base_dir = filesystem_csv data_context.add_datasource("default", "pandas", base_directory=str(base_dir)) - with pytest.raises(IOError, match="f4"): - data_context.get_batch("default", "f4") \ No newline at end of file + with pytest.raises(DataContextError) as exc: + data_context.get_batch("f4") + assert "f4" in exc.message \ No newline at end of file diff --git a/tests/datasource/test_datasources.py b/tests/datasource/test_datasources.py index 793b35dc16a4..7c3b5901e1af 100644 --- a/tests/datasource/test_datasources.py +++ b/tests/datasource/test_datasources.py @@ -63,17 +63,17 @@ def test_create_pandas_datasource(data_context, tmp_path_factory): def test_standalone_pandas_datasource(test_folder_connection_path): datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) - assert datasource.get_available_data_asset_names() == [{"generator": "default", "available_data_asset_names": {"test"}}] + assert datasource.get_available_data_asset_names() == {"default": {"test"}} manual_batch_kwargs = datasource.build_batch_kwargs(os.path.join(str(test_folder_connection_path), "test.csv")) - # Get the default (filesystem) generator + # Get the default (subdir_path) generator generator = datasource.get_generator() auto_batch_kwargs = generator.yield_batch_kwargs("test") assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] # Include some extra kwargs... - dataset = datasource.get_data_asset("test", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0) + dataset = datasource.get_batch("test", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0) assert isinstance(dataset, PandasDataset) assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all() @@ -81,13 +81,12 @@ def test_standalone_sqlalchemy_datasource(test_db_connection_string): datasource = SqlAlchemyDatasource( 'SqlAlchemy', connection_string=test_db_connection_string, echo=False) - assert datasource.get_available_data_asset_names() == [{"generator": "default", "available_data_asset_names": {"table_1", "table_2"}}] - dataset1 = datasource.get_data_asset("table_1") - dataset2 = datasource.get_data_asset("table_2", schema='main') + assert datasource.get_available_data_asset_names() == {"default": {"table_1", "table_2"}} + dataset1 = datasource.get_batch("table_1") + dataset2 = datasource.get_batch("table_2", schema='main') assert isinstance(dataset1, SqlAlchemyDataset) assert isinstance(dataset2, SqlAlchemyDataset) - def test_create_sqlalchemy_datasource(data_context): name = "test_sqlalchemy_datasource" type_ = "sqlalchemy" @@ -155,47 +154,45 @@ def test_create_sparkdf_datasource(data_context, tmp_path_factory): assert name in data_context_config["datasources"] assert data_context_config["datasources"][name]["type"] == type_ - assert data_context_config["datasources"][name]["reader_options"]["sep"] == "|" + assert data_context_config["datasources"][name]["generators"]["default"]["reader_options"]["sep"] == "|" # Note that pipe is special in yml, so let's also check to see that it was properly serialized with open(os.path.join(data_context.get_context_root_directory(), "great_expectations/great_expectations.yml"), "r") as configfile: lines = configfile.readlines() - assert " sep: '|'\n" in lines - assert " header: false\n" in lines + assert " sep: '|'\n" in lines + assert " header: false\n" in lines def test_sqlalchemysource_templating(sqlitedb_engine): datasource = SqlAlchemyDatasource(engine=sqlitedb_engine) generator = datasource.get_generator() generator.add_query("test", "select 'cat' as ${col_name};") - df = datasource.get_data_asset("test", col_name="animal_name") + df = datasource.get_batch("test", col_name="animal_name") res = df.expect_column_to_exist("animal_name") assert res["success"] == True - - def test_pandas_source_readcsv(data_context, tmp_path_factory): if not PY3: # We don't specifically test py2 unicode reading since this test is about our handling of kwargs *to* read_csv pytest.skip() basedir = tmp_path_factory.mktemp('test_create_pandas_datasource') shutil.copy("./tests/test_sets/unicode.csv", basedir) - data_context.add_datasource(name="mysource", type_="pandas", read_csv_kwargs={"encoding": "utf-8"}, base_directory=str(basedir)) + data_context.add_datasource(name="mysource", type_="pandas", reader_options={"encoding": "utf-8"}, base_directory=str(basedir)) - batch = data_context.get_batch("mysource", "unicode") + batch = data_context.get_batch("mysource/unicode") assert len(batch["Μ"] == 1) assert "😁" in list(batch["Μ"]) data_context.add_datasource(name="mysource2", type_="pandas", base_directory=str(basedir)) - batch = data_context.get_batch("mysource2", "unicode") + batch = data_context.get_batch("mysource2/unicode") assert "😁" in list(batch["Μ"]) - data_context.add_datasource(name="mysource3", type_="pandas", read_csv_kwargs={"encoding": "utf-16"}, base_directory=str(basedir)) + data_context.add_datasource(name="mysource3", type_="pandas", reader_options={"encoding": "utf-16"}, base_directory=str(basedir)) with pytest.raises(UnicodeError, match="UTF-16 stream does not start with BOM"): - batch = data_context.get_batch("mysource3", "unicode") + batch = data_context.get_batch("mysource3/unicode") with pytest.raises(LookupError, match="unknown encoding: blarg"): - batch = data_context.get_batch("mysource", "unicode", encoding='blarg') + batch = data_context.get_batch("mysource/unicode", encoding='blarg') - batch = data_context.get_batch("mysource2", "unicode", encoding='utf-8') - assert "😁" in list(batch["Μ"]) \ No newline at end of file + batch = data_context.get_batch("mysource2/unicode", encoding='utf-8') + assert "😁" in list(batch["Μ"]) diff --git a/tests/test_autoinspect.py b/tests/test_autoinspect.py index 8856e7f1d466..05df3561bdbf 100644 --- a/tests/test_autoinspect.py +++ b/tests/test_autoinspect.py @@ -11,31 +11,31 @@ def test_no_autoinspection(): df = ge.dataset.PandasDataset({"a": [1, 2, 3]}, profiler=None) - config = df.get_expectations() + suite = df.get_expectation_suite() - assert len(config["expectations"]) == 0 + assert len(suite["expectations"]) == 0 def test_default_no_autoinspection(): df = ge.dataset.PandasDataset({"a": [1, 2, 3]}) - config = df.get_expectations() + suite = df.get_expectation_suite() - assert len(config["expectations"]) == 0 + assert len(suite["expectations"]) == 0 @pytest.mark.parametrize("dataset_type", CONTEXTS) def test_autoinspect_existing_dataset(dataset_type): # Get a basic dataset with no expectations df = get_dataset(dataset_type, {"a": [1, 2, 3]}, profiler=None) - config = df.get_expectations() - assert len(config["expectations"]) == 0 + suite = df.get_expectation_suite() + assert len(suite["expectations"]) == 0 # Run autoinspect df.profile(ge.profile.ColumnsExistProfiler) - config = df.get_expectations() + suite = df.get_expectation_suite() # Ensure that autoinspect worked - assert config["expectations"] == \ + assert suite["expectations"] == \ [{'expectation_type': 'expect_column_to_exist', 'kwargs': {'column': 'a'}}] @@ -43,10 +43,10 @@ def test_autoinspect_existing_dataset(dataset_type): def test_autoinspect_columns_exist(dataset_type): df = get_dataset( dataset_type, {"a": [1, 2, 3]}, profiler=ge.profile.ColumnsExistProfiler) - config = df.get_expectations() + suite = df.get_expectation_suite() - assert len(config["expectations"]) == 1 - assert config["expectations"] == \ + assert len(suite["expectations"]) == 1 + assert suite["expectations"] == \ [{'expectation_type': 'expect_column_to_exist', 'kwargs': {'column': 'a'}}] diff --git a/tests/test_cli.py b/tests/test_cli.py index 59258a88304c..5f0198d21207 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -9,6 +9,8 @@ import json import os import shutil +import logging +import sys from ruamel.yaml import YAML yaml = YAML() yaml.default_flow_style = False @@ -36,7 +38,7 @@ def test_cli_command_entrance(): init Initialize a new Great Expectations project. profile Profile a great expectations object. render Render a great expectations object. - validate Validate a CSV file against an expectations configuration. + validate Validate a CSV file against an expectation suite. """ @@ -58,15 +60,15 @@ def test_cli_validate_help(): result = runner.invoke(cli, ["validate", "--help"]) assert result.exit_code == 0 - expected_help_message = """Usage: cli validate [OPTIONS] DATASET EXPECTATIONS_CONFIG_FILE + expected_help_message = """Usage: cli validate [OPTIONS] DATASET EXPECTATION_SUITE_FILE - Validate a CSV file against an expectations configuration. + Validate a CSV file against an expectation suite. - DATASET: Path to a file containing a CSV file to validate using the - provided expectations_config_file. + DATASET: Path to a file containing a CSV file to validate using the + provided expectation_suite_file. - EXPECTATIONS_CONFIG_FILE: Path to a file containing a valid - great_expectations expectations config to use to validate the data. + EXPECTATION_SUITE_FILE: Path to a file containing a valid + great_expectations expectations suite to use to validate the data. Options: -p, --evaluation_parameters TEXT @@ -203,22 +205,29 @@ def test_cli_init(tmp_path_factory): # assert False -def test_cli_profile(empty_data_context, filesystem_csv_2): +def test_cli_profile(empty_data_context, filesystem_csv_2, capsys): empty_data_context.add_datasource( "my_datasource", "pandas", base_directory=str(filesystem_csv_2)) not_so_empty_data_context = empty_data_context - # print(not_so_empty_data_context.get_available_data_asset_names()) - project_root_dir = not_so_empty_data_context.get_context_root_directory() # print(project_root_dir) + # For some reason, even with this logging change (which is required and done in main of the cli) + # the click cli runner does not pick up output; capsys appears to intercept it first + logger = logging.getLogger("great_expectations") + handler = logging.StreamHandler(stream=sys.stdout) + formatter = logging.Formatter( + '%(levelname)s %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) runner = CliRunner() result = runner.invoke( cli, ["profile", "my_datasource", "-d", project_root_dir]) - # print(result) - # print(result.output) - - assert "Profiling my_datasource with BasicDatasetProfiler" in result.output - assert "Note: You will need to review and revise Expectations before using them in production." in result.output + captured = capsys.readouterr() + + assert "Profiling my_datasource with BasicDatasetProfiler" in captured.out + assert "Note: You will need to review and revise Expectations before using them in production." in captured.out + logger.removeHandler(handler) \ No newline at end of file diff --git a/tests/test_data_asset.py b/tests/test_data_asset.py index 6418fc42f68a..de5dd35ea291 100644 --- a/tests/test_data_asset.py +++ b/tests/test_data_asset.py @@ -24,11 +24,11 @@ def test_data_asset(self): 'z': ['hello', 'jello', 'mello'], }) - # print D._expectations_config.keys() - # print json.dumps(D._expectations_config, indent=2) + # print D._expectation_suite.keys() + # print json.dumps(D._expectation_suite, indent=2) self.assertEqual( - D._expectations_config, + D._expectation_suite, { "data_asset_name": None, "data_asset_type": "Dataset", @@ -41,7 +41,7 @@ def test_data_asset(self): self.maxDiff = None self.assertEqual( - D.get_expectations(), + D.get_expectation_suite(), { "data_asset_name": None, "data_asset_type": "Dataset", @@ -63,7 +63,7 @@ def test_expectation_meta(self): 'x', 2, 2, meta={"notes": "This expectation is for lolz."}) k = 0 self.assertEqual(result['success'], True) - config = df.get_expectations() + config = df.get_expectation_suite() for expectation_config in config['expectations']: if expectation_config['expectation_type'] == 'expect_column_median_to_be_between': k += 1 @@ -172,11 +172,11 @@ def test_get_and_save_expectation_config(self): } self.assertEqual( - df.get_expectations(), + df.get_expectation_suite(), output_config, ) - df.save_expectations(directory_name + '/temp1.json') + df.save_expectation_suite(directory_name + '/temp1.json') temp_file = open(directory_name+'/temp1.json') self.assertEqual( json.load(temp_file), @@ -245,13 +245,13 @@ def test_get_and_save_expectation_config(self): } self.assertEqual( - df.get_expectations( + df.get_expectation_suite( discard_failed_expectations=False ), output_config ) - df.save_expectations( + df.save_expectation_suite( directory_name+'/temp2.json', discard_failed_expectations=False ) @@ -317,7 +317,7 @@ def test_get_and_save_expectation_config(self): } self.assertEqual( - df.get_expectations( + df.get_expectation_suite( discard_result_format_kwargs=False, discard_include_configs_kwargs=False, discard_catch_exceptions_kwargs=False, @@ -326,7 +326,7 @@ def test_get_and_save_expectation_config(self): msg="Second Test Set" ) - df.save_expectations( + df.save_expectation_suite( directory_name+'/temp3.json', discard_result_format_kwargs=False, discard_include_configs_kwargs=False, @@ -933,7 +933,7 @@ def test_remove_expectation(self): ) self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 8 ) @@ -942,7 +942,7 @@ def test_remove_expectation(self): None ) self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 7 ) @@ -951,18 +951,18 @@ def test_remove_expectation(self): None ) self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 5 ) my_df.remove_expectation(column="z", remove_multiple_matches=True), self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 2 ) self.assertEqual( - my_df.get_expectations(discard_failed_expectations=False), + my_df.get_expectation_suite(discard_failed_expectations=False), { 'expectations': [ { @@ -1185,13 +1185,13 @@ def test_meta_version_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - out = D.validate(expectations_config={"expectations": []}) + out = D.validate(expectation_suite={"expectations": []}) self.assertEqual(str(w[0].message), "WARNING: No great_expectations version found in configuration object.") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - out = D.validate(expectations_config={ + out = D.validate(expectation_suite={ "meta": {"great_expectations.__version__": "0.0.0"}, "expectations": []}) self.assertEqual(str(w[0].message), "WARNING: This configuration object was built using version 0.0.0 of great_expectations, but is currently being valided by version %s." % ge.__version__) diff --git a/tests/test_data_asset_util.py b/tests/test_data_asset_util.py index 9c77ee26ba25..6f769720a047 100644 --- a/tests/test_data_asset_util.py +++ b/tests/test_data_asset_util.py @@ -22,7 +22,7 @@ def test_recursively_convert_to_json_serializable(self): D.expect_column_kl_divergence_to_be_less_than("x", part, .6) # Dumping this JSON object verifies that everything is serializable - json.dumps(D.get_expectations(), indent=2) + json.dumps(D.get_expectation_suite(), indent=2) x = { 'w': [ diff --git a/tests/test_expectation_decorators.py b/tests/test_expectation_decorators.py index ec1abe65f60b..190627990e15 100644 --- a/tests/test_expectation_decorators.py +++ b/tests/test_expectation_decorators.py @@ -29,7 +29,7 @@ def test_expectation_decorator_build_config(self): eds.no_op_expectation() eds.no_op_value_expectation('a') - config = eds.get_expectations() + config = eds.get_expectation_suite() self.assertEqual({'expectation_type': 'no_op_expectation', 'kwargs': {}}, config['expectations'][0]) @@ -49,7 +49,7 @@ def test_expectation_decorator_meta(self): metadata = {'meta_key': 'meta_value'} eds = ExpectationOnlyDataAsset() out = eds.no_op_value_expectation('a', meta=metadata) - config = eds.get_expectations() + config = eds.get_expectation_suite() self.assertEqual({'success': True, 'meta': metadata}, diff --git a/tests/test_filedata_asset.py b/tests/test_filedata_asset.py index cfb8f52fbdbf..f762d52254c3 100644 --- a/tests/test_filedata_asset.py +++ b/tests/test_filedata_asset.py @@ -23,7 +23,7 @@ def test_autoinspect_filedata_asset(): # raise -def test_expectation_config_filedata_asset(): +def test_expectation_suite_filedata_asset(): # Load in data files file_path = './tests/test_sets/toy_data_complete.csv' @@ -42,7 +42,7 @@ def test_expectation_config_filedata_asset(): include_config=True) # Test basic config output - complete_config = f_dat.get_expectations() + complete_config = f_dat.get_expectation_suite() expected_config_expectations = [{'expectation_type':'expect_file_line_regex_match_count_to_equal', 'kwargs': {'expected_count': 3, 'regex': ',\\S', @@ -50,8 +50,8 @@ def test_expectation_config_filedata_asset(): assertDeepAlmostEqual(complete_config["expectations"], expected_config_expectations) # Include result format kwargs - complete_config2 = f_dat.get_expectations(discard_result_format_kwargs=False, - discard_failed_expectations=False) + complete_config2 = f_dat.get_expectation_suite(discard_result_format_kwargs=False, + discard_failed_expectations=False) expected_config_expectations2 = [{'expectation_type': 'expect_file_line_regex_match_count_to_equal', 'kwargs': {'expected_count': 3, 'regex': ',\\S', @@ -66,8 +66,8 @@ def test_expectation_config_filedata_asset(): assertDeepAlmostEqual(complete_config2["expectations"], expected_config_expectations2) # Discard Failing Expectations - complete_config3 = f_dat.get_expectations(discard_result_format_kwargs=False, - discard_failed_expectations=True) + complete_config3 = f_dat.get_expectation_suite(discard_result_format_kwargs=False, + discard_failed_expectations=True) expected_config_expectations3 = [{'expectation_type': 'expect_file_line_regex_match_count_to_equal', 'kwargs': {'expected_count': 3, diff --git a/tests/test_fixtures/expectations/parameterized_expectations_config_fixture.json b/tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json similarity index 89% rename from tests/test_fixtures/expectations/parameterized_expectations_config_fixture.json rename to tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json index 5f5e4bc78f3e..1b85f50ad0a2 100644 --- a/tests/test_fixtures/expectations/parameterized_expectations_config_fixture.json +++ b/tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json @@ -1,5 +1,5 @@ { - "data_asset_name": "parameterized_expectations_config_fixture", + "data_asset_name": "mydataset/mygenerator/parameterized_expectation_suite_fixture/default", "data_asset_type": "Dataset", "meta": { }, diff --git a/tests/test_fixtures/great_expectations_basic.yml b/tests/test_fixtures/great_expectations_basic.yml index 321286999c95..bc410291b373 100644 --- a/tests/test_fixtures/great_expectations_basic.yml +++ b/tests/test_fixtures/great_expectations_basic.yml @@ -2,10 +2,10 @@ # It has comments that should be preserved. datasources: # For example, this one. - default: + mydatasource: type: pandas generators: # The name default is read if no datasource or generator is specified - default: - type: filesystem - base_directory: /data \ No newline at end of file + mygenerator: + type: subdir_reader + base_directory: /data diff --git a/tests/test_ge_utils.py b/tests/test_ge_utils.py index 5194574f1a43..ae6978de098c 100644 --- a/tests/test_ge_utils.py +++ b/tests/test_ge_utils.py @@ -3,40 +3,40 @@ import great_expectations as ge -def test_validate_non_dataset(file_data_asset, empty_expectations_config): +def test_validate_non_dataset(file_data_asset, empty_expectation_suite): with pytest.raises(ValueError, match=r"The validate util method only supports dataset validations"): - ge.validate(file_data_asset, empty_expectations_config, data_asset_type=ge.data_asset.FileDataAsset) + ge.validate(file_data_asset, empty_expectation_suite, data_asset_type=ge.data_asset.FileDataAsset) -def test_validate_dataset(dataset, basic_expectations_config): - res = ge.validate(dataset, basic_expectations_config) +def test_validate_dataset(dataset, basic_expectation_suite): + res = ge.validate(dataset, basic_expectation_suite) assert res["success"] == True assert res["statistics"]["evaluated_expectations"] == 4 if isinstance(dataset, ge.dataset.PandasDataset): - res = ge.validate(dataset, basic_expectations_config, data_asset_type=ge.dataset.PandasDataset) + res = ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.PandasDataset) assert res["success"] == True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): - ge.validate(dataset, basic_expectations_config, data_asset_type=ge.dataset.SqlAlchemyDataset) + ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.SqlAlchemyDataset) elif isinstance(dataset, ge.dataset.SqlAlchemyDataset): - res = ge.validate(dataset, basic_expectations_config, data_asset_type=ge.dataset.SqlAlchemyDataset) + res = ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.SqlAlchemyDataset) assert res["success"] == True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): - ge.validate(dataset, basic_expectations_config, data_asset_type=ge.dataset.PandasDataset) + ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.PandasDataset) elif isinstance(dataset, ge.dataset.SparkDFDataset): - res = ge.validate(dataset, basic_expectations_config, data_asset_type=ge.dataset.SparkDFDataset) + res = ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.SparkDFDataset) assert res["success"] == True assert res["statistics"]["evaluated_expectations"] == 4 with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): - ge.validate(dataset, basic_expectations_config, data_asset_type=ge.dataset.PandasDataset) + ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.PandasDataset) def test_validate_using_data_context(dataset, data_context): # Before running, the data context should not have compiled parameters assert data_context._compiled == False - res = ge.validate(dataset, data_asset_name="parameterized_expectations_config_fixture", data_context=data_context) + res = ge.validate(dataset, data_asset_name="parameterized_expectation_suite_fixture", data_context=data_context) # After handling a validation result registration, it should be assert data_context._compiled == True @@ -48,14 +48,14 @@ def test_validate_using_data_context(dataset, data_context): def test_validate_using_data_context_path(dataset, data_context): data_context_path = data_context.get_context_root_directory() - res = ge.validate(dataset, data_asset_name="parameterized_expectations_config_fixture", data_context=data_context_path) + res = ge.validate(dataset, data_asset_name="parameterized_expectation_suite_fixture", data_context=data_context_path) - # We should have now found the right config with expectations to evaluate + # We should have now found the right suite with expectations to evaluate assert res["success"] == False assert res["statistics"]["evaluated_expectations"] == 2 -def test_validate_invalid_parameters(dataset, basic_expectations_config, data_context): - with pytest.raises(ValueError, match="Either an expectations config or a DataContext is required for validation."): +def test_validate_invalid_parameters(dataset, basic_expectation_suite, data_context): + with pytest.raises(ValueError, match="Either an expectation suite or a DataContext is required for validation."): ge.validate(dataset) \ No newline at end of file diff --git a/tests/test_great_expectations.py b/tests/test_great_expectations.py index 96b75e8e2c9e..c5b97812e6c6 100644 --- a/tests/test_great_expectations.py +++ b/tests/test_great_expectations.py @@ -193,11 +193,11 @@ class TestValidation(unittest.TestCase): def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: - my_expectations_config = json.load(f) + my_expectation_suite = json.load(f) my_df = ge.read_csv( "./tests/test_sets/Titanic.csv", - expectations_config=my_expectations_config + expectation_suite=my_expectation_suite ) my_df.set_default_expectation_argument("result_format", "COMPLETE") @@ -274,7 +274,7 @@ def test_validate_catch_non_existent_expectation(self): }] } results = df.validate( - expectations_config=validation_config_non_existent_expectation)['results'] + expectation_suite=validation_config_non_existent_expectation)['results'] self.assertIn( "object has no attribute 'non_existent_expectation'", @@ -301,7 +301,7 @@ def test_validate_catch_invalid_parameter(self): }] } - results = df.validate(expectations_config=validation_config_invalid_parameter)[ + results = df.validate(expectation_suite=validation_config_invalid_parameter)[ 'results'] print(results[0]['exception_info']) self.assertIn( @@ -370,20 +370,20 @@ class TestRepeatedAppendExpectation(unittest.TestCase): def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: - my_expectations_config = json.load(f) + my_expectation_suite = json.load(f) my_df = ge.read_csv("./tests/test_sets/Titanic.csv", profiler=ge.profile.ColumnsExistProfiler) self.assertEqual( - len(my_df.get_expectations()['expectations']), + len(my_df.get_expectation_suite()['expectations']), 7 ) # For column_expectations, _append_expectation should only replace expectations where the expetation_type AND the column match my_df.expect_column_to_exist("PClass") self.assertEqual( - len(my_df.get_expectations()['expectations']), + len(my_df.get_expectation_suite()['expectations']), 7 ) diff --git a/tests/test_parameter_substitution.py b/tests/test_parameter_substitution.py index aacec32f3344..23fb9cc984f2 100644 --- a/tests/test_parameter_substitution.py +++ b/tests/test_parameter_substitution.py @@ -1,6 +1,6 @@ """ Test the expectation decorator's ability to substitute parameters -at evaluation time, and store parameters in expectations_config +at evaluation time, and store parameters in expectation_suite """ import pytest @@ -55,13 +55,13 @@ def test_parameter_substitution(single_expectation_custom_data_asset): # Establish our expectation using that parameter result = single_expectation_custom_data_asset.expect_nothing( expectation_argument={"$PARAMETER": "upstream_dag_key"}) - config = single_expectation_custom_data_asset.get_expectations() + suite = single_expectation_custom_data_asset.get_expectation_suite() - # Ensure our value has been substituted during evaluation, and set properly in the config + # Ensure our value has been substituted during evaluation, and set properly in the suite assert result["result"]["details"]["expectation_argument"] == "upstream_dag_value" - assert config["evaluation_parameters"] == { + assert suite["evaluation_parameters"] == { "upstream_dag_key": "upstream_dag_value"} - assert config["expectations"][0]["kwargs"] == { + assert suite["expectations"][0]["kwargs"] == { "expectation_argument": {"$PARAMETER": "upstream_dag_key"}} @@ -71,12 +71,12 @@ def test_exploratory_parameter_substitution(single_expectation_custom_data_asset result = single_expectation_custom_data_asset.expect_nothing( expectation_argument={"$PARAMETER": "upstream_dag_key", "$PARAMETER.upstream_dag_key": "temporary_value"}) - config = single_expectation_custom_data_asset.get_expectations() - # Ensure our value has been substituted during evaluation, and NOT stored in the config + suite = single_expectation_custom_data_asset.get_expectation_suite() + # Ensure our value has been substituted during evaluation, and NOT stored in the suite assert result["result"]["details"]["expectation_argument"] == "temporary_value" - assert "evaluation_parameters" not in config or config["evaluation_parameters"] == { + assert "evaluation_parameters" not in suite or suite["evaluation_parameters"] == { } - assert config["expectations"][0]["kwargs"] == { + assert suite["expectations"][0]["kwargs"] == { "expectation_argument": {"$PARAMETER": "upstream_dag_key"}} # Evaluating the expectation without the parameter should now fail, because no parameters were set diff --git a/tests/test_profile.py b/tests/test_profile.py index ffc8f000a076..6905c97de6b8 100644 --- a/tests/test_profile.py +++ b/tests/test_profile.py @@ -58,14 +58,14 @@ def test_ColumnsExistProfiler(): def test_BasicDatasetProfiler(): toy_dataset = PandasDataset({"x": [1, 2, 3]}) - assert len(toy_dataset.get_expectations( + assert len(toy_dataset.get_expectation_suite( suppress_warnings=True)["expectations"]) == 0 expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) # print(json.dumps(expectations_config, indent=2)) - assert len(toy_dataset.get_expectations( + assert len(toy_dataset.get_expectation_suite( suppress_warnings=True)["expectations"]) > 0 # We should add an additional test that instantiates the batch via context, so the data_asset_name will be populated. @@ -113,14 +113,14 @@ def test_BasicDatasetProfiler_with_context(empty_data_context, filesystem_csv_2) "my_datasource", "pandas", base_directory=str(filesystem_csv_2)) not_so_empty_data_context = empty_data_context - batch = not_so_empty_data_context.get_batch("my_datasource", "f1") + batch = not_so_empty_data_context.get_batch("my_datasource/f1") expectations_config, validation_results = BasicDatasetProfiler.profile( batch) # print(batch.get_batch_kwargs()) # print(json.dumps(expectations_config, indent=2)) - assert expectations_config["data_asset_name"] == "f1" + assert expectations_config["data_asset_name"] == "my_datasource/default/f1/default" assert "BasicDatasetProfiler" in expectations_config["meta"] assert set(expectations_config["meta"]["BasicDatasetProfiler"].keys()) == { "created_by", "created_at", "batch_kwargs" @@ -134,7 +134,7 @@ def test_BasicDatasetProfiler_with_context(empty_data_context, filesystem_csv_2) print(json.dumps(validation_results, indent=2)) - assert validation_results["meta"]["data_asset_name"] == "f1" + assert validation_results["meta"]["data_asset_name"] == "my_datasource/default/f1/default" assert set(validation_results["meta"].keys()) == { "great_expectations.__version__", "data_asset_name", "run_id", "batch_kwargs" } @@ -145,13 +145,13 @@ def test_context_profiler(empty_data_context, filesystem_csv_2): "my_datasource", "pandas", base_directory=str(filesystem_csv_2)) not_so_empty_data_context = empty_data_context - assert not_so_empty_data_context.list_expectations_configs() == [] + assert not_so_empty_data_context.list_expectation_suites() == [] not_so_empty_data_context.profile_datasource("my_datasource") - print(not_so_empty_data_context.list_expectations_configs()) - assert not_so_empty_data_context.list_expectations_configs() != [] + print(not_so_empty_data_context.list_expectation_suites()) + assert not_so_empty_data_context.list_expectation_suites() != [] - profiled_expectations = not_so_empty_data_context.get_expectations('f1') + profiled_expectations = not_so_empty_data_context.get_expectation_suite('f1') print(json.dumps(profiled_expectations, indent=2)) # FIXME: REVISIT THIS TEST FOR CONTENT @@ -161,7 +161,7 @@ def test_context_profiler(empty_data_context, filesystem_csv_2): # print(json.dumps(validation_results, indent=2)) # # Note: deliberately not testing context file storage in this test. - # context_expectations_config = not_so_empty_data_context.get_expectations( + # context_expectations_config = not_so_empty_data_context.get_expectation_suite( # "my_datasource", "f1") # assert context_expectations_config == profiled_expectations diff --git a/tests/test_render.py b/tests/test_render.py index 60619ed7d6b3..bdf04e46ffe6 100644 --- a/tests/test_render.py +++ b/tests/test_render.py @@ -28,11 +28,11 @@ # from great_expectations import render # def test_prescriptive_expectation_renderer(self): -# expectations_config = json.load( +# expectation_suite = json.load( # open('tests/test_fixtures/rendering_fixtures/expectation_suite_3.json') # ) # results = render.view_models.PrescriptiveExpectationPageRenderer().render( -# expectations_config, +# expectation_suite, # ) # assert results != None # assert "
  • is a required field.
  • " in results @@ -75,7 +75,7 @@ # def test_render_modes(self): # # df = ge.read_csv("examples/data/Meteorite_Landings.csv") # # df.autoinspect(ge.dataset.autoinspect.pseudo_pandas_profiling) -# # expectations_list = df.get_expectations_config()["expectations"] +# # expectations_list = df.get_expectation_suite()["expectations"] # expectations_list = json.load( # open('tests/test_fixtures/rendering_fixtures/expectation_suite_3.json') diff --git a/tests/test_spark_dataset.py b/tests/test_spark_dataset.py index 5984dfc6e0fe..23b088b28f4b 100644 --- a/tests/test_spark_dataset.py +++ b/tests/test_spark_dataset.py @@ -2,10 +2,9 @@ from great_expectations.datasource import SparkDFDatasource import pytest -# context = ge.get_data_context('SparkCSV', './tests/test_sets') -context = SparkDFDatasource(base_directory="./tests/test_sets") -titanic_dataset = context.get_data_asset('Titanic.csv', header=True) -strf_dataset = context.get_data_asset('strf_test.csv', header=True) +datasource = SparkDFDatasource(base_directory="./tests/test_sets") +titanic_dataset = datasource.get_batch('Titanic.csv', header=True) +strf_dataset = datasource.get_batch('strf_test.csv', header=True) def test_expect_column_values_to_be_unique():