diff --git a/.travis.yml b/.travis.yml index d603aa3dc669..4abd3e7e809f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +# dist: xenial language: python os: - linux @@ -27,15 +28,19 @@ matrix: # - dist: xenial # python: 3.7 # env: PANDAS=latest +addons: + postgresql: "9.4" services: - postgresql + - mysql install: - # - ./travis-java.sh +# - ./travis-java.sh - pip install --only-binary=numpy,scipy numpy scipy - if [ "$PANDAS" = "latest" ]; then pip install pandas; else pip install pandas==$PANDAS; fi - pip install -r requirements-dev.txt before_script: - psql -c 'create database test_ci;' -U postgres + - mysql -u root --password="" -e 'create database test_ci;' script: - pytest --cov=great_expectations tests/ after_success: diff --git a/README.md b/README.md index 21f08d1fe67f..4cbe36402208 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,6 @@ Great Expectations *Always know what to expect from your data.* -Coming soon...! (Temporary notice June 2019) --------------------------------------------------------------------------------- - -We're making some major revisions to the project right now, so expect a BIG update to documentation by the end of June. - -In the meantime, the Great Expectations Slack channel is the best place to get up-to-date information: - -https://tinyurl.com/great-expectations-slack - -Teaser: the next round of revisions doesn't change the existing behavior of Great Expectations at all, but it does add tons of new support for profiling, documenting, and deploying Expectations. It significantly raises the bar for making Great Expectations fully production-ready. - - - What is great_expectations? -------------------------------------------------------------------------------- @@ -46,9 +33,15 @@ To get more done with data, faster. Teams use great_expectations to How do I get started? -------------------------------------------------------------------------------- -It's easy! Just use pip install: +It's easy! + First use pip install: + + $ pip install great_expectations + + Then run this command in the root directory of the project you want to try Great Expectations on: - $ pip install great_expectations + $ great_expectations init + You can also clone the repository, which includes examples of using great_expectations. diff --git a/docs/source/autoinspection.rst b/docs/source/autoinspection.rst deleted file mode 100644 index 5090fdfbc072..000000000000 --- a/docs/source/autoinspection.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. _autoinspection: - -================================================================================ -Autoinspection -================================================================================ - -It can be very convenient to have great expectations automatically review a \ -dataset and suggest expectations that may be appropriate. Currently, there's \ -a very basic, but easily extensible, autoinspection capability available. - -Dataset objects have an `autoinspect` method which allows you to provide a \ -function that will evaluate a dataset object and add expectations to it. \ -By default `autoinspect` will call the autoinspect function \ -:func:`columns_exist ` \ -which will add an `expect_column_to_exist` expectation for each column \ -currently present on the dataset. - -To implement additional autoinspection functions, you simply take a single \ -parameter, a Dataset, and evaluate and add expectations to that object. - - -.. code-block:: python - - >> import great_expectations as ge - >> df = ge.dataset.PandasDataset({"col": [1, 2, 3, 4, 5]}) - >> df.autoinspect(ge.dataset.autoinspect.columns_exist) - >> df.get_expectations_config() - {'dataset_name': None, - 'meta': {'great_expectations.__version__': '0.4.4__develop'}, - 'expectations': [ - {'expectation_type': 'expect_column_to_exist', - 'kwargs': {'column': 'col'} - }] - } diff --git a/docs/source/conf.py b/docs/source/conf.py index 55064cdb59af..fcf749763b6c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -52,7 +52,7 @@ # General information about the project. project = u'great_expectations' -copyright = u'2018, The Great Expectations Team' +copyright = u'2019, The Great Expectations Team' author = u'The Great Expectations Team' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 000000000000..5c249a281298 --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,16 @@ +.. _contributing: + +Contributing +================== + +.. toctree:: + :maxdepth: 2 + +Can I contribute? +----------------- + +Absolutely. Yes, please. Start +`here `__, +and don't be shy with questions! + + diff --git a/docs/source/core_concepts.rst b/docs/source/core_concepts.rst new file mode 100644 index 000000000000..c0825eb8c9d7 --- /dev/null +++ b/docs/source/core_concepts.rst @@ -0,0 +1,14 @@ +.. _core_concepts: + +Core Concepts +================== + +.. toctree:: + :maxdepth: 2 + + /core_concepts/expectations + /core_concepts/validation + /core_concepts/data_context + /core_concepts/datasource + /core_concepts/custom_expectations + /core_concepts/glossary diff --git a/docs/source/custom_expectations.rst b/docs/source/core_concepts/custom_expectations.rst similarity index 100% rename from docs/source/custom_expectations.rst rename to docs/source/core_concepts/custom_expectations.rst diff --git a/docs/source/core_concepts/data_context.rst b/docs/source/core_concepts/data_context.rst new file mode 100644 index 000000000000..a400c4397240 --- /dev/null +++ b/docs/source/core_concepts/data_context.rst @@ -0,0 +1,37 @@ +.. _data_context: + +Data Context +=================== + +A DataContext represents a Great Expectations project. It organizes storage and access for +expectation suites, datasources, notification settings, and data fixtures. + +The DataContext is configured via a yml file stored in a directory called great_expectations; the configuration file +as well as managed expectation suites should be stored in version control. + +DataContexts use data sources you're already familiar with. Generators help introspect data stores and data execution +frameworks (such as airflow, Nifi, dbt, or dagster) to describe and produce batches of data ready for analysis. This +enables fetching, validation, profiling, and documentation of your data in a way that is meaningful within your +existing infrastructure and work environment. + +DataContexts use a datasource-based namespace, where each accessible type of data has a three-part +normalized *data_asset_name*, consisting of *datasource/generator/generator_asset*. + +- The datasource actually connects to a source of materialized data and returns Great Expectations DataAssets \ + connected to a compute environment and ready for validation. + +- The Generator knows how to introspect datasources and produce identifying "batch_kwargs" that define \ + particular slices of data. + +- The generator_asset is a specific name -- often a table name or other name familiar to users -- that \ + generators can slice into batches. + +An expectation suite is a collection of expectations ready to be applied to a batch of data. Since +in many projects it is useful to have different expectations evaluate in different contexts--profiling +vs. testing; warning vs. error; high vs. low compute; ML model or dashboard--suites provide a namespace +option for selecting which expectations a DataContext returns. + +In many simple projects, the datasource or generator name may be omitted and the DataContext will infer +the correct name when there is no ambiguity. + +Similarly, if no expectation suite name is provided, the DataContext will assume the name "default". diff --git a/docs/source/core_concepts/datasource.rst b/docs/source/core_concepts/datasource.rst new file mode 100644 index 000000000000..2fe87ef9500d --- /dev/null +++ b/docs/source/core_concepts/datasource.rst @@ -0,0 +1,29 @@ +.. _datasource: + +Datasources +============ + +Datasources are responsible for connecting to data infrastructure. Each Datasource is a source +of materialized data, such as a SQL database, S3 bucket, or local file directory. + +Each Datasource also provides access to Great Expectations data assets that are connected to +a specific compute environment, such as a SQL database, a Spark cluster, or a local in-memory +Pandas Dataframe. + +To bridge the gap between those worlds, Datasources interact closely with *generators* which +are aware of a source of data and can produce produce identifying information, called +"batch_kwargs" that datasources can use to get individual batches of data. They add flexibility +in how to obtain data such as with time-based partitioning, downsampling, or other techniques +appropriate for the datasource. + +For example, a generator could produce a SQL query that logically represents "rows in the Events +table with a timestamp on February 7, 2012," which a SqlAlchemyDatasource could use to materialize +a SqlAlchemyDataset corresponding to that batch of data and ready for validation. + +Since opinionated DAG managers such as airflow, dbt, prefect.io, dagster can also act as datasources +and/or generators for a more generic datasource. + +See :ref:`batch_generator` for more detail about how batch generators interact with datasources and DAG runners. + +See datasource module docs :ref:`datasource_module` for more detail about available datasources. + diff --git a/docs/source/expectations.rst b/docs/source/core_concepts/expectations.rst similarity index 96% rename from docs/source/expectations.rst rename to docs/source/core_concepts/expectations.rst index 51ae3b34b700..2dc6c1966d70 100644 --- a/docs/source/expectations.rst +++ b/docs/source/core_concepts/expectations.rst @@ -125,13 +125,13 @@ You can also add notes or even structured metadata to expectations to describe t Saving Expectations ------------------------------------------------------------------------------ -At the end of your exploration, call `save_expectations` to store all Expectations from your session to your pipeline test files. +At the end of your exploration, call `save_expectation_suite` to store all Expectations from your session to your pipeline test files. This is how you always know what to expect from your data. .. code-block:: bash - >> my_df.save_expectations_config("my_titanic_expectations.json") + >> my_df.save_expectation_suite("my_titanic_expectations.json") For more detail on how to control expectation output, please see :ref:`standard_arguments` and :ref:`result_format`. diff --git a/docs/source/glossary.rst b/docs/source/core_concepts/glossary.rst similarity index 96% rename from docs/source/glossary.rst rename to docs/source/core_concepts/glossary.rst index 3b2bc37e4596..e53b2e99bc31 100644 --- a/docs/source/glossary.rst +++ b/docs/source/core_concepts/glossary.rst @@ -58,10 +58,12 @@ Datetime and JSON parsing Aggregate functions -------------------------------------------------------------------------------- +* :func:`expect_column_distinct_values_to_be_in_set ` * :func:`expect_column_distinct_values_to_contain_set ` * :func:`expect_column_distinct_values_to_equal_set ` * :func:`expect_column_mean_to_be_between ` * :func:`expect_column_median_to_be_between ` +* :func:`expect_column_quantile_values_to_be_between ` * :func:`expect_column_stdev_to_be_between ` * :func:`expect_column_unique_value_count_to_be_between ` * :func:`expect_column_proportion_of_unique_values_to_be_between ` diff --git a/docs/source/validation.rst b/docs/source/core_concepts/validation.rst similarity index 83% rename from docs/source/validation.rst rename to docs/source/core_concepts/validation.rst index f43839c3b60d..cb63814f8718 100644 --- a/docs/source/validation.rst +++ b/docs/source/core_concepts/validation.rst @@ -4,16 +4,31 @@ Validation ================================================================================ -Once you've constructed and stored Expectations, you can use them to validate new data. +Once you've constructed and stored Expectations, you can use them to validate new data. Validation generates a report +that details any specific deviations from expected values. + +We recommend using a :ref:`data_context` to manage expectation suites and coordinate validation across runs. + + +Validation Result +---------------------------- + +The report contains information about: + + - the overall sucess (the `success` field), + - summary statistics of the expectations (the `statistics` field), and + - the detailed results of each expectation (the `results` field). + +An example report looks like the following: .. code-block:: bash >> import json >> import great_expectations as ge - >> my_expectations_config = json.load(file("my_titanic_expectations.json")) + >> my_expectation_suite = json.load(file("my_titanic_expectations.json")) >> my_df = ge.read_csv( "./tests/examples/titanic.csv", - expectations_config=my_expectations_config + expectation_suite=my_expectation_suite ) >> my_df.validate() @@ -86,13 +101,6 @@ Once you've constructed and stored Expectations, you can use them to validate ne } } -Calling great_expectations's ``validate`` method generates a JSON-formatted report. -The report contains information about: - - - the overall sucess (the `success` field), - - summary statistics of the expectations (the `statistics` field), and - - the detailed results of each expectation (the `results` field). - Command-line validation ------------------------------------------------------------------------------ @@ -177,10 +185,15 @@ Deployment patterns Useful deployment patterns include: -* Include validation at the end of a complex data transformation, to verify that no cases were lost, duplicated, or improperly merged. -* Include validation at the *beginning* of a script applying a machine learning model to a new batch of data, to verify that its distributed similarly to the training and testing set. -* Automatically trigger table-level validation when new data is dropped to an FTP site or S3 bucket, and send the validation report to the uploader and bucket owner by email. +* Include validation at the end of a complex data transformation, to verify that \ + no cases were lost, duplicated, or improperly merged. +* Include validation at the *beginning* of a script applying a machine learning model to a new batch of data, to \ + verify that its distributed similarly to the training and testing set. +* Automatically trigger table-level validation when new data is dropped to an FTP site or S3 bucket, and send the \ + validation report to the uploader and bucket owner by email. * Schedule database validation jobs using cron, then capture errors and warnings (if any) and post them to Slack. -* Validate as part of an Airflow task: if Expectations are violated, raise an error and stop DAG propagation until the problem is resolved. Alternatively, you can implement expectations that raise warnings without halting the DAG. +* Validate as part of an Airflow task: if Expectations are violated, raise an error and stop DAG propagation until \ + the problem is resolved. Alternatively, you can implement expectations that raise warnings without halting the DAG. -For certain deployment patterns, it may be useful to parameterize expectations, and supply evaluation parameters at validation time. See :ref:`evaluation_parameters` for more information. +For certain deployment patterns, it may be useful to parameterize expectations, and supply evaluation parameters at \ +validation time. See :ref:`evaluation_parameters` for more information. diff --git a/docs/source/data_context_module.rst b/docs/source/data_context_module.rst deleted file mode 100644 index 39d817e0c76b..000000000000 --- a/docs/source/data_context_module.rst +++ /dev/null @@ -1,52 +0,0 @@ -.. _data_context_module: - -Data Context Module -=================== - -.. automodule:: great_expectations.data_context - :members: - :undoc-members: - :show-inheritance: - -great_expectations.data_context.base ---------------------------------------------- - -.. automodule:: great_expectations.data_context.base - :members: - :undoc-members: - :show-inheritance: - :exclude-members: DataContext - - .. autoclass:: great_expectations.data_context.base.DataContext - :members: - :undoc-members: - :show-inheritance: - -great_expectations.data_context.PandasCSVDataContext ----------------------------------------------------- - -.. automodule:: great_expectations.data_context.pandas_context - :members: - :undoc-members: - :show-inheritance: - :exclude-members: PandasCSVDataContext - - .. autoclass:: great_expectations.data_context.pandas_context.PandasCSVDataContext - :members: - :undoc-members: - :show-inheritance: - -great_expectations.data_context.SqlAlchemyDataContext ------------------------------------------------------ - -.. automodule:: great_expectations.data_context.sqlalchemy_context - :members: - :undoc-members: - :show-inheritance: - :exclude-members: SqlAlchemyDataContext - - .. autoclass:: great_expectations.data_context.sqlalchemy_context.SqlAlchemyDataContext - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/source/data_contexts.rst b/docs/source/data_contexts.rst deleted file mode 100644 index c317bc17c4c3..000000000000 --- a/docs/source/data_contexts.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. _data_contexts: - -================================================================================ -Data Contexts -================================================================================ - -Data Contexts manage connections to Great Expectations Datasets. Note: data contexts -will be changed significantly during the next release of GE. - -To get a data context, simply call `get_data_context()` on the ge object: - -.. code-block:: bash - - >> import great_expectations as ge - >> options = { ## my connection options } - >> sql_context = ge.get_data_context('sqlalchemy_context', options) - - >> sql_dataset = sql_context.get_dataset('table_name') - - -There are currently four types of data contexts: - - :ref:`PandasCSVDataContext`: The PandasCSVDataContext ('PandasCSV') exposes a local directory containing files as datasets. - - :ref:`SqlAlchemyDataContext`: The SqlAlchemyDataContext ('SqlAlchemy') exposes tables from a SQL-compliant database as datasets. - - :ref:`SparkCSVDataContext`: The SparkCSVDataContext ('SparkCSV') exposes csv files accessible from a SparkSQL context. - - :ref:`SparkParquetDataContext`: The SparkParquetDataContext ('SparkParquet') exposes parquet files accessible from a SparkSQL context. - - :ref:`DatabricksTableContext`: The DatabricksTableContext ('DatabricksTable') exposes tables from a databricks notebook. - -All data contexts expose the following methods: - - list_datasets(): lists datasets available in current context - - get_dataset(dataset_name): returns a dataset with the matching name (e.g. filename or tablename) - -.. _PandasCSVDataContext: - -`PandasCSVDataContext` ----------------------- - -The `options` paramater for a PandasCSVDataContext is simply the glob pattern matching the files to be available. - - -.. _SqlAlchemyDataContext: - -`SqlAlchemyDataContext` ------------------------ - -The `options` parameter for a SqlAlchemyDataContext is the sqlalchemy connection string to connect to the database. - - -.. _SparkCSVDataContext: - -`SparkCSVDataContext` ---------------------- - -The `options` parameter for a SparkCSVDataContext is a directory from which to read a CSV file, and options to pass to the reader. - - -.. _SparkParquetDataContext: - -`SparkParquetDataContext` -------------------------- - -The `options` parameter for a SparkParquetDataContext is a directory from which to read a Parquet file, and options to pass to the reader. - - -.. _DatabricksTableContext: - -`DatabricksTableContext` ---------------------- - -The `options` parameter for a _DatabricksTableContext is a dataase from which to expose tables; get_dataset optionally also accepts -a date partition. \ No newline at end of file diff --git a/docs/source/get_in_touch.rst b/docs/source/get_in_touch.rst new file mode 100644 index 000000000000..50172e326d91 --- /dev/null +++ b/docs/source/get_in_touch.rst @@ -0,0 +1,30 @@ +.. _get_in_touch: + +Get in Touch +================== + +.. toctree:: + :maxdepth: 2 + +What's the best way to get in touch with the Great Expectations team? +--------------------------------------------------------------------- + +`Issues on +GitHub `__. +If you have questions, comments, feature requests, etc., `opening an +issue `__ +is definitely the best path forward. + +We also have a slack channel, which you can join here: https://tinyurl.com/great-expectations-slack + + +Great Expectations doesn't do X. Is it right for my use case? +------------------------------------------------------------- + +It depends. If you have needs that the library doesn't meet yet, please +`upvote an existing +issue(s) `__ +or `open a new +issue `__ +and we'll see what we can do. Great Expectations is under active +development, so your use case might be supported soon. diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst new file mode 100644 index 000000000000..c7d38cad5e67 --- /dev/null +++ b/docs/source/getting_started.rst @@ -0,0 +1,40 @@ +.. _getting_started: + +Getting Started +================== + +Requirements +------------ + +Great expectations requires a python compute environment and access to data, either locally or \ +through a database or distributed cluster. The tutorials below walk you through getting started \ +with an example project. + +How do I get started? +--------------------- + +It's easy! Just use pip install: + +:: + + $ pip install great_expectations + + +Then go to the root directory of the project you want to use Great Expectations in and run: +:: + + great_expectations init + + +:ref:`tutorial_init` + + +:ref:`tutorial_create_expectations` + + +.. toctree:: + :maxdepth: 2 + :hidden: + + tutorial_init + tutorial_create_expectations diff --git a/docs/source/guides.rst b/docs/source/guides.rst new file mode 100644 index 000000000000..c24159c3d809 --- /dev/null +++ b/docs/source/guides.rst @@ -0,0 +1,18 @@ +.. _guides: + +Guides +================== + +.. toctree:: + :maxdepth: 2 + + /guides/profiling + /guides/data_documentation + /guides/batch_generator + /guides/implemented_expectations + /guides/distributional_expectations + /guides/standard_arguments + /guides/result_format + /guides/evaluation_parameters + /guides/conventions + /guides/migrating_versions diff --git a/docs/source/guides/batch_generator.rst b/docs/source/guides/batch_generator.rst new file mode 100644 index 000000000000..a0e27a3a4d22 --- /dev/null +++ b/docs/source/guides/batch_generator.rst @@ -0,0 +1,25 @@ +.. _batch_generator: + +Batch Generator +================== + +Batch generators produce identifying information, called "batch_kwargs" that datasources +can use to get individual batches of data. They add flexibility in how to obtain data +such as with time-based partitioning, downsampling, or other techniques appropriate +for the datasource. + +For example, a generator could produce a SQL query that logically represents "rows in +the Events table with a timestamp on February 7, 2012," which a SqlAlchemyDatasource +could use to materialize a SqlAlchemyDataset corresponding to that batch of data and +ready for validation. + +A batch is a sample from a data asset, sliced according to a particular rule. For +example, an hourly slide of the Events table or “most recent `users` records.” + +A Batch is the primary unit of validation in the Great Expectations DataContext. +Batches include metadata that identifies how they were constructed--the same “batch_kwargs” +assembled by the generator, While not every datasource will enable re-fetching a +specific batch of data, GE can store snapshots of batches or store metadata from an +external data version control system. + +See :py:mod:`great_expectations.datasource.batch_generator.BatchGenerator` diff --git a/docs/source/conventions.rst b/docs/source/guides/conventions.rst similarity index 100% rename from docs/source/conventions.rst rename to docs/source/guides/conventions.rst diff --git a/docs/source/guides/data_documentation.rst b/docs/source/guides/data_documentation.rst new file mode 100644 index 000000000000..37ba7bef9be9 --- /dev/null +++ b/docs/source/guides/data_documentation.rst @@ -0,0 +1,4 @@ +.. _data_documentation: + +Data Documentation +=================== \ No newline at end of file diff --git a/docs/source/distributional_expectations.rst b/docs/source/guides/distributional_expectations.rst similarity index 89% rename from docs/source/distributional_expectations.rst rename to docs/source/guides/distributional_expectations.rst index a09976439d03..9bb6f6daab48 100644 --- a/docs/source/distributional_expectations.rst +++ b/docs/source/guides/distributional_expectations.rst @@ -19,14 +19,14 @@ parameterized distributions. The design is motivated by the following assumptions: * Encoding expectations into a simple object that allows for portable data pipeline testing is the top priority. \ - In many circumstances the loss of precision associated with "compressing" data into an expectation may be beneficial \ - because of its intentional simplicity as well as because it adds a very light layer of obfuscation over the data \ - which may align with privacy preservation goals. + In many circumstances the loss of precision associated with "compressing" data into an expectation may be beneficial \ + because of its intentional simplicity as well as because it adds a very light layer of obfuscation over the data \ + which may align with privacy preservation goals. * While it should be possible to easily extend the framework with more rigorous statistical tests, great expectations \ - should provide simple, reasonable defaults. Care should be taken in cases where robust statistical guarantees are \ - expected. + should provide simple, reasonable defaults. Care should be taken in cases where robust statistical guarantees are \ + expected. * Building and interpreting expectations should be intuitive: a more precise partition object implies a more precise \ - expectation. + expectation. .. _partition_object: @@ -91,15 +91,15 @@ Distributional expectations rely on three tests for their work. Kullback-Leibler (KL) divergence is available as an expectation for both categorical and continuous data (continuous data will be discretized according to the provided partition prior to computing divergence). Unlike KS and Chi-Squared tests which can use a p-value, you must provide a threshold for the relative entropy to use KL divergence. Further, KL divergence is not symmetric. -* :func:`expect_column_kl_divergence_to_be_less_than ` +* :func:`expect_column_kl_divergence_to_be_less_than ` For continuous data, the expect_column_bootstrapped_ks_test_p_value_to_be_greater_than expectation uses the Kolmogorov-Smirnov (KS) test, which compares the actual and expected cumulative densities of the data. Because of the partition_object's piecewise uniform approximation of the expected distribution, the test would be overly sensitive to differences when used with a sample of data of much larger than the size of the partition. The expectation consequently uses a bootstrapping method to sample the provided data with tunable specificity. -* :func:`expect_column_bootstrapped_ks_test_p_value_to_be_greater_than ` +* :func:`expect_column_bootstrapped_ks_test_p_value_to_be_greater_than ` For categorical data, the expect_column_chisquare_test_p_value_to_be_greater_than expectation uses the Chi-Squared test. The provided weights are scaled to the size of the data in the tested column at the time of the test. -* :func:`expect_column_chisquare_test_p_value_to_be_greater_than ` +* :func:`expect_column_chisquare_test_p_value_to_be_greater_than ` diff --git a/docs/source/evaluation_parameters.rst b/docs/source/guides/evaluation_parameters.rst similarity index 85% rename from docs/source/evaluation_parameters.rst rename to docs/source/guides/evaluation_parameters.rst index 52ce49e29377..9cb44ba4defd 100644 --- a/docs/source/evaluation_parameters.rst +++ b/docs/source/guides/evaluation_parameters.rst @@ -24,7 +24,7 @@ value that should be used during the initial evaluation of the expectation. } You can also store parameter values in a special dictionary called evaluation_parameters that is stored in the \ -expectations_config to be available to multiple expectations or while declaring additional expectations. +expectation_suite to be available to multiple expectations or while declaring additional expectations. .. code-block:: python @@ -42,7 +42,7 @@ When validating expectations, you can provide evaluation parameters based on ups .. code-block:: python - >> my_df.validate(expectations_config=my_dag_step_config, evaluation_parameters={"upstream_row_count": upstream_row_count}) + >> my_df.validate(expectation_suite=my_dag_step_config, evaluation_parameters={"upstream_row_count": upstream_row_count}) Finally, the command-line tool also allows you to provide a JSON file that contains parameters to use during evaluation: @@ -52,4 +52,4 @@ Finally, the command-line tool also allows you to provide a JSON file that conta { "upstream_row_count": 10 } - >> great_expectations validate --evaluation_paramters=my_parameters_file.json dataset_file.csv expectations_config.json + >> great_expectations validate --evaluation_parameters=my_parameters_file.json dataset_file.csv expectation_suite.json diff --git a/docs/source/implemented_expectations.rst b/docs/source/guides/implemented_expectations.rst similarity index 54% rename from docs/source/implemented_expectations.rst rename to docs/source/guides/implemented_expectations.rst index ee8ff2aabd17..41de3691fa3c 100644 --- a/docs/source/implemented_expectations.rst +++ b/docs/source/guides/implemented_expectations.rst @@ -1,99 +1,99 @@ .. _implemented_expectations: Implemented Expectations -===================== +======================== Because Great Expectations can run against different platforms, not all expectations have been implemented for all platforms. This table details which are implemented. Note we love pull-requests to help us fill out the missing implementations! -+-----------------------------------------------------------------------------+----------+----------+----------+ -|**Expectations** |**Pandas**|**SQL** |**Spark** | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_to_exist` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_table_columns_to_match_ordered_list` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_table_row_count_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_table_row_count_to_equal` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_unique` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_not_be_null` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_null` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_of_type` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_in_type_list` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_in_set` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_not_be_in_set` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_between` | True | True | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_increasing` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_decreasing` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_value_lengths_to_be_between` | True | True | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_value_lengths_to_equal` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_match_regex` | True | False | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_not_match_regex` | True | False | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_match_regex_list` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_not_match_regex_list` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_match_strftime_format` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_dateutil_parseable` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_be_json_parseable` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_values_to_match_json_schema` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than`| True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_distinct_values_to_equal_set` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_distinct_values_to_contain_set` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_mean_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_median_to_be_between` | True | True | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_stdev_to_be_between` | True | False | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_unique_value_count_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_proportion_of_unique_values_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_most_common_value_to_be_in_set` | True | False | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_sum_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_min_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_max_to_be_between` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_chisquare_test_p_value_to_be_greater_than` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_bootstrapped_ks_test_p_value_to_be_greater_than` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_kl_divergence_to_be_less_than` | True | True | True | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_pair_values_to_be_equal` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_pair_values_A_to_be_greater_than_B` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_column_pair_values_to_be_in_set` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ -|`expect_multicolumn_values_to_be_unique` | True | False | False | -+-----------------------------------------------------------------------------+----------+----------+----------+ ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|**Expectations** |**Pandas**|**SQL** |**Spark** | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_to_exist` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_table_columns_to_match_ordered_list` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_table_row_count_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_table_row_count_to_equal` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_unique` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_not_be_null` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_null` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_of_type` | True | True | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_in_type_list` | True | True | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_in_set` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_not_be_in_set` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_increasing` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_decreasing` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_value_lengths_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_value_lengths_to_equal` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_match_regex` | True | False | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_not_match_regex` | True | False | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_match_regex_list` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_not_match_regex_list` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_match_strftime_format` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_dateutil_parseable` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_be_json_parseable` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_values_to_match_json_schema` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_distinct_values_to_equal_set` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_distinct_values_to_contain_set` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_mean_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_median_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_stdev_to_be_between` | True | False | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_unique_value_count_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_proportion_of_unique_values_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_most_common_value_to_be_in_set` | True | False | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_sum_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_min_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_max_to_be_between` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_chisquare_test_p_value_to_be_greater_than` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_bootstrapped_ks_test_p_value_to_be_greater_than` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_kl_divergence_to_be_less_than` | True | True | True | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_pair_values_to_be_equal` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_pair_values_A_to_be_greater_than_B` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_column_pair_values_to_be_in_set` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ +|`expect_multicolumn_values_to_be_unique` | True | False | False | ++-----------------------------------------------------------------------------------+----------+----------+----------+ diff --git a/docs/source/guides/migrating_versions.rst b/docs/source/guides/migrating_versions.rst new file mode 100644 index 000000000000..2eb6eeec65e7 --- /dev/null +++ b/docs/source/guides/migrating_versions.rst @@ -0,0 +1,23 @@ +.. _migrating_versions: + + +Migrating between Versions +=========================== + +Great Expectations provides a warning when the currently-installed version is different from the version stored in the +expectation suite. + +Since expectation semantics are usually consistent across versions, there is little change required when upgrading +great expectations, with some exceptions noted here. + +In version 0.7, GE introduced several new features, and significantly changed the way DataContext objects work: + + - A :ref:`data_context` object manages access to expectation suites and other configuration in addition to data assets. + It provides a flexible but opinionated structure for creating and storing configuration and expectations in version + control. + + - When upgrading from prior versions, the new :ref:`datasource` objects provide the same functionality that compute- + environment-specific data context objects provided before, but with significantly more flexibility. + + - The term "autoinspect" is no longer used directly, having been replaced by a much more flexible :ref:`profiling` + feature. \ No newline at end of file diff --git a/docs/source/guides/profiling.rst b/docs/source/guides/profiling.rst new file mode 100644 index 000000000000..0aa1383db785 --- /dev/null +++ b/docs/source/guides/profiling.rst @@ -0,0 +1,27 @@ +.. _profiling: + +================================================================================ +Profiling +================================================================================ + +It can be very convenient to have great expectations automatically review a \ +dataset and suggest expectations that may be appropriate. Currently, there's \ +a very basic, but easily extensible, profiling capability available. + +Dataset objects have a `profile` method which allows you to provide a \ +profiler class that will evaluate a dataset object and add expectations to it. + +.. code-block:: python + + >> import great_expectations as ge + >> df = ge.dataset.PandasDataset({"col": [1, 2, 3, 4, 5]}) + >> df.profile(ge.profile.ColumnsExistProfiler) + >> df.get_expectation_suite() + {'data_asset_name': None, + 'expectation_suite_name': None, + 'meta': {'great_expectations.__version__': '0.7.0'}, + 'expectations': [ + {'expectation_type': 'expect_column_to_exist', + 'kwargs': {'column': 'col'} + }] + } diff --git a/docs/source/result_format.rst b/docs/source/guides/result_format.rst similarity index 100% rename from docs/source/result_format.rst rename to docs/source/guides/result_format.rst diff --git a/docs/source/standard_arguments.rst b/docs/source/guides/standard_arguments.rst similarity index 96% rename from docs/source/standard_arguments.rst rename to docs/source/guides/standard_arguments.rst index 77733537e661..59b1fa396c26 100644 --- a/docs/source/standard_arguments.rst +++ b/docs/source/guides/standard_arguments.rst @@ -7,7 +7,7 @@ Standard arguments for expectations All Expectations return a json-serializable dictionary when evaluated, and share four standard (optional) arguments: - :ref:`result_format`: controls what information is returned from the evaluation of the expectation expectation. - - :ref:`include_config`: If true, then the expectation config itself is returned as part of the result object. + - :ref:`include_config`: If true, then the expectation suite itself is returned as part of the result object. - :ref:`catch_exceptions`: If true, execution will not fail if the Expectation encounters an error. Instead, it will \ return success = False and provide an informative error message. - :ref:`meta`: allows user-supplied meta-data to be stored with an expectation. @@ -23,7 +23,7 @@ See :ref:`result_format` for more information. `include_config` ------------------------------------------------------------------------------ -All Expectations accept a boolean `include_config` parameter. If true, then the expectation config itself is returned as part of the result object +All Expectations accept a boolean `include_config` parameter. If true, then the expectation suite itself is returned as part of the result object .. code-block:: bash diff --git a/docs/source/index.rst b/docs/source/index.rst index 4176578be3dc..9ee026b59784 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,36 +10,13 @@ Welcome to Great Expectations! :maxdepth: 2 intro - data_contexts - expectations - distributional_expectations - validation - workflow_advantages - glossary - -Advanced ------------------- - -.. toctree:: - :maxdepth: 2 - - standard_arguments - result_format - autoinspection - evaluation_parameters - custom_expectations - conventions + getting_started + core_concepts + guides + get_in_touch + contributing roadmap_changelog - implemented_expectations - -Module Docs -------------- -.. toctree:: - :maxdepth: 2 - - data_asset_module - dataset_module - data_context_module + module_docs Indices and tables ------------------ diff --git a/docs/source/intro.rst b/docs/source/intro.rst index 8512723b8bbc..c57e4a31e8db 100644 --- a/docs/source/intro.rst +++ b/docs/source/intro.rst @@ -3,6 +3,9 @@ Introduction ================== +.. toctree:: + :maxdepth: 2 + *Always know what to expect from your data.* What is great\_expectations? @@ -31,75 +34,38 @@ To get more done with data, faster. Teams use great\_expectations to - Codify assumptions used to build models when sharing with distributed teams or other analysts. -How do I get started? ---------------------- - -It's easy! Just use pip install: - -:: - - $ pip install great_expectations - -You can also clone the repository, which includes examples of using -great\_expectations. - -:: +Workflow advantages +------------------- - $ git clone https://github.com/great-expectations/great_expectations.git - $ pip install great_expectations/ +Most data science and data engineering teams end up building some form of pipeline testing, eventually. Unfortunately, many teams don't get around to it until late in the game, long after early lessons from data exploration and model development have been forgotten. -What expectations are available? --------------------------------- +In the meantime, data pipelines often become deep stacks of unverified assumptions. Mysterious (and sometimes embarrassing) bugs crop up more and more frequently. Resolving them requires painstaking exploration of upstream data, often leading to frustrating negotiations about data specs across teams. -Expectations include: +It's not unusual to see data teams grind to a halt for weeks (or even months!) to pay down accumulated pipeline debt. This work is never fun---after all, it's just data cleaning: no new products shipped; no new insights kindled. Even worse, it's re-cleaning old data that you thought you'd already dealt with. In our experience, servicing pipeline debt is one of the biggest productivity and morale killers on data teams. -- ``expect_table_row_count_to_equal`` -- ``expect_column_values_to_be_unique`` -- ``expect_column_values_to_be_in_set`` -- ``expect_column_mean_to_be_between`` -- ...and many more +We strongly believe that most of this pain is avoidable. We built Great Expectations to make it very, very simple to -Visit the `glossary of -expectations `__ -for a complete list of expectations that are currently part of the great -expectations vocabulary. +1. set up your testing framework early, +2. capture those early learnings while they're still fresh, and +3. systematically validate new data against them. -Can I contribute? ------------------ +It's the best tool we know of for managing the complexity that inevitably grows within data pipelines. We hope it helps you as much as it's helped us. -Absolutely. Yes, please. Start -`here `__, -and don't be shy with questions! +Good night and good luck! -How do I learn more? --------------------- -For full documentation, visit `Great Expectations on -readthedocs.io `__. - -`Down with Pipeline -Debt! `__ -explains the core philosophy behind Great Expectations. Please give it a -read, and clap, follow, and share while you're at it. - -For quick, hands-on introductions to Great Expectations' key features, -check out our walkthrough videos: - -- `Introduction to Great - Expectations `__ -- `Using Distributional - Expectations `__ - -What's the best way to get in touch with the Great Expectations team? ---------------------------------------------------------------------- - -`Issues on -GitHub `__. -If you have questions, comments, feature requests, etc., `opening an -issue `__ -is definitely the best path forward. +Use Cases +------------ -We also have a slack channel, which you can join here: https://tinyurl.com/great-expectations-slack +* Automating verification of new data deliveries purchased from a vendor before using them for \ + analytics. +* Packaging tests for whether new data meets assumptions when sharing notebooks that generate \ + reports or analysis. +* Making implicit knowledge explicit and reducing the number of "data integration meetings" and \ + round-trips to subject-matter experts to elicit necessary context and requirements while buiding \ + a new pipeline. +* Discovering new unit tests based on edge cases in data. +* ... and many more Great Expectations doesn't do X. Is it right for my use case? diff --git a/docs/source/module_docs.rst b/docs/source/module_docs.rst new file mode 100644 index 000000000000..fdb61d3e57df --- /dev/null +++ b/docs/source/module_docs.rst @@ -0,0 +1,15 @@ +.. _module_docs: + +Module Docs +================== + +.. toctree:: + :maxdepth: 2 + + /module_docs/data_asset_module + /module_docs/dataset_module + /module_docs/data_context_module + /module_docs/datasource_module + /module_docs/generator_module + /module_docs/profile_module + /module_docs/render_module diff --git a/docs/source/data_asset_module.rst b/docs/source/module_docs/data_asset_module.rst similarity index 76% rename from docs/source/data_asset_module.rst rename to docs/source/module_docs/data_asset_module.rst index 000599613601..508be4b01c96 100644 --- a/docs/source/data_asset_module.rst +++ b/docs/source/module_docs/data_asset_module.rst @@ -3,16 +3,13 @@ Data Asset Module ================================== -great_expectations.data_asset.base ----------------------------------- - -.. automodule:: great_expectations.data_asset.base +.. automodule:: great_expectations.data_asset.data_asset :members: :undoc-members: :show-inheritance: :exclude-members: DataAsset - .. autoclass:: great_expectations.data_asset.base.DataAsset + .. autoclass:: great_expectations.data_asset.data_asset.DataAsset great_expectations.data_asset.file_data_asset --------------------------------------------- @@ -36,7 +33,7 @@ great_expectations.data_asset.file_data_asset great_expectations.data_asset.util ---------------------------------- -.. automodule:: great_expectations.dataset.util +.. automodule:: great_expectations.data_asset.util :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/source/module_docs/data_context_module.rst b/docs/source/module_docs/data_context_module.rst new file mode 100644 index 000000000000..c25594e71524 --- /dev/null +++ b/docs/source/module_docs/data_context_module.rst @@ -0,0 +1,24 @@ +.. _data_context_module: + + +Data Context Module +=================== + +.. automodule:: great_expectations.data_context + :members: + :undoc-members: + :show-inheritance: + :exclude-members: DataContext + +great_expectations.data_context.DataContext +-------------------------------------------- + + .. autoclass:: great_expectations.data_context.DataContext + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: great_expectations.data_context.util + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/dataset_module.rst b/docs/source/module_docs/dataset_module.rst similarity index 97% rename from docs/source/dataset_module.rst rename to docs/source/module_docs/dataset_module.rst index d8abf9198ae2..4549f7382060 100644 --- a/docs/source/dataset_module.rst +++ b/docs/source/module_docs/dataset_module.rst @@ -79,11 +79,3 @@ great_expectations.dataset.util :undoc-members: :show-inheritance: - -great_expectations.dataset.autoinspect --------------------------------------- - -.. automodule:: great_expectations.dataset.autoinspect - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/module_docs/datasource_module.rst b/docs/source/module_docs/datasource_module.rst new file mode 100644 index 000000000000..2facb26a1f9c --- /dev/null +++ b/docs/source/module_docs/datasource_module.rst @@ -0,0 +1,41 @@ +.. _datasource_module: + +Datasource Module +================== + +.. automodule:: great_expectations.datasource + :members: + :undoc-members: + :show-inheritance: + :exclude-members: Datasource, BatchGenerator + + .. autoclass:: great_expectations.datasource.Datasource + :members: + :undoc-members: + + +great_expectations.datasource.pandas_source +--------------------------------------------- + +.. autoclass:: great_expectations.datasource.pandas_source.PandasDatasource + :members: + :undoc-members: + :show-inheritance: + + +great_expectations.datasource.sqlalchemy_source +------------------------------------------------ + +.. autoclass:: great_expectations.datasource.sqlalchemy_source.SqlAlchemyDatasource + :members: + :undoc-members: + :show-inheritance: + + +great_expectations.datasource.spark_source +--------------------------------------------- + +.. autoclass:: great_expectations.datasource.spark_source.SparkDFDatasource + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/module_docs/generator_module.rst b/docs/source/module_docs/generator_module.rst new file mode 100644 index 000000000000..5ce7412f87e0 --- /dev/null +++ b/docs/source/module_docs/generator_module.rst @@ -0,0 +1,60 @@ +.. _generator_module: + + +Generator Module +================== + +.. automodule:: great_expectations.datasource.generator + :members: + :undoc-members: + :show-inheritance: + :exclude-members: BatchGenerator + + .. autoclass:: great_expectations.datasource.generator.batch_generator.BatchGenerator + :members: + :undoc-members: + + +great_expectations.datasource.generator.in_memory_generator.InMemoryGenerator +------------------------------------------------------------------------------- + +.. autoclass:: great_expectations.datasource.generator.in_memory_generator.InMemoryGenerator + :members: + :undoc-members: + :show-inheritance: + + +great_expectations.datasource.generator.query_generator.QueryGenerator +------------------------------------------------------------------------ + +.. autoclass:: great_expectations.datasource.generator.query_generator.QueryGenerator + :members: + :undoc-members: + :show-inheritance: + + +great_expectations.datasource.generator.filesystem_path_generator.SubdirReaderGenerator +---------------------------------------------------------------------------------------- + +.. autoclass:: great_expectations.datasource.generator.filesystem_path_generator.SubdirReaderGenerator + :members: + :undoc-members: + :show-inheritance: + + +great_expectations.datasource.generator.filesystem_path_generator.GlobReaderGenerator +------------------------------------------------------------------------------------- + +.. autoclass:: great_expectations.datasource.generator.filesystem_path_generator.GlobReaderGenerator + :members: + :undoc-members: + :show-inheritance: + + +great_expectations.datasource.generator.databricks_generator.DatabricksTableGenerator +--------------------------------------------------------------------------------------- + +.. autoclass:: great_expectations.datasource.generator.databricks_generator.DatabricksTableGenerator + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/module_docs/profile_module.rst b/docs/source/module_docs/profile_module.rst new file mode 100644 index 000000000000..95a32ac74529 --- /dev/null +++ b/docs/source/module_docs/profile_module.rst @@ -0,0 +1,9 @@ +.. _profile_module: + +Profile Module +================================== + +.. automodule:: great_expectations.profile + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/module_docs/render_module.rst b/docs/source/module_docs/render_module.rst new file mode 100644 index 000000000000..91e3a3e0e131 --- /dev/null +++ b/docs/source/module_docs/render_module.rst @@ -0,0 +1,19 @@ +.. _render_module: + +Render Module +================================== + +.. automodule:: great_expectations.render + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: great_expectations.render.renderer + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: great_expectations.render.view + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/roadmap_changelog.rst b/docs/source/roadmap_changelog.rst index dfce7bd5bd82..71f718b4fe93 100644 --- a/docs/source/roadmap_changelog.rst +++ b/docs/source/roadmap_changelog.rst @@ -5,20 +5,88 @@ Changelog and Roadmap Planned Features ---------------- -* Improved project initialization and data contexts * Improved variable typing * Support for non-tabular datasources (e.g. JSON, XML, AVRO) +* Conditional expectations +* Multi-batch expectations -v.0.6.1__develop +v.0.7.0__develop ________________ - - +Version 0.7 of Great Expectations is HUGE. It introduces several major new features +and a large number of improvements, including breaking API changes. + +The core vocabulary of expectations remains consistent. Upgrading to +the new version of GE will primarily require changes to code that +uses data contexts; existing expectation suites will require only changes +to top-level names. + + * Major update of Data Contexts. Data Contexts now offer significantly \ + more support for building and maintaining expectation suites and \ + interacting with existing pipeline systems, including providing a namespace for objects.\ + They can handle integrating, registering, and storing validation results, and + provide a namespace for data assets, making **batches** first-class citizens in GE. + Read more: :ref:`data_context` or :py:mod:`great_expectations.data_context` + + * Major refactor of autoinspect. Autoinspect is now built around a module + called "profile" which provides a class-based structure for building + expectation suites. There is no longer a default "autoinspect_func" -- + calling autoinspect requires explicitly passing the desired profiler. See :ref:`profiling` + + * New "Compile to Docs" feature produces beautiful documentation from expectations and expectation + validation reports, helping keep teams on the same page. + + * Name clarifications: we've stopped using the overloaded terms "expectations + config" and "config" and instead use "expectation suite" to refer to a + collection (or suite!) of expectations that can be used for validating a + data asset. + + - Expectation Suites include several top level keys that are useful \ + for organizing content in a data context: data_asset_name, \ + expectation_suite_name, and data_asset_type. When a data_asset is \ + validated, those keys will be placed in the `meta` key of the \ + validation result. + + * Major enhancement to the CLI tool including `init`, `render` and more flexibility with `validate` + + * Added helper notebooks to make it easy to get started. Each notebook acts as a combination of \ + tutorial and code scaffolding, to help you quickly learn best practices by applying them to \ + your own data. + + * Relaxed constraints on expectation parameter values, making it possible to declare many column + aggregate expectations in a way that is always "vacuously" true, such as + ``expect_column_values_to_be_between`` ``None`` and ``None``. This makes it possible to progressively + tighten expectations while using them as the basis for profiling results and documentation. + + * Bugfixes and improvements: + + * New expectations: + + * expect_column_quantile_values_to_be_between + * expect_column_distinct_values_to_be_in_set + + * Added support for ``head`` method on all current backends, returning a PandasDataset + * More implemented expectations for SparkDF Dataset with optimizations + + * expect_column_values_to_be_between + * expect_column_median_to_be_between + * expect_column_value_lengths_to_be_between + + * Optimized histogram fetching for SqlalchemyDataset and SparkDFDataset + * Added cross-platform internal partition method, paving path for improved profiling + * Fixed bug with outputstrftime not being honored in PandasDataset + * Fixed series naming for column value counts + * Standardized naming for expect_column_values_to_be_of_type + * Standardized and made explicit use of sample normalization in stdev calculation + * Added from_dataset helper + * Internal testing improvements + * Documentation reorganization and improvements + * Introduce custom exceptions for more detailed error logs v.0.6.1 ________________ * Re-add testing (and support) for py2 -* NOTE: Support for SqlAlchemyDataset and SparkDFDataset is enabled via optional install - (e.g. `pip install great_expectations[sqlalchemy]` or `pip install great_expectations[spark]`) +* NOTE: Support for SqlAlchemyDataset and SparkDFDataset is enabled via optional install \ + (e.g. ``pip install great_expectations[sqlalchemy]`` or ``pip install great_expectations[spark]``) v.0.6.0 ------------ @@ -33,22 +101,21 @@ v.0.6.0 v.0.5.1 --------------- -* Fix issue where no result_format available for expect_column_values_to_be_null caused error +* **Fix** issue where no result_format available for expect_column_values_to_be_null caused error * Use vectorized computation in pandas (#443, #445; thanks @RoyalTS) v.0.5.0 ---------------- * Restructured class hierarchy to have a more generic DataAsset parent that maintains expectation logic separate \ - from the tabular organization of Dataset expectations + from the tabular organization of Dataset expectations * Added new FileDataAsset and associated expectations (#416 thanks @anhollis) * Added support for date/datetime type columns in some SQLAlchemy expectations (#413) * Added support for a multicolumn expectation, expect multicolumn values to be unique (#408) -* Optimization: You can now disable `partial_unexpected_counts` by setting the \ - `partial_unexpected_count` value to 0 in the result_format argument, and we do not compute it when it would - not be returned. (#431, thanks @eugmandel) -* Fix: Correct error in unexpected_percent computations for sqlalchemy when unexpected values exceed limit (#424) -* Fix: Pass meta object to expectation result (#415, thanks @jseeman) +* **Optimization**: You can now disable `partial_unexpected_counts` by setting the `partial_unexpected_count` value to \ + 0 in the result_format argument, and we do not compute it when it would not be returned. (#431, thanks @eugmandel) +* **Fix**: Correct error in unexpected_percent computations for sqlalchemy when unexpected values exceed limit (#424) +* **Fix**: Pass meta object to expectation result (#415, thanks @jseeman) * Add support for multicolumn expectations, with `expect_multicolumn_values_to_be_unique` as an example (#406) * Add dataset class to from_pandas to simplify using custom datasets (#404, thanks @jtilly) * Add schema support for sqlalchemy data context (#410, thanks @rahulj51) @@ -73,7 +140,7 @@ v.0.4.5 * Add support for custom schema in SqlAlchemyDataset (#370, thanks @elsander) * Use getfullargspec to avoid deprecation warnings. * Add expect_column_values_to_be_unique to SqlAlchemyDataset -* Fix map expectations for categorical columns (thanks @eugmandel) +* **Fix** map expectations for categorical columns (thanks @eugmandel) * Improve internal testing suite (thanks @anhollis and @ccnobbli) * Consistently use value_set instead of mixing value_set and values_set (thanks @njsmith8) @@ -81,8 +148,8 @@ v.0.4.4 ---------------- * Improve CLI help and set CLI return value to the number of unmet expectations * Add error handling for empty columns to SqlAlchemyDataset, and associated tests -* Fix broken support for older pandas versions (#346) -* Fix pandas deepcopy issue (#342) +* **Fix** broken support for older pandas versions (#346) +* **Fix** pandas deepcopy issue (#342) v.0.4.3 ------- @@ -95,16 +162,18 @@ v.0.4.3 * Add support for parameterized expectations * Improve support for custom expectations with better error messages (thanks @syk0saje) * Implement expect_column_value_lenghts_to_[be_between|equal] for SQAlchemy (thanks @ccnobbli) -* Fix PandasDataset subclasses to inherit child class +* **Fix** PandasDataset subclasses to inherit child class v.0.4.2 ------- -* Fix bugs in expect_column_values_to_[not]_be_null: computing unexpected value percentages and handling all-null (thanks @ccnobbli) +* **Fix** bugs in expect_column_values_to_[not]_be_null: computing unexpected value percentages and handling all-null (thanks @ccnobbli) * Support mysql use of Decimal type (thanks @bouke-nederstigt) * Add new expectation expect_column_values_to_not_match_regex_list. + * Change behavior of expect_column_values_to_match_regex_list to use python re.findall in PandasDataset, relaxing \ - matching of individuals expressions to allow matches anywhere in the string. -* Fix documentation errors and other small errors (thanks @roblim, @ccnobbli) + matching of individuals expressions to allow matches anywhere in the string. + +* **Fix** documentation errors and other small errors (thanks @roblim, @ccnobbli) v.0.4.1 ------- @@ -112,7 +181,9 @@ v.0.4.1 v.0.4.0 ------- -* Initial implementation of data context API and SqlAlchemyDataset including implementations of the following expectations: +* Initial implementation of data context API and SqlAlchemyDataset including implementations of the following \ + expectations: + * expect_column_to_exist * expect_table_row_count_to_be * expect_table_row_count_to_be_between @@ -126,10 +197,16 @@ v.0.4.0 * expect_column_sum_to_be * expect_column_unique_value_count_to_be_between * expect_column_proportion_of_unique_values_to_be_between -* Major refactor of output_format to new result_format parameter. See docs for full details. + +* Major refactor of output_format to new result_format parameter. See docs for full details: + * exception_list and related uses of the term exception have been renamed to unexpected - * the output formats are explicitly hierarchical now, with BOOLEAN_ONLY < BASIC < SUMMARY < COMPLETE. `column_aggregate_expectation`s now return element count and related information included at the BASIC level or higher. -* New expectation available for parameterized distributions--expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than (what a name! :) -- (thanks @ccnobbli) + * Output formats are explicitly hierarchical now, with BOOLEAN_ONLY < BASIC < SUMMARY < COMPLETE. \ + All *column_aggregate_expectation* expectations now return element count and related information included at the \ + BASIC level or higher. + +* New expectation available for parameterized distributions--\ + expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than (what a name! :) -- (thanks @ccnobbli) * ge.from_pandas() utility (thanks @schrockn) * Pandas operations on a PandasDataset now return another PandasDataset (thanks @dlwhite5) * expect_column_to_exist now takes a column_index parameter to specify column order (thanks @louispotok) @@ -144,7 +221,7 @@ v.0.3.2 v.0.3.1 -------- -* Fix infinite recursion error when building custom expectations +* **Fix** infinite recursion error when building custom expectations * Catch dateutil parsing overflow errors v.0.2 diff --git a/docs/source/tutorial_create_expectations.rst b/docs/source/tutorial_create_expectations.rst new file mode 100644 index 000000000000..920b3896d13f --- /dev/null +++ b/docs/source/tutorial_create_expectations.rst @@ -0,0 +1,60 @@ +.. _tutorial_create_expectations: + +Tutorial - Create Expectations +============================== + +.. toctree:: + :maxdepth: 2 + + +Get DataContext object +----------------------- +TODO: content!!! + +What expectations are available? +-------------------------------- + +Expectations include: + +- ``expect_table_row_count_to_equal`` +- ``expect_column_values_to_be_unique`` +- ``expect_column_values_to_be_in_set`` +- ``expect_column_mean_to_be_between`` +- ...and many more + +Visit the `glossary of +expectations `__ +for a complete list of expectations that are currently part of the great +expectations vocabulary. + + +How do I learn more? +-------------------- + +For full documentation, visit `Great Expectations on +readthedocs.io `__. + +`Down with Pipeline +Debt! `__ +explains the core philosophy behind Great Expectations. Please give it a +read, and clap, follow, and share while you're at it. + +For quick, hands-on introductions to Great Expectations' key features, +check out our walkthrough videos: + +- `Introduction to Great + Expectations `__ +- `Using Distributional + Expectations `__ + + +Great Expectations doesn't do X. Is it right for my use case? +------------------------------------------------------------- + +It depends. If you have needs that the library doesn't meet yet, please +`upvote an existing +issue(s) `__ +or `open a new +issue `__ +and we'll see what we can do. Great Expectations is under active +development, so your use case might be supported soon. diff --git a/docs/source/tutorial_init.rst b/docs/source/tutorial_init.rst new file mode 100644 index 000000000000..7bcae302e055 --- /dev/null +++ b/docs/source/tutorial_init.rst @@ -0,0 +1,26 @@ +.. _tutorial_init: + +Tutorial - great_expectations init +================================== + +.. toctree:: + :maxdepth: 2 + +What can you do with great expectations + +What is a datasource? + +(data source -> datasource) + +"candidate expectations" language goes away + +What is profiling? + - general, docs; what can you do with profiling (create expectations, docs) + +Why profile? + +How to do it with sampling? + +Drop "as of 0.7.0"; drop "as a backup option" replace with, "if you're not ready now, visit ... for more information about +sampling and profiling" + diff --git a/docs/source/workflow_advantages.rst b/docs/source/workflow_advantages.rst deleted file mode 100644 index 88ad9f90b12a..000000000000 --- a/docs/source/workflow_advantages.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. _workflow_advantages: - -================================================================================ -Workflow advantages -================================================================================ - -Most data science and data engineering teams end up building some form of pipeline testing, eventually. Unfortunately, many teams don't get around to it until late in the game, long after early lessons from data exploration and model development have been forgotten. - -In the meantime, data pipelines often become deep stacks of unverified assumptions. Mysterious (and sometimes embarrassing) bugs crop up more and more frequently. Resolving them requires painstaking exploration of upstream data, often leading to frustrating negotiations about data specs across teams. - -It's not unusual to see data teams grind to a halt for weeks (or even months!) to pay down accumulated pipeline debt. This work is never fun---after all, it's just data cleaning: no new products shipped; no new insights kindled. Even worse, it's re-cleaning old data that you thought you'd already dealt with. In our experience, servicing pipeline debt is one of the biggest productivity and morale killers on data teams. - -We strongly believe that most of this pain is avoidable. We built Great Expectations to make it very, very simple to - -1. set up your testing framework early, -2. capture those early learnings while they're still fresh, and -3. systematically validate new data against them. - -It's the best tool we know of for managing the complexity that inevitably grows within data pipelines. We hope it helps you as much as it's helped us. - -Good night and good luck! - - diff --git a/examples/integrations/airflow/hooks/db_hook.py b/examples/integrations/airflow/hooks/db_hook.py index 737aed399158..e1e571d6178a 100644 --- a/examples/integrations/airflow/hooks/db_hook.py +++ b/examples/integrations/airflow/hooks/db_hook.py @@ -1,6 +1,11 @@ import great_expectations as ge from airflow.hooks.mysql_hook import MySqlHook +#### +# +# NOTE: this code has not been updated for the new GE 0.7 naming conventions +# +#### class ExpectationMySQLHook(MySqlHook): diff --git a/examples/integrations/airflow/hooks/s3_csv_hook.py b/examples/integrations/airflow/hooks/s3_csv_hook.py index e0e69e557802..f6ed8e83f57d 100644 --- a/examples/integrations/airflow/hooks/s3_csv_hook.py +++ b/examples/integrations/airflow/hooks/s3_csv_hook.py @@ -5,6 +5,11 @@ from airflow.hooks.S3_hook import S3Hook import great_expectations as ge +#### +# +# NOTE: this code has not been updated for the new GE 0.7 naming conventions +# +#### class ExpectationS3CsvHook(S3Hook): diff --git a/examples/integrations/airflow/operators/expectation_operator.py b/examples/integrations/airflow/operators/expectation_operator.py index 86b0e780288b..cec6ebe34f3e 100644 --- a/examples/integrations/airflow/operators/expectation_operator.py +++ b/examples/integrations/airflow/operators/expectation_operator.py @@ -11,6 +11,11 @@ from examples.integrations.airflow.hooks.s3_csv_hook import ExpectationS3CsvHook from examples.integrations.airflow.hooks.db_hook import ExpectationMySQLHook +#### +# +# NOTE: this code has not been updated for the new GE 0.7 naming conventions +# +#### class ExpectationOperator(BaseOperator): @@ -32,7 +37,7 @@ def __init__(self, Validate provided dataset using great_expectations. :param dataset: Name of the dataset being loaded :type str - :param expectations_json: file pointing to expectation config or json string + :param expectations_json: file pointing to expectation suite or json string :type str :param fail_on_error: True if airflow job should fail when expectations fail :type bool @@ -137,15 +142,15 @@ def _get_dataframe(self): def _load_json(self): """ - Load expectation config based on operator parameters. If provided expectations_json is a file the config will - be loaded from this file. Otherwise we'll try to load the config as a string. + Load expectation suite based on operator parameters. If provided expectations_json is a file the suite will + be loaded from this file. Otherwise we'll try to load the expectation suite as a string. :return: """ if os.path.isfile(self.expectations_json): - self.log.info("Loading expectation config from file {file}".format(file=self.expectations_json)) + self.log.info("Loading expectation suite from file {file}".format(file=self.expectations_json)) return json.load(open(self.expectations_json)) else: - self.log.info("Loading expectation config from string") + self.log.info("Loading expectation suite from string") return json.loads(self.expectations_json) def _store_results(self, results): @@ -154,9 +159,9 @@ def _store_results(self, results): def execute(self, context): df = self._get_dataframe() - config = self._load_json() + suite = self._load_json() self.log.info("Start dataset validation for set {set}".format(set=self.dataset_name)) - results = df.validate(expectations_config=config) + results = df.validate(expectation_suite=suite) self.log.info(pformat(results)) diff --git a/examples/notebooks/Crop_Expectations_With_Reshape.ipynb b/examples/notebooks/Crop_Expectations_With_Reshape.ipynb index c23508ce4597..885676299651 100644 --- a/examples/notebooks/Crop_Expectations_With_Reshape.ipynb +++ b/examples/notebooks/Crop_Expectations_With_Reshape.ipynb @@ -166,7 +166,7 @@ }, "outputs": [], "source": [ - "print(json.dumps(df.get_expectations_config(), indent = 2))" + "print(json.dumps(df.get_expectation_suite(), indent = 2))" ] } ], diff --git a/examples/notebooks/Distributional_Expectations_Demo.ipynb b/examples/notebooks/Distributional_Expectations_Demo.ipynb index 9c8ec5c70002..4a658d9417dd 100644 --- a/examples/notebooks/Distributional_Expectations_Demo.ipynb +++ b/examples/notebooks/Distributional_Expectations_Demo.ipynb @@ -364,7 +364,7 @@ }, "outputs": [], "source": [ - "df.get_expectations_config()" + "df.get_expectation_suite()" ] }, { @@ -459,7 +459,7 @@ }, "outputs": [], "source": [ - "my_expectations = df.get_expectations_config()" + "my_expectations = df.get_expectation_suite()" ] }, { @@ -481,7 +481,7 @@ }, "outputs": [], "source": [ - "results = df_test.validate(expectations_config=my_expectations)\n", + "results = df_test.validate(expectation_suite=my_expectations)\n", "results" ] }, @@ -493,7 +493,7 @@ }, "outputs": [], "source": [ - "failures = df_test.validate(expectations_config=my_expectations, only_return_failures=True)\n", + "failures = df_test.validate(expectation_suite=my_expectations, only_return_failures=True)\n", "failures" ] }, diff --git a/examples/notebooks/explore_titanic_data.ipynb b/examples/notebooks/explore_titanic_data.ipynb index 22c7302be624..df0e41620a65 100644 --- a/examples/notebooks/explore_titanic_data.ipynb +++ b/examples/notebooks/explore_titanic_data.ipynb @@ -136,8 +136,8 @@ }, "outputs": [], "source": [ - "print json.dumps(titanic_df.get_expectations_config(), indent=2)\n", - "# titanic_df.save_expectations_config('titanic_expectations.json')" + "print json.dumps(titanic_df.get_expectation_suite(), indent=2)\n", + "# titanic_df.save_expectation_suite('titanic_expectations.json')" ] }, { diff --git a/great_expectations/__init__.py b/great_expectations/__init__.py index 94d1b4ec23d3..cdd3cb1d64c5 100644 --- a/great_expectations/__init__.py +++ b/great_expectations/__init__.py @@ -1,5 +1,5 @@ from .util import * from great_expectations import data_asset -from great_expectations.data_context import get_data_context +from great_expectations import data_context from .version import __version__ \ No newline at end of file diff --git a/great_expectations/cli.py b/great_expectations/cli.py deleted file mode 100755 index 5b306c4c6f6b..000000000000 --- a/great_expectations/cli.py +++ /dev/null @@ -1,133 +0,0 @@ -import json -import sys -import os -import argparse -import logging - -from great_expectations import read_csv -from great_expectations import __version__ -from great_expectations.dataset import Dataset, PandasDataset -from great_expectations.data_asset import FileDataAsset - -logger = logging.getLogger(__name__) - -def dispatch(args): - parser = argparse.ArgumentParser( - description='great_expectations command-line interface') - - subparsers = parser.add_subparsers(dest='command') - subparsers.required = True - - validate_parser = subparsers.add_parser( - 'validate', description='Validate expectations for your dataset.') - validate_parser.set_defaults(func=validate) - - validate_parser.add_argument('dataset', - help='Path to a file containing a CSV file to validate using the provided expectations_config_file.') - validate_parser.add_argument('expectations_config_file', - help='Path to a file containing a valid great_expectations expectations config to use to validate the data.') - - validate_parser.add_argument('--evaluation_parameters', '-p', default=None, - help='Path to a file containing JSON object used to evaluate parameters in expectations config.') - validate_parser.add_argument('--result_format', '-o', default="SUMMARY", - help='Result format to use when building evaluation responses.') - validate_parser.add_argument('--catch_exceptions', '-e', default=True, type=bool, - help='Specify whether to catch exceptions raised during evaluation of expectations (defaults to True).') - validate_parser.add_argument('--only_return_failures', '-f', default=False, type=bool, - help='Specify whether to only return expectations that are not met during evaluation (defaults to False).') - # validate_parser.add_argument('--no_catch_exceptions', '-e', default=True, action='store_false') - # validate_parser.add_argument('--only_return_failures', '-f', default=False, action='store_true') - custom_dataset_group = validate_parser.add_argument_group( - 'custom_dataset', description='Arguments defining a custom dataset to use for validation.') - custom_dataset_group.add_argument('--custom_dataset_module', '-m', default=None, - help='Path to a python module containing a custom dataset class.') - custom_dataset_group.add_argument('--custom_dataset_class', '-c', default=None, - help='Name of the custom dataset class to use during evaluation.') - - version_parser = subparsers.add_parser('version') - version_parser.set_defaults(func=version) - - parsed_args = parser.parse_args(args) - - return parsed_args.func(parsed_args) - - -def validate(parsed_args): - """ - Read a dataset file and validate it using a config saved in another file. Uses parameters defined in the dispatch - method. - - :param parsed_args: A Namespace object containing parsed arguments from the dispatch method. - :return: The number of unsucessful expectations - """ - parsed_args = vars(parsed_args) - data_set = parsed_args['dataset'] - expectations_config_file = parsed_args['expectations_config_file'] - - expectations_config = json.load(open(expectations_config_file)) - - if parsed_args["evaluation_parameters"] is not None: - evaluation_parameters = json.load( - open(parsed_args["evaluation_parameters"])) - else: - evaluation_parameters = None - - # Use a custom dataasset module and class if provided. Otherwise infer from the config. - if parsed_args["custom_dataset_module"]: - sys.path.insert(0, os.path.dirname( - parsed_args["custom_dataset_module"])) - module_name = os.path.basename( - parsed_args["custom_dataset_module"]).split('.')[0] - custom_module = __import__(module_name) - dataset_class = getattr( - custom_module, parsed_args["custom_dataset_class"]) - elif "data_asset_type" in expectations_config: - if expectations_config["data_asset_type"] == "Dataset" or expectations_config["data_asset_type"] == "PandasDataset": - dataset_class = PandasDataset - elif expectations_config["data_asset_type"].endswith("Dataset"): - logger.info("Using PandasDataset to validate dataset of type %s." % expectations_config["data_asset_type"]) - dataset_class = PandasDataset - elif expectations_config["data_asset_type"] == "FileDataAsset": - dataset_class = FileDataAsset - else: - logger.critical("Unrecognized data_asset_type %s. You may need to specifcy custom_dataset_module and custom_dataset_class." % expectations_config["data_asset_type"]) - return -1 - else: - dataset_class = PandasDataset - - if issubclass(dataset_class, Dataset): - da = read_csv(data_set, expectations_config=expectations_config, - dataset_class=dataset_class) - else: - da = dataset_class(data_set, config=expectations_config) - - result = da.validate( - evaluation_parameters=evaluation_parameters, - result_format=parsed_args["result_format"], - catch_exceptions=parsed_args["catch_exceptions"], - only_return_failures=parsed_args["only_return_failures"], - ) - - print(json.dumps(result, indent=2)) - return result['statistics']['unsuccessful_expectations'] - - -def version(parsed_args): - """ - Print the currently-running version of great expectations - """ - print(__version__) - - -def main(): - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(logging.INFO) - return_value = dispatch(sys.argv[1:]) - sys.exit(return_value) - - -if __name__ == '__main__': - main() diff --git a/great_expectations/cli/__init__.py b/great_expectations/cli/__init__.py new file mode 100755 index 000000000000..43f4372ce1dc --- /dev/null +++ b/great_expectations/cli/__init__.py @@ -0,0 +1 @@ +from .cli import main, cli diff --git a/great_expectations/cli/cli.py b/great_expectations/cli/cli.py new file mode 100644 index 000000000000..4c39f2b7ec1f --- /dev/null +++ b/great_expectations/cli/cli.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- + +import click +import six +import os +import json +import logging +import sys +import warnings +warnings.filterwarnings('ignore') + +from pyfiglet import figlet_format +try: + from termcolor import colored +except ImportError: + colored = None + +from great_expectations import __version__, read_csv +from great_expectations.exceptions import DataContextError +from great_expectations.dataset import Dataset, PandasDataset +from great_expectations.data_asset import FileDataAsset +from great_expectations.data_context import DataContext + +from great_expectations.render.renderer import DescriptivePageRenderer, PrescriptivePageRenderer +from great_expectations.render.view import DescriptivePageView + + +from .util import cli_message +from .init import ( + scaffold_directories_and_notebooks, + greeting_1, + msg_prompt_lets_begin, +) +from .datasource import ( + add_datasource +) + +# Take over the entire GE module logging namespace when running CLI +logger = logging.getLogger("great_expectations") + +@click.group() +@click.version_option(version=__version__) +def cli(): + """great_expectations command-line interface""" + pass + + +@cli.command() +@click.argument('dataset') +@click.argument('expectation_suite_file') +@click.option('--evaluation_parameters', '-p', default=None, + help='Path to a file containing JSON object used to evaluate parameters in expectations config.') +@click.option('--result_format', '-o', default="SUMMARY", + help='Result format to use when building evaluation responses.') +@click.option('--catch_exceptions', '-e', default=True, type=bool, + help='Specify whether to catch exceptions raised during evaluation of expectations (defaults to True).') +@click.option('--only_return_failures', '-f', default=False, type=bool, + help='Specify whether to only return expectations that are not met during evaluation \ + (defaults to False).') +@click.option('--custom_dataset_module', '-m', default=None, + help='Path to a python module containing a custom dataset class.') +@click.option('--custom_dataset_class', '-c', default=None, + help='Name of the custom dataset class to use during evaluation.') +def validate(dataset, expectation_suite_file, evaluation_parameters, result_format, + catch_exceptions, only_return_failures, custom_dataset_module, custom_dataset_class): + """Validate a CSV file against an expectation suite. + + DATASET: Path to a file containing a CSV file to validate using the provided expectation_suite_file. + + EXPECTATION_SUITE_FILE: Path to a file containing a valid great_expectations expectations suite to use to \ +validate the data. + """ + + """ + Read a dataset file and validate it using an expectation suite saved in another file. Uses parameters defined in the dispatch + method. + + :param parsed_args: A Namespace object containing parsed arguments from the dispatch method. + :return: The number of unsucessful expectations + """ + expectation_suite_file = expectation_suite_file + + expectation_suite = json.load(open(expectation_suite_file)) + + if evaluation_parameters is not None: + evaluation_parameters = json.load( + open(evaluation_parameters, "r")) + + # Use a custom dataasset module and class if provided. Otherwise infer from the expectation suite + if custom_dataset_module: + sys.path.insert(0, os.path.dirname( + custom_dataset_module)) + module_name = os.path.basename( + custom_dataset_module).split('.')[0] + custom_module = __import__(str(module_name)) + dataset_class = getattr( + custom_module, custom_dataset_class) + elif "data_asset_type" in expectation_suite: + if (expectation_suite["data_asset_type"] == "Dataset" or + expectation_suite["data_asset_type"] == "PandasDataset"): + dataset_class = PandasDataset + elif expectation_suite["data_asset_type"].endswith("Dataset"): + logger.info("Using PandasDataset to validate dataset of type %s." % + expectation_suite["data_asset_type"]) + dataset_class = PandasDataset + elif expectation_suite["data_asset_type"] == "FileDataAsset": + dataset_class = FileDataAsset + else: + logger.critical("Unrecognized data_asset_type %s. You may need to specifcy custom_dataset_module and \ + custom_dataset_class." % expectation_suite["data_asset_type"]) + return -1 + else: + dataset_class = PandasDataset + + if issubclass(dataset_class, Dataset): + da = read_csv(dataset, expectation_suite=expectation_suite, + dataset_class=dataset_class) + else: + da = dataset_class(dataset, config=expectation_suite) + + result = da.validate( + evaluation_parameters=evaluation_parameters, + result_format=result_format, + catch_exceptions=catch_exceptions, + only_return_failures=only_return_failures, + ) + + # Note: Should this be rendered through cli_message? + # Probably not, on the offchance that the JSON object contains tags + print(json.dumps(result, indent=2)) + sys.exit(result['statistics']['unsuccessful_expectations']) + + +@cli.command() +@click.option( + '--target_directory', + '-d', + default="./", + help='The root of the project directory where you want to initialize Great Expectations.' +) +def init(target_directory): + """Initialize a new Great Expectations project. + + This guided input walks the user through setting up a project. + + It scaffolds directories, sets up notebooks, creates a project file, and + appends to a `.gitignore` file. + """ + try: + context = DataContext.create(target_directory) + except DataContextError as err: + logger.critical(err.message) + sys.exit(-1) + + base_dir = os.path.join(target_directory, "great_expectations") + + six.print_(colored( + figlet_format("Great Expectations", font="big"), + color="cyan" + )) + + cli_message(greeting_1) + + if not click.confirm(msg_prompt_lets_begin, default=True): + cli_message( + "OK - run great_expectations init again when ready. Exiting..." + ) + exit(0) + + scaffold_directories_and_notebooks(base_dir) + cli_message( + "\nDone.", + ) + + add_datasource(context) + + +@cli.command() +@click.argument('render_object') +def render(render_object): + """Render a great expectations object. + + RENDER_OBJECT: path to a GE object to render + """ + with open(render_object, "r") as infile: + raw = json.load(infile) + + model = DescriptivePageRenderer.render(raw) + # model = PrescriptivePageRenderer.render(raw) + print(DescriptivePageView.render(model)) + + +@cli.command() +@click.argument('datasource_name') +@click.option('--max_data_assets', '-m', default=10, + help='Maximum number of named data assets to profile.') +@click.option('--profile_all_data_assets', '-A', is_flag=True, default=False, + help='Profile ALL data assets within the target data source. If True, this will override --max_data_assets.') +@click.option('--target_directory', '-d', default="./great_expectations", + help='The root of a project directory containing a great_expectations/ config.') +def profile(datasource_name, max_data_assets, profile_all_data_assets, target_directory): + """Profile a great expectations object. + + datasource_name: A datasource within this GE context to profile. + """ + + if profile_all_data_assets: + max_data_assets = None + + # FIXME: By default, this should iterate over all datasources + context = DataContext(target_directory) + context.profile_datasource( + datasource_name, + max_data_assets=max_data_assets + ) + + +def main(): + handler = logging.StreamHandler() + # Just levelname and message Could re-add other info if we want + formatter = logging.Formatter( + ' %(message)s') + # '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + cli() + + +if __name__ == '__main__': + main() diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py new file mode 100644 index 000000000000..bf3dfc51aeb1 --- /dev/null +++ b/great_expectations/cli/datasource.py @@ -0,0 +1,264 @@ +import os +import click + +from .util import cli_message +from great_expectations.render import DescriptivePageView + + +def add_datasource(context): + data_source_selection = click.prompt( + msg_prompt_choose_data_source, + type=click.Choice(["1", "2", "3", "4"]), + show_choices=False + ) + + cli_message(data_source_selection) + + if data_source_selection == "1": # pandas + print("This init script will configure a local ") + path = click.prompt( + msg_prompt_filesys_enter_base_path, + # default='/data/', + type=click.Path( + exists=False, + file_okay=False, + dir_okay=True, + readable=True + ), + show_default=True + ) + if path.startswith("./"): + path = path[2:] + + if path.endswith("/"): + basenamepath = path[:-1] + else: + basenamepath = path + + default_data_source_name = os.path.basename(basenamepath) + "__dir" + data_source_name = click.prompt( + msg_prompt_datasource_name, + default=default_data_source_name, + show_default=True + ) + + context.add_datasource(data_source_name, "pandas", base_directory=os.path.join("..", path)) + + elif data_source_selection == "2": # sqlalchemy + data_source_name = click.prompt( + msg_prompt_datasource_name, default="mydb", show_default=True) + + cli_message(msg_sqlalchemy_config_connection.format( + data_source_name)) + + drivername = click.prompt("What is the driver for the sqlalchemy connection?", default="postgres", + show_default=True) + host = click.prompt("What is the host for the sqlalchemy connection?", default="localhost", + show_default=True) + port = click.prompt("What is the port for the sqlalchemy connection?", default="5432", + show_default=True) + username = click.prompt("What is the username for the sqlalchemy connection?", default="postgres", + show_default=True) + password = click.prompt("What is the password for the sqlalchemy connection?", default="", + show_default=False, hide_input=True) + database = click.prompt("What is the database name for the sqlalchemy connection?", default="postgres", + show_default=True) + + credentials = { + "drivername": drivername, + "host": host, + "port": port, + "username": username, + "password": password, + "database": database + } + context.add_profile_credentials(data_source_name, **credentials) + + context.add_datasource( + data_source_name, "sqlalchemy", profile=data_source_name) + + elif data_source_selection == "3": # Spark + path = click.prompt( + msg_prompt_filesys_enter_base_path, + default='/data/', + type=click.Path( + exists=True, + file_okay=False, + dir_okay=True, + readable=True + ), + show_default=True + ) + if path.startswith("./"): + path = path[2:] + + if path.endswith("/"): + basenamepath = path[:-1] + default_data_source_name = os.path.basename(basenamepath) + data_source_name = click.prompt( + msg_prompt_datasource_name, default=default_data_source_name, show_default=True) + + context.add_datasource(data_source_name, "spark", base_directory=path) + + # if data_source_selection == "5": # dbt + # dbt_profile = click.prompt(msg_prompt_dbt_choose_profile) + # log_message(msg_dbt_go_to_notebook, color="blue") + # context.add_datasource("dbt", "dbt", profile=dbt_profile) + if data_source_selection == "4": # None of the above + cli_message(msg_unknown_data_source) + print("Skipping datasource configuration. You can add a datasource later by editing the great_expectations.yml file.") + return None + + if data_source_name != None: + + cli_message( +""" +Would you like to profile '%s' to create candidate expectations and documentation? + +Please note: +As of v0.7.0, profiling is still a beta feature in Great Expectations. +This generation of profilers will evaluate the entire data source (without sampling) and may be very time consuming. +As a rule of thumb, we recommend starting with data smaller than 100MB. + +As a backup option please visit https://great-expectations.readthedocs.io/en/latest/profiling.html for instructions for profiling limited subsets within data sources. + """ % (data_source_name) + ) + if click.confirm("Proceed?", + default=True + ): + profiling_results = context.profile_datasource( + data_source_name, + max_data_assets=20 + ) + + print("\nProfiling results are saved:") + for profiling_result in profiling_results: + data_asset_name = profiling_result[1]['meta']['data_asset_name'] + expectation_suite_name = profiling_result[1]['meta']['expectation_suite_name'] + run_id = profiling_result[1]['meta']['run_id'] + + print(" {0:s}".format(context.get_validation_location(data_asset_name, expectation_suite_name, run_id)['filepath'])) + + cli_message( +""" + +To generate documentation from the data you just profiled, the profiling results should be moved from +great_expectations/uncommitted (ignored by git) to great_expectations/fixtures. Before proceeding, +make sure that this data does not contain sensitive information. + +To learn more: https://great-expectations.readthedocs.io/en/latest/intro.html#data_documentation +""" + ) + if click.confirm("Proceed?", + default = True + ): + cli_message("Rendering...") + + for profiling_result in profiling_results: + data_asset_name = profiling_result[1]['meta']['data_asset_name'] + expectation_suite_name = profiling_result[1]['meta']['expectation_suite_name'] + run_id = profiling_result[1]['meta']['run_id'] + context.move_validation_to_fixtures(data_asset_name, expectation_suite_name, run_id) + + context.render_full_static_site() + cli_message( + """ +To view the generated data documentation, start a web server: +cd great_expectations/data_documentation; python -m SimpleHTTPServer (if Python 2) or +cd great_expectations/data_documentation; python3 -m http.server (if Python 3) +and open http://localhost:8000 in your browser +""") + + else: + cli_message( + "Okay, skipping profiling for now. You can always do this later by running `great_expectations profile`." + ) + + if data_source_selection == "1": # Pandas + cli_message(msg_filesys_go_to_notebook) + + elif data_source_selection == "2": # SQL + cli_message(msg_sqlalchemy_go_to_notebook) + + elif data_source_selection == "3": # Spark + cli_message(msg_spark_go_to_notebook) + + +msg_prompt_choose_data_source = """ +Configure a data source: + 1. Pandas data frames (including local filesystem) + 2. Relational database (SQL) + 3. Spark DataFrames + 4. Skip datasource configuration +""" + +# msg_prompt_dbt_choose_profile = """ +# Please specify the name of the dbt profile (from your ~/.dbt/profiles.yml file Great Expectations \ +# should use to connect to the database +# """ + +# msg_dbt_go_to_notebook = """ +# To create expectations for your dbt models start Jupyter and open notebook +# great_expectations/notebooks/using_great_expectations_with_dbt.ipynb - +# it will walk you through next steps. +# """ + +msg_prompt_filesys_enter_base_path = """ +Enter the path of the root directory where the data files are stored +(the path may be either absolute or relative to current directory) +""" + +msg_prompt_datasource_name = """ +Give your new data source a short name +""" + +msg_sqlalchemy_config_connection = """ +Great Expectations relies on sqlalchemy to connect to relational databases. +Please make sure that you have it installed. + +Next, we will configure database credentials and store them in the "{0:s}" section +of this config file: great_expectations/uncommitted/credentials/profiles.yml: +""" + +msg_unknown_data_source = """ +We are looking for more types of data types to support. +Please create a GitHub issue here: +https://github.com/great-expectations/great_expectations/issues/new +In the meantime you can see what Great Expectations can do on CSV files. +To create expectations for your CSV files start Jupyter and open notebook +great_expectations/notebooks/using_great_expectations_with_pandas.ipynb - +it will walk you through configuring the database connection and next steps. +""" + +msg_filesys_go_to_notebook = """ +To create expectations for your CSV files start Jupyter and open the notebook +that will walk you through next steps. + +To launch with jupyter notebooks: + jupyter notebook great_expectations/notebooks/create_expectations.ipynb + +To launch with jupyter lab: + jupyter lab great_expectations/notebooks/create_expectations.ipynb +""" + +msg_sqlalchemy_go_to_notebook = """ +To create expectations for your SQL data assets start Jupyter and open the notebook +that will walk you through next steps. + +To launch with jupyter notebooks: + jupyter notebook great_expectations/notebooks/create_expectations.ipynb + +To launch with jupyter lab: + jupyter lab great_expectations/notebooks/create_expectations.ipynb +""" + +msg_spark_go_to_notebook = """ +To create expectations for your CSV files start Jupyter and open the notebook +that will walk you through next steps. + +To launch with jupyter notebooks: + jupyter notebook great_expectations/notebooks/create_expectations.ipynb + +To launch with jupyter lab: + jupyter lab great_expectations/notebooks/create_expectations.ipynb +""" diff --git a/great_expectations/cli/init.py b/great_expectations/cli/init.py new file mode 100644 index 000000000000..b974f3821011 --- /dev/null +++ b/great_expectations/cli/init.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +import os +import glob +import shutil + +from great_expectations.data_context.util import safe_mmkdir +from great_expectations import __version__ + + +def script_relative_path(file_path): + ''' + Useful for testing with local files. Use a path relative to where the + test resides and this function will return the absolute path + of that file. Otherwise it will be relative to script that + ran the test + + Note this is expensive performance wise so if you are calling this many + times you may want to call it once and cache the base dir. + ''' + # from http://bit.ly/2snyC6s + + import inspect + scriptdir = inspect.stack()[1][1] + return os.path.join(os.path.dirname(os.path.abspath(scriptdir)), file_path) + + +def scaffold_directories_and_notebooks(base_dir): + """Add basic directories for an initial, opinionated GE project.""" + + safe_mmkdir(base_dir, exist_ok=True) + notebook_dir_name = "notebooks" + + open(os.path.join(base_dir, ".gitignore"), 'w').write("uncommitted/") + + for directory in [notebook_dir_name, "expectations", "datasources", "data_documentation", "uncommitted", "plugins", "fixtures"]: + safe_mmkdir(os.path.join(base_dir, directory), exist_ok=True) + + for uncommitted_directory in ["validations", "credentials", "samples"]: + safe_mmkdir(os.path.join(base_dir, "uncommitted", + uncommitted_directory), exist_ok=True) + + for notebook in glob.glob(script_relative_path("../init_notebooks/*.ipynb")): + notebook_name = os.path.basename(notebook) + shutil.copyfile(notebook, os.path.join( + base_dir, notebook_dir_name, notebook_name)) + + +#!!! This injects a version tag into the docs. We should test that those versioned docs exist in RTD. +greeting_1 = """ +Always know what to expect from your data. + +If you're new to Great Expectations, this tutorial is a good place to start: + + https://great-expectations.readthedocs.io/en/latest/intro.html#how-do-i-get-started +""" + +msg_prompt_lets_begin = """ +Let's add Great Expectations to your project, by scaffolding a new great_expectations directory: + + great_expectations + ├── great_expectations.yml + ├── data_documentation + ├── datasources + ├── expectations + ├── fixtures + ├── notebooks + ├── plugins + ├── uncommitted + │  ├── validations + │  ├── credentials + │  └── samples + └── .gitignore + +OK to proceed? +""" diff --git a/great_expectations/cli/util.py b/great_expectations/cli/util.py new file mode 100644 index 000000000000..80547717b8ba --- /dev/null +++ b/great_expectations/cli/util.py @@ -0,0 +1,17 @@ +import six +import re + +try: + from termcolor import colored +except ImportError: + colored = None + + +def cli_message(string): + mod_string = re.sub( + "(.*?)", + colored("\g<1>", "blue"), + string + ) + + six.print_(colored(mod_string)) diff --git a/great_expectations/data_asset/__init__.py b/great_expectations/data_asset/__init__.py index a26da095e5a4..9de24563be37 100644 --- a/great_expectations/data_asset/__init__.py +++ b/great_expectations/data_asset/__init__.py @@ -1,2 +1,2 @@ -from .base import DataAsset +from .data_asset import DataAsset from .file_data_asset import FileDataAsset \ No newline at end of file diff --git a/great_expectations/data_asset/base.py b/great_expectations/data_asset/data_asset.py similarity index 69% rename from great_expectations/data_asset/base.py rename to great_expectations/data_asset/data_asset.py index a431838141e6..cce2721925e2 100644 --- a/great_expectations/data_asset/base.py +++ b/great_expectations/data_asset/data_asset.py @@ -6,6 +6,7 @@ import traceback import warnings import logging +import datetime from six import PY3, string_types from collections import namedtuple @@ -15,38 +16,60 @@ ) from great_expectations.version import __version__ -from great_expectations.data_asset.util import DotDict, recursively_convert_to_json_serializable, parse_result_format -from great_expectations.dataset.autoinspect import columns_exist +from great_expectations.data_asset.util import ( + DotDict, + recursively_convert_to_json_serializable, + parse_result_format, + get_empty_expectation_suite +) logger = logging.getLogger("DataAsset") + class DataAsset(object): def __init__(self, *args, **kwargs): """ Initialize the DataAsset. - :param autoinspect_func (function) = None: The autoinspection function that should be run on the data_asset to - establish baseline expectations. + :param profiler (profiler class) = None: The profiler that should be run on the data_asset to + build a baseline expectation suite. Note: DataAsset is designed to support multiple inheritance (e.g. PandasDataset inherits from both a - Pandas DataFrame and Dataset which inherits from DataAsset), so it accepts generic *args and **kwargs arguments so that they can also be - passed to other parent classes. In python 2, there isn't a clean way to include all of *args, **kwargs, and a - named kwarg...so we use the inelegant solution of popping from kwargs, leaving the support for the autoinspect_func - parameter not obvious from the signature. + Pandas DataFrame and Dataset which inherits from DataAsset), so it accepts generic *args and **kwargs arguments + so that they can also be passed to other parent classes. In python 2, there isn't a clean way to include all of + *args, **kwargs, and a named kwarg...so we use the inelegant solution of popping from kwargs, leaving the + support for the profiler parameter not obvious from the signature. """ - autoinspect_func = kwargs.pop("autoinspect_func", None) - initial_config = kwargs.pop("config", None) + interactive_evaluation = kwargs.pop("interactive_evaluation", True) + profiler = kwargs.pop("profiler", None) + expectation_suite = kwargs.pop("expectation_suite", None) data_asset_name = kwargs.pop("data_asset_name", None) - + expectation_suite_name = kwargs.pop("expectation_suite_name", None) + data_context = kwargs.pop("data_context", None) + batch_kwargs = kwargs.pop("batch_kwargs", None) + if "autoinspect_func" in kwargs: + warnings.warn("Autoinspect_func is no longer supported; use a profiler instead (migration is easy!).") super(DataAsset, self).__init__(*args, **kwargs) - self._initialize_expectations(config=initial_config, data_asset_name=data_asset_name) - if autoinspect_func is not None: - autoinspect_func(self) + self._interactive_evaluation = interactive_evaluation + self._initialize_expectations( + expectation_suite=expectation_suite, + data_asset_name=data_asset_name, + expectation_suite_name=expectation_suite_name + ) + self._data_context = data_context + self._batch_kwargs = batch_kwargs + if profiler is not None: + profiler.profile(self) - def autoinspect(self, autoinspect_func=columns_exist): - autoinspect_func(self) + def autoinspect(self, profiler): + warnings.warn("The term autoinspect is deprecated and will be removed in a future release. Please use 'profile'\ + instead.") + profiler.profile(self) + + def profile(self, profiler): + profiler.profile(self) @classmethod def expectation(cls, method_arg_names): @@ -77,8 +100,8 @@ def expectation(cls, method_arg_names): Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format `. * meta (dict or None): \ - A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. \ - For more detail, see :ref:`meta`. + A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ + modification. For more detail, see :ref:`meta`. """ def outer_wrapper(func): @wraps(func) @@ -134,9 +157,11 @@ def wrapper(self, *args, **kwargs): # This will become the stored config expectation_args = copy.deepcopy(all_args) - if "evaluation_parameters" in self._expectations_config: - evaluation_args = self._build_evaluation_parameters(expectation_args, - self._expectations_config["evaluation_parameters"]) # This will be passed to the evaluation + if "evaluation_parameters" in self._expectation_suite: + evaluation_args = self._build_evaluation_parameters( + expectation_args, + self._expectation_suite["evaluation_parameters"] + ) else: evaluation_args = self._build_evaluation_parameters( expectation_args, None) @@ -156,21 +181,25 @@ def wrapper(self, *args, **kwargs): exception_message = None # Finally, execute the expectation method itself - try: - return_obj = func(self, **evaluation_args) - - except Exception as err: - if catch_exceptions: - raised_exception = True - exception_traceback = traceback.format_exc() - exception_message = str(err) - - return_obj = { - "success": False - } + if self._interactive_evaluation: + try: + return_obj = func(self, **evaluation_args) + + except Exception as err: + if catch_exceptions: + raised_exception = True + exception_traceback = traceback.format_exc() + exception_message = str(err) + + return_obj = { + "success": False + } + + else: + raise err - else: - raise(err) + else: + return_obj = {"stored_configuration": expectation_config} # Append the expectation to the config. self._append_expectation(expectation_config) @@ -179,6 +208,11 @@ def wrapper(self, *args, **kwargs): return_obj["expectation_config"] = copy.deepcopy( expectation_config) + # If there was no interactive evaluation, success will not have been computed. + if "success" in return_obj: + # Add a "success" object to the config + expectation_config["success_on_last_run"] = return_obj["success"] + if catch_exceptions: return_obj["exception_info"] = { "raised_exception": raised_exception, @@ -186,24 +220,24 @@ def wrapper(self, *args, **kwargs): "exception_traceback": exception_traceback } - # Add a "success" object to the config - expectation_config["success_on_last_run"] = return_obj["success"] - # Add meta to return object if meta is not None: return_obj['meta'] = meta - return_obj = recursively_convert_to_json_serializable( return_obj) + + if self._data_context is not None: + return_obj = self._data_context.update_return_obj(self, return_obj) + return return_obj return wrapper return outer_wrapper - def _initialize_expectations(self, config=None, data_asset_name=None): - """Instantiates `_expectations_config` as empty by default or with a specified expectation `config`. + def _initialize_expectations(self, expectation_suite=None, data_asset_name=None, expectation_suite_name=None): + """Instantiates `_expectation_suite` as empty by default or with a specified expectation `config`. In addition, this always sets the `default_expectation_args` to: `include_config`: False, `catch_exceptions`: False, @@ -214,62 +248,63 @@ def _initialize_expectations(self, config=None, data_asset_name=None): interoperability. Args: - config (json): \ + expectation_suite (json): \ A json-serializable expectation config. \ - If None, creates default `_expectations_config` with an empty list of expectations and \ + If None, creates default `_expectation_suite` with an empty list of expectations and \ key value `data_asset_name` as `data_asset_name`. data_asset_name (string): \ - The name to assign to `_expectations_config.data_asset_name` if `config` is not provided. + The name to assign to `_expectation_suite.data_asset_name` + expectation_suite_name (string): \ + The name to assign to the `expectation_suite.expectation_suite_name` + + Returns: + None """ - if config != None: - #!!! Should validate the incoming config with jsonschema here - - # Copy the original so that we don't overwrite it by accident - # Pandas incorrectly interprets this as an attempt to create a column and throws up a warning. Suppress it - # since we are subclassing. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UserWarning) - self._expectations_config = DotDict(copy.deepcopy(config)) - if data_asset_name is not None: - self._expectations_config["data_asset_name"] = data_asset_name + if expectation_suite is not None: + # TODO: validate the incoming expectation_suite with jsonschema here + self._expectation_suite = DotDict(copy.deepcopy(expectation_suite)) + + if data_asset_name is not None: + if self._expectation_suite["data_asset_name"] != data_asset_name: + logger.warning( + "Overriding existing data_asset_name {n1} with new name {n2}" + .format(n1=self._expectation_suite["data_asset_name"], n2=data_asset_name) + ) + self._expectation_suite["data_asset_name"] = data_asset_name + + if expectation_suite_name is not None: + if self._expectation_suite["expectation_suite_name"] != expectation_suite_name: + logger.warning( + "Overriding existing expectation_suite_name {n1} with new name {n2}" + .format(n1=self._expectation_suite["expectation_suite_name"], n2=expectation_suite_name) + ) + self._expectation_suite["expectation_suite_name"] = expectation_suite_name else: - # Pandas incorrectly interprets this as an attempt to create a column and throws up a warning. Suppress it - # since we are subclassing. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UserWarning) - self._expectations_config = DotDict({ - "data_asset_name": data_asset_name, - "data_asset_type": self.__class__.__name__, - "meta": { - "great_expectations.__version__": __version__ - }, - "expectations": [] - }) - - # Pandas incorrectly interprets this as an attempt to create a column and throws up a warning. Suppress it - # since we are subclassing. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UserWarning) - self.default_expectation_args = { - "include_config": False, - "catch_exceptions": False, - "result_format": 'BASIC', - } + if expectation_suite_name is None: + expectation_suite_name = "default" + self._expectation_suite = get_empty_expectation_suite(data_asset_name, expectation_suite_name) + + self.default_expectation_args = { + "include_config": False, + "catch_exceptions": False, + "result_format": 'BASIC', + } def _append_expectation(self, expectation_config): - """Appends an expectation to `DataAsset._expectations_config` and drops existing expectations of the same type. + """Appends an expectation to `DataAsset._expectation_suite` and drops existing expectations of the same type. If `expectation_config` is a column expectation, this drops existing expectations that are specific to \ that column and only if it is the same expectation type as `expectation_config`. Otherwise, if it's not a \ column expectation, this drops existing expectations of the same type as `expectation config`. \ - After expectations of the same type are dropped, `expectation_config` is appended to `DataAsset._expectations_config`. + After expectations of the same type are dropped, `expectation_config` is appended to \ + `DataAsset._expectation_suite`. Args: expectation_config (json): \ - The JSON-serializable expectation to be added to the DataAsset expectations in `_expectations_config`. + The JSON-serializable expectation to be added to the DataAsset expectations in `_expectation_suite`. Notes: May raise future errors once json-serializable tests are implemented to check for correct arg formatting @@ -286,25 +321,25 @@ def _append_expectation(self, expectation_config): # Drop existing expectations with the same expectation_type. # For column_expectations, _append_expectation should only replace expectations # where the expectation_type AND the column match - #!!! This is good default behavior, but - #!!! it needs to be documented, and - #!!! we need to provide syntax to override it. + # !!! This is good default behavior, but + # !!! it needs to be documented, and + # !!! we need to provide syntax to override it. if 'column' in expectation_config['kwargs']: column = expectation_config['kwargs']['column'] - self._expectations_config.expectations = [f for f in filter( + self._expectation_suite.expectations = [f for f in filter( lambda exp: (exp['expectation_type'] != expectation_type) or ( 'column' in exp['kwargs'] and exp['kwargs']['column'] != column), - self._expectations_config.expectations + self._expectation_suite.expectations )] else: - self._expectations_config.expectations = [f for f in filter( + self._expectation_suite.expectations = [f for f in filter( lambda exp: exp['expectation_type'] != expectation_type, - self._expectations_config.expectations + self._expectation_suite.expectations )] - self._expectations_config.expectations.append(expectation_config) + self._expectation_suite.expectations.append(expectation_config) def _copy_and_clean_up_expectation(self, expectation, @@ -359,10 +394,10 @@ def _copy_and_clean_up_expectations_from_indexes( discard_include_configs_kwargs=True, discard_catch_exceptions_kwargs=True, ): - """Copies and cleans all expectations provided by their index in DataAsset._expectations_config.expectations. + """Copies and cleans all expectations provided by their index in DataAsset._expectation_suite.expectations. Applies the _copy_and_clean_up_expectation method to multiple expectations, provided by their index in \ - `DataAsset,_expectations_config.expectations`. Returns a list of the copied and cleaned expectations. + `DataAsset,_expectation_suite.expectations`. Returns a list of the copied and cleaned expectations. Args: match_indexes (List): \ @@ -385,7 +420,7 @@ def _copy_and_clean_up_expectations_from_indexes( for i in match_indexes: rval.append( self._copy_and_clean_up_expectation( - self._expectations_config.expectations[i], + self._expectation_suite.expectations[i], discard_result_format_kwargs, discard_include_configs_kwargs, discard_catch_exceptions_kwargs, @@ -409,20 +444,21 @@ def find_expectation_indexes(self, A list of indexes for matching expectation objects. If there are no matches, the list will be empty. """ - if expectation_kwargs == None: + if expectation_kwargs is None: expectation_kwargs = {} - if "column" in expectation_kwargs and column != None and column != expectation_kwargs["column"]: + if "column" in expectation_kwargs and column is not None and column is not expectation_kwargs["column"]: raise ValueError("Conflicting column names in remove_expectation: %s and %s" % ( column, expectation_kwargs["column"])) - if column != None: + if column is not None: expectation_kwargs["column"] = column match_indexes = [] - for i, exp in enumerate(self._expectations_config.expectations): - if expectation_type == None or (expectation_type == exp['expectation_type']): - # if column == None or ('column' not in exp['kwargs']) or (exp['kwargs']['column'] == column) or (exp['kwargs']['column']==: + for i, exp in enumerate(self._expectation_suite.expectations): + if expectation_type is None or (expectation_type == exp['expectation_type']): + # if column == None or ('column' not in exp['kwargs']) or + # (exp['kwargs']['column'] == column) or (exp['kwargs']['column']==: match = True for k, v in expectation_kwargs.items(): @@ -449,9 +485,12 @@ def find_expectations(self, expectation_type=None : The name of the expectation type to be matched. column=None : The name of the column to be matched. expectation_kwargs=None : A dictionary of kwargs to match against. - discard_result_format_kwargs=True : In returned expectation object(s), suppress the `result_format` parameter. - discard_include_configs_kwargs=True : In returned expectation object(s), suppress the `include_configs` parameter. - discard_catch_exceptions_kwargs=True : In returned expectation object(s), suppress the `catch_exceptions` parameter. + discard_result_format_kwargs=True : In returned expectation object(s), \ + suppress the `result_format` parameter. + discard_include_configs_kwargs=True : In returned expectation object(s), \ + suppress the `include_configs` parameter. + discard_catch_exceptions_kwargs=True : In returned expectation object(s), \ + suppress the `catch_exceptions` parameter. Returns: A list of matching expectation objects. @@ -494,7 +533,8 @@ def remove_expectation(self, Note: If remove_expectation doesn't find any matches, it raises a ValueError. If remove_expectation finds more than one matches and remove_multiple_matches!=True, it raises a ValueError. - If dry_run=True, then `remove_expectation` acts as a thin layer to find_expectations, with the default values for discard_result_format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs + If dry_run=True, then `remove_expectation` acts as a thin layer to find_expectations, with the default \ + values for discard_result_format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs """ match_indexes = self.find_expectation_indexes( @@ -513,18 +553,18 @@ def remove_expectation(self, else: if not dry_run: - self._expectations_config.expectations = [i for j, i in enumerate( - self._expectations_config.expectations) if j not in match_indexes] + self._expectation_suite.expectations = [i for j, i in enumerate( + self._expectation_suite.expectations) if j not in match_indexes] else: return self._copy_and_clean_up_expectations_from_indexes(match_indexes) else: # Exactly one match expectation = self._copy_and_clean_up_expectation( - self._expectations_config.expectations[match_indexes[0]] + self._expectation_suite.expectations[match_indexes[0]] ) if not dry_run: - del self._expectations_config.expectations[match_indexes[0]] + del self._expectation_suite.expectations[match_indexes[0]] else: if remove_multiple_matches: @@ -532,6 +572,9 @@ def remove_expectation(self, else: return expectation + def get_batch_kwargs(self): + return self._batch_kwargs + def discard_failing_expectations(self): res = self.validate(only_return_failures=True).get('results') if any(res): @@ -574,7 +617,7 @@ def set_default_expectation_argument(self, argument, value): See also: get_default_expectation_arguments """ - #!!! Maybe add a validation check here? + # !!! Maybe add a validation check here? self.default_expectation_args[argument] = value @@ -585,6 +628,22 @@ def get_expectations_config(self, discard_catch_exceptions_kwargs=True, suppress_warnings=False ): + warnings.warn("get_expectations_config is deprecated, and will be removed in a future release. " + + "Please use get_expectation_suite instead.", DeprecationWarning) + return self.get_expectation_suite( + discard_failed_expectations, + discard_result_format_kwargs, + discard_include_configs_kwargs, + discard_catch_exceptions_kwargs, + suppress_warnings) + + def get_expectation_suite(self, + discard_failed_expectations=True, + discard_result_format_kwargs=True, + discard_include_configs_kwargs=True, + discard_catch_exceptions_kwargs=True, + suppress_warnings=False + ): """Returns _expectation_config as a JSON object, and perform some cleaning along the way. Args: @@ -596,14 +655,17 @@ def get_expectations_config(self, In returned expectation objects, suppress the `include_configs` parameter. Defaults to `True`. discard_catch_exceptions_kwargs (boolean): \ In returned expectation objects, suppress the `catch_exceptions` parameter. Defaults to `True`. + suppress_warnings (boolean): \ + If true, do not print warning message about information discarded before return Returns: An expectation config. Note: - get_expectations_config does not affect the underlying config at all. The returned config is a copy of _expectations_config, not the original object. + get_expectation_suite does not affect the underlying config at all. The returned config is a copy of \ + _expectation_suite, not the original object. """ - config = dict(self._expectations_config) + config = dict(self._expectation_suite) config = copy.deepcopy(config) expectations = config["expectations"] @@ -617,7 +679,7 @@ def get_expectations_config(self, # Instead of retaining expectations IFF success==True, it discard expectations IFF success==False. # In cases where expectation["success"] is missing or None, expectations are *retained*. # Such a case could occur if expectations were loaded from a config file and never run. - if "success_on_last_run" in expectation and expectation["success_on_last_run"] == False: + if "success_on_last_run" in expectation and expectation["success_on_last_run"] is False: discards["failed_expectations"] += 1 else: new_expectations.append(expectation) @@ -625,7 +687,8 @@ def get_expectations_config(self, expectations = new_expectations for expectation in expectations: - # FIXME: Factor this out into a new function. The logic is duplicated in remove_expectation, which calls _copy_and_clean_up_expectation + # FIXME: Factor this out into a new function. The logic is duplicated in remove_expectation, + # which calls _copy_and_clean_up_expectation if "success_on_last_run" in expectation: del expectation["success_on_last_run"] @@ -645,16 +708,11 @@ def get_expectations_config(self, discards["catch_exceptions"] += 1 if not suppress_warnings: - """ -WARNING: get_expectations_config discarded - 12 failing expectations - 44 result_format kwargs - 0 include_config kwargs - 1 catch_exceptions kwargs -If you wish to change this behavior, please set discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs appropirately. - """ - if any([discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, discard_catch_exceptions_kwargs]): - print("WARNING: get_expectations_config discarded") + if any([discard_failed_expectations, + discard_result_format_kwargs, + discard_include_configs_kwargs, + discard_catch_exceptions_kwargs]): + print("WARNING: get_expectation_suite discarded") if discard_failed_expectations: print("\t%d failing expectations" % discards["failed_expectations"]) @@ -667,7 +725,9 @@ def get_expectations_config(self, if discard_catch_exceptions_kwargs: print("\t%d catch_exceptions kwargs" % discards["catch_exceptions"]) - print("If you wish to change this behavior, please set discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs appropirately.") + print( + "If you wish to change this behavior, please set discard_failed_expectations, discard_result " + "format_kwargs, discard_include_configs_kwargs, and discard_catch_exceptions_kwargs appropriately.") config["expectations"] = expectations return config @@ -680,13 +740,28 @@ def save_expectations_config( discard_include_configs_kwargs=True, discard_catch_exceptions_kwargs=True, suppress_warnings=False + ): + warnings.warn("save_expectations_config is deprecated, and will be removed in a future release. " + + "Please use save_expectation_suite instead.", DeprecationWarning) + self.save_expectation_suite( + filepath, discard_failed_expectations, discard_result_format_kwargs, + discard_include_configs_kwargs, discard_catch_exceptions_kwargs, suppress_warnings) + + def save_expectation_suite( + self, + filepath=None, + discard_failed_expectations=True, + discard_result_format_kwargs=True, + discard_include_configs_kwargs=True, + discard_catch_exceptions_kwargs=True, + suppress_warnings=False ): """Writes ``_expectation_config`` to a JSON file. Writes the DataAsset's expectation config to the specified JSON ``filepath``. Failing expectations \ can be excluded from the JSON expectations config with ``discard_failed_expectations``. The kwarg key-value \ - pairs :ref:`result_format`, :ref:`include_config`, and :ref:`catch_exceptions` are optionally excluded from the JSON \ - expectations config. + pairs :ref:`result_format`, :ref:`include_config`, and :ref:`catch_exceptions` are optionally excluded from \ + the JSON expectations config. Args: filepath (string): \ @@ -695,51 +770,66 @@ def save_expectations_config( If True, excludes expectations that do not return ``success = True``. \ If False, all expectations are written to the JSON config file. discard_result_format_kwargs (boolean): \ - If True, the :ref:`result_format` attribute for each expectation is not written to the JSON config file. \ + If True, the :ref:`result_format` attribute for each expectation is not written to the JSON config \ + file. discard_include_configs_kwargs (boolean): \ - If True, the :ref:`include_config` attribute for each expectation is not written to the JSON config file.\ - discard_catch_exceptions_kwargs (boolean): \ - If True, the :ref:`catch_exceptions` attribute for each expectation is not written to the JSON config \ + If True, the :ref:`include_config` attribute for each expectation is not written to the JSON config \ file. + discard_catch_exceptions_kwargs (boolean): \ + If True, the :ref:`catch_exceptions` attribute for each expectation is not written to the JSON \ + config file. suppress_warnings (boolean): \ It True, all warnings raised by Great Expectations, as a result of dropped expectations, are \ suppressed. """ - if filepath == None: - # FIXME: Fetch the proper filepath from the project config - pass - - expectations_config = self.get_expectations_config( + expectation_suite = self.get_expectation_suite( discard_failed_expectations, discard_result_format_kwargs, discard_include_configs_kwargs, discard_catch_exceptions_kwargs, suppress_warnings ) - expectation_config_str = json.dumps(expectations_config, indent=2) - open(filepath, 'w').write(expectation_config_str) - - def validate(self, expectations_config=None, evaluation_parameters=None, catch_exceptions=True, result_format=None, only_return_failures=False): + if filepath is None and self._data_context is not None: + self._data_context.save_expectation_suite(expectation_suite) + elif filepath is not None: + expectation_config_str = json.dumps(expectation_suite, indent=2) + open(filepath, 'w').write(expectation_config_str) + else: + raise ValueError("Unable to save config: filepath or data_context must be available.") + + def validate(self, + expectation_suite=None, + run_id=None, + data_context=None, + evaluation_parameters=None, + catch_exceptions=True, + result_format=None, + only_return_failures=False): """Generates a JSON-formatted report describing the outcome of all expectations. - Use the default expectations_config=None to validate the expectations config associated with the DataAsset. + Use the default expectation_suite=None to validate the expectations config associated with the DataAsset. Args: - expectations_config (json or None): \ + expectation_suite (json or None): \ If None, uses the expectations config generated with the DataAsset during the current session. \ If a JSON file, validates those expectations. + run_id (str): \ + A string used to identify this validation result as part of a collection of validations. See \ + DataContext for more information. + data_context (DataContext): \ + A datacontext object to use as part of validation for binding evaluation parameters and \ + registering validation results. evaluation_parameters (dict or None): \ - If None, uses the evaluation_paramters from the expectations_config provided or as part of the data_asset. - If a dict, uses the evaluation parameters in the dictionary. + If None, uses the evaluation_paramters from the expectation_suite provided or as part of the \ + data_asset. If a dict, uses the evaluation parameters in the dictionary. catch_exceptions (boolean): \ - If True, exceptions raised by tests will not end validation and will be described in the returned report. + If True, exceptions raised by tests will not end validation and will be described in the returned \ + report. result_format (string or None): \ If None, uses the default value ('BASIC' or as specified). \ - If string, the returned expectation output follows the specified format ('BOOLEAN_ONLY','BASIC', etc.). - include_config (boolean): \ - If True, the returned results include the config information associated with each expectation, if \ - it exists. + If string, the returned expectation output follows the specified format ('BOOLEAN_ONLY','BASIC', \ + etc.). only_return_failures (boolean): \ If True, expectation results are only returned when ``success = False`` \ @@ -775,40 +865,66 @@ def validate(self, expectations_config=None, evaluation_parameters=None, catch_e } Notes: - If the configuration object was built with a different version of great expectations then the current environment. \ - If no version was found in the configuration file. + If the configuration object was built with a different version of great expectations then the \ + current environment. If no version was found in the configuration file. Raises: AttributeError - if 'catch_exceptions'=None and an expectation throws an AttributeError """ + validate__interactive_evaluation = self._interactive_evaluation + if not self._interactive_evaluation: + # Turn this off for an explicit call to validate + self._interactive_evaluation = True + + # If a different validation data context was provided, override + validate__data_context = self._data_context + if data_context is None and self._data_context is not None: + data_context = self._data_context + elif data_context is not None: + # temporarily set self._data_context so it is used inside the expectation decorator + self._data_context = data_context results = [] - if expectations_config is None: - expectations_config = self.get_expectations_config( + if expectation_suite is None: + expectation_suite = self.get_expectation_suite( discard_failed_expectations=False, discard_result_format_kwargs=False, discard_include_configs_kwargs=False, discard_catch_exceptions_kwargs=False, ) - elif isinstance(expectations_config, string_types): - expectations_config = json.load(open(expectations_config, 'r')) + elif isinstance(expectation_suite, string_types): + expectation_suite = json.load(open(expectation_suite, 'r')) - if evaluation_parameters is None: - # Use evaluation parameters from the (maybe provided) config - if "evaluation_parameters" in expectations_config: - evaluation_parameters = expectations_config["evaluation_parameters"] + # Evaluation parameter priority is + # 1. from provided parameters + # 2. from expectation configuration + # 3. from data context + # So, we load them in reverse order + + if data_context is not None: + runtime_evaluation_parameters = data_context.bind_evaluation_parameters(run_id) # , expectation_suite) + else: + runtime_evaluation_parameters = {} + + if "evaluation_parameters" in expectation_suite: + runtime_evaluation_parameters.update(expectation_suite["evaluation_parameters"]) + + if evaluation_parameters is not None: + runtime_evaluation_parameters.update(evaluation_parameters) # Warn if our version is different from the version in the configuration try: - if expectations_config['meta']['great_expectations.__version__'] != __version__: + if expectation_suite['meta']['great_expectations.__version__'] != __version__: warnings.warn( - "WARNING: This configuration object was built using version %s of great_expectations, but is currently being valided by version %s." % (expectations_config['meta']['great_expectations.__version__'], __version__)) + "WARNING: This configuration object was built using version %s of great_expectations, but " + "is currently being valided by version %s." + % (expectation_suite['meta']['great_expectations.__version__'], __version__)) except KeyError: warnings.warn( "WARNING: No great_expectations version found in configuration object.") - for expectation in expectations_config['expectations']: + for expectation in expectation_suite['expectations']: try: expectation_method = getattr( @@ -819,7 +935,7 @@ def validate(self, expectations_config=None, evaluation_parameters=None, catch_e # A missing parameter should raise a KeyError evaluation_args = self._build_evaluation_parameters( - expectation['kwargs'], evaluation_parameters) + expectation['kwargs'], runtime_evaluation_parameters) result = expectation_method( catch_exceptions=catch_exceptions, @@ -841,7 +957,7 @@ def validate(self, expectations_config=None, evaluation_parameters=None, catch_e } else: - raise(err) + raise err # if include_config: result["expectation_config"] = copy.deepcopy(expectation) @@ -861,10 +977,13 @@ def validate(self, expectations_config=None, evaluation_parameters=None, catch_e if only_return_failures: abbrev_results = [] for exp in results: - if exp["success"] == False: + if not exp["success"]: abbrev_results.append(exp) results = abbrev_results + data_asset_name = expectation_suite.get("data_asset_name", None) + expectation_suite_name = expectation_suite.get("expectation_suite_name", "default") + result = { "results": results, "success": statistics.success, @@ -873,12 +992,32 @@ def validate(self, expectations_config=None, evaluation_parameters=None, catch_e "successful_expectations": statistics.successful_expectations, "unsuccessful_expectations": statistics.unsuccessful_expectations, "success_percent": statistics.success_percent, - } + }, + "meta": { + "great_expectations.__version__": __version__, + "data_asset_name": data_asset_name, + "expectation_suite_name": expectation_suite_name + } } if evaluation_parameters is not None: result.update({"evaluation_parameters": evaluation_parameters}) + if run_id is not None: + result["meta"].update({"run_id": run_id}) + else: + run_id = datetime.datetime.utcnow().isoformat() + result["meta"].update({"run_id": run_id}) + + if self._batch_kwargs is not None: + result["meta"].update({"batch_kwargs": self._batch_kwargs}) + + if data_context is not None: + result = data_context.register_validation_results(run_id, result, self) + + self._data_context = validate__data_context + self._interactive_evaluation = validate__interactive_evaluation + return result def get_evaluation_parameter(self, parameter_name, default_value=None): @@ -891,9 +1030,9 @@ def get_evaluation_parameter(self, parameter_name, default_value=None): Returns: The current value of the evaluation parameter. """ - if "evaluation_parameters" in self._expectations_config and \ - parameter_name in self._expectations_config['evaluation_parameters']: - return self._expectations_config['evaluation_parameters'][parameter_name] + if "evaluation_parameters" in self._expectation_suite and \ + parameter_name in self._expectation_suite['evaluation_parameters']: + return self._expectation_suite['evaluation_parameters'][parameter_name] else: return default_value @@ -906,19 +1045,27 @@ def set_evaluation_parameter(self, parameter_name, parameter_value): parameter_value (any): The value to be used """ - if 'evaluation_parameters' not in self._expectations_config: - self._expectations_config['evaluation_parameters'] = {} + if 'evaluation_parameters' not in self._expectation_suite: + self._expectation_suite['evaluation_parameters'] = {} - self._expectations_config['evaluation_parameters'].update( + self._expectation_suite['evaluation_parameters'].update( {parameter_name: parameter_value}) def set_data_asset_name(self, data_asset_name): """Sets the name of this data_asset as stored in the expectations configuration.""" - self._expectations_config['data_asset_name'] = data_asset_name + self._expectation_suite['data_asset_name'] = data_asset_name def get_data_asset_name(self): """Gets the current name of this data_asset as stored in the expectations configuration.""" - return self._expectations_config['data_asset_name'] + return self._expectation_suite.get("data_asset_name", None) + + def set_expectation_suite_name(self, expectation_suite_name): + """Sets the expectation_suite name of this data_asset as stored in the expectations configuration.""" + self._expectation_suite["expectation_suite_name"] = expectation_suite_name + + def get_expectation_suite_name(self): + """Gets the current expectation_suite name of this data_asset as stored in the expectations configuration.""" + return self._expectation_suite.get("expectation_suite_name", None) def _build_evaluation_parameters(self, expectation_args, evaluation_parameters): """Build a dictionary of parameters to evaluate, using the provided evaluation_paramters, @@ -941,13 +1088,20 @@ def _build_evaluation_parameters(self, expectation_args, evaluation_parameters): value["$PARAMETER"]] elif evaluation_parameters is not None and value["$PARAMETER"] in evaluation_parameters: evaluation_args[key] = evaluation_parameters[value['$PARAMETER']] + elif not self._interactive_evaluation: + pass else: raise KeyError( "No value found for $PARAMETER " + value["$PARAMETER"]) return evaluation_args - ##### Output generation ##### + ### + # + # Output generation + # + ### + def _format_map_output( self, result_format, @@ -1031,7 +1185,6 @@ def _format_map_output( } ) - if result_format['result_format'] == 'SUMMARY': return return_obj @@ -1057,8 +1210,9 @@ def _calc_map_expectation_success(self, success_count, nonnull_count, mostly): nonnull_count (int): \ The number of nonnull values in the column mostly (float or None): \ - A value between 0 and 1 (or None), indicating the percentage of successes required to pass the expectation as a whole\ - If mostly=None, then all values must succeed in order for the expectation as a whole to succeed. + A value between 0 and 1 (or None), indicating the percentage of successes required to pass the \ + expectation as a whole. If mostly=None, then all values must succeed in order for the expectation as \ + a whole to succeed. Returns: success (boolean), percent_success (float) @@ -1068,7 +1222,7 @@ def _calc_map_expectation_success(self, success_count, nonnull_count, mostly): # percent_success = float(success_count)/nonnull_count percent_success = success_count / nonnull_count - if mostly != None: + if mostly is not None: success = bool(percent_success >= mostly) else: @@ -1080,7 +1234,11 @@ def _calc_map_expectation_success(self, success_count, nonnull_count, mostly): return success, percent_success - ##### Iterative testing for custom expectations ##### + ### + # + # Iterative testing for custom expectations + # + ### def test_expectation_function(self, function, *args, **kwargs): """Test a generic expectation function @@ -1094,8 +1252,9 @@ def test_expectation_function(self, function, *args, **kwargs): A JSON-serializable expectation result object. Notes: - This function is a thin layer to allow quick testing of new expectation functions, without having to define custom classes, etc. - To use developed expectations from the command-line tool, you'll still need to define custom classes, etc. + This function is a thin layer to allow quick testing of new expectation functions, without having to \ + define custom classes, etc. To use developed expectations from the command-line tool, you will still need \ + to define custom classes, etc. Check out :ref:`custom_expectations` for more information. """ @@ -1108,6 +1267,7 @@ def test_expectation_function(self, function, *args, **kwargs): new_function = self.expectation(argspec)(function) return new_function(self, *args, **kwargs) + ValidationStatistics = namedtuple("ValidationStatistics", [ "evaluated_expectations", "successful_expectations", @@ -1130,7 +1290,8 @@ def _calc_validation_statistics(validation_results): try: success_percent = successful_expectations / evaluated_expectations * 100 except ZeroDivisionError: - success_percent = float("nan") + # success_percent = float("nan") + success_percent = None return ValidationStatistics( successful_expectations=successful_expectations, diff --git a/great_expectations/data_asset/file_data_asset.py b/great_expectations/data_asset/file_data_asset.py index 5ab11b60f280..4caf02c85995 100644 --- a/great_expectations/data_asset/file_data_asset.py +++ b/great_expectations/data_asset/file_data_asset.py @@ -9,7 +9,7 @@ import numpy as np from six import PY3 from itertools import compress -from great_expectations.data_asset.base import DataAsset +from great_expectations.data_asset.data_asset import DataAsset from great_expectations.data_asset.util import parse_result_format @@ -589,32 +589,34 @@ def expect_file_to_be_valid_json(self, schema=None, result_format=None, meta=None): """ - schema : string - optional JSON schema file on which JSON data file is validated against + Args: + schema : string + optional JSON schema file on which JSON data file is validated against - result_format (str or None): - Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. - For more detail, see :ref:`result_format `. + result_format (str or None): + Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. \ + For more detail, see :ref:`result_format `. - include_config (boolean): - If True, then include the expectation config as part of the result object. \ - For more detail, see :ref:`include_config`. + include_config (boolean): + If True, then include the expectation config as part of the result object. \ + For more detail, see :ref:`include_config`. - catch_exceptions (boolean or None): - If True, then catch exceptions and include them as part of the result object. \ - For more detail, see :ref:`catch_exceptions`. + catch_exceptions (boolean or None): + If True, then catch exceptions and include them as part of the result object. \ + For more detail, see :ref:`catch_exceptions`. - meta (dict or None): - A JSON-serializable dictionary (nesting allowed) that will - be included in the output without modification. \ + meta (dict or None): + A JSON-serializable dictionary (nesting allowed) that will \ + be included in the output without modification. For more detail, see :ref:`meta`. Returns: A JSON-serializable expectation result object. - Exact fields vary depending on the values passed to :ref:`result_format ` and + Exact fields vary depending on the values passed to :ref:`result_format ` and \ :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. + """ success = False if schema is None: diff --git a/great_expectations/data_asset/util.py b/great_expectations/data_asset/util.py index 495d6b3710b5..61b95310b426 100644 --- a/great_expectations/data_asset/util.py +++ b/great_expectations/data_asset/util.py @@ -14,6 +14,8 @@ from functools import wraps +from great_expectations.version import __version__ as __version__ + def parse_result_format(result_format): """This is a simple helper utility that can be used to parse a string result_format into the dict format used @@ -177,6 +179,16 @@ def recursively_convert_to_json_serializable(test_obj): # Note: Use np.floating to avoid FutureWarning from numpy return float(round(test_obj, sys.float_info.dig)) + elif isinstance(test_obj, pd.Series): + # Converting a series is tricky since the index may not be a string, but all json + # keys must be strings. So, we use a very ugly serialization strategy + index_name = test_obj.index.name or "index" + value_name = test_obj.name or "value" + return [{ + index_name: recursively_convert_to_json_serializable(idx), + value_name: recursively_convert_to_json_serializable(val) + } for idx, val in test_obj.iteritems()] + elif isinstance(test_obj, pd.DataFrame): return recursively_convert_to_json_serializable(test_obj.to_dict(orient='records')) @@ -193,3 +205,13 @@ def recursively_convert_to_json_serializable(test_obj): else: raise TypeError('%s is of type %s which cannot be serialized.' % ( str(test_obj), type(test_obj).__name__)) + +def get_empty_expectation_suite(data_asset_name=None, expectation_suite_name="default"): + return DotDict({ + 'data_asset_name': data_asset_name, + 'expectation_suite_name': expectation_suite_name, + 'meta': { + 'great_expectations.__version__': __version__ + }, + 'expectations': [] + }) diff --git a/great_expectations/data_context/__init__.py b/great_expectations/data_context/__init__.py index 46677150a5cb..1a42162ec426 100644 --- a/great_expectations/data_context/__init__.py +++ b/great_expectations/data_context/__init__.py @@ -1,39 +1,3 @@ -import logging +# -*- coding: utf-8 -*- -from .pandas_context import PandasCSVDataContext - -logger = logging.getLogger(__name__) - -try: - from .sqlalchemy_context import SqlAlchemyDataContext -except ImportError: - logger.info("Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support") - -try: - from .spark_context import SparkCSVDataContext - from .spark_parquet_context import SparkParquetDataContext - from .databricks_context import DatabricksTableContext -except ImportError: - logger.info("Unable to load Spark contexts; install optional spark dependency for support") - - -def get_data_context(context_type, options, *args, **kwargs): - """Return a data_context object which exposes options to list datasets and get a dataset from - that context. This is a new API in Great Expectations 0.4, and is subject to rapid change. - - :param context_type: (string) one of "SqlAlchemy", "PandasCSV", "SparkCSV", or "DatabricksTable" - :param options: options to be passed to the data context's connect method. - :return: a new DataContext object - """ - if context_type == "SqlAlchemy": - return SqlAlchemyDataContext(options, *args, **kwargs) - elif context_type == "PandasCSV": - return PandasCSVDataContext(options, *args, **kwargs) - elif context_type == "SparkCSV": - return SparkCSVDataContext(options, *args, **kwargs) - elif context_type == "SparkParquet": - return SparkParquetDataContext(options, *args, **kwargs) - elif context_type == "DatabricksTable": - return DatabricksTableContext(options, *args, **kwargs) - else: - raise ValueError("Unknown data context.") +from .data_context import DataContext diff --git a/great_expectations/data_context/base.py b/great_expectations/data_context/base.py deleted file mode 100644 index 6a2099240ea5..000000000000 --- a/great_expectations/data_context/base.py +++ /dev/null @@ -1,18 +0,0 @@ -class DataContext(object): - """A generic DataContext, exposing the base API including constructor with `options` parameter, list_datasets, - and get_dataset. - - Warning: this feature is new in v0.4 and may change based on community feedback. - """ - - def __init__(self, options, *args, **kwargs): - self.connect(options, *args, **kwargs) - - def connect(self, options): - return NotImplementedError - - def list_datasets(self): - return NotImplementedError - - def get_dataset(self, dataset_name, caching=False, **kwargs): - return NotImplementedError diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py new file mode 100644 index 000000000000..bfd345152c8b --- /dev/null +++ b/great_expectations/data_context/data_context.py @@ -0,0 +1,1600 @@ +# -*- coding: utf-8 -*- + +import os +import json +import logging +from ruamel.yaml import YAML +import sys +import copy +import errno +from glob import glob +from six import string_types +import datetime +import shutil + +from .util import NormalizedDataAssetName, get_slack_callback, safe_mmkdir + +from great_expectations.exceptions import DataContextError, ConfigNotFoundError, ProfilerError + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + +from great_expectations.data_asset.util import get_empty_expectation_suite +from great_expectations.dataset import Dataset, PandasDataset +from great_expectations.datasource import ( + PandasDatasource, + SqlAlchemyDatasource, + SparkDFDatasource, + DBTDatasource +) +from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler +from great_expectations.render.renderer import DescriptivePageRenderer, PrescriptivePageRenderer +from great_expectations.render.view import DescriptivePageView + + +from .expectation_explorer import ExpectationExplorer + +logger = logging.getLogger(__name__) +yaml = YAML() +yaml.indent(mapping=2, sequence=4, offset=2) +yaml.default_flow_style = False + +ALLOWED_DELIMITERS = ['.', '/'] + + +class DataContext(object): + """A DataContext manages resources including datasources, generators, and expectation suites. + + Use the `create` classmethod to create a new empty config, or instantiate the DataContext + by passing the path to an existing data context root directory. See :py:mod:`great_expectations.data_context` + for more information. + """ + + @classmethod + def create(cls, project_root_dir=None): + """Build a new great_expectations directory and DataContext object in the provided project_root_dir. + + `create` will not create a new "great_expectations" directory in the provided folder, provided one does not + already exist. Then, it will initialize a new DataContext in that folder and write the resulting config. + + Args: + project_root_dir: path to the root directory in which to create a new great_expectations directory + + Returns: + DataContext + """ + if not os.path.isdir(project_root_dir): + raise DataContextError("project_root_dir must be a directory in which to initialize a new DataContext") + else: + try: + os.mkdir(os.path.join(project_root_dir, "great_expectations")) + except (FileExistsError, OSError): + raise DataContextError( + "Cannot create a DataContext object when a great_expectations directory " + "already exists at the provided root directory.") + + with open(os.path.join(project_root_dir, "great_expectations/great_expectations.yml"), "w") as template: + template.write(PROJECT_TEMPLATE) + + return cls(os.path.join(project_root_dir, "great_expectations")) + + def __init__(self, context_root_dir=None, expectation_explorer=False, data_asset_name_delimiter='/'): + """DataContext constructor + + Args: + context_root_dir: location to look for the ``great_expectations.yml`` file. If None, searches for the file \ + based on conventions for project subdirectories. + expectation_explorer: If True, load the expectation explorer manager, which will modify GE return objects \ + to include ipython notebook widgets. + data_asset_name_delimiter: the delimiter character to use when parsing data_asset_name parameters. \ + Defaults to '/' + + Returns: + None + """ + self._expectation_explorer = expectation_explorer + self._datasources = {} + if expectation_explorer: + self._expectation_explorer_manager = ExpectationExplorer() + + # determine the "context root directory" - this is the parent of "great_expectations" dir + if context_root_dir is None: + if os.path.isdir("../notebooks") and os.path.isfile("../great_expectations.yml"): + context_root_dir = "../" + elif os.path.isdir("./great_expectations") and \ + os.path.isfile("./great_expectations/great_expectations.yml"): + context_root_dir = "./great_expectations" + elif os.path.isdir("./") and os.path.isfile("./great_expectations.yml"): + context_root_dir = "./" + else: + raise( + "Unable to locate context root directory. Please provide a directory name." + ) + + self._context_root_directory = os.path.abspath(context_root_dir) + + # TODO: these paths should be configurable + self.expectations_directory = os.path.join(self.root_directory, "expectations") + self.fixtures_validations_directory = os.path.join(self.root_directory, "fixtures/validations") + self.data_doc_directory = os.path.join(self.root_directory, "data_documentation") + self.plugin_store_directory = os.path.join(self.root_directory, "plugins/store") + sys.path.append(self.plugin_store_directory) + + self._project_config = self._load_project_config() + + if "datasources" not in self._project_config: + self._project_config["datasources"] = {} + for datasource in self._project_config["datasources"].keys(): + self.get_datasource(datasource) + + self._load_evaluation_parameter_store() + self._compiled = False + if data_asset_name_delimiter not in ALLOWED_DELIMITERS: + raise DataContextError("Invalid delimiter: delimiter must be '.' or '/'") + self._data_asset_name_delimiter = data_asset_name_delimiter + + @property + def root_directory(self): + """The root directory for configuration objects in the data context; the location in which + ``great_expectations.yml`` is located.""" + return self._context_root_directory + + def _load_project_config(self): + """Loads the project configuration file.""" + try: + with open(os.path.join(self.root_directory, "great_expectations.yml"), "r") as data: + return yaml.load(data) + except IOError: + raise ConfigNotFoundError(self.root_directory) + + @property + def data_asset_name_delimiter(self): + """Configurable delimiter character used to parse data asset name strings into \ + ``NormalizedDataAssetName`` objects.""" + return self._data_asset_name_delimiter + + @data_asset_name_delimiter.setter + def data_asset_name_delimiter(self, new_delimiter): + """data_asset_name_delimiter property setter method""" + if new_delimiter not in ALLOWED_DELIMITERS: + raise DataContextError("Invalid delimiter: delimiter must be '.' or '/'") + else: + self._data_asset_name_delimiter = new_delimiter + + def get_validation_location(self, data_asset_name, expectation_suite_name, run_id): + """Get the local path where a validation result is stored, given full asset name and run id + + Args: + data_asset_name: name of data asset for which to get validation location + expectation_suite_name: name of expectation suite for which to get validation location + run_id: run_id of validation to get. If no run_id is specified, fetch the latest run_id according to \ + alphanumeric sort (by default, the latest run_id if using ISO 8601 formatted timestamps for run_id + + Returns: + path (str): path to the validation location for the specified data_asset, expectation_suite and run_id + """ + result = {} + + if "result_store" not in self._project_config: + logger.warning("Unable to get validation results: no result store configured.") + return {} + + data_asset_name = self._normalize_data_asset_name(data_asset_name) + result_store = self._project_config["result_store"] + if "filesystem" in result_store and isinstance(result_store["filesystem"], dict): + if "base_directory" not in result_store["filesystem"]: + raise DataContextError( + "Invalid result_store configuration: 'base_directory' is required for a filesystem store.") + + base_directory = result_store["filesystem"]["base_directory"] + if not os.path.isabs(base_directory): + base_directory = os.path.join(self.root_directory, base_directory) + + if run_id is None: # Get most recent run_id + runs = [name for name in os.listdir(base_directory) if + os.path.isdir(os.path.join(base_directory, name))] + run_id = sorted(runs)[-1] + + validation_path = os.path.join( + base_directory, + run_id, + self._get_normalized_data_asset_name_filepath( + data_asset_name, + expectation_suite_name, + base_path="" + ) + ) + + result['filepath'] = validation_path + + elif "s3" in result_store and isinstance(result_store["s3"], dict): + # FIXME: this code is untested + if "bucket" not in result_store["s3"] or "key_prefix" not in result_store["s3"]: + raise DataContextError( + "Invalid result_store configuration: 'bucket' and 'key_prefix' are required for an s3 store.") + + try: + import boto3 + s3 = boto3.client('s3') + except ImportError: + raise ImportError("boto3 is required for retrieving a dataset from s3") + + bucket = result_store["s3"]["bucket"] + key_prefix = result_store["s3"]["key_prefix"] + + if run_id is None: # Get most recent run_id + all_objects = s3.list_objects(Bucket=bucket) + # Remove the key_prefix and first slash from the name + validations = [ + name[len(key_prefix) + 1:] + for name in all_objects + if name.startswith(key_prefix) and len(name) > len(key_prefix) + 1 + ] + # run id is the first section after the word "validations" + runs = [validation.split('/')[1] for validation in validations] + run_id = sorted(runs)[-1] + + key = os.path.join( + key_prefix, + "validations", + run_id, + self._get_normalized_data_asset_name_filepath( + data_asset_name, + expectation_suite_name, + base_path="" + ) + ) + + result['bucket'] = bucket + result['key'] = key + + else: + raise DataContextError("Invalid result_store configuration: only 'filesystem' and 's3' are supported.") + + return result + + def get_validation_doc_filepath(self, data_asset_name, expectation_suite_name): + """Get the local path where a the rendered html doc for a validation result is stored, given full asset name. + + Args: + data_asset_name: name of data asset for which to get documentation filepath + expectation_suite_name: name of expectation suite for which to get validation location + + Returns: + path (str): Path to the location + + """ + # TODO: this path should be configurable or parameterized to support descriptive and prescriptive docs + validation_filepath = self._get_normalized_data_asset_name_filepath( + data_asset_name, + expectation_suite_name, + base_path=self.data_doc_directory, + file_extension=".html" + ) + + return validation_filepath + + def move_validation_to_fixtures(self, data_asset_name, expectation_suite_name, run_id): + """ + Move validation results from uncommitted to fixtures/validations to make available for the data doc renderer + + Args: + data_asset_name: name of data asset for which to get documentation filepath + expectation_suite_name: name of expectation suite for which to get validation location + run_id: run_id of validation to get. If no run_id is specified, fetch the latest run_id according to \ + alphanumeric sort (by default, the latest run_id if using ISO 8601 formatted timestamps for run_id + + + Returns: + None + """ + source_filepath = self.get_validation_location(data_asset_name, expectation_suite_name, run_id)['filepath'] + + destination_filepath = self._get_normalized_data_asset_name_filepath( + data_asset_name, + expectation_suite_name, + base_path=self.fixtures_validations_directory, + file_extension=".json" + ) + + safe_mmkdir(os.path.dirname(destination_filepath)) + shutil.move(source_filepath, destination_filepath) + + ##### + # + # Internal helper methods + # + ##### + + def _get_normalized_data_asset_name_filepath(self, data_asset_name, + expectation_suite_name, + base_path=None, + file_extension=".json"): + """Get the path where the project-normalized data_asset_name expectations are stored. This method is used + internally for constructing all absolute and relative paths for asset_name-based paths. + + Args: + data_asset_name: name of data asset for which to construct the path + expectation_suite_name: name of expectation suite for which to construct the path + base_path: base path from which to construct the path. If None, uses the DataContext root directory + file_extension: the file extension to append to the path + + Returns: + path (str): path for the requsted object. + """ + if base_path is None: + base_path = os.path.join(self.root_directory, "expectations") + + # We need to ensure data_asset_name is a valid filepath no matter its current state + if isinstance(data_asset_name, NormalizedDataAssetName): + name_parts = [name_part.replace("/", "__") for name_part in data_asset_name] + relative_path = "/".join(name_parts) + elif isinstance(data_asset_name, string_types): + # if our delimiter is not '/', we need to first replace any slashes that exist in the name + # to avoid extra layers of nesting (e.g. for dbt models) + relative_path = data_asset_name + if self.data_asset_name_delimiter != "/": + relative_path.replace("/", "__") + relative_path = relative_path.replace(self.data_asset_name_delimiter, "/") + else: + raise DataContextError("data_assset_name must be a NormalizedDataAssetName or string") + + expectation_suite_name += file_extension + + return os.path.join( + base_path, + relative_path, + expectation_suite_name + ) + + def _save_project_config(self): + """Save the current project to disk.""" + with open(os.path.join(self.root_directory, "great_expectations.yml"), "w") as data: + yaml.dump(self._project_config, data) + + def _get_all_profile_credentials(self): + """Get all profile credentials from the default location.""" + + # TODO: support parameterized additional store locations + try: + with open(os.path.join(self.root_directory, + "uncommitted/credentials/profiles.yml"), "r") as profiles_file: + return yaml.load(profiles_file) or {} + except IOError as e: + if e.errno != errno.ENOENT: + raise + logger.debug("Generating empty profile store.") + base_profile_store = yaml.load("{}") + base_profile_store.yaml_set_start_comment(PROFILE_COMMENT) + return base_profile_store + + def get_profile_credentials(self, profile_name): + """Get named profile credentials. + + Args: + profile_name (str): name of the profile for which to get credentials + + Returns: + credentials (dict): dictionary of credentials + """ + profiles = self._get_all_profile_credentials() + if profile_name in profiles: + return profiles[profile_name] + else: + return {} + + def add_profile_credentials(self, profile_name, **kwargs): + """Add named profile credentials. + + Args: + profile_name: name of the profile for which to add credentials + **kwargs: credential key-value pairs + + Returns: + None + """ + profiles = self._get_all_profile_credentials() + profiles[profile_name] = dict(**kwargs) + profiles_filepath = os.path.join(self.root_directory, "uncommitted/credentials/profiles.yml") + safe_mmkdir(os.path.dirname(profiles_filepath), exist_ok=True) + if not os.path.isfile(profiles_filepath): + logger.info("Creating new profiles store at {profiles_filepath}".format( + profiles_filepath=profiles_filepath) + ) + with open(profiles_filepath, "w") as profiles_file: + yaml.dump(profiles, profiles_file) + + def get_datasource_config(self, datasource_name): + """Get the configuration for a configured datasource + + Args: + datasource_name: The datasource for which to get the config + + Returns: + datasource_config (dict): dictionary containing datasource configuration + """ + + # TODO: Review logic, once described below but not implemented in datasource save, for splitting configuration + # We allow a datasource to be defined in any combination of the following ways: + + # 1. It may be fully specified in the datasources section of the great_expectations.yml file + # 2. It may be stored in a file by convention located in `datasources//config.yml` + # 3. It may be listed in the great_expectations.yml file with a config_file key that provides a relative \ + # path to a different yml config file + + # Any key duplicated across configs will be updated by the last key read (in the order above) + datasource_config = {} + defined_config_path = None + default_config_path = os.path.join(self.root_directory, "datasources", datasource_name, "config.yml") + if datasource_name in self._project_config["datasources"]: + base_datasource_config = copy.deepcopy(self._project_config["datasources"][datasource_name]) + if "config_file" in base_datasource_config: + defined_config_path = os.path.join(self.root_directory, base_datasource_config.pop("config_file")) + datasource_config.update(base_datasource_config) + + try: + with open(default_config_path, "r") as config_file: + default_path_datasource_config = yaml.load(config_file) or {} + datasource_config.update(default_path_datasource_config) + except IOError as e: + if e.errno != errno.ENOENT: + raise + logger.debug("No config file found in default location for datasource %s" % datasource_name) + + if defined_config_path is not None: + try: + with open(defined_config_path, "r") as config_file: + defined_path_datasource_config = yaml.load(config_file) or {} + datasource_config.update(defined_path_datasource_config) + except IOError as e: + if e.errno != errno.ENOENT: + raise + logger.warning("No config file found in user-defined location for datasource %s" % datasource_name) + + return datasource_config + + def get_available_data_asset_names(self, datasource_names=None, generator_names=None): + """Inspect datasource and generators to provide available data_asset objects. + + Args: + datasource_names: list of datasources for which to provide available data_asset_name objects. If None, \ + return available data assets for all datasources. + generator_names: list of generators for which to provide available data_asset_name objects. + + Returns: + data_asset_names (dict): Dictionary describing available data assets + :: + + { + datasource_name: { + generator_name: [ data_asset_1, data_asset_2, ... ] + ... + } + ... + } + + """ + data_asset_names = {} + if datasource_names is None: + datasource_names = [datasource["name"] for datasource in self.list_datasources()] + elif isinstance(datasource_names, string_types): + datasource_names = [datasource_names] + elif not isinstance(datasource_names, list): + raise ValueError( + "Datasource names must be a datasource name, list of datasource names or None (to list all datasources)" + ) + + if generator_names is not None: + if isinstance(generator_names, string_types): + generator_names = [generator_names] + if len(generator_names) == len(datasource_names): # Iterate over both together + for idx, datasource_name in enumerate(datasource_names): + datasource = self.get_datasource(datasource_name) + data_asset_names[datasource_name] = \ + datasource.get_available_data_asset_names(generator_names[idx]) + + elif len(generator_names) == 1: + datasource = self.get_datasource(datasource_names[0]) + datasource_names[datasource_names[0]] = datasource.get_available_data_asset_names(generator_names) + + else: + raise ValueError( + "If providing generators, you must either specify one generator for each datasource or only " + "one datasource." + ) + else: # generator_names is None + for datasource_name in datasource_names: + datasource = self.get_datasource(datasource_name) + data_asset_names[datasource_name] = datasource.get_available_data_asset_names(None) + + return data_asset_names + + def get_batch(self, data_asset_name, expectation_suite_name="default", batch_kwargs=None, **kwargs): + """ + Get a batch of data from the specified data_asset_name. Attaches the named expectation_suite, and uses the \ + provided batch_kwargs. + + Args: + data_asset_name: name of the data asset. The name will be normalized. \ + (See :py:meth:`_normalize_data_asset_name` ) + expectation_suite_name: name of the expectation suite to attach to the data_asset returned + batch_kwargs: key-value pairs describing the batch of data the datasource should fetch. \ + (See :class:`BatchGenerator` ) If no batch_kwargs are specified, then the context will get the next + available batch_kwargs for the data_asset. + **kwargs: additional key-value pairs to pass to the datasource when fetching the batch. + + Returns: + Great Expectations data_asset with attached expectation_suite and DataContext + """ + normalized_data_asset_name = self._normalize_data_asset_name(data_asset_name) + + datasource = self.get_datasource(normalized_data_asset_name.datasource) + if not datasource: + raise DataContextError( + "Can't find datasource {0:s} in the config - please check your great_expectations.yml" + ) + + data_asset = datasource.get_batch(normalized_data_asset_name, + expectation_suite_name, + batch_kwargs, + **kwargs) + return data_asset + + def add_datasource(self, name, type_, **kwargs): + """Add a new datasource to the data context. + + The type\_ parameter must match one of the recognized types for the DataContext + + Args: + name (str): the name for the new datasource to add + type_ (str): the type of datasource to add + + Returns: + datasource (Datasource) + """ + datasource_class = self._get_datasource_class(type_) + datasource = datasource_class(name=name, data_context=self, **kwargs) + self._datasources[name] = datasource + if not "datasources" in self._project_config: + self._project_config["datasources"] = {} + self._project_config["datasources"][name] = datasource.get_config() + self._save_project_config() + + return datasource + + def get_config(self): + self._save_project_config() + return self._project_config + + def _get_datasource_class(self, datasource_type): + if datasource_type == "pandas": + return PandasDatasource + elif datasource_type == "dbt": + return DBTDatasource + elif datasource_type == "sqlalchemy": + return SqlAlchemyDatasource + elif datasource_type == "spark": + return SparkDFDatasource + else: + try: + # Update to do dynamic loading based on plugin types + return PandasDatasource + except ImportError: + raise + + def get_datasource(self, datasource_name="default"): + """Get the named datasource + + Args: + datasource_name (str): the name of the datasource from the configuration + + Returns: + datasource (Datasource) + """ + if datasource_name in self._datasources: + return self._datasources[datasource_name] + elif datasource_name in self._project_config["datasources"]: + datasource_config = copy.deepcopy(self._project_config["datasources"][datasource_name]) + # elif len(self._project_config["datasources"]) == 1: + # datasource_name = list(self._project_config["datasources"])[0] + # datasource_config = copy.deepcopy(self._project_config["datasources"][datasource_name]) + else: + raise ValueError( + "Unable to load datasource %s -- no configuration found or invalid configuration." % datasource_name + ) + type_ = datasource_config.pop("type") + datasource_class= self._get_datasource_class(type_) + datasource = datasource_class(name=datasource_name, data_context=self, **datasource_config) + self._datasources[datasource_name] = datasource + return datasource + + def _load_evaluation_parameter_store(self): + """Load the evaluation parameter store to use for managing cross data-asset parameterized expectations. + + By default, the Context uses an in-memory parameter store only suitable for evaluation on a single node. + + Returns: + None + """ + # This is a trivial class that implements in-memory key value store. + # We use it when user does not specify a custom class in the config file + class MemoryEvaluationParameterStore(object): + def __init__(self): + self.dict = {} + + def get(self, run_id, name): + if run_id in self.dict: + return self.dict[run_id][name] + else: + return {} + + def set(self, run_id, name, value): + if run_id not in self.dict: + self.dict[run_id] = {} + self.dict[run_id][name] = value + + def get_run_parameters(self, run_id): + if run_id in self.dict: + return self.dict[run_id] + else: + return {} + + ##### + # + # If user wishes to provide their own implementation for this key value store (e.g., + # Redis-based), they should specify the following in the project config file: + # + # evaluation_parameter_store: + # type: demostore + # config: - this is optional - this is how we can pass kwargs to the object's c-tor + # param1: boo + # param2: bah + # + # Module called "demostore" must be found in great_expectations/plugins/store. + # Class "GreatExpectationsEvaluationParameterStore" must be defined in that module. + # The class must implement the following methods: + # 1. def __init__(self, **kwargs) + # + # 2. def get(self, name) + # + # 3. def set(self, name, value) + # + # We will load the module dynamically + # + ##### + try: + config_block = self._project_config.get("evaluation_parameter_store") + if not config_block or not config_block.get("type"): + self._evaluation_parameter_store = MemoryEvaluationParameterStore() + else: + module_name = config_block.get("type") + class_name = "GreatExpectationsEvaluationParameterStore" + + loaded_module = __import__(module_name, fromlist=[module_name]) + loaded_class = getattr(loaded_module, class_name) + if config_block.get("config"): + self._evaluation_parameter_store = loaded_class(**config_block.get("config")) + else: + self._evaluation_parameter_store = loaded_class() + except Exception: + logger.exception("Failed to load evaluation_parameter_store class") + raise + + def list_expectation_suites(self): + """Returns currently-defined expectation suites available in a nested dictionary structure + reflecting the namespace provided by this DataContext. + + Returns: + Dictionary of currently-defined expectation suites:: + + { + datasource: { + generator: { + generator_asset: [list_of_expectation_suites] + } + } + ... + } + + """ + + expectation_suites_dict = {} + + # First, we construct the *actual* defined expectation suites + for datasource in os.listdir(self.expectations_directory): + datasource_path = os.path.join(self.expectations_directory, datasource) + if not os.path.isdir(datasource_path): + continue + if datasource not in expectation_suites_dict: + expectation_suites_dict[datasource] = {} + for generator in os.listdir(datasource_path): + generator_path = os.path.join(datasource_path, generator) + if not os.path.isdir(generator_path): + continue + if generator not in expectation_suites_dict[datasource]: + expectation_suites_dict[datasource][generator] = {} + for generator_asset in os.listdir(generator_path): + generator_asset_path = os.path.join(generator_path, generator_asset) + if os.path.isdir(generator_asset_path): + candidate_suites = os.listdir(generator_asset_path) + expectation_suites_dict[datasource][generator][generator_asset] = [ + suite_name[:-5] for suite_name in candidate_suites if suite_name.endswith(".json") + ] + + return expectation_suites_dict + + def list_datasources(self): + """List currently-configured datasources on this context. + + Returns: + List(dict): each dictionary includes "name" and "type" keys + """ + return [{"name": key, "type": value["type"]} for key, value in self._project_config["datasources"].items()] + + def _normalize_data_asset_name(self, data_asset_name): + """Normalizes data_asset_names for a data context. + + A data_asset_name is defined per-project and consists of three components that together define a "namespace" + for data assets, encompassing both expectation suites and batches. + + Within a namespace, an expectation suite effectively defines candidate "types" for batches of data, and + validating a batch of data determines whether that instance is of the candidate type. + + The data_asset_name namespace consists of three components: + + - a datasource name + - a generator_name + - a generator_asset + + It has a string representation consisting of each of those components delimited by a character defined in the + data_context ('/' by default). + + Args: + data_asset_name (str): The (unnormalized) data asset name to normalize. The name will be split \ + according to the currently-configured data_asset_name_delimiter + + Returns: + NormalizedDataAssetName + """ + if isinstance(data_asset_name, NormalizedDataAssetName): + return data_asset_name + + split_name = data_asset_name.split(self.data_asset_name_delimiter) + existing_expectation_suites = self.list_expectation_suites() + existing_namespaces = [] + for datasource in existing_expectation_suites.keys(): + for generator in existing_expectation_suites[datasource].keys(): + for generator_asset in existing_expectation_suites[datasource][generator]: + existing_namespaces.append( + NormalizedDataAssetName( + datasource, + generator, + generator_asset + ) + ) + + if len(split_name) > 3: + raise DataContextError( + "Invalid data_asset_name '{data_asset_name}': found too many components using delimiter '{delimiter}'" + .format( + data_asset_name=data_asset_name, + delimiter=self.data_asset_name_delimiter + ) + ) + + elif len(split_name) == 1: + # In this case, the name *must* refer to a unique data_asset_name + provider_names = set() + generator_asset = split_name[0] + for normalized_identifier in existing_namespaces: + curr_generator_asset = normalized_identifier[2] + if generator_asset == curr_generator_asset: + provider_names.add( + normalized_identifier + ) + + # NOTE: Current behavior choice is to continue searching to see whether the namespace is ambiguous + # based on configured generators *even* if there is *only one* namespace with expectation suites + # in it. + + # If generators' namespaces are enormous or if they are slow to provide all their available names, + # that behavior could become unwieldy, and perhaps should be revisited by using the escape hatch + # commented out below. + + # if len(provider_names) == 1: + # return provider_names[0] + # + # elif len(provider_names) > 1: + # raise DataContextError( + # "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" + # .format(data_asset_name=data_asset_name, provider_names=provider_names) + # ) + + available_names = self.get_available_data_asset_names() + for datasource in available_names.keys(): + for generator in available_names[datasource].keys(): + names_set = available_names[datasource][generator] + if generator_asset in names_set: + provider_names.add( + NormalizedDataAssetName(datasource, generator, generator_asset) + ) + + if len(provider_names) == 1: + return provider_names.pop() + + elif len(provider_names) > 1: + raise DataContextError( + "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names) + ) + + # If we are here, then the data_asset_name does not belong to any configured datasource or generator + # If there is only a single datasource and generator, we assume the user wants to create a new + # namespace. + if (len(available_names.keys()) == 1 and # in this case, we know that the datasource name is valid + len(available_names[datasource].keys()) == 1): + logger.info("Normalizing to a new generator name.") + return NormalizedDataAssetName( + datasource, + generator, + generator_asset + ) + + if len(available_names.keys()) == 0: + raise DataContextError( + "No datasource configured: a datasource is required to normalize an incomplete data_asset_name" + ) + + raise DataContextError( + "Ambiguous data_asset_name: no existing data_asset has the provided name, no generator provides it, " + " and there are multiple datasources and/or generators configured." + ) + + elif len(split_name) == 2: + # In this case, the name must be a datasource_name/generator_asset + + # If the data_asset_name is already defined by a config in that datasource, return that normalized name. + provider_names = set() + for normalized_identifier in existing_namespaces: + curr_datasource_name = normalized_identifier[0] + curr_generator_asset = normalized_identifier[2] + if curr_datasource_name == split_name[0] and curr_generator_asset == split_name[1]: + provider_names.add(normalized_identifier) + + # NOTE: Current behavior choice is to continue searching to see whether the namespace is ambiguous + # based on configured generators *even* if there is *only one* namespace with expectation suites + # in it. + + # If generators' namespaces are enormous or if they are slow to provide all their available names, + # that behavior could become unwieldy, and perhaps should be revisited by using the escape hatch + # commented out below. + + # if len(provider_names) == 1: + # return provider_names[0] + # + # elif len(provider_names) > 1: + # raise DataContextError( + # "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" + # .format(data_asset_name=data_asset_name, provider_names=provider_names) + # ) + + available_names = self.get_available_data_asset_names() + for datasource_name in available_names.keys(): + for generator in available_names[datasource_name].keys(): + generator_assets = available_names[datasource_name][generator] + if split_name[0] == datasource_name and split_name[1] in generator_assets: + provider_names.add(NormalizedDataAssetName(datasource_name, generator, split_name[1])) + + if len(provider_names) == 1: + return provider_names.pop() + + elif len(provider_names) > 1: + raise DataContextError( + "Ambiguous data_asset_name '{data_asset_name}'. Multiple candidates found: {provider_names}" + .format(data_asset_name=data_asset_name, provider_names=provider_names) + ) + + # If we are here, then the data_asset_name does not belong to any configured datasource or generator + # If there is only a single generator for their provided datasource, we allow the user to create a new + # namespace. + if split_name[0] in available_names and len(available_names[split_name[0]]) == 1: + logger.info("Normalizing to a new generator name.") + return NormalizedDataAssetName( + split_name[0], + list(available_names[split_name[0]].keys())[0], + split_name[1] + ) + + if len(available_names.keys()) == 0: + raise DataContextError( + "No datasource configured: a datasource is required to normalize an incomplete data_asset_name" + ) + + raise DataContextError( + "No generator available to produce data_asset_name '{data_asset_name}' " + "with datasource '{datasource_name}'" + .format(data_asset_name=data_asset_name, datasource_name=datasource_name) + ) + + elif len(split_name) == 3: + # In this case, we *do* check that the datasource and generator names are valid, but + # allow the user to define a new generator asset + datasources = [datasource["name"] for datasource in self.list_datasources()] + if split_name[0] in datasources: + datasource = self.get_datasource(split_name[0]) + generators = [generator["name"] for generator in datasource.list_generators()] + if split_name[1] in generators: + return NormalizedDataAssetName(*split_name) + + raise DataContextError( + "Invalid data_asset_name: no configured datasource '{datasource_name}' " + "with generator '{generator_name}'" + .format(datasource_name=split_name[0], generator_name=split_name[1]) + ) + + def get_expectation_suite(self, data_asset_name, expectation_suite_name="default"): + """Get or create a named expectation suite for the provided data_asset_name. + + Args: + data_asset_name (str or NormalizedDataAssetName): the data asset name to which the expectation suite belongs + expectation_suite_name (str): the name for the expectation suite + + Returns: + expectation_suite + """ + if not isinstance(data_asset_name, NormalizedDataAssetName): + data_asset_name = self._normalize_data_asset_name(data_asset_name) + + config_file_path = self._get_normalized_data_asset_name_filepath(data_asset_name, expectation_suite_name) + if os.path.isfile(config_file_path): + with open(config_file_path, 'r') as json_file: + read_config = json.load(json_file) + # update the data_asset_name to correspond to the current name (in case the config has been moved/renamed) + read_config["data_asset_name"] = self.data_asset_name_delimiter.join(data_asset_name) + return read_config + else: + return get_empty_expectation_suite( + self.data_asset_name_delimiter.join(data_asset_name), + expectation_suite_name + ) + + def save_expectation_suite(self, expectation_suite, data_asset_name=None, expectation_suite_name=None): + """Save the provided expectation suite into the DataContext. + + Args: + expectation_suite: the suite to save + data_asset_name: the data_asset_name for this expectation suite. If no name is provided, the name will\ + be read from the suite + expectation_suite_name: the name of this expectation suite. If no name is provided the name will \ + be read from the suite + + Returns: + None + """ + if data_asset_name is None: + try: + data_asset_name = expectation_suite['data_asset_name'] + except KeyError: + raise DataContextError( + "data_asset_name must either be specified or present in the provided expectation suite") + if expectation_suite_name is None: + try: + expectation_suite_name = expectation_suite['expectation_suite_name'] + except KeyError: + raise DataContextError( + "expectation_suite_name must either be specified or present in the provided expectation suite") + if not isinstance(data_asset_name, NormalizedDataAssetName): + data_asset_name = self._normalize_data_asset_name(data_asset_name) + config_file_path = self._get_normalized_data_asset_name_filepath(data_asset_name, expectation_suite_name) + safe_mmkdir(os.path.dirname(config_file_path), exist_ok=True) + with open(config_file_path, 'w') as outfile: + json.dump(expectation_suite, outfile) + self._compiled = False + + def bind_evaluation_parameters(self, run_id): # , expectations): + """Return current evaluation parameters stored for the provided run_id, ready to be bound to parameterized + expectation values. + + Args: + run_id: the run_id for which to return evaluation parameters + + Returns: + evaluation_parameters (dict) + """ + # TOOO: only return parameters requested by the given expectations + return self._evaluation_parameter_store.get_run_parameters(run_id) + + def register_validation_results(self, run_id, validation_results, data_asset=None): + """Process results of a validation run. This method is called by data_asset objects that are connected to + a DataContext during validation. It performs several actions: + - store the validation results to a result_store, if one is configured + - store a snapshot of the data_asset, if so configured and a compatible data_asset is available + - perform a callback action using the validation results, if one is configured + - retrieve validation results referenced in other parameterized expectations and store them in the \ + evaluation parameter store. + + Args: + run_id: the run_id for which to register validation results + validation_results: the validation results object + data_asset: the data_asset to snapshot, if snapshot is configured + + Returns: + validation_results: Validation results object, with updated meta information including references to \ + stored data, if appropriate + """ + + try: + data_asset_name = validation_results["meta"]["data_asset_name"] + except KeyError: + logger.warning("No data_asset_name found in validation results; using '_untitled'") + data_asset_name = "_untitled" + + try: + normalized_data_asset_name = self._normalize_data_asset_name(data_asset_name) + except DataContextError: + logger.warning( + "Registering validation results for a data_asset_name that cannot be normalized in this context." + ) + + expectation_suite_name = validation_results["meta"].get("expectation_suite_name", "default") + if "result_store" in self._project_config: + result_store = self._project_config["result_store"] + if isinstance(result_store, dict) and "filesystem" in result_store: + validation_filepath = self._get_normalized_data_asset_name_filepath( + normalized_data_asset_name, + expectation_suite_name, + base_path=os.path.join( + self.root_directory, + result_store["filesystem"]["base_directory"], + run_id + ) + ) + logger.debug("Storing validation result: %s" % validation_filepath) + safe_mmkdir(os.path.dirname(validation_filepath)) + with open(validation_filepath, "w") as outfile: + json.dump(validation_results, outfile) + if isinstance(result_store, dict) and "s3" in result_store: + bucket = result_store["s3"]["bucket"] + key_prefix = result_store["s3"]["key_prefix"] + key = os.path.join( + key_prefix, + "validations/{run_id}/{data_asset_name}".format( + run_id=run_id, + data_asset_name=self._get_normalized_data_asset_name_filepath( + normalized_data_asset_name, + expectation_suite_name, + base_path="" + ) + ) + ) + validation_results["meta"]["result_reference"] = "s3://{bucket}/{key}".format(bucket=bucket, key=key) + try: + import boto3 + s3 = boto3.resource('s3') + result_s3 = s3.Object(bucket, key) + result_s3.put(Body=json.dumps(validation_results).encode('utf-8')) + except ImportError: + logger.error("Error importing boto3 for AWS support.") + except Exception: + raise + + if "result_callback" in self._project_config: + result_callback = self._project_config["result_callback"] + if isinstance(result_callback, dict) and "slack" in result_callback: + get_slack_callback(result_callback["slack"])(validation_results) + else: + logger.warning("Unrecognized result_callback configuration.") + + if "data_asset_snapshot_store" in self._project_config and validation_results["success"] is False: + data_asset_snapshot_store = self._project_config["data_asset_snapshot_store"] + if isinstance(data_asset, PandasDataset): + if isinstance(data_asset_snapshot_store, dict) and "filesystem" in data_asset_snapshot_store: + logger.info("Storing dataset to file") + safe_mmkdir(os.path.join( + self.root_directory, + data_asset_snapshot_store["filesystem"]["base_directory"], + run_id) + ) + data_asset.to_csv( + self._get_normalized_data_asset_name_filepath( + normalized_data_asset_name, + expectation_suite_name, + base_path=os.path.join( + self.root_directory, + data_asset_snapshot_store["filesystem"]["base_directory"], + run_id + ), + file_extension=".csv.gz" + ), + compression="gzip" + ) + + if isinstance(data_asset_snapshot_store, dict) and "s3" in data_asset_snapshot_store: + bucket = data_asset_snapshot_store["s3"]["bucket"] + key_prefix = data_asset_snapshot_store["s3"]["key_prefix"] + key = os.path.join( + key_prefix, + "validations/{run_id}/{data_asset_name}.csv.gz".format( + run_id=run_id, + data_asset_name=self._get_normalized_data_asset_name_filepath( + normalized_data_asset_name, + expectation_suite_name, + base_path="", + file_extension=".csv.gz" + ) + ) + ) + validation_results["meta"]["data_asset_snapshot"] = "s3://{bucket}/{key}".format( + bucket=bucket, + key=key) + + try: + import boto3 + s3 = boto3.resource('s3') + result_s3 = s3.Object(bucket, key) + result_s3.put(Body=data_asset.to_csv(compression="gzip").encode('utf-8')) + except ImportError: + logger.error("Error importing boto3 for AWS support. Unable to save to result store.") + except Exception: + raise + else: + logger.warning( + "Unable to save data_asset of type: %s. Only PandasDataset is supported." % type(data_asset)) + + if not self._compiled: + self._compile() + + if "meta" not in validation_results or "data_asset_name" not in validation_results["meta"]: + logger.warning("No data_asset_name found in validation results; evaluation parameters cannot be registered.") + return validation_results + elif validation_results["meta"]["data_asset_name"] not in self._compiled_parameters["data_assets"]: + # This is fine; short-circuit since we do not need to register any results from this dataset. + return validation_results + + for result in validation_results['results']: + # Unoptimized: loop over all results and check if each is needed + expectation_type = result['expectation_config']['expectation_type'] + if expectation_type in self._compiled_parameters["data_assets"][data_asset_name]: + # First, bind column-style parameters + if (("column" in result['expectation_config']['kwargs']) and + ("columns" in self._compiled_parameters["data_assets"][data_asset_name][expectation_type]) and + (result['expectation_config']['kwargs']["column"] in + self._compiled_parameters["data_assets"][data_asset_name][expectation_type]["columns"])): + + column = result['expectation_config']['kwargs']["column"] + # Now that we have a small search space, invert logic, and look for the parameters in our result + for type_key, desired_parameters in self._compiled_parameters["data_assets"][data_asset_name][expectation_type]["columns"][column].items(): + # value here is the set of desired parameters under the type_key + for desired_param in desired_parameters: + desired_key = desired_param.split(":")[-1] + if type_key == "result" and desired_key in result['result']: + self.store_validation_param(run_id, desired_param, result["result"][desired_key]) + elif type_key == "details" and desired_key in result["result"]["details"]: + self.store_validation_param(run_id, desired_param, result["result"]["details"]) + else: + logger.warning("Unrecognized key for parameter %s" % desired_param) + + # Next, bind parameters that do not have column parameter + for type_key, desired_parameters in self._compiled_parameters["data_assets"][data_asset_name][expectation_type].items(): + if type_key == "columns": + continue + for desired_param in desired_parameters: + desired_key = desired_param.split(":")[-1] + if type_key == "result" and desired_key in result['result']: + self.store_validation_param(run_id, desired_param, result["result"][desired_key]) + elif type_key == "details" and desired_key in result["result"]["details"]: + self.store_validation_param(run_id, desired_param, result["result"]["details"]) + else: + logger.warning("Unrecognized key for parameter %s" % desired_param) + + return validation_results + + def store_validation_param(self, run_id, key, value): + """Store a new validation parameter. + + Args: + run_id: current run_id + key: parameter key + value: parameter value + + Returns: + None + """ + self._evaluation_parameter_store.set(run_id, key, value) + + def get_validation_param(self, run_id, key): + """Get a new validation parameter. + + Args: + run_id: run_id for desired value + key: parameter key + + Returns: + value stored in evaluation_parameter_store for the provided run_id and key + """ + return self._evaluation_parameter_store.get(run_id, key) + + def _compile(self): + """Compiles all current expectation configurations in this context to be ready for result registration. + + Compilation only respects parameters with a URN structure beginning with urn:great_expectations:validations + It splits parameters by the : (colon) character; valid URNs must have one of the following structures to be + automatically recognized. + + "urn" : "great_expectations" : "validations" : data_asset_name : "expectations" : expectation_name : "columns" : column_name : "result": result_key + [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + + "urn" : "great_expectations" : "validations" : data_asset_name : "expectations" : expectation_name : "columns" : column_name : "details": details_key + [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + + "urn" : "great_expectations" : "validations" : data_asset_name : "expectations" : expectation_name : "result": result_key + [0] [1] [2] [3] [4] [5] [6] [7] + + "urn" : "great_expectations" : "validations" : data_asset_name : "expectations" : expectation_name : "details": details_key + [0] [1] [2] [3] [4] [5] [6] [7] + + Parameters are compiled to the following structure: + + :: json + + { + "raw": + "data_assets": { + data_asset_name: { + expectation_name: { + "details": + "result": + column_name: { + "details": + "result": + } + } + } + } + } + + + """ + + # Full recompilation every time + self._compiled_parameters = { + "raw": set(), + "data_assets": {} + } + + known_assets = self.list_expectation_suites() + config_paths = [y for x in os.walk(self.expectations_directory) for y in glob(os.path.join(x[0], '*.json'))] + + for config_file in config_paths: + config = json.load(open(config_file, 'r')) + for expectation in config["expectations"]: + for _, value in expectation["kwargs"].items(): + if isinstance(value, dict) and '$PARAMETER' in value: + # Compile only respects parameters in urn structure beginning with urn:great_expectations:validations + if value["$PARAMETER"].startswith("urn:great_expectations:validations:"): + column_expectation = False + parameter = value["$PARAMETER"] + self._compiled_parameters["raw"].add(parameter) + param_parts = parameter.split(":") + try: + data_asset = param_parts[3] + expectation_name = param_parts[5] + if param_parts[6] == "columns": + column_expectation = True + column_name = param_parts[7] + param_key = param_parts[8] + else: + param_key = param_parts[6] + except IndexError: + logger.warning("Invalid parameter urn (not enough parts): %s" % parameter) + + if data_asset not in known_assets: + logger.warning("Adding parameter %s for unknown data asset config" % parameter) + + if data_asset not in self._compiled_parameters["data_assets"]: + self._compiled_parameters["data_assets"][data_asset] = {} + + if expectation_name not in self._compiled_parameters["data_assets"][data_asset]: + self._compiled_parameters["data_assets"][data_asset][expectation_name] = {} + + if column_expectation: + if "columns" not in self._compiled_parameters["data_assets"][data_asset][expectation_name]: + self._compiled_parameters["data_assets"][data_asset][expectation_name]["columns"] = {} + if column_name not in self._compiled_parameters["data_assets"][data_asset][expectation_name]["columns"]: + self._compiled_parameters["data_assets"][data_asset][expectation_name]["columns"][column_name] = {} + if param_key not in self._compiled_parameters["data_assets"][data_asset][expectation_name]["columns"][column_name]: + self._compiled_parameters["data_assets"][data_asset][expectation_name]["columns"][column_name][param_key] = set() + self._compiled_parameters["data_assets"][data_asset][expectation_name]["columns"][column_name][param_key].add(parameter) + + elif param_key in ["result", "details"]: + if param_key not in self._compiled_parameters["data_assets"][data_asset][expectation_name]: + self._compiled_parameters["data_assets"][data_asset][expectation_name][param_key] = set() + self._compiled_parameters["data_assets"][data_asset][expectation_name][param_key].add(parameter) + + else: + logger.warning("Invalid parameter urn (unrecognized structure): %s" % parameter) + + self._compiled = True + + def get_validation_result(self, data_asset_name, expectation_suite_name="default", run_id=None, failed_only=False): + """Get validation results from a configured store. + + Args: + data_asset_name: name of data asset for which to get validation result + expectation_suite_name: expectation_suite name for which to get validation result (default: "default") + run_id: run_id for which to get validation result (if None, fetch the latest result by alphanumeric sort) + failed_only: if True, filter the result to return only failed expectations + + Returns: + validation_result + + """ + + validation_location = self.get_validation_location(data_asset_name, expectation_suite_name, run_id) + + if 'filepath' in validation_location: + validation_path = validation_location['filepath'] + with open(validation_path, "r") as infile: + results_dict = json.load(infile) + + if failed_only: + failed_results_list = [result for result in results_dict["results"] if not result["success"]] + results_dict["results"] = failed_results_list + return results_dict + else: + return results_dict + + elif 'bucket' in validation_location: # s3 + + try: + import boto3 + s3 = boto3.client('s3') + except ImportError: + raise ImportError("boto3 is required for retrieving a dataset from s3") + + bucket = validation_location["bucket"] + key = validation_location["key"] + s3_response_object = s3.get_object(Bucket=bucket, Key=key) + object_content = s3_response_object['Body'].read() + + results_dict = json.loads(object_content) + + if failed_only: + failed_results_list = [result for result in results_dict["results"] if not result["success"]] + results_dict["results"] = failed_results_list + return results_dict + else: + return results_dict + else: + raise DataContextError("Invalid result_store configuration: only 'filesystem' and 's3' are supported.") + + # TODO: refactor this into a snapshot getter based on project_config + # def get_failed_dataset(self, validation_result, **kwargs): + # try: + # reference_url = validation_result["meta"]["dataset_reference"] + # except KeyError: + # raise ValueError("Validation result must have a dataset_reference in the meta object to fetch") + + # if reference_url.startswith("s3://"): + # try: + # import boto3 + # s3 = boto3.client('s3') + # except ImportError: + # raise ImportError("boto3 is required for retrieving a dataset from s3") + + # parsed_url = urlparse(reference_url) + # bucket = parsed_url.netloc + # key = parsed_url.path[1:] + + # s3_response_object = s3.get_object(Bucket=bucket, Key=key) + # if key.endswith(".csv"): + # # Materialize as dataset + # # TODO: check the associated config for the correct data_asset_type to use + # return read_csv(s3_response_object['Body'], **kwargs) + # else: + # return s3_response_object['Body'] + + # else: + # raise ValueError("Only s3 urls are supported.") + + def update_return_obj(self, data_asset, return_obj): + """Helper called by data_asset. + + Args: + data_asset: The data_asset whose validation produced the current return object + return_obj: the return object to update + + Returns: + return_obj: the return object, potentially changed into a widget by the configured expectation explorer + """ + if self._expectation_explorer: + return self._expectation_explorer_manager.create_expectation_widget(data_asset, return_obj) + else: + return return_obj + + def render_full_static_site(self): + """ + Render the static site for the project. + + Returns: + None + """ + + #TODO: this is a temporary implementation and should be replaced with a rendered specific for this purpose + validation_filepaths = [y for x in os.walk(self.fixtures_validations_directory) for y in glob(os.path.join(x[0], '*.json'))] + for validation_filepath in validation_filepaths: + with open(validation_filepath, "r") as infile: + validation = json.load(infile) + + data_asset_name = validation['meta']['data_asset_name'] + expectation_suite_name = validation['meta']['expectation_suite_name'] + model = DescriptivePageRenderer.render(validation) + out_filepath = self.get_validation_doc_filepath(data_asset_name, expectation_suite_name) + safe_mmkdir(os.path.dirname(out_filepath)) + with open(out_filepath, 'w') as writer: + writer.write(DescriptivePageView.render(model)) + + def profile_datasource(self, + datasource_name, + generator_name=None, + profiler=BasicDatasetProfiler, + max_data_assets=10): + """Profile the named datasource using the named profiler. + + Args: + datasource_name: the name of the datasource for which to profile data_assets + generator_name: the name of the generator to use to get batches + profiler: the profiler to use + max_data_assets: the maximum number of data_assets from the + + Returns: + List of (expectation_suite, EVR) tuples for each of the data_assets found in the datasource + """ + logger.info("Profiling '%s' with '%s'" % (datasource_name, profiler.__name__)) + profiling_results = [] + data_asset_names = self.get_available_data_asset_names(datasource_name) + if generator_name is None: + if len(data_asset_names[datasource_name].keys()) == 1: + generator_name = list(data_asset_names[datasource_name].keys())[0] + if generator_name not in data_asset_names[datasource_name]: + raise ProfilerError("Generator %s not found for datasource %s" % (generator_name, datasource_name)) + + data_asset_name_list = list(data_asset_names[datasource_name][generator_name]) + total_data_assets = len(data_asset_name_list) + logger.info("Found %d data assets using generator '%s'" % (total_data_assets, generator_name)) + + if max_data_assets is None or max_data_assets >= len(data_asset_name_list): + logger.info("Profiling all %d." % (len(data_asset_name_list))) + else: + logger.info("Profiling the first %d, alphabetically." % (max_data_assets)) + data_asset_name_list.sort() + data_asset_name_list = data_asset_name_list[:max_data_assets] + + total_columns, total_expectations, total_rows, skipped_data_assets = 0, 0, 0, 0 + total_start_time = datetime.datetime.now() + run_id = total_start_time.isoformat() + for name in data_asset_name_list: + try: + start_time = datetime.datetime.now() + + # FIXME: There needs to be an affordance here to limit to 100 rows, or downsample, etc. + batch = self.get_batch( + data_asset_name=NormalizedDataAssetName(datasource_name, generator_name, name), + expectation_suite_name=profiler.__name__ + ) + + if not profiler.validate(batch): + raise ProfilerError( + "batch '%s' is not a valid batch for the '%s' profiler" % (name, profiler.__name__) + ) + + # Note: This logic is specific to DatasetProfilers, which profile a single batch. Multi-batch profilers + # will have more to unpack. + expectation_suite, validation_result = profiler.profile(batch, run_id=run_id) + profiling_results.append((expectation_suite, validation_result)) + + if isinstance(batch, Dataset): + # For datasets, we can produce some more detailed statistics + row_count = batch.get_row_count() + total_rows += row_count + new_column_count = len(set([exp["kwargs"]["column"] for exp in expectation_suite["expectations"] if "column" in exp["kwargs"]])) + total_columns += new_column_count + + new_expectation_count = len(expectation_suite["expectations"]) + total_expectations += new_expectation_count + + self.save_expectation_suite(expectation_suite) + duration = (datetime.datetime.now() - start_time).total_seconds() + logger.info("\tProfiled %d rows from %s (%.3f sec)" % (row_count, name, duration)) + + except ProfilerError as err: + logger.warning(err.message) + except IOError as exc: + logger.warning("IOError while profiling %s. (Perhaps a loading error?) Skipping." % (name)) + logger.debug(str(exc)) + skipped_data_assets += 1 + + total_duration = (datetime.datetime.now() - total_start_time).total_seconds() + logger.info(""" +Profiled %d of %d named data assets, with %d total rows and %d columns in %.2f seconds. +Generated, evaluated, and stored %d candidate Expectations. +Note: You will need to review and revise Expectations before using them in production.""" % ( + len(data_asset_name_list), + total_data_assets, + total_rows, + total_columns, + total_duration, + total_expectations, + )) + if skipped_data_assets > 0: + logger.warning("Skipped %d data assets due to errors." % skipped_data_assets) + + return profiling_results + + +PROJECT_HELP_COMMENT = """# Welcome to great expectations. +# This project configuration file allows you to define datasources, +# generators, integrations, and other configuration artifacts that +# make it easier to use Great Expectations. + +# For more help configuring great expectations, +# see the documentation at: https://greatexpectations.io/config_file.html + +# NOTE: GE uses the names of configured datasources and generators to manage +# how expectations and other configuration artifacts are stored in the +# expectations/ and datasources/ folders. If you need to rename an existing +# datasource or generator, be sure to also update the paths for related artifacts. + +""" + +PROJECT_OPTIONAL_CONFIG_COMMENT = """ + +# Configure additional data context options here. + +# Uncomment the lines below to enable s3 as a result store. If a result store is enabled, +# validation results will be saved in the store according to run id. + +# For S3, ensure that appropriate credentials or assume_role permissions are set where +# validation happens. + + +result_store: + filesystem: + base_directory: uncommitted/validations/ +# s3: +# bucket: +# key_prefix: +# + +# Uncomment the lines below to enable a result callback. + +# result_callback: +# slack: https://slack.com/replace_with_your_webhook + + +# Uncomment the lines below to save snapshots of data assets that fail validation. + +# data_asset_snapshot_store: +# filesystem: +# base_directory: uncommitted/snapshots/ +# s3: +# bucket: +# key_prefix: + +""" + +PROJECT_TEMPLATE = PROJECT_HELP_COMMENT + "datasources: {}\n" + PROJECT_OPTIONAL_CONFIG_COMMENT + + +PROFILE_COMMENT = """This file stores profiles with database access credentials. +Do not commit this file to version control. + +A profile can optionally have a single parameter called +"url" which will be passed to sqlalchemy's create_engine. + +Otherwise, all credential options specified here for a +given profile will be passed to sqlalchemy's create URL function. + +""" diff --git a/great_expectations/data_context/expectation_explorer.py b/great_expectations/data_context/expectation_explorer.py new file mode 100644 index 000000000000..3da04aee65aa --- /dev/null +++ b/great_expectations/data_context/expectation_explorer.py @@ -0,0 +1,474 @@ +import logging + +from IPython.display import display +import ipywidgets as widgets + +logger = logging.getLogger(__name__) +debug_view = widgets.Output(layout={'border': '3 px solid pink'}) + + +class ExpectationExplorer(object): + def __init__(self): + self.expectation_widgets = {} + self.expectation_kwarg_field_names = { + 'expect_column_values_to_be_unique': ['mostly'], + 'expect_column_unique_value_count_to_be_between': ['min_value', 'max_value'], + 'expect_column_values_to_be_in_set': ['value_set', 'mostly'], + 'expect_column_to_exist': ['column_index'], + 'expect_column_values_to_not_be_null': ['mostly'], + 'expect_column_values_to_be_null': ['mostly'], + 'expect_column_values_to_not_be_in_set': ['value_set', 'mostly'], + 'expect_column_values_to_match_regex': ['regex', 'mostly'], + 'expect_column_values_to_not_match_regex': ['regex', 'mostly'], + 'expect_column_values_to_match_regex_list': ['regex_list', 'match_on', 'mostly'], + 'expect_column_values_to_not_match_regex_list': ['regex_list', 'mostly'], + 'expect_column_values_to_match_strftime_format': ['strftime_format', 'mostly'], + 'expect_column_values_to_be_json_parseable': ['mostly'], + 'expect_column_values_to_match_json_schema': ['json_schema', 'mostly'], + 'expect_column_value_lengths_to_equal': ['value', 'mostly'], + 'expect_column_value_lengths_to_be_between': ['min_value', 'max_value', 'mostly'], + 'expect_column_values_to_be_between': ['min_value', 'max_value', 'allow_cross_type_comparisons', 'parse_strings_as_datetimes', 'output_strftime_format', 'mostly'], + 'expect_column_max_to_be_between': ['min_value', 'max_value', 'parse_strings_as_datetimes', 'output_strftime_format'], + 'expect_column_min_to_be_between': ['min_value', 'max_value', 'parse_strings_as_datetimes', 'output_strftime_format'], + 'expect_table_row_count_to_equal': ['value'], + 'expect_table_row_count_to_be_between': ['min_value', 'max_value'], + 'expect_table_columns_to_match_ordered_list': ['column_list'], + 'expect_column_proportion_of_unique_values_to_be_between': ['min_value', 'max_value'], + 'expect_column_values_to_be_dateutil_parseable': ['mostly'], + 'expect_column_values_to_be_increasing': ['strictly', 'parse_strings_as_datetimes', 'mostly'], + 'expect_column_values_to_be_decreasing': ['strictly', 'parse_strings_as_datetimes', 'mostly'], + 'expect_column_median_to_be_between': ['min_value', 'max_value'], + 'expect_column_mean_to_be_between': ['min_value', 'max_value'], + 'expect_column_stdev_to_be_between': ['min_value', 'max_value'], + 'expect_column_kl_divergence_to_be_less_than': ['partition_object', 'threshold', 'internal_weight_holdout', 'tail_weight_holdout'], + 'expect_column_sum_to_be_between': ['min_value', 'max_value'], + 'expect_column_most_common_value_to_be_in_set': ['value_set', 'ties_okay'], + 'expect_column_pair_values_to_be_equal': ['ignore_row_if'], + 'expect_column_pair_values_A_to_be_greater_than_B': ['or_equal', 'allow_cross_type_comparisons', 'ignore_row_if'], + 'expect_column_pair_values_to_be_in_set': ['value_pairs_set', 'ignore_row_if'], + #### + 'expect_column_values_to_be_of_type': ['type_', 'mostly'], + 'expect_column_values_to_be_in_type_list': ['type_list', 'mostly'], + 'expect_multicolumn_values_to_be_unique': ['ignore_row_if'], + 'expect_column_chisquare_test_p_value_to_be_greater_than': ['partition_object', 'p'], + 'expect_column_bootstrapped_ks_test_p_value_to_be_greater_than': ['partition_object', 'p', 'bootstrap_samples', 'bootstrap_sample_size'], + 'expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than': ['distribution', 'p_value', 'params'], + } + self.kwarg_widget_exclusions = [ + 'column', 'result_format', 'include_config'] + # debug_view is for debugging ipywidgets callback functions + self.debug_view = widgets.Output(layout={'border': '3 px solid pink'}) + + # @debug_view.capture(clear_output=True) + # def update_result(self, *, new_result, expectation_type, column=None): + def update_result(self, new_result, expectation_type, column=None): + new_success_value = new_result.get('success') + new_result_widgets = self.generate_expectation_result_detail_widgets( + new_result['result'], expectation_type) + new_border_color = 'green' if new_success_value else 'red' + + if column: + self.expectation_widgets[column][expectation_type][ + 'success'].value = 'Success: {str(new_success_value)}'.format(new_success_value=new_success_value) + self.expectation_widgets[column][expectation_type]['result_detail_widget']\ + .children = new_result_widgets + self.expectation_widgets[column][expectation_type]['editor_widget']\ + .layout\ + .border = '2px solid {new_border_color}'.format(new_border_color=new_border_color) + else: + self.expectation_widgets['non_column_expectations'][expectation_type][ + 'success'].value = 'Success: {str(new_success_value)}'.format(new_success_value=new_success_value) + self.expectation_widgets['non_column_expectations'][expectation_type]['result_detail_widget']\ + .children = new_result_widgets + self.expectation_widgets['non_column_expectations'][expectation_type]['editor_widget']\ + .layout\ + .border = '2px solid {new_border_color}'.format(new_border_color=new_border_color) + + def get_expectation_state(self, expectation_type, column=None): + if column: + column_expectations = self.expectation_widgets.get(column) + if not column_expectations: + return None + return column_expectations.get(expectation_type) + else: + non_column_expectations = self.expectation_widgets.get( + 'non_column_expectations') + if not non_column_expectations: + return None + return column_expectations.get(expectation_type) + + def set_expectation_state(self, expectation_type, expectation_state, column=None): + if column: + column_expectations = self.expectation_widgets.get(column, {}) + column_expectations[expectation_type] = expectation_state + self.expectation_widgets[column] = column_expectations + else: + non_column_expectations = self.expectation_widgets.get( + 'non_column_expectations', {}) + non_column_expectations[expectation_type] = expectation_state + self.expectation_widgets['non_column_expectations'] = non_column_expectations + + @debug_view.capture(clear_output=True) + def update_expectation_state(self, ge_df, existing_expectation_state, expectation_validation_result): + expectation_editor_widget = existing_expectation_state.get( + 'editor_widget') + expectation_type = expectation_validation_result['expectation_config']['expectation_type'] + new_expectation_kwargs = expectation_validation_result['expectation_config']['kwargs'] + existing_expectation_kwarg_widgets = existing_expectation_state['kwargs'] + column = existing_expectation_kwarg_widgets.get('column') + + new_kwarg_widget_values = self.ge_kwargs_to_widget_values( + new_expectation_kwargs) + + for kwarg_name, kwarg_value in new_kwarg_widget_values.items(): + if kwarg_name in self.kwarg_widget_exclusions: + continue + existing_widget = existing_expectation_kwarg_widgets.get( + kwarg_name) + + if existing_widget: + if getattr(existing_widget, 'value', None) or getattr( + existing_widget, 'value', None) == 0: + existing_widget.value = kwarg_value + else: + existing_expectation_kwarg_widgets[kwarg_name] = kwarg_value + else: + widget_generator = getattr( + self, 'generate_{kwarg_name}_widget'.format(kwarg_name=kwarg_name), None) + widget = widget_generator(ge_df=ge_df, expectation_type=expectation_type, **new_expectation_kwargs) if widget_generator \ + else self.generate_expectation_kwarg_fallback_widget(expectation_kwarg_name=kwarg_name, **new_expectation_kwargs) + existing_expectation_kwarg_widgets[kwarg_name] = widget + expectation_editor_widget.children[0].children[1].children += ( + widget,) + + self.update_result(new_result=expectation_validation_result, + expectation_type=expectation_type, column=column) + return expectation_editor_widget + + # interconvert expectation kwargs + def expectation_kwarg_widgets_to_ge_kwargs(self, kwarg_widgets): + def kwarg_transformer(kwarg_key, kwarg_value): + kwarg_value = kwarg_value.value if (getattr( + kwarg_value, 'value', None) or getattr( + kwarg_value, 'value', None) == 0) else kwarg_value + transformers = { + 'value_set': lambda value_set_string: [item.strip() for item in value_set_string.split(',')] + } + return transformers.get(kwarg_key, lambda kwarg_value: kwarg_value)(kwarg_value) + + expectation_kwargs = {} + + for key, value in kwarg_widgets.items(): + if not getattr(self, 'generate_{key}_widget'.format(key=key), None): + continue + expectation_kwargs[key] = kwarg_transformer(key, value) + + return expectation_kwargs + + def ge_kwargs_to_widget_values(self, ge_kwargs): + def kwarg_transformer(kwarg_key, kwarg_value): + transformers = { + 'value_set': lambda value_set_list: ', '.join(value_set_list) + } + return transformers.get(kwarg_key, lambda kwarg_value: kwarg_value)(kwarg_value) + + expectation_kwargs = ge_kwargs.copy() + + for key, value in expectation_kwargs.items(): + if not getattr(self, 'generate_{key}_widget'.format(key=key), None): + continue + expectation_kwargs[key] = kwarg_transformer(key, value) + + return expectation_kwargs + + # widget generators for input fields + # def generate_mostly_widget(self, *, ge_df, mostly=1, expectation_type, column=None, **expectation_kwargs): + def generate_mostly_widget(self, ge_df, mostly, expectation_type, column=None, **expectation_kwargs): + mostly_widget = widgets.FloatSlider( + value=mostly, + min=0, + max=1.0, + step=0.01, + description='mostly: ', + continuous_update=True, + orientation='horizontal', + readout=True, + readout_format='.2f' + ) + + @debug_view.capture(clear_output=True) + def on_mostly_change(change): + expectation_state = self.get_expectation_state( + expectation_type, column) + ge_expectation_kwargs = self.expectation_kwarg_widgets_to_ge_kwargs( + expectation_state['kwargs']) + + new_result = getattr(ge_df, expectation_type)( + include_config=True, **ge_expectation_kwargs) + # self.update_result(new_result=new_result, expectation_type=expectation_type, column=column) + + mostly_widget.observe(on_mostly_change, names='value') + return mostly_widget + + # def generate_min_value_widget(self, *, ge_df, expectation_type, min_value=None, column=None, **expectation_kwargs): + def generate_min_value_widget(self, ge_df, expectation_type, min_value=None, column=None, **expectation_kwargs): + + expectation_state = self.get_expectation_state( + expectation_type, column) or {'kwargs': {}} + min_value_widget = expectation_state['kwargs'].get('min_value') + max_value_widget = expectation_state['kwargs'].get('max_value') + + if expectation_type == 'expect_column_unique_value_count_to_be_between': + if min_value_widget: + min_value_widget.value = min_value or int(-9e300) + else: + min_value_widget = widgets.BoundedIntText( + value=min_value or 0, + min=0, + description='min_value: ', + disabled=False + ) + if not max_value_widget: + max_value_widget = widgets.BoundedIntText( + description='max_value: ', + value=int(9e300), + max=int(9e300), + disabled=False + ) + + @debug_view.capture(clear_output=True) + def on_min_value_change(change): + expectation_state = self.get_expectation_state( + expectation_type, column) + ge_expectation_kwargs = self.expectation_kwarg_widgets_to_ge_kwargs( + expectation_state['kwargs']) + new_result = getattr(ge_df, expectation_type)(include_config=True, + **ge_expectation_kwargs) + + min_value_widget.observe(on_min_value_change, names='value') + max_dl = widgets.link((max_value_widget, 'value'), + (min_value_widget, 'max')) + + expectation_state['kwargs']['min_value'] = min_value_widget + expectation_state['kwargs']['max_value'] = max_value_widget + self.set_expectation_state(expectation_type, expectation_state, column) + + return min_value_widget + + # def generate_max_value_widget(self, *, ge_df, expectation_type, max_value=None, column=None, **expectation_kwargs): + def generate_max_value_widget(self, ge_df, expectation_type, max_value=None, column=None, **expectation_kwargs): + expectation_state = self.get_expectation_state( + expectation_type, column) or {'kwargs': {}} + min_value_widget = expectation_state['kwargs'].get('min_value') + max_value_widget = expectation_state['kwargs'].get('max_value') + + if expectation_type == 'expect_column_unique_value_count_to_be_between': + if max_value_widget: + max_value_widget.value = max_value or int(9e300) + else: + max_value_widget = widgets.BoundedIntText( + value=max_value or int(9e300), + max=int(9e300), + description='max_value: ', + disabled=False + ) + if not min_value_widget: + min_value_widget = widgets.BoundedIntText( + min=0, + value=0, + description='min_value: ', + disabled=False + ) + + @debug_view.capture(clear_output=True) + def on_max_value_change(change): + expectation_state = self.get_expectation_state( + expectation_type, column) + ge_expectation_kwargs = self.expectation_kwarg_widgets_to_ge_kwargs( + expectation_state['kwargs']) + new_result = getattr(ge_df, expectation_type)(include_config=True, + **ge_expectation_kwargs) + + max_value_widget.observe(on_max_value_change, names='value') + min_dl = widgets.link((min_value_widget, 'value'), + (max_value_widget, 'min')) + + expectation_state['kwargs']['min_value'] = min_value_widget + expectation_state['kwargs']['max_value'] = max_value_widget + self.set_expectation_state(expectation_type, expectation_state, column) + + return max_value_widget + +# def generate_value_set_widget(self, *, ge_df, expectation_type, value_set, column, **expectation_kwargs): + def generate_value_set_widget(self, ge_df, expectation_type, value_set, column, **expectation_kwargs): + expectation_state = self.get_expectation_state( + expectation_type, column) + value_set_widget = widgets.Textarea( + value=', '.join(value_set), + placeholder='Please enter comma-separated set values.', + description='value_set: ', + disabled=False + ) + + @debug_view.capture(clear_output=True) + def on_value_set_change(change): + expectation_state = self.get_expectation_state( + expectation_type, column) + ge_expectation_kwargs = self.expectation_kwarg_widgets_to_ge_kwargs( + expectation_state['kwargs']) + + new_result = getattr(ge_df, expectation_type)(include_config=True, + **ge_expectation_kwargs) + + self.update_result(new_result=new_result, + expectation_type=expectation_type, column=column) + + value_set_widget.observe(on_value_set_change, names='value') + return value_set_widget + +# def generate_expectation_kwarg_fallback_widget(self, *, expectation_kwarg_name, **expectation_kwargs): + def generate_expectation_kwarg_fallback_widget(self, expectation_kwarg_name, **expectation_kwargs): + expectation_kwarg_value = expectation_kwargs.get( + expectation_kwarg_name) + warning_message = widgets.HTML( + value='
Warning: Cannot find dynamic widget for expectation kwarg "{expectation_kwarg_name}". To change kwarg value, please call expectation again with the modified value.
'.format(expectation_kwarg_name=expectation_kwarg_name) + ) + static_widget = widgets.Textarea(value=str( + expectation_kwarg_value), description='{expectation_kwarg_name}: '.format(expectation_kwarg_name=expectation_kwarg_name), disabled=True) + return widgets.VBox([warning_message, static_widget]) + + # widget generators for general info shared between all expectations + def generate_column_widget(self, column, **kwargs): + # return widgets.HTML(value=f'Column: {column}') if column else None + return widgets.HTML(value='
+ + + + + + + + {% include 'ge_info.j2' %} +
+
+
+ {% block navbar %}{% include 'navbar.j2' %}{% endblock %} +
+
+ {% block sections %}{% include 'sections.j2' %}{% endblock %} +
+ +
+
+ + + + + \ No newline at end of file diff --git a/great_expectations/render/view/templates/sections.j2 b/great_expectations/render/view/templates/sections.j2 new file mode 100644 index 000000000000..d1365f0b8cb1 --- /dev/null +++ b/great_expectations/render/view/templates/sections.j2 @@ -0,0 +1,106 @@ +{# {% if not nowrap %}{% extends "widget.j2" %}{% endif %} #} +{% block sections %} + {% for section in sections %} + {% with content_blocks = section["content_blocks"] %} +
+ {% set section_loop = loop %} + {% if "header" in content_blocks %} + {% with content_block = content_blocks["header"] %} + {% include 'header.j2' %} + {% endwith %} + {% endif %} + {% if "column_type" in content_blocks %} + {% with content_block = content_blocks["column_type"] %} +
+

+ {{content_block["content"][0]}} +

+
+ {% endwith %} + {% endif %} + {% if "overview_table" in content_blocks %} +
left side + {% with content_block = content_blocks["overview_table"] %} + {% include 'table.j2' %} + {% endwith %} +
+ {% endif %} + {% if "stats_table" in content_blocks %} +
right side + {% with content_block = content_blocks["stats_table"] %} + {% include 'table.j2' %} + {% endwith %} +
+ {% endif %} + {% if "value_list" in content_blocks %} + {% with content_block = content_blocks["value_list"] %} + {% include 'value_list.j2' %} + {% endwith %} + {% endif %} + {% if "value_graph" in content_blocks %} + {% with content_block = content_blocks["value_graph"] %} +
+ + {% endwith %} + {% endif %} + {% if "summary_list" in content_blocks %} + {% with content_block = content_blocks["summary_list"] %} +
+

+

    + {% for bullet_point in content_block["bullet_list"] %} +
  • {{ bullet_point | render_template }}
  • + {% endfor %} +
+

+
+ {% endwith %} + {% endif %} + {% if "other_blocks" in content_blocks %} + {% for content_block in content_blocks["other_blocks"] %} + {% set content_block_loop = loop %} + {% if content_block["content_block_type"] == "header" %} + {% include 'header.j2' %} + {% elif content_block["content_block_type"] == "text" %} +
+

+ {{content_block["content"][0]}} +

+
+ {% elif content_block["content_block_type"] == "value_list" %} + {% include 'value_list.j2' %} + + {% elif content_block["content_block_type"] == "bullet_list" %} +
+

+

    + {% for bullet_point in content_block["bullet_list"] %} +
  • {{ bullet_point | render_template }}
  • + {% endfor %} +
+

+
+ + {% elif content_block["content_block_type"] == "graph" %} +
+ + {% elif content_block["content_block_type"] == "table" %} + {% include 'table.j2' %} + {% elif content_block["content_block_type"] == "example_list" %} + {% endif %} + {% endfor %} + {% endif %} +
+ {% endwith %} + {% endfor %} +{% endblock %} \ No newline at end of file diff --git a/great_expectations/render/view/templates/table.j2 b/great_expectations/render/view/templates/table.j2 new file mode 100644 index 000000000000..0428e306a602 --- /dev/null +++ b/great_expectations/render/view/templates/table.j2 @@ -0,0 +1,12 @@ +
+ + {% for row in content_block["table_rows"] %} + + {% set rowloop = loop %} + {% for cell in row %} + + {% endfor %} + + {% endfor %} +
{{ cell }}
+
\ No newline at end of file diff --git a/great_expectations/render/view/templates/value_list.j2 b/great_expectations/render/view/templates/value_list.j2 new file mode 100644 index 000000000000..54056854dd5e --- /dev/null +++ b/great_expectations/render/view/templates/value_list.j2 @@ -0,0 +1,7 @@ +Example values: +
+

+{% for value_count in content_block["value_list"] %} + {{value_count["value"]}} +{% endfor %} +

\ No newline at end of file diff --git a/great_expectations/render/view/view.py b/great_expectations/render/view/view.py new file mode 100644 index 000000000000..01028cd34a5b --- /dev/null +++ b/great_expectations/render/view/view.py @@ -0,0 +1,83 @@ +import json +from string import Template as pTemplate + +from jinja2 import ( + Template, Environment, BaseLoader, PackageLoader, select_autoescape +) + + +def render_template(template): + return pTemplate(template["template"]).substitute(template["params"]) + + +class NoOpTemplate(object): + @classmethod + def render(cls, document): + return document + + +class PrettyPrintTemplate(object): + @classmethod + def render(cls, document, indent=2): + print(json.dumps(document, indent=indent)) + + +class View(object): + """Defines a method for converting a document to human-consumable form""" + + _template = NoOpTemplate + + @classmethod + def render(cls, document, template=None): + if template is None: + template = cls._template + + t = cls._get_template(template) + return t.render(document) + + @classmethod + def _get_template(cls, template): + if template is None: + return NoOpTemplate + + env = Environment( + loader=PackageLoader( + 'great_expectations', + 'render/view/templates' + ), + autoescape=select_autoescape(['html', 'xml']) + ) + env.filters['render_template'] = render_template + return env.get_template(template) + + +class EVRView(View): + pass + + +class ExpectationsView(View): + pass + + +class DataProfileView(View): + pass + + +class ColumnHeaderView(View): + _template = "header.j2" + + +class ValueListView(View): + _template = "value_list.j2" + + +class ColumnSectionView(View): + _template = "sections.j2" + + +class PageView(View): + _template = "page.j2" + + +class DescriptivePageView(PageView): + pass diff --git a/great_expectations/util.py b/great_expectations/util.py index 8171fe0c2e17..4fb3e73b46ed 100644 --- a/great_expectations/util.py +++ b/great_expectations/util.py @@ -1,47 +1,55 @@ +import os + import pandas as pd import json +import logging +import uuid +import errno + +from six import string_types import great_expectations.dataset as dataset +from great_expectations.data_context import DataContext +logger = logging.getLogger(__name__) -def _convert_to_dataset_class(df, dataset_class, expectations_config=None, autoinspect_func=None): + +def _convert_to_dataset_class(df, dataset_class, expectation_suite=None, profiler=None): """ - Convert a (pandas) dataframe to a great_expectations dataset, with (optional) expectations_config + Convert a (pandas) dataframe to a great_expectations dataset, with (optional) expectation_suite """ - if expectations_config is not None: - # Cast the dataframe into the new class, and manually initialize expectations according to the provided configuration - df = dataset_class(df) - df._initialize_expectations(expectations_config) + if expectation_suite is not None: + # Create a dataset of the new class type, and manually initialize expectations according to the provided expectation suite + new_df = dataset_class.from_dataset(df) + new_df._initialize_expectations(expectation_suite) else: # Instantiate the new Dataset with default expectations - try: - df = dataset_class(df, autoinspect_func=autoinspect_func) - except: - raise NotImplementedError( - "read_csv requires a Dataset class that can be instantiated from a Pandas DataFrame") + new_df = dataset_class.from_dataset(df) + if profiler is not None: + new_df.profile(profiler) - return df + return new_df def read_csv( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, - autoinspect_func=None, + expectation_suite=None, + profiler=None, *args, **kwargs ): df = pd.read_csv(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, autoinspect_func) + df, dataset_class, expectation_suite, profiler) return df def read_json( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, + expectation_suite=None, accessor_func=None, - autoinspect_func=None, + profiler=None, *args, **kwargs ): if accessor_func != None: @@ -53,15 +61,15 @@ def read_json( df = pd.read_json(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, autoinspect_func) + df, dataset_class, expectation_suite, profiler) return df def read_excel( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, - autoinspect_func=None, + expectation_suite=None, + profiler=None, *args, **kwargs ): """Read a file using Pandas read_excel and return a great_expectations dataset. @@ -69,7 +77,7 @@ def read_excel( Args: filename (string): path to file to read dataset_class (Dataset class): class to which to convert resulting Pandas df - expectations_config (string): path to great_expectations config file + expectation_suite (string): path to great_expectations expectation suite file Returns: great_expectations dataset or ordered dict of great_expectations datasets, @@ -79,18 +87,18 @@ def read_excel( if isinstance(df, dict): for key in df: df[key] = _convert_to_dataset_class( - df[key], dataset_class, expectations_config, autoinspect_func) + df[key], dataset_class, expectation_suite, profiler) else: df = _convert_to_dataset_class( - df, dataset_class, expectations_config, autoinspect_func) + df, dataset_class, expectation_suite, profiler) return df def read_table( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, - autoinspect_func=None, + expectation_suite=None, + profiler=None, *args, **kwargs ): """Read a file using Pandas read_table and return a great_expectations dataset. @@ -98,22 +106,22 @@ def read_table( Args: filename (string): path to file to read dataset_class (Dataset class): class to which to convert resulting Pandas df - expectations_config (string): path to great_expectations config file + expectation_suite (string): path to great_expectations expectation suite file Returns: great_expectations dataset """ df = pd.read_table(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, autoinspect_func) + df, dataset_class, expectation_suite, profiler) return df def read_parquet( filename, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, - autoinspect_func=None, + expectation_suite=None, + profiler=None, *args, **kwargs ): """Read a file using Pandas read_parquet and return a great_expectations dataset. @@ -121,31 +129,31 @@ def read_parquet( Args: filename (string): path to file to read dataset_class (Dataset class): class to which to convert resulting Pandas df - expectations_config (string): path to great_expectations config file + expectation_suite (string): path to great_expectations expectation suite file Returns: great_expectations dataset """ df = pd.read_parquet(filename, *args, **kwargs) df = _convert_to_dataset_class( - df, dataset_class, expectations_config, autoinspect_func) + df, dataset_class, expectation_suite, profiler) return df -def from_pandas(pandas_df, +def from_pandas(pandas_df, dataset_class=dataset.pandas_dataset.PandasDataset, - expectations_config=None, - autoinspect_func=None -): + expectation_suite=None, + profiler=None + ): """Read a Pandas data frame and return a great_expectations dataset. Args: pandas_df (Pandas df): Pandas data frame dataset_class (Dataset class) = dataset.pandas_dataset.PandasDataset: class to which to convert resulting Pandas df - expectations_config (string) = None: path to great_expectations config file - autoinspect_func (function) = None: The autoinspection function that should - be run on the dataset to establish baseline expectations. + expectation_suite (string) = None: path to great_expectations expectation suite file + profiler (profiler class) = None: The profiler that should + be run on the dataset to establish a baseline expectation suite. Returns: great_expectations dataset @@ -153,18 +161,56 @@ class to which to convert resulting Pandas df return _convert_to_dataset_class( pandas_df, dataset_class, - expectations_config, - autoinspect_func + expectation_suite, + profiler ) -def validate(df, expectations_config, *args, **kwargs): - # FIXME: I'm not sure that this should always default to PandasDataset - dataset_ = _convert_to_dataset_class(df, - dataset.pandas_dataset.PandasDataset, - expectations_config - ) - return dataset_.validate(*args, **kwargs) +def validate(data_asset, expectation_suite=None, data_asset_name=None, data_context=None, data_asset_type=None, *args, **kwargs): + """Validate the provided data asset using the provided expectation suite""" + if expectation_suite is None and data_context is None: + raise ValueError( + "Either an expectation suite or a DataContext is required for validation.") + + if expectation_suite is None: + logger.info("Using expectation suite from DataContext.") + # Allow data_context to be a string, and try loading it from path in that case + if isinstance(data_context, string_types): + data_context = DataContext(data_context) + expectation_suite = data_context.get_expectation_suite(data_asset_name) + else: + if data_asset_name in expectation_suite: + logger.info("Using expectation suite with name %s" % + expectation_suite["data_asset_name"]) + else: + logger.info("Using expectation suite with no data_asset_name") + + # If the object is already a Dataset type, then this is purely a convenience method + # and no conversion is needed + if isinstance(data_asset, dataset.Dataset) and data_asset_type is None: + return data_asset.validate(expectation_suite=expectation_suite, data_context=data_context, *args, **kwargs) + elif data_asset_type is None: + # Guess the GE data_asset_type based on the type of the data_asset + if isinstance(data_asset, pd.DataFrame): + data_asset_type = dataset.PandasDataset + # Add other data_asset_type conditions here as needed + + # Otherwise, we will convert for the user to a subclass of the + # existing class to enable new expectations, but only for datasets + if not isinstance(data_asset, (dataset.Dataset, pd.DataFrame)): + raise ValueError( + "The validate util method only supports dataset validations, including custom subclasses. For other data asset types, use the object's own validate method.") + + if not issubclass(type(data_asset), data_asset_type): + if isinstance(data_asset, (pd.DataFrame)) and issubclass(data_asset_type, dataset.PandasDataset): + pass # This is a special type of allowed coercion + else: + raise ValueError( + "The validate util method only supports validation for subtypes of the provided data_asset_type.") + + data_asset_ = _convert_to_dataset_class( + data_asset, data_asset_type, expectation_suite) + return data_asset_.validate(*args, data_context=data_context, **kwargs) class DotDict(dict): diff --git a/great_expectations/version.py b/great_expectations/version.py index 7a33dd5f4296..336bf5d0f218 100644 --- a/great_expectations/version.py +++ b/great_expectations/version.py @@ -1 +1 @@ -__version__ = "0.6.1__develop" +__version__ = "0.7.0-beta" diff --git a/requirements-dev.txt b/requirements-dev.txt index 89ca1796c44a..8fa5f87d72b0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,17 +3,26 @@ scipy>=0.19.0 pandas>=0.22.0 python-dateutil>=2.4.2 pytz>=2015.6 -six>=1.9.0 +six>=1.12.0 jsonschema>=2.5.1 backports.functools_lru_cache>=1.5 +ruamel.yaml>=0.15.24 +ipywidgets>=7.4.2 +requests>=2.20 +Click>=7.0 +pyfiglet>=0.8 +termcolor>=1.1.0 sqlalchemy>=1.2 +pyspark>=2.3.2 +psycopg2>=2.7.6,<2.8 +mysqlclient>=1.4.0 xlrd>=1.1.0 pyarrow>=0.12.0 sphinxcontrib-napoleon>=0.6.1 sphinx_rtd_theme>=0.4.3 pypandoc>=1.4 pytest>=4.1.1 +mock>=3.0.5 pytest-cov>=2.6.1 coveralls>=1.3 -pyspark>=2.3.2 -psycopg2>=2.7.6,<2.8 +altair>=3.1.0 diff --git a/requirements.txt b/requirements.txt index 258268fafe77..c02c58e34c58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,13 @@ scipy>=0.19.0 pandas>=0.22.0 python-dateutil>=2.4.2 pytz>=2015.6 -six>=1.9.0 +six>=1.12.0 jsonschema>=2.5.1 -backports.functools_lru_cache>=1.5 \ No newline at end of file +altair>=3.1.0 +backports.functools_lru_cache>=1.5 +ruamel.yaml>=0.15.24 +ipywidgets>=7.4.2 +requests>=2.20 +Click>=7.0 +pyfiglet>=0.8 +termcolor>=1.1.0 \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000000..754a14b7f077 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,223 @@ +import pytest + +import shutil +import os +import json +import warnings + +import numpy as np +import sqlalchemy as sa + +import great_expectations as ge +from great_expectations.dataset.pandas_dataset import PandasDataset +from great_expectations.data_context.util import safe_mmkdir + +from .test_utils import get_dataset + +CONTEXTS = ['PandasDataset', 'sqlite', 'SparkDFDataset'] + +### TODO: make it easier to turn off Spark as well + +##### +# +# Postgresql Context +# +##### +try: + engine = sa.create_engine('postgresql://postgres@localhost/test_ci') + conn = engine.connect() + CONTEXTS += ['postgresql'] + conn.close() +except (ImportError, sa.exc.SQLAlchemyError): + warnings.warn("No postgres context available for testing.") + +##### +# +# MySQL context -- TODO FIXME enable these tests +# +##### + +# try: +# engine = sa.create_engine('mysql://root@localhost/test_ci') +# conn = engine.connect() +# CONTEXTS += ['mysql'] +# conn.close() +# except (ImportError, sa.exc.SQLAlchemyError): +# warnings.warn("No mysql context available for testing.") + +@pytest.fixture +def empty_expectation_suite(): + expectation_suite = { + 'data_asset_name': "empty_suite_fixture", + 'expectation_suite_name': "default", + 'meta': {}, + 'expectations': [] + } + return expectation_suite + + +@pytest.fixture +def basic_expectation_suite(): + expectation_suite = { + 'data_asset_name': "basic_suite_fixture", + 'expectation_suite_name': "default", + 'meta': {}, + 'expectations': [ + # Removing this from list of expectations, since mysql doesn't support infinities and we want generic fixtures + # TODO: mysql cannot handle columns with infinities....re-handle this case + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "infinities" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "nulls" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "naturals" + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "naturals" + } + } + ] + } + return expectation_suite + + +@pytest.fixture +def file_data_asset(tmp_path): + tmp_path = str(tmp_path) + path = os.path.join(tmp_path, 'file_data_asset.txt') + with open(path, 'w+') as file: + file.write(json.dumps([0, 1, 2, 3, 4])) + + return ge.data_asset.FileDataAsset(file_path=path) + + +@pytest.fixture(params=CONTEXTS) +def dataset(request): + """Provide dataset fixtures that have special values and/or are otherwise useful outside + the standard json testing framework""" + # No infinities for mysql + if request.param == "mysql": + data = { + # "infinities": [-np.inf, -10, -np.pi, 0, np.pi, 10/2.2, np.inf], + "nulls": [np.nan, None, 0, 1.1, 2.2, 3.3, None], + "naturals": [1, 2, 3, 4, 5, 6, 7] + } + else: + data = { + "infinities": [-np.inf, -10, -np.pi, 0, np.pi, 10/2.2, np.inf], + "nulls": [np.nan, None, 0, 1.1, 2.2, 3.3, None], + "naturals": [1, 2, 3, 4, 5, 6, 7] + } + schemas = { + "pandas": { + "infinities": "float64", + "nulls": "float64", + "naturals": "float64" + }, + "postgresql": { + "infinities": "DOUBLE_PRECISION", + "nulls": "DOUBLE_PRECISION", + "naturals": "DOUBLE_PRECISION" + }, + "sqlite": { + "infinities": "FLOAT", + "nulls": "FLOAT", + "naturals": "FLOAT" + }, + "mysql": { + "infinities": "FLOAT", + "nulls": "FLOAT", + "naturals": "FLOAT" + }, + "spark": { + "infinities": "FloatType", + "nulls": "FloatType", + "naturals": "FloatType" + } + } + return get_dataset(request.param, data, schemas=schemas) + + +@pytest.fixture() +def sqlitedb_engine(): + return sa.create_engine('sqlite://') + + +@pytest.fixture() +def empty_data_context(tmp_path_factory): + project_path = str(tmp_path_factory.mktemp('empty_data_context')) + context = ge.data_context.DataContext.create(project_path) + context_path = os.path.join(project_path, "great_expectations") + asset_config_path = os.path.join( + context_path, "expectations") + safe_mmkdir(asset_config_path, exist_ok=True) + return context + + +@pytest.fixture +def titanic_data_context(tmp_path_factory): + project_path = str(tmp_path_factory.mktemp('titanic_data_context')) + context_path = os.path.join(project_path, "great_expectations") + safe_mmkdir(os.path.join(context_path, "expectations"), exist_ok=True) + safe_mmkdir(os.path.join(context_path, "unexpected/validations"), exist_ok=True) + data_path = os.path.join(context_path, "../data") + safe_mmkdir(os.path.join(data_path), exist_ok=True) + shutil.copy("./tests/test_fixtures/great_expectations_titanic.yml", str(os.path.join(context_path, "great_expectations.yml"))) + shutil.copy("./tests/test_sets/Titanic.csv", str(os.path.join(context_path, "../data/Titanic.csv"))) + return ge.data_context.DataContext(context_path) + + +@pytest.fixture() +def data_context(tmp_path_factory): + # This data_context is *manually* created to have the config we want, vs created with DataContext.create + project_path = str(tmp_path_factory.mktemp('data_context')) + context_path = os.path.join(project_path, "great_expectations") + asset_config_path = os.path.join(context_path, "expectations") + safe_mmkdir(os.path.join(asset_config_path, "mydatasource/mygenerator/parameterized_expectation_suite_fixture"), exist_ok=True) + shutil.copy("./tests/test_fixtures/great_expectations_basic.yml", str(os.path.join(context_path, "great_expectations.yml"))) + shutil.copy("./tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json", + os.path.join(asset_config_path, "mydatasource/mygenerator/parameterized_expectation_suite_fixture/default.json")) + return ge.data_context.DataContext(context_path) + + +@pytest.fixture() +def filesystem_csv(tmp_path_factory): + base_dir = tmp_path_factory.mktemp('filesystem_csv') + base_dir = str(base_dir) + # Put a few files in the directory + with open(os.path.join(base_dir, "f1.csv"), "w") as outfile: + outfile.writelines(["a,b,c\n"]) + with open(os.path.join(base_dir, "f2.csv"), "w") as outfile: + outfile.writelines(["a,b,c\n"]) + + safe_mmkdir(os.path.join(base_dir, "f3")) + with open(os.path.join(base_dir, "f3", "f3_20190101.csv"), "w") as outfile: + outfile.writelines(["a,b,c\n"]) + with open(os.path.join(base_dir, "f3", "f3_20190102.csv"), "w") as outfile: + outfile.writelines(["a,b,c\n"]) + + return base_dir + + +@pytest.fixture() +def filesystem_csv_2(tmp_path_factory): + base_dir = tmp_path_factory.mktemp('test_files') + base_dir = str(base_dir) + + # Put a file in the directory + toy_dataset = PandasDataset({"x": [1, 2, 3]}) + toy_dataset.to_csv(os.path.join(base_dir, "f1.csv"), index=None) + return base_dir diff --git a/tests/data_context/__init__.py b/tests/data_context/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/data_context/test_configuration_storage.py b/tests/data_context/test_configuration_storage.py new file mode 100644 index 000000000000..4338ed495764 --- /dev/null +++ b/tests/data_context/test_configuration_storage.py @@ -0,0 +1,32 @@ +import pytest + +import os + +@pytest.fixture() +def data_context_config_string(): + config_str = \ +""" +# This is a comment +# it should be preserved. +datasources: + # this comments should also be preserved + default: + type: pandas + generators: + # The name default is read if no datasource or generator is specified + default: + type: filesystem + base_dir: /data +""" + +def test_preserve_comments(data_context): + data_context.add_datasource("test_datasource", "pandas") + + context_root_dir = data_context.root_directory + + with open(os.path.join(context_root_dir, "great_expectations.yml"), "r") as infile: + lines = infile.readlines() + + assert lines[0] == "# This is a basic configuration for testing.\n" + assert lines[2] == "datasources:\n" + assert lines[3] == " # For example, this one.\n" \ No newline at end of file diff --git a/tests/data_context/test_data_context.py b/tests/data_context/test_data_context.py new file mode 100644 index 000000000000..f9bd4f4948b5 --- /dev/null +++ b/tests/data_context/test_data_context.py @@ -0,0 +1,399 @@ +import pytest + +from datetime import datetime +try: + from unittest import mock +except ImportError: + import mock + +import os +import shutil +import json + +import sqlalchemy as sa +import pandas as pd + +from great_expectations.exceptions import DataContextError +from great_expectations.data_context import DataContext +from great_expectations.data_context.util import safe_mmkdir, NormalizedDataAssetName +from great_expectations.dataset import PandasDataset, SqlAlchemyDataset + + +@pytest.fixture() +def parameterized_expectation_suite(): + return { + "data_asset_name": "parameterized_expectaitons_config_fixture", + "data_asset_type": "Dataset", + "meta": { + }, + "expectations": [ + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": { + "value": { + "$PARAMETER": "urn:great_expectations:validations:source_diabetes_data:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value" + } + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "value": { + "$PARAMETER": "urn:great_expectations:validations:source_patient_data:expectations:expect_table_row_count_to_equal:result:observed_value" + } + } + } + ] + } + +def test_validate_saves_result_inserts_run_id(empty_data_context, filesystem_csv): + empty_data_context.add_datasource( + "my_datasource", "pandas", base_directory=str(filesystem_csv)) + not_so_empty_data_context = empty_data_context + + # we should now be able to validate, and have validations saved. + assert not_so_empty_data_context._project_config["result_store"]["filesystem"]["base_directory"] == "uncommitted/validations/" + + my_batch = not_so_empty_data_context.get_batch("my_datasource/f1") + + my_batch.expect_column_to_exist("a") + + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.utcnow.return_value = datetime(1955, 11, 5) + validation_result = my_batch.validate() + + with open(os.path.join(not_so_empty_data_context.root_directory, + "uncommitted/validations/1955-11-05T00:00:00/my_datasource/default/f1/default.json")) as infile: + saved_validation_result = json.load(infile) + + assert validation_result == saved_validation_result + +def test_list_available_data_asset_names(empty_data_context, filesystem_csv): + empty_data_context.add_datasource("my_datasource", "pandas", base_directory= str(filesystem_csv)) + available_asset_names = empty_data_context.get_available_data_asset_names() + + assert available_asset_names == { + "my_datasource": { + "default": set(["f1", "f2", "f3"]) + } + } + +def test_list_expectation_suites(data_context): + assert data_context.list_expectation_suites() == { + "mydatasource" : { + "mygenerator": { + "parameterized_expectation_suite_fixture": ["default"] + } + } + } + +def test_get_existing_data_asset_config(data_context): + data_asset_config = data_context.get_expectation_suite('mydatasource/mygenerator/parameterized_expectation_suite_fixture', 'default') + assert data_asset_config['data_asset_name'] == 'mydatasource/mygenerator/parameterized_expectation_suite_fixture' + assert data_asset_config['expectation_suite_name'] == 'default' + assert len(data_asset_config['expectations']) == 2 + +def test_get_new_data_asset_config(data_context): + data_asset_config = data_context.get_expectation_suite('this_data_asset_config_does_not_exist') + assert data_asset_config['data_asset_name'] == 'mydatasource/mygenerator/this_data_asset_config_does_not_exist' + assert data_asset_config['expectation_suite_name'] == 'default' + assert len(data_asset_config['expectations']) == 0 + +def test_save_data_asset_config(data_context): + data_asset_config = data_context.get_expectation_suite('this_data_asset_config_does_not_exist') + assert data_asset_config['data_asset_name'] == 'mydatasource/mygenerator/this_data_asset_config_does_not_exist' + assert data_asset_config["expectation_suite_name"] == "default" + assert len(data_asset_config['expectations']) == 0 + data_asset_config['expectations'].append({ + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": { + "value": 10 + } + }) + data_context.save_expectation_suite(data_asset_config) + data_asset_config_saved = data_context.get_expectation_suite('this_data_asset_config_does_not_exist') + assert data_asset_config['expectations'] == data_asset_config_saved['expectations'] + +def test_register_validation_results(data_context): + run_id = "460d61be-7266-11e9-8848-1681be663d3e" + source_patient_data_results = { + "meta": { + "data_asset_name": "source_patient_data", + "expectation_suite_name": "default" + }, + "results": [ + { + "expectation_config": { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": { + "value": 1024, + } + }, + "success": True, + "exception_info": {"exception_message": None, + "exception_traceback": None, + "raised_exception": False}, + "result": { + "observed_value": 1024, + "element_count": 1024, + "missing_percent": 0.0, + "missing_count": 0 + } + } + ], + "success": True + } + res = data_context.register_validation_results(run_id, source_patient_data_results) + assert res == source_patient_data_results # results should always be returned, and in this case not modified + bound_parameters = data_context._evaluation_parameter_store.get_run_parameters(run_id) + assert bound_parameters == { + 'urn:great_expectations:validations:source_patient_data:expectations:expect_table_row_count_to_equal:result:observed_value': 1024 + } + source_diabetes_data_results = { + "meta": { + "data_asset_name": "source_diabetes_data", + "expectation_suite_name": "default" + }, + "results": [ + { + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "patient_nbr", + "min": 2048, + "max": 2048 + } + }, + "success": True, + "exception_info": {"exception_message": None, + "exception_traceback": None, + "raised_exception": False}, + "result": { + "observed_value": 2048, + "element_count": 5000, + "missing_percent": 0.0, + "missing_count": 0 + } + } + ], + "success": True + } + data_context.register_validation_results(run_id, source_diabetes_data_results) + bound_parameters = data_context._evaluation_parameter_store.get_run_parameters(run_id) + assert bound_parameters == { + 'urn:great_expectations:validations:source_patient_data:expectations:expect_table_row_count_to_equal:result:observed_value': 1024, + 'urn:great_expectations:validations:source_diabetes_data:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value': 2048 + } + +def test_compile(data_context): + data_context._compile() + assert data_context._compiled_parameters == { + 'raw': { + 'urn:great_expectations:validations:source_diabetes_data:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value', + 'urn:great_expectations:validations:source_patient_data:expectations:expect_table_row_count_to_equal:result:observed_value' + }, + 'data_assets': { + 'source_diabetes_data': { + 'expect_column_unique_value_count_to_be_between': { + 'columns': { + 'patient_nbr': { + 'result': { + 'urn:great_expectations:validations:source_diabetes_data:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value' + } + } + } + } + }, + 'source_patient_data': { + 'expect_table_row_count_to_equal': { + 'result': { + 'urn:great_expectations:validations:source_patient_data:expectations:expect_table_row_count_to_equal:result:observed_value' + } + } + } + } + } + +def test_normalize_data_asset_names_error(data_context): + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("this/should/never/work/because/it/is/so/long") + assert "found too many components using delimiter '/'" in exc.message + +def test_normalize_data_asset_names_delimiters(empty_data_context, filesystem_csv): + empty_data_context.add_datasource( + "my_datasource", "pandas", base_directory=str(filesystem_csv)) + data_context = empty_data_context + + data_context.data_asset_name_delimiter = '.' + assert data_context._normalize_data_asset_name("my_datasource.default.f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + data_context.data_asset_name_delimiter = '/' + assert data_context._normalize_data_asset_name("my_datasource/default/f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + with pytest.raises(DataContextError) as exc: + data_context.data_asset_name_delimiter = "$" + assert "Invalid delimiter" in exc.message + + with pytest.raises(DataContextError) as exc: + data_context.data_asset_name_delimiter = "//" + assert "Invalid delimiter" in exc.message + +def test_normalize_data_asset_names_conditions(empty_data_context, filesystem_csv, tmp_path_factory): + # If no datasource is configured, nothing should be allowed to normalize: + with pytest.raises(DataContextError) as exc: + empty_data_context._normalize_data_asset_name("f1") + assert "No datasource configured" in exc.message + + with pytest.raises(DataContextError) as exc: + empty_data_context._normalize_data_asset_name("my_datasource/f1") + assert "No datasource configured" in exc.message + + with pytest.raises(DataContextError) as exc: + empty_data_context._normalize_data_asset_name("my_datasource/default/f1") + assert "No datasource configured" in exc.message + + ### + # Add a datasource + ### + empty_data_context.add_datasource( + "my_datasource", "pandas", base_directory=str(filesystem_csv)) + data_context = empty_data_context + + # We can now reference existing or available data asset namespaces using + # a the data_asset_name; the datasource name and data_asset_name or all + # three components of the normalized data asset name + assert data_context._normalize_data_asset_name("f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + assert data_context._normalize_data_asset_name("my_datasource/f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + assert data_context._normalize_data_asset_name("my_datasource/default/f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + # With only one datasource and generator configured, we + # can create new namespaces at the generator asset level easily: + assert data_context._normalize_data_asset_name("f5") == \ + NormalizedDataAssetName("my_datasource", "default", "f5") + + # We can also be more explicit in creating new namespaces at the generator asset level: + assert data_context._normalize_data_asset_name("my_datasource/f6") == \ + NormalizedDataAssetName("my_datasource", "default", "f6") + + assert data_context._normalize_data_asset_name("my_datasource/default/f7") == \ + NormalizedDataAssetName("my_datasource", "default", "f7") + + # However, we cannot create against nonexisting datasources or generators: + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("my_fake_datasource/default/f7") + assert "no configured datasource 'my_fake_datasource' with generator 'default'" in exc.message + + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("my_datasource/my_fake_generator/f7") + assert "no configured datasource 'my_datasource' with generator 'my_fake_generator'" in exc.message + + ### + # Add a second datasource + ### + + second_datasource_basedir = str(tmp_path_factory.mktemp("test_normalize_data_asset_names_conditions_single_name")) + with open(os.path.join(second_datasource_basedir, "f3.tsv"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(second_datasource_basedir, "f4.tsv"), "w") as outfile: + outfile.write("\n\n\n") + data_context.add_datasource( + "my_second_datasource", "pandas", base_directory=second_datasource_basedir) + + # We can still reference *unambiguous* data_asset_names: + assert data_context._normalize_data_asset_name("f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + assert data_context._normalize_data_asset_name("f4") == \ + NormalizedDataAssetName("my_second_datasource", "default", "f4") + + # However, single-name resolution will fail with ambiguous entries + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("f3") + assert "Ambiguous data_asset_name 'f3'. Multiple candidates found" in exc.message + + # Two-name resolution still works since generators are not ambiguous in that case + assert data_context._normalize_data_asset_name("my_datasource/f3") == \ + NormalizedDataAssetName("my_datasource", "default", "f3") + + # We can also create new namespaces using only two components since that is not ambiguous + assert data_context._normalize_data_asset_name("my_datasource/f9") == \ + NormalizedDataAssetName("my_datasource", "default", "f9") + + # However, we cannot create new names using only a single component + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("f10") + assert "Ambiguous data_asset_name: no existing data_asset has the provided name" in exc.message + + ### + # Add a second generator to one datasource + ### + my_datasource = data_context.get_datasource("my_datasource") + my_datasource.add_generator("in_memory_generator", "memory") + + # We've chosen an interesting case: in_memory_generator does not by default provide its own names + # so we can still get some names if there is no ambiguity about the namespace + assert data_context._normalize_data_asset_name("f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + # However, if we add a data_asset that would cause that name to be ambiguous, it will then fail: + suite = data_context.get_expectation_suite("my_datasource/in_memory_generator/f1") + data_context.save_expectation_suite(suite) + + with pytest.raises(DataContextError) as exc: + name = data_context._normalize_data_asset_name("f1") + assert "Ambiguous data_asset_name 'f1'. Multiple candidates found" in exc.message + + # It will also fail with two components since there is still ambiguity: + with pytest.raises(DataContextError) as exc: + data_context._normalize_data_asset_name("my_datasource/f1") + assert "Ambiguous data_asset_name 'f1'. Multiple candidates found" in exc.message + + # But we can get the asset using all three components + assert data_context._normalize_data_asset_name("my_datasource/default/f1") == \ + NormalizedDataAssetName("my_datasource", "default", "f1") + + assert data_context._normalize_data_asset_name("my_datasource/in_memory_generator/f1") == \ + NormalizedDataAssetName("my_datasource", "in_memory_generator", "f1") + + +def test_list_datasources(data_context): + datasources = data_context.list_datasources() + + assert datasources == [ + { + "name": "mydatasource", + "type": "pandas" + } + ] + + data_context.add_datasource("second_pandas_source", "pandas") + + datasources = data_context.list_datasources() + + assert datasources == [ + { + "name": "mydatasource", + "type": "pandas" + }, + { + "name": "second_pandas_source", + "type": "pandas" + } + ] + +def test_data_context_result_store(titanic_data_context): + """ + Test that validation results can be correctly fetched from the configured results store + """ + profiling_results = titanic_data_context.profile_datasource("mydatasource") + for profiling_result in profiling_results: + data_asset_name = profiling_result[1]['meta']['data_asset_name'] + validation_result = titanic_data_context.get_validation_result(data_asset_name, "BasicDatasetProfiler") + assert data_asset_name in validation_result["meta"]["data_asset_name"] diff --git a/tests/datasource/__init__.py b/tests/datasource/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/datasource/test_batch_generators.py b/tests/datasource/test_batch_generators.py new file mode 100644 index 000000000000..cc7a05dc6db1 --- /dev/null +++ b/tests/datasource/test_batch_generators.py @@ -0,0 +1,140 @@ +import pytest + +import os + +from great_expectations.exceptions import DataContextError +from great_expectations.datasource import SubdirReaderGenerator, GlobReaderGenerator + +def test_file_kwargs_generator(data_context, filesystem_csv): + base_dir = filesystem_csv + + datasource = data_context.add_datasource("default", "pandas", base_directory=str(base_dir)) + generator = datasource.get_generator("default") + known_data_asset_names = datasource.get_available_data_asset_names() + + assert known_data_asset_names["default"] == set([ + "f1", "f2", "f3" + ]) + + f1_batches = [batch_kwargs for batch_kwargs in generator.get_iterator("f1")] + assert len(f1_batches) == 1 + assert "timestamp" in f1_batches[0] + del f1_batches[0]["timestamp"] + assert f1_batches[0] == { + "path": os.path.join(base_dir, "f1.csv") + } + + f3_batches = [batch_kwargs["path"] for batch_kwargs in generator.get_iterator("f3")] + expected_batches = [ + { + "path": os.path.join(base_dir, "f3", "f3_20190101.csv") + }, + { + "path": os.path.join(base_dir, "f3", "f3_20190102.csv") + } + ] + for batch in expected_batches: + assert batch["path"] in f3_batches + assert len(f3_batches) == 2 + +def test_file_kwargs_generator_error(data_context, filesystem_csv): + base_dir = filesystem_csv + data_context.add_datasource("default", "pandas", base_directory=str(base_dir)) + + with pytest.raises(DataContextError) as exc: + data_context.get_batch("f4") + assert "f4" in exc.message + +def test_glob_reader_generator(tmp_path_factory): + """Provides an example of how glob generator works: we specify our own + names for data_assets, and an associated glob; the generator + will take care of providing batches consististing of one file per + batch corresponding to the glob.""" + + basedir = str(tmp_path_factory.mktemp("test_glob_reader_generator")) + + with open(os.path.join(basedir, "f1.blarg"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f2.csv"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f3.blarg"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f4.blarg"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f5.blarg"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f6.blarg"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f7.xls"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f8.parquet"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f9.xls"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f0.json"), "w") as outfile: + outfile.write("\n\n\n") + + g2 = GlobReaderGenerator(base_directory=basedir, asset_globs={ + "blargs": "*.blarg", + "fs": "f*" + }) + + g2_assets = g2.get_available_data_asset_names() + assert g2_assets == set(["blargs", "fs"]) + + blargs_kwargs = [x["path"] for x in g2.get_iterator("blargs")] + real_blargs = [ + os.path.join(basedir, "f1.blarg"), + os.path.join(basedir, "f3.blarg"), + os.path.join(basedir, "f4.blarg"), + os.path.join(basedir, "f5.blarg"), + os.path.join(basedir, "f6.blarg") + ] + for kwargs in real_blargs: + assert kwargs in blargs_kwargs + assert len(blargs_kwargs) == len (real_blargs) + +def test_file_kwargs_generator_extensions(tmp_path_factory): + """csv, xls, parquet, json should be recognized file extensions""" + basedir = str(tmp_path_factory.mktemp("test_file_kwargs_generator_extensions")) + + # Do not include: invalid extension + with open(os.path.join(basedir, "f1.blarg"), "w") as outfile: + outfile.write("\n\n\n") + # Include + with open(os.path.join(basedir, "f2.csv"), "w") as outfile: + outfile.write("\n\n\n") + # Do not include: valid subdir, but no valid files in it + os.mkdir(os.path.join(basedir, "f3")) + with open(os.path.join(basedir, "f3", "f3_1.blarg"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f3", "f3_2.blarg"), "w") as outfile: + outfile.write("\n\n\n") + # Include: valid subdir with valid files + os.mkdir(os.path.join(basedir, "f4")) + with open(os.path.join(basedir, "f4", "f4_1.csv"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f4", "f4_2.csv"), "w") as outfile: + outfile.write("\n\n\n") + # Do not include: valid extension, but dot prefix + with open(os.path.join(basedir, ".f5.csv"), "w") as outfile: + outfile.write("\n\n\n") + + #Include: valid extensions + with open(os.path.join(basedir, "f6.tsv"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f7.xls"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f8.parquet"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f9.xls"), "w") as outfile: + outfile.write("\n\n\n") + with open(os.path.join(basedir, "f0.json"), "w") as outfile: + outfile.write("\n\n\n") + + g1 = SubdirReaderGenerator(base_directory=basedir) + + g1_assets = g1.get_available_data_asset_names() + assert g1_assets == set([ + "f2", "f4", "f6", "f7", "f8", "f9", "f0" + ]) diff --git a/tests/datasource/test_datasources.py b/tests/datasource/test_datasources.py new file mode 100644 index 000000000000..57664568c1cb --- /dev/null +++ b/tests/datasource/test_datasources.py @@ -0,0 +1,285 @@ +# -*- coding: utf-8 -*- + +import pytest +from six import PY3 + +from ruamel.yaml import YAML +yaml = YAML(typ='safe') +import os +import shutil + +import pandas as pd +import sqlalchemy as sa + +from great_expectations.exceptions import BatchKwargsError +from great_expectations.data_context import DataContext +from great_expectations.datasource import PandasDatasource, SqlAlchemyDatasource, SparkDFDatasource +from great_expectations.dataset import PandasDataset, SqlAlchemyDataset, SparkDFDataset + +@pytest.fixture(scope="module") +def test_folder_connection_path(tmp_path_factory): + df1 = pd.DataFrame( + {'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}) + path = str(tmp_path_factory.mktemp("test_folder_connection_path")) + df1.to_csv(os.path.join(path, "test.csv")) + + return str(path) + +@pytest.fixture(scope="module") +def test_db_connection_string(tmp_path_factory): + df1 = pd.DataFrame( + {'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}) + df2 = pd.DataFrame( + {'col_1': [0, 1, 2, 3, 4], 'col_2': ['b', 'c', 'd', 'e', 'f']}) + + basepath = str(tmp_path_factory.mktemp("db_context")) + path = os.path.join(basepath, "test.db") + engine = sa.create_engine('sqlite:///' + str(path)) + df1.to_sql('table_1', con=engine, index=True) + df2.to_sql('table_2', con=engine, index=True, schema='main') + + # Return a connection string to this newly-created db + return 'sqlite:///' + str(path) + + +@pytest.fixture(scope="module") +def test_parquet_folder_connection_path(tmp_path_factory): + df1 = pd.DataFrame( + {'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}) + basepath = str(tmp_path_factory.mktemp("parquet_context")) + df1.to_parquet(os.path.join(basepath, "test.parquet")) + + return basepath + +def test_create_pandas_datasource(data_context, tmp_path_factory): + basedir = tmp_path_factory.mktemp('test_create_pandas_datasource') + name = "test_pandas_datasource" + type_ = "pandas" + + data_context.add_datasource(name, type_, base_directory=str(basedir)) + data_context_config = data_context.get_config() + + assert name in data_context_config["datasources"] + assert data_context_config["datasources"][name]["type"] == type_ + + # We should now see updated configs + # Finally, we should be able to confirm that the folder structure is as expected + with open(os.path.join(data_context.root_directory, "great_expectations.yml"), "r") as data_context_config_file: + data_context_file_config = yaml.load(data_context_config_file) + + assert data_context_file_config["datasources"][name] == data_context_config["datasources"][name] + +def test_standalone_pandas_datasource(test_folder_connection_path): + datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) + + assert datasource.get_available_data_asset_names() == {"default": {"test"}} + manual_batch_kwargs = datasource.build_batch_kwargs(os.path.join(str(test_folder_connection_path), "test.csv")) + + # Get the default (subdir_path) generator + generator = datasource.get_generator() + auto_batch_kwargs = generator.yield_batch_kwargs("test") + + assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] + + # Include some extra kwargs... + dataset = datasource.get_batch("test", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0) + assert isinstance(dataset, PandasDataset) + assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all() + +def test_standalone_sqlalchemy_datasource(test_db_connection_string): + datasource = SqlAlchemyDatasource( + 'SqlAlchemy', connection_string=test_db_connection_string, echo=False) + + assert datasource.get_available_data_asset_names() == {"default": {"table_1", "table_2"}} + dataset1 = datasource.get_batch("table_1") + dataset2 = datasource.get_batch("table_2", schema='main') + assert isinstance(dataset1, SqlAlchemyDataset) + assert isinstance(dataset2, SqlAlchemyDataset) + +def test_create_sqlalchemy_datasource(data_context): + name = "test_sqlalchemy_datasource" + type_ = "sqlalchemy" + connection_kwargs = { + "drivername": "postgresql", + "username": "user", + "password": "pass", + "host": "host", + "port": 1234, + "database": "db", + } + + # It should be possible to create a sqlalchemy source using these params without + # saving a profile + data_context.add_datasource(name, type_, **connection_kwargs) + data_context_config = data_context.get_config() + assert name in data_context_config["datasources"] + assert data_context_config["datasources"][name]["type"] == type_ + + # We should be able to get it in this session even without saving the config + source = data_context.get_datasource(name) + assert isinstance(source, SqlAlchemyDatasource) + + profile_name = "test_sqlalchemy_datasource" + data_context.add_profile_credentials(profile_name, **connection_kwargs) + + # But we should be able to add a source using a profile + name = "second_source" + data_context.add_datasource(name, type_, profile="test_sqlalchemy_datasource") + + data_context_config = data_context.get_config() + assert name in data_context_config["datasources"] + assert data_context_config["datasources"][name]["type"] == type_ + assert data_context_config["datasources"][name]["profile"] == profile_name + + source = data_context.get_datasource(name) + assert isinstance(source, SqlAlchemyDatasource) + + # Finally, we should be able to confirm that the folder structure is as expected + with open(os.path.join(data_context.root_directory, "uncommitted/credentials/profiles.yml"), "r") as profiles_file: + profiles = yaml.load(profiles_file) + + assert profiles == { + profile_name: dict(**connection_kwargs) + } + +def test_create_sparkdf_datasource(data_context, tmp_path_factory): + base_dir = tmp_path_factory.mktemp('test_create_sparkdf_datasource') + name = "test_sparkdf_datasource" + type_ = "spark" + + data_context.add_datasource(name, type_, base_directory=str(base_dir)) + data_context_config = data_context.get_config() + + assert name in data_context_config["datasources"] + assert data_context_config["datasources"][name]["type"] == type_ + assert data_context_config["datasources"][name]["generators"]["default"]["base_directory"] == str(base_dir) + + base_dir = tmp_path_factory.mktemp('test_create_sparkdf_datasource-2') + name = "test_sparkdf_datasource" + type_ = "spark" + + data_context.add_datasource(name, type_, reader_options={"sep": "|", "header": False}) + data_context_config = data_context.get_config() + + assert name in data_context_config["datasources"] + assert data_context_config["datasources"][name]["type"] == type_ + assert data_context_config["datasources"][name]["generators"]["default"]["reader_options"]["sep"] == "|" + + # Note that pipe is special in yml, so let's also check to see that it was properly serialized + with open(os.path.join(data_context.root_directory, "great_expectations.yml"), "r") as configfile: + lines = configfile.readlines() + assert " sep: '|'\n" in lines + assert " header: false\n" in lines + + +def test_sqlalchemysource_templating(sqlitedb_engine): + datasource = SqlAlchemyDatasource(engine=sqlitedb_engine) + generator = datasource.get_generator() + generator.add_query("test", "select 'cat' as ${col_name};") + df = datasource.get_batch("test", col_name="animal_name") + res = df.expect_column_to_exist("animal_name") + assert res["success"] == True + + +def test_pandas_source_readcsv(data_context, tmp_path_factory): + if not PY3: + # We don't specifically test py2 unicode reading since this test is about our handling of kwargs *to* read_csv + pytest.skip() + basedir = tmp_path_factory.mktemp('test_create_pandas_datasource') + shutil.copy("./tests/test_sets/unicode.csv", basedir) + data_context.add_datasource(name="mysource", type_="pandas", reader_options={"encoding": "utf-8"}, base_directory=str(basedir)) + + batch = data_context.get_batch("mysource/unicode") + assert len(batch["Μ"] == 1) + assert "😁" in list(batch["Μ"]) + + data_context.add_datasource(name="mysource2", type_="pandas", base_directory=str(basedir)) + batch = data_context.get_batch("mysource2/unicode") + assert "😁" in list(batch["Μ"]) + + data_context.add_datasource(name="mysource3", type_="pandas", reader_options={"encoding": "utf-16"}, base_directory=str(basedir)) + with pytest.raises(UnicodeError, match="UTF-16 stream does not start with BOM"): + batch = data_context.get_batch("mysource3/unicode") + + with pytest.raises(LookupError, match="unknown encoding: blarg"): + batch = data_context.get_batch("mysource/unicode", encoding='blarg') + + batch = data_context.get_batch("mysource2/unicode", encoding='utf-8') + assert "😁" in list(batch["Μ"]) + + +def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path): + datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path) + + assert datasource.get_available_data_asset_names() == { + "default": set(['test']) + } + dataset = datasource.get_batch('test') + assert isinstance(dataset, SparkDFDataset) + # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int + assert dataset.spark_df.head()['col_1'] == 1 + +def test_standalone_spark_csv_datasource(test_folder_connection_path): + datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) + assert datasource.get_available_data_asset_names() == { + "default": set(['test']) + } + dataset = datasource.get_batch('test', header=True) + assert isinstance(dataset, SparkDFDataset) + # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int + assert dataset.spark_df.head()['col_1'] == '1' + +def test_invalid_reader_sparkdf_datasource(tmp_path_factory): + basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource")) + datasource = SparkDFDatasource('mysparksource', base_directory=basepath) + + with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: + newfile.write("a,b\n1,2\n3,4\n") + + with pytest.raises(BatchKwargsError) as exc: + datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }) + assert "Unable to determine reader for path" in exc.message + + with pytest.raises(BatchKwargsError) as exc: + datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }, reader_method="blarg") + assert "Unknown reader method: blarg" in exc.message + + with pytest.raises(BatchKwargsError) as exc: + datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }, reader_method="excel") + assert "Unsupported reader: excel" in exc.message + + dataset = datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }, + reader_method="csv", header=True) + assert dataset.spark_df.head()["a"] == "1" + +def test_invalid_reader_pandas_datasource(tmp_path_factory): + basepath = str(tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource")) + datasource = PandasDatasource('mypandassource', base_directory=basepath) + + with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: + newfile.write("a,b\n1,2\n3,4\n") + + with pytest.raises(BatchKwargsError) as exc: + datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }) + assert "Unable to determine reader for path" in exc.message + + with pytest.raises(BatchKwargsError) as exc: + datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }, reader_method="blarg") + assert "Unknown reader method: blarg" in exc.message + + dataset = datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ + "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") + }, reader_method="csv", header=0) + assert dataset["a"][0] == 1 \ No newline at end of file diff --git a/tests/sqlalchemy_dataset/test_sqlalchemydataset.py b/tests/sqlalchemy_dataset/test_sqlalchemydataset.py index 96bf447788ee..e3cddc63949b 100644 --- a/tests/sqlalchemy_dataset/test_sqlalchemydataset.py +++ b/tests/sqlalchemy_dataset/test_sqlalchemydataset.py @@ -1,3 +1,7 @@ +try: + from unittest import mock +except ImportError: + import mock import pytest from great_expectations.dataset import MetaSqlAlchemyDataset, SqlAlchemyDataset @@ -109,6 +113,21 @@ def test_schema_custom_sql_error(): assert "Cannot specify both schema and custom_sql." in str(err) +def test_sqlalchemydataset_raises_error_on_missing_table_name(): + with pytest.raises(ValueError) as ve: + SqlAlchemyDataset(table_name=None, engine="foo", connection_string='bar') + assert str(ve.value) == "No table_name provided." + + +def test_sqlalchemydataset_builds_guid_for_table_name_on_custom_sql(): + engine = sa.create_engine('sqlite://') + with mock.patch("uuid.uuid4") as mock_uuid: + mock_uuid.return_value = "a-guid-with-dashes-that-will-break-sql" + + dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1") + assert dataset._table.name =="a_guid_with_dashes_that_will_break_sql" + + def test_sqlalchemydataset_with_custom_sql(): engine = sa.create_engine('sqlite://') @@ -164,7 +183,7 @@ def test_column_fallback(): @pytest.fixture def unexpected_count_df(): - return get_dataset("SqlAlchemyDataset", {"a": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]}) + return get_dataset("sqlite", {"a": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]}) def test_sqlalchemy_dataset_unexpected_count_calculations(unexpected_count_df): diff --git a/tests/test_autoinspect.py b/tests/test_autoinspect.py index b9dfb81ba1b1..05df3561bdbf 100644 --- a/tests/test_autoinspect.py +++ b/tests/test_autoinspect.py @@ -4,52 +4,52 @@ import pytest from .test_utils import get_dataset +from .conftest import CONTEXTS import great_expectations as ge -import great_expectations.dataset.autoinspect as autoinspect def test_no_autoinspection(): - df = ge.dataset.PandasDataset({"a": [1, 2, 3]}, autoinspect_func=None) - config = df.get_expectations_config() + df = ge.dataset.PandasDataset({"a": [1, 2, 3]}, profiler=None) + suite = df.get_expectation_suite() - assert len(config["expectations"]) == 0 + assert len(suite["expectations"]) == 0 def test_default_no_autoinspection(): df = ge.dataset.PandasDataset({"a": [1, 2, 3]}) - config = df.get_expectations_config() + suite = df.get_expectation_suite() - assert len(config["expectations"]) == 0 + assert len(suite["expectations"]) == 0 -@pytest.mark.parametrize("dataset_type", ["PandasDataset", "SqlAlchemyDataset"]) +@pytest.mark.parametrize("dataset_type", CONTEXTS) def test_autoinspect_existing_dataset(dataset_type): # Get a basic dataset with no expectations - df = get_dataset(dataset_type, {"a": [1, 2, 3]}, autoinspect_func=None) - config = df.get_expectations_config() - assert len(config["expectations"]) == 0 + df = get_dataset(dataset_type, {"a": [1, 2, 3]}, profiler=None) + suite = df.get_expectation_suite() + assert len(suite["expectations"]) == 0 # Run autoinspect - df.autoinspect(autoinspect.columns_exist) - config = df.get_expectations_config() + df.profile(ge.profile.ColumnsExistProfiler) + suite = df.get_expectation_suite() # Ensure that autoinspect worked - assert config["expectations"] == \ + assert suite["expectations"] == \ [{'expectation_type': 'expect_column_to_exist', 'kwargs': {'column': 'a'}}] -@pytest.mark.parametrize("dataset_type", ["PandasDataset", "SqlAlchemyDataset"]) +@pytest.mark.parametrize("dataset_type", CONTEXTS) def test_autoinspect_columns_exist(dataset_type): df = get_dataset( - dataset_type, {"a": [1, 2, 3]}, autoinspect_func=autoinspect.columns_exist) - config = df.get_expectations_config() + dataset_type, {"a": [1, 2, 3]}, profiler=ge.profile.ColumnsExistProfiler) + suite = df.get_expectation_suite() - assert len(config["expectations"]) == 1 - assert config["expectations"] == \ + assert len(suite["expectations"]) == 1 + assert suite["expectations"] == \ [{'expectation_type': 'expect_column_to_exist', 'kwargs': {'column': 'a'}}] def test_autoinspect_warning(): with pytest.raises(NotImplementedError): - ge.dataset.Dataset(autoinspect_func=autoinspect.columns_exist) + ge.dataset.Dataset(profiler=ge.profile.ColumnsExistProfiler) diff --git a/tests/test_cli.py b/tests/test_cli.py index 22109b62ded1..e12721b1beac 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,116 +1,151 @@ -import json -import pytest +# Since our cli produces unicode output, but we want tests in python2 as well +from __future__ import unicode_literals -import great_expectations.cli +from click.testing import CliRunner import great_expectations.version +from great_expectations.cli import cli +import tempfile +import pytest +import json +import os +import shutil +import logging +import sys +from ruamel.yaml import YAML +yaml = YAML() +yaml.default_flow_style = False +from datetime import datetime +try: + from unittest import mock +except ImportError: + import mock -def test_cli_command_error(capsys): - with pytest.raises(SystemExit) as pytest_wrapped_e: - great_expectations.cli.dispatch([]) - out, err = capsys.readouterr() - assert pytest_wrapped_e.type == SystemExit - assert out == '' - assert ('error: the following arguments are required: command' in err) or ( - 'error: too few arguments' in err) +def test_cli_command_entrance(): + runner = CliRunner() + result = runner.invoke(cli) + assert result.exit_code == 0 + assert result.output == """Usage: cli [OPTIONS] COMMAND [ARGS]... -def test_cli_validate_help(capsys): - with pytest.raises(SystemExit) as pytest_wrapped_e: - great_expectations.cli.dispatch((["validate", "--help"])) + great_expectations command-line interface - assert pytest_wrapped_e.value.code == 0 - expected_help_message = """ - Validate expectations for your dataset. - - positional arguments: - dataset Path to a file containing a CSV file to validate using - the provided expectations_config_file. - expectations_config_file - Path to a file containing a valid great_expectations - expectations config to use to validate the data. - - optional arguments: - -h, --help show this help message and exit - --evaluation_parameters EVALUATION_PARAMETERS, -p EVALUATION_PARAMETERS - Path to a file containing JSON object used to evaluate - parameters in expectations config. - --result_format RESULT_FORMAT, -o RESULT_FORMAT - Result format to use when building evaluation - responses. - --catch_exceptions CATCH_EXCEPTIONS, -e CATCH_EXCEPTIONS - Specify whether to catch exceptions raised during - evaluation of expectations (defaults to True). - --only_return_failures ONLY_RETURN_FAILURES, -f ONLY_RETURN_FAILURES - Specify whether to only return expectations that are - not met during evaluation (defaults to False). - - custom_dataset: - Arguments defining a custom dataset to use for validation. - - --custom_dataset_module CUSTOM_DATASET_MODULE, -m CUSTOM_DATASET_MODULE - Path to a python module containing a custom dataset - class. - --custom_dataset_class CUSTOM_DATASET_CLASS, -c CUSTOM_DATASET_CLASS - Name of the custom dataset class to use during - evaluation.""".replace("\n ", "\n") - out, err = capsys.readouterr() - assert expected_help_message in out +Options: + --version Show the version and exit. + --help Show this message and exit. +Commands: + init Initialize a new Great Expectations project. + profile Profile a great expectations object. + render Render a great expectations object. + validate Validate a CSV file against an expectation suite. +""" -def test_cli_validate_missing_positional_arguments(capsys): - with pytest.raises(SystemExit) as pytest_wrapped_e: - great_expectations.cli.dispatch(["validate"]) - out, err = capsys.readouterr() +def test_cli_command_bad_command(): + runner = CliRunner() - assert pytest_wrapped_e.type == SystemExit - assert out == '' - assert ('validate: error: the following arguments are required: dataset, expectations_config_file' in err) or \ - ('error: too few arguments' in err) - assert '[--evaluation_parameters EVALUATION_PARAMETERS]' in err - assert '[--result_format RESULT_FORMAT]' in err - assert '[--catch_exceptions CATCH_EXCEPTIONS]' in err - assert '[--only_return_failures ONLY_RETURN_FAILURES]' in err - assert '[--custom_dataset_module CUSTOM_DATASET_MODULE]' in err - assert '[--custom_dataset_class CUSTOM_DATASET_CLASS]' in err + result = runner.invoke(cli, ["blarg"]) + assert result.exit_code == 2 + assert result.output == """Usage: cli [OPTIONS] COMMAND [ARGS]... +Try "cli --help" for help. +Error: No such command "blarg". +""" -def test_cli_version(capsys): - great_expectations.cli.dispatch(["version"]) - out, err = capsys.readouterr() - assert out == great_expectations.version.__version__ + '\n' - assert err == '' +def test_cli_validate_help(): + runner = CliRunner() + result = runner.invoke(cli, ["validate", "--help"]) -def test_validate_basic_operation(capsys): - with pytest.warns(UserWarning, match="No great_expectations version found in configuration object."): - return_value = great_expectations.cli.dispatch(["validate", - "./tests/test_sets/Titanic.csv", - "./tests/test_sets/titanic_expectations.json"]) + assert result.exit_code == 0 + expected_help_message = """Usage: cli validate [OPTIONS] DATASET EXPECTATION_SUITE_FILE + + Validate a CSV file against an expectation suite. + + DATASET: Path to a file containing a CSV file to validate using the + provided expectation_suite_file. + + EXPECTATION_SUITE_FILE: Path to a file containing a valid + great_expectations expectations suite to use to validate the data. + +Options: + -p, --evaluation_parameters TEXT + Path to a file containing JSON object used + to evaluate parameters in expectations + config. + -o, --result_format TEXT Result format to use when building + evaluation responses. + -e, --catch_exceptions BOOLEAN Specify whether to catch exceptions raised + during evaluation of expectations (defaults + to True). + -f, --only_return_failures BOOLEAN + Specify whether to only return expectations + that are not met during evaluation + (defaults to False). + -m, --custom_dataset_module TEXT + Path to a python module containing a custom + dataset class. + -c, --custom_dataset_class TEXT + Name of the custom dataset class to use + during evaluation. + --help Show this message and exit. +""".replace(" ", "").replace("\t", "").replace("\n", "") + output = str(result.output).replace( + " ", "").replace("\t", "").replace("\n", "") + assert output == expected_help_message + + +def test_cli_validate_missing_positional_arguments(): + runner = CliRunner() + + result = runner.invoke(cli, ["validate"]) + + assert "Error: Missing argument \"DATASET\"." in str(result.output) + + +def test_cli_version(): + runner = CliRunner() + + result = runner.invoke(cli, ["--version"]) + assert great_expectations.version.__version__ in str(result.output) + + +def test_validate_basic_operation(): + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.utcnow.return_value = datetime(1955, 11, 5) + runner = CliRunner() + with pytest.warns(UserWarning, match="No great_expectations version found in configuration object."): + result = runner.invoke(cli, ["validate", "./tests/test_sets/Titanic.csv", + "./tests/test_sets/titanic_expectations.json"]) - out, err = capsys.readouterr() - json_result = json.loads(out) + assert result.exit_code == 1 + json_result = json.loads(str(result.output)) + + del json_result["meta"]["great_expectations.__version__"] with open('./tests/test_sets/expected_cli_results_default.json', 'r') as f: expected_cli_results = json.load(f) assert json_result == expected_cli_results - assert return_value == expected_cli_results['statistics']['unsuccessful_expectations'] -def test_validate_custom_dataset(capsys): - with pytest.warns(UserWarning, match="No great_expectations version found in configuration object."): - great_expectations.cli.dispatch(["validate", +def test_validate_custom_dataset(): + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.utcnow.return_value = datetime(1955, 11, 5) + runner = CliRunner() + with pytest.warns(UserWarning, match="No great_expectations version found in configuration object."): + result = runner.invoke(cli, ["validate", "./tests/test_sets/Titanic.csv", "./tests/test_sets/titanic_custom_expectations.json", "-f", "True", "-m", "./tests/test_fixtures/custom_dataset.py", "-c", "CustomPandasDataset"]) - out, err = capsys.readouterr() - json_result = json.loads(out) + json_result = json.loads(result.output) + + del json_result["meta"]["great_expectations.__version__"] del json_result["results"][0]["result"]['partial_unexpected_counts'] with open('./tests/test_sets/expected_cli_results_custom.json', 'r') as f: expected_cli_results = json.load(f) @@ -120,16 +155,80 @@ def test_validate_custom_dataset(capsys): def test_cli_evaluation_parameters(capsys): with pytest.warns(UserWarning, match="No great_expectations version found in configuration object."): - great_expectations.cli.dispatch(["validate", - "./tests/test_sets/Titanic.csv", - "./tests/test_sets/titanic_parameterized_expectations.json", - "--evaluation_parameters", - "./tests/test_sets/titanic_evaluation_parameters.json", - "-f", "True"]) + runner = CliRunner() + result = runner.invoke(cli, ["validate", + "./tests/test_sets/Titanic.csv", + "./tests/test_sets/titanic_parameterized_expectations.json", + "--evaluation_parameters", + "./tests/test_sets/titanic_evaluation_parameters.json", + "-f", "True"]) + json_result = json.loads(result.output) - out, err = capsys.readouterr() with open('./tests/test_sets/titanic_evaluation_parameters.json', 'r') as f: expected_evaluation_parameters = json.load(f) - json_result = json.loads(out) assert json_result['evaluation_parameters'] == expected_evaluation_parameters + + +def test_cli_init(tmp_path_factory): + basedir = tmp_path_factory.mktemp("test_cli_init_diff") + basedir = str(basedir) + os.makedirs(os.path.join(basedir, "data")) + curdir = os.path.abspath(os.getcwd()) + os.chdir(basedir) + + runner = CliRunner() + result = runner.invoke(cli, ["init"], input="Y\n1\n%s\n\n" % str( + os.path.join(basedir, "data"))) + + print(result.output) + + assert """Always know what to expect from your data.""" in result.output + + assert os.path.isdir(os.path.join(basedir, "great_expectations")) + assert os.path.isfile(os.path.join( + basedir, "great_expectations/great_expectations.yml")) + config = yaml.load( + open(os.path.join(basedir, "great_expectations/great_expectations.yml"), "r")) + assert config["datasources"]["data__dir"]["type"] == "pandas" + + os.chdir(curdir) + + # assert False + + +# def test_cli_render(tmp_path_factory): +# runner = CliRunner() +# result = runner.invoke(cli, ["render"]) + +# print(result) +# print(result.output) +# assert False + + +def test_cli_profile(empty_data_context, filesystem_csv_2, capsys): + empty_data_context.add_datasource( + "my_datasource", "pandas", base_directory=str(filesystem_csv_2)) + not_so_empty_data_context = empty_data_context + + project_root_dir = not_so_empty_data_context.root_directory + # print(project_root_dir) + + # For some reason, even with this logging change (which is required and done in main of the cli) + # the click cli runner does not pick up output; capsys appears to intercept it first + logger = logging.getLogger("great_expectations") + handler = logging.StreamHandler(stream=sys.stdout) + formatter = logging.Formatter( + '%(levelname)s %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + runner = CliRunner() + result = runner.invoke( + cli, ["profile", "my_datasource", "-d", project_root_dir]) + + captured = capsys.readouterr() + + assert "Profiling 'my_datasource' with 'BasicDatasetProfiler'" in captured.out + assert "Note: You will need to review and revise Expectations before using them in production." in captured.out + logger.removeHandler(handler) \ No newline at end of file diff --git a/tests/test_data_asset.py b/tests/test_data_asset.py index 28282de9fd96..386b97ad3ca8 100644 --- a/tests/test_data_asset.py +++ b/tests/test_data_asset.py @@ -7,7 +7,6 @@ import pandas as pd import numpy as np import great_expectations as ge -import great_expectations.dataset.autoinspect as autoinspect import unittest @@ -25,13 +24,14 @@ def test_data_asset(self): 'z': ['hello', 'jello', 'mello'], }) - # print D._expectations_config.keys() - # print json.dumps(D._expectations_config, indent=2) + # print D._expectation_suite.keys() + # print json.dumps(D._expectation_suite, indent=2) self.assertEqual( - D._expectations_config, + D._expectation_suite, { "data_asset_name": None, + "expectation_suite_name": "default", "data_asset_type": "Dataset", "meta": { "great_expectations.__version__": ge.__version__ @@ -42,9 +42,10 @@ def test_data_asset(self): self.maxDiff = None self.assertEqual( - D.get_expectations_config(), + D.get_expectation_suite(), { "data_asset_name": None, + "expectation_suite_name": "default", "data_asset_type": "Dataset", "meta": { "great_expectations.__version__": ge.__version__ @@ -64,7 +65,7 @@ def test_expectation_meta(self): 'x', 2, 2, meta={"notes": "This expectation is for lolz."}) k = 0 self.assertEqual(result['success'], True) - config = df.get_expectations_config() + config = df.get_expectation_suite() for expectation_config in config['expectations']: if expectation_config['expectation_type'] == 'expect_column_median_to_be_between': k += 1 @@ -166,6 +167,7 @@ def test_get_and_save_expectation_config(self): } ], "data_asset_name": None, + "expectation_suite_name": "default", "data_asset_type": "Dataset", "meta": { "great_expectations.__version__": ge.__version__ @@ -173,11 +175,11 @@ def test_get_and_save_expectation_config(self): } self.assertEqual( - df.get_expectations_config(), + df.get_expectation_suite(), output_config, ) - df.save_expectations_config(directory_name+'/temp1.json') + df.save_expectation_suite(directory_name + '/temp1.json') temp_file = open(directory_name+'/temp1.json') self.assertEqual( json.load(temp_file), @@ -239,6 +241,7 @@ def test_get_and_save_expectation_config(self): } ], "data_asset_name": None, + "expectation_suite_name": "default", "data_asset_type": "Dataset", "meta": { "great_expectations.__version__": ge.__version__ @@ -246,13 +249,13 @@ def test_get_and_save_expectation_config(self): } self.assertEqual( - df.get_expectations_config( + df.get_expectation_suite( discard_failed_expectations=False ), output_config ) - df.save_expectations_config( + df.save_expectation_suite( directory_name+'/temp2.json', discard_failed_expectations=False ) @@ -312,13 +315,14 @@ def test_get_and_save_expectation_config(self): ], "data_asset_name": None, "data_asset_type": "Dataset", + "expectation_suite_name": "default", "meta": { "great_expectations.__version__": ge.__version__ } } self.assertEqual( - df.get_expectations_config( + df.get_expectation_suite( discard_result_format_kwargs=False, discard_include_configs_kwargs=False, discard_catch_exceptions_kwargs=False, @@ -327,7 +331,7 @@ def test_get_and_save_expectation_config(self): msg="Second Test Set" ) - df.save_expectations_config( + df.save_expectation_suite( directory_name+'/temp3.json', discard_result_format_kwargs=False, discard_include_configs_kwargs=False, @@ -738,7 +742,7 @@ def test_find_expectations(self): 'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'y': [1, 2, None, 4, None, 6, 7, 8, 9, None], 'z': ['cello', 'hello', 'jello', 'bellow', 'fellow', 'mellow', 'wellow', 'xello', 'yellow', 'zello'], - }, autoinspect_func=autoinspect.columns_exist) + }, profiler=ge.profile.ColumnsExistProfiler) my_df.expect_column_values_to_be_of_type('x', 'int') my_df.expect_column_values_to_be_of_type('y', 'int') my_df.expect_column_values_to_be_of_type('z', 'int') @@ -827,7 +831,7 @@ def test_remove_expectation(self): 'x': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'y': [1, 2, None, 4, None, 6, 7, 8, 9, None], 'z': ['cello', 'hello', 'jello', 'bellow', 'fellow', 'mellow', 'wellow', 'xello', 'yellow', 'zello'], - }, autoinspect_func=autoinspect.columns_exist) + }, profiler=ge.profile.ColumnsExistProfiler) my_df.expect_column_values_to_be_of_type('x', 'int') my_df.expect_column_values_to_be_of_type('y', 'int') my_df.expect_column_values_to_be_of_type( @@ -934,7 +938,7 @@ def test_remove_expectation(self): ) self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 8 ) @@ -943,7 +947,7 @@ def test_remove_expectation(self): None ) self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 7 ) @@ -952,18 +956,18 @@ def test_remove_expectation(self): None ) self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 5 ) my_df.remove_expectation(column="z", remove_multiple_matches=True), self.assertEqual( - len(my_df._expectations_config.expectations), + len(my_df._expectation_suite.expectations), 2 ) self.assertEqual( - my_df.get_expectations_config(discard_failed_expectations=False), + my_df.get_expectation_suite(discard_failed_expectations=False), { 'expectations': [ { @@ -976,6 +980,7 @@ def test_remove_expectation(self): } ], 'data_asset_name': None, + "expectation_suite_name": "default", "data_asset_type": "Dataset", "meta": { "great_expectations.__version__": ge.__version__ @@ -989,7 +994,7 @@ def test_discard_failing_expectations(self): 'B': [5, 6, 7, 8], 'C': ['a', 'b', 'c', 'd'], 'D': ['e', 'f', 'g', 'h'] - }, autoinspect_func=autoinspect.columns_exist) + }, profiler=ge.profile.ColumnsExistProfiler) # Put some simple expectations on the data frame df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) @@ -1186,13 +1191,13 @@ def test_meta_version_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - out = D.validate(expectations_config={"expectations": []}) + out = D.validate(expectation_suite={"expectations": []}) self.assertEqual(str(w[0].message), "WARNING: No great_expectations version found in configuration object.") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - out = D.validate(expectations_config={ + out = D.validate(expectation_suite={ "meta": {"great_expectations.__version__": "0.0.0"}, "expectations": []}) self.assertEqual(str(w[0].message), "WARNING: This configuration object was built using version 0.0.0 of great_expectations, but is currently being valided by version %s." % ge.__version__) diff --git a/tests/test_data_asset_util.py b/tests/test_data_asset_util.py index a3f23d498bc7..6f769720a047 100644 --- a/tests/test_data_asset_util.py +++ b/tests/test_data_asset_util.py @@ -22,7 +22,7 @@ def test_recursively_convert_to_json_serializable(self): D.expect_column_kl_divergence_to_be_less_than("x", part, .6) # Dumping this JSON object verifies that everything is serializable - json.dumps(D.get_expectations_config(), indent=2) + json.dumps(D.get_expectation_suite(), indent=2) x = { 'w': [ diff --git a/tests/test_data_contexts/test_data_contexts.py b/tests/test_data_contexts/test_data_contexts.py deleted file mode 100644 index b4794ec0c399..000000000000 --- a/tests/test_data_contexts/test_data_contexts.py +++ /dev/null @@ -1,83 +0,0 @@ -import pytest - -import sqlalchemy as sa -import pandas as pd - -from great_expectations import get_data_context -from great_expectations.dataset import PandasDataset, SqlAlchemyDataset, SparkDFDataset - - -@pytest.fixture(scope="module") -def test_db_connection_string(tmpdir_factory): - df1 = pd.DataFrame( - {'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}) - df2 = pd.DataFrame( - {'col_1': [0, 1, 2, 3, 4], 'col_2': ['b', 'c', 'd', 'e', 'f']}) - - path = tmpdir_factory.mktemp("db_context").join("test.db") - engine = sa.create_engine('sqlite:///' + str(path)) - df1.to_sql('table_1', con=engine, index=True) - df2.to_sql('table_2', con=engine, index=True, schema='main') - - # Return a connection string to this newly-created db - return 'sqlite:///' + str(path) - - -@pytest.fixture(scope="module") -def test_folder_connection_path(tmpdir_factory): - df1 = pd.DataFrame( - {'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}) - path = tmpdir_factory.mktemp("csv_context") - df1.to_csv(path.join("test.csv")) - - return str(path) - - -@pytest.fixture(scope="module") -def test_parquet_folder_connection_path(tmpdir_factory): - df1 = pd.DataFrame( - {'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e']}) - path = tmpdir_factory.mktemp("parquet_context") - df1.to_parquet(path.join("test.parquet")) - - return str(path) - - -def test_invalid_data_context(): - # Test an unknown data context name - with pytest.raises(ValueError) as err: - get_data_context('what_a_ridiculous_name', None) - assert "Unknown data context." in str(err) - - -def test_sqlalchemy_data_context(test_db_connection_string): - context = get_data_context( - 'SqlAlchemy', test_db_connection_string, echo=False) - - assert context.list_datasets() == ['table_1', 'table_2'] - dataset1 = context.get_dataset('table_1') - dataset2 = context.get_dataset('table_2', schema='main') - assert isinstance(dataset1, SqlAlchemyDataset) - assert isinstance(dataset2, SqlAlchemyDataset) - - -def test_pandas_data_context(test_folder_connection_path): - context = get_data_context('PandasCSV', test_folder_connection_path) - - assert context.list_datasets() == ['test.csv'] - dataset = context.get_dataset('test.csv') - assert isinstance(dataset, PandasDataset) - -def test_spark_csv_data_context(test_folder_connection_path): - context = get_data_context('SparkCSV', test_folder_connection_path) - - assert context.list_datasets() == ['test.csv'] - dataset = context.get_dataset('test.csv') - assert isinstance(dataset, SparkDFDataset) - -def test_spark_parquet_data_context(test_parquet_folder_connection_path): - context = get_data_context('SparkParquet', test_parquet_folder_connection_path) - - assert context.list_datasets() == ['test.parquet'] - dataset = context.get_dataset('test.parquet') - assert isinstance(dataset, SparkDFDataset) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 90d28d546f39..c0895891f96d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,14 +1,17 @@ import pytest -from .test_utils import CONTEXTS, get_dataset +from .conftest import CONTEXTS +from .test_utils import get_dataset +from collections import OrderedDict +from great_expectations.dataset import PandasDataset -data = { - "a": [2.0, 5.0], - "b": [5, 5], - "c": [0, 10], - "d": [0, None], -} +data = OrderedDict([ + ["a", [2.0, 5.0]], + ["b", [5, 5]], + ["c", [0, 10]], + ["d", [0, None]] +]) schemas = { "SparkDFDataset": { "a": "float", @@ -31,3 +34,15 @@ def test_caching(context): dataset = get_dataset(context, data, schemas=schemas.get(context)) with pytest.raises(AttributeError): dataset.get_column_max.cache_info() + +@pytest.mark.parametrize('context', CONTEXTS) +def test_head(context): + dataset = get_dataset(context, data, schemas=schemas.get(context), caching=True) + dataset.expect_column_mean_to_be_between("b", 5, 5) + head = dataset.head(1) + assert isinstance(head, PandasDataset) + assert len(head) == 1 + assert list(head.columns) == ["a", "b", "c", "d"] + assert head["a"][0] == 2.0 + suite = head.get_expectation_suite() + assert len(suite["expectations"]) == 5 diff --git a/tests/test_dataset_implementations/test_dataset_implementations.json b/tests/test_dataset_implementations/test_dataset_implementations.json index 258edb063ad8..da1b96e6dd78 100644 --- a/tests/test_dataset_implementations/test_dataset_implementations.json +++ b/tests/test_dataset_implementations/test_dataset_implementations.json @@ -10,11 +10,11 @@ }, "schemas": { "SparkDFDataset": { - "x": "float", - "y": "int", - "z": "int", - "n": "int", - "b": "bool" + "x": "FloatType", + "y": "IntegerType", + "z": "IntegerType", + "n": "IntegerType", + "b": "BooleanType" } } }, @@ -27,10 +27,10 @@ }, "schemas": { "SparkDFDataset": { - "a": "int", - "b": "string", - "c": "string", - "d": "string" + "a": "IntegerType", + "b": "StringType", + "c": "StringType", + "d": "StringType" } } }, @@ -42,9 +42,9 @@ }, "schemas": { "SparkDFDataset": { - "a": "int", - "b": "string", - "c": "int" + "a": "IntegerType", + "b": "StringType", + "c": "IntegerType" } } } @@ -151,30 +151,68 @@ "expected": 3 }, { + "_note": "this test tests the value of this method *after serialization* see python test_get_column_value_counts for series test", "func": "get_column_value_counts", "dataset": "d2", "kwargs": { "column": "b" }, - "expected": { - "a": 1, - "b": 2, - "c": 3, - "d": 4 - } + "expected": [ + { + "value": "a", + "count": 1 + }, + { + "value": "b", + "count": 2 + }, + { + "value": "c", + "count": 3 + }, + { + "value": "d", + "count": 4 + } + ] }, { + "_note": "this test tests the value of this method *after serialization* see python test_get_column_value_counts for series test", "func": "get_column_value_counts", "dataset": "d2", "kwargs": { "column": "c" }, - "expected": { - "a": 1, - "b": 2, - "c": 3, - "d": 1 - } + "expected": [ + { + "value": "a", + "count": 1 + }, + { + "value": "b", + "count": 2 + }, + { + "value": "c", + "count": 3 + }, + { + "value": "d", + "count": 1 + } + ] + }, + { + "_note": "this test tests the value of this method *after serialization* see python test_get_column_value_counts for series test", + "func": "get_column_value_counts", + "dataset": "d1", + "kwargs": { + "column": "y" + }, + "expected": [{ + "value": 5, + "count": 2 + }] }, { "func": "get_column_max", @@ -373,6 +411,60 @@ "max_strictly": false }, "expected": 5 + }, + { + "func": "get_column_quantiles", + "dataset": "d1", + "kwargs": { + "column": "x", + "quantiles": [0.0,1.0] + }, + "expected": [2.0,5.0], + "suppress_test_for": ["sqlite"] + }, + { + "func": "get_column_quantiles", + "dataset": "d2", + "kwargs": { + "column": "a", + "quantiles": [0.0, 0.1111111111111111, 0.2222222222222222, 0.3333333333333333, 0.4444444444444444, 0.5555555555555556, 0.6666666666666666, 0.7777777777777777, 0.8888888888888888, 1.0] + }, + "expected": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "suppress_test_for": ["sqlite"] + }, + { + "func": "get_column_partition", + "dataset": "d2", + "kwargs": { + "column": "a" + }, + "expected": [ 1.0, 1.9, 2.8, 3.7, 4.6, 5.5, 6.4, 7.3, 8.2, 9.1, 10.0], + "suppress_test_for": ["sqlite"], + "tolerance": 0.0001, + "_note": "we use the default arguments here" + }, + { + "func": "get_column_partition", + "dataset": "d2", + "kwargs": { + "column": "a", + "bins": "uniform", + "n_bins": 9 + }, + "tolerance": 0.0001, + "expected": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "suppress_test_for": ["sqlite"] + }, + { + "func": "get_column_partition", + "dataset": "d2", + "kwargs": { + "column": "a", + "bins": "ntile", + "n_bins": 2 + }, + "expected": [1.0, 5.0, 10.0], + "suppress_test_for": ["sqlite"] } ] } \ No newline at end of file diff --git a/tests/test_dataset_implementations/test_dataset_implementations.py b/tests/test_dataset_implementations/test_dataset_implementations.py index dcf29f73c216..d49e27ec7190 100644 --- a/tests/test_dataset_implementations/test_dataset_implementations.py +++ b/tests/test_dataset_implementations/test_dataset_implementations.py @@ -1,12 +1,17 @@ +import pytest + import json import os +import copy from collections import OrderedDict -from ..test_utils import CONTEXTS, get_dataset, candidate_getter_is_on_temporary_notimplemented_list - -import pytest import numpy as np +import pandas as pd + +from ..conftest import CONTEXTS +from ..test_utils import get_dataset, candidate_getter_is_on_temporary_notimplemented_list +from great_expectations.data_asset.util import recursively_convert_to_json_serializable dir_path = os.path.dirname(os.path.realpath(__file__)) test_config_path = os.path.join(dir_path, 'test_dataset_implementations.json') @@ -22,7 +27,7 @@ def test_implementations(context, test): should_skip = ( candidate_getter_is_on_temporary_notimplemented_list(context, test['func']) or - context in test.get('supress_test_for', []) + context in test.get('suppress_test_for', []) ) if should_skip: pytest.skip() @@ -31,13 +36,127 @@ def test_implementations(context, test): schema = test_datasets[test['dataset']]['schemas'].get(context) dataset = get_dataset(context, data, schemas=schema) func = getattr(dataset, test['func']) - result = func(**test.get('kwargs', {})) + run_kwargs = copy.deepcopy(test.get('kwargs', {})) + result = func(**run_kwargs) - # can't serialize pd.Series to json, so convert to dict and compare + # NOTE: we cannot serialize pd.Series to json directly, + # so we're going to test our preferred serialization. + # THIS TEST DOES NOT REPRESENT THE EXPECTED RETURN VALUE + # OF THE TESTED FUNCTION; THIS IS A JOINT TEST OF THE + # JSON SERIALIZATION AND THE TEST. + # See test_get_column_value_counts for a series-specific test if test['func'] == 'get_column_value_counts': - result = result.to_dict() + result = recursively_convert_to_json_serializable(result) if 'tolerance' in test: assert np.allclose(test['expected'], result, test['tolerance']) + elif isinstance(test['expected'], list): + if len(test['expected']) > 0 and isinstance(test['expected'][0], dict): + for item in test['expected']: + assert item in result + else: + assert test['expected'] == result else: assert test['expected'] == result + +@pytest.mark.parametrize('context', CONTEXTS) +def test_get_column_value_counts(context): + schemas = { + "SparkDFDataset": { + "x": "FloatType", + "y": "IntegerType", + "z": "IntegerType", + "n": "IntegerType", + "b": "BooleanType" + } + } + data = { + "x": [2.0, 5.0], + "y": [5, 5], + "z": [0, 10], + "n": [0, None], + "b": [True, False] + } + dataset = get_dataset(context, data, schemas=schemas) + + res = dataset.get_column_value_counts("x") + expected = pd.Series(data["x"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + + assert res.equals(expected) + + res = dataset.get_column_value_counts("y") + expected = pd.Series(data["y"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + res = dataset.get_column_value_counts("z") + expected = pd.Series(data["z"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + res = dataset.get_column_value_counts("n") + expected = pd.Series(data["n"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + + res = dataset.get_column_value_counts("b") + expected = pd.Series(data["b"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + data = { + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "b": ["a", "b", "b", "c", "c", "c", "d", "d", "d", "d"], + "c": ["a", "b", "b", "c", "c", "c", "d", None, None, None], + "d": ["a", "b", "c", "d", "e", "f", "g", None, None, None] + } + schemas = { + "SparkDFDataset": { + "a": "IntegerType", + "b": "StringType", + "c": "StringType", + "d": "StringType" + } + } + dataset = get_dataset(context, data, schemas=schemas) + + res = dataset.get_column_value_counts("a") + expected = pd.Series(data["a"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + res = dataset.get_column_value_counts("b") + expected = pd.Series(data["b"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + + res = dataset.get_column_value_counts("c") + expected = pd.Series(data["c"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) + + res = dataset.get_column_value_counts("d") + expected = pd.Series(data["d"]).value_counts() + expected.sort_index(inplace=True) + expected.index.name = "value" + expected.name = "count" + assert res.equals(expected) diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_distinct_values_to_be_in_set.json b/tests/test_definitions/column_aggregate_expectations/expect_column_distinct_values_to_be_in_set.json new file mode 100644 index 000000000000..3128c489fc84 --- /dev/null +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_distinct_values_to_be_in_set.json @@ -0,0 +1,227 @@ +{ + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "datasets": [{ + "data": { + "dist1" : [1,2,3,4,5,6,7,8], + "dist2" : [1,2,3,4,5,null,null,null], + "dist3" : [2,2,2,2,5,6,7,8], + "dist4" : [1,1,1,1,2,null,null,null] + }, + "tests": [ + { + "title": "Basic positive test", + "exact_match_out": false, + "in": { + "column": "dist1", + "value_set": [1,2,3,4,5,6,7,8,9] + }, + "out": { + "success": true, + "observed_value": [1,2,3,4,5,6,7,8], + "value_counts": [ + {"value": 1, + "count": 1 + }, + {"value": 2, + "count": 1 + }, + {"value": 3, + "count": 1 + }, + {"value": 4, + "count": 1 + }, + {"value": 5, + "count": 1 + }, + {"value": 6, + "count": 1 + }, + {"value": 7, + "count": 1 + }, + {"value": 8, + "count": 1 + } + ] + } + }, + { + "title": "Vacuously true - universal set", + "exact_match_out": false, + "in": { + "column": "dist1", + "value_set": null + }, + "out": { + "success": true, + "observed_value": [1,2,3,4,5,6,7,8], + "value_counts": [ + {"value": 1, + "count": 1 + }, + {"value": 2, + "count": 1 + }, + {"value": 3, + "count": 1 + }, + {"value": 4, + "count": 1 + }, + {"value": 5, + "count": 1 + }, + {"value": 6, + "count": 1 + }, + {"value": 7, + "count": 1 + }, + {"value": 8, + "count": 1 + } + ] + } + }, + { + "title": "Positive test with null values in column", + "exact_match_out": false, + "in": { + "column": "dist2", + "value_set": [1,2,3,4,5] + }, + "out": { + "success": true, + "observed_value": [1,2,3,4,5], + "value_counts": [ + {"value": 1, + "count": 1 + }, + {"value": 2, + "count": 1 + }, + {"value": 3, + "count": 1 + }, + {"value": 4, + "count": 1 + }, + {"value": 5, + "count": 1 + } + ] + } + }, + { + "title": "Positive test with duplicate values in column", + "exact_match_out": false, + "in": { + "column": "dist3", + "value_set": [2,5,6,7,8,9] + }, + "out": { + "success": true, + "observed_value": [2,5,6,7,8], + "value_counts": [ + {"value": 2, + "count": 4 + }, + {"value": 5, + "count": 1 + }, + {"value": 6, + "count": 1 + }, + {"value": 7, + "count": 1 + }, + {"value": 8, + "count": 1 + } + ] + } + }, + { + "title": "Positive test; duplicate and null values", + "exact_match_out": false, + "in": { + "column": "dist4", + "value_set": [1,2,9] + }, + "out": { + "success": true, + "observed_value": [1, 2], + "value_counts": [ + {"value": 1, + "count": 4 + }, + {"value": 2, + "count": 1 + } + ] + } + }, + { + "title": "Basic negative test, no set intersection", + "exact_match_out": false, + "in": { + "column": "dist1", + "value_set": [9] + }, + "out": { + "success": false, + "observed_value": [1,2,3,4,5,6,7,8] + } + }, + { + "title": "Negative test, some set intersection and extra", + "exact_match_out": false, + "in": { + "column": "dist1", + "value_set": [2,3,4,5,6,7,8,9] + }, + "out": { + "success": false, + "observed_value": [1,2,3,4,5,6,7,8] + } + }, + { + "title": "Negative test with null values in column", + "exact_match_out": false, + "in": { + "column": "dist2", + "value_set": [1,2,3,4] + }, + "out": { + "success": false, + "observed_value": [1,2,3,4,5] + } + }, + { + "title": "Negative test with duplicate values in column", + "exact_match_out": false, + "in": { + "column": "dist3", + "value_set": [2,5,6,7] + }, + "out": { + "success": false, + "observed_value": [2,5,6,7,8] + } + }, + { + "title": "Negative test; duplicate and null values", + "exact_match_out": false, + "in": { + "column": "dist4", + "value_set": [1] + }, + "out": { + "success": false, + "observed_value": [1, 2] + } + } + ] + }] +} \ No newline at end of file diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_max_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_max_to_be_between.json index c3b00e1f0fb4..6e0fa33ea95f 100644 --- a/tests/test_definitions/column_aggregate_expectations/expect_column_max_to_be_between.json +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_max_to_be_between.json @@ -13,24 +13,24 @@ }, "schemas": { "sqlite": { - "w" : "int", - "x" : "int", - "y" : "int", - "z" : "varchar", - "zz" : "datetime", - "zzz" : "varchar", - "a" : "int", - "b" : "int" + "w" : "INTEGER", + "x" : "INTEGER", + "y" : "INTEGER", + "z" : "VARCHAR", + "zz" : "DATETIME", + "zzz" : "VARCHAR", + "a" : "INTEGER", + "b" : "INTEGER" }, "postgresql": { - "w" : "int", - "x" : "int", - "y" : "int", - "z" : "text", - "zz" : "timestamp", - "zzz" : "text", - "a" : "int", - "b" : "int" + "w" : "INTEGER", + "x" : "INTEGER", + "y" : "INTEGER", + "z" : "TEXT", + "zz" : "TIMESTAMP", + "zzz" : "TEXT", + "a" : "INTEGER", + "b" : "INTEGER" } }, "tests" : [{ @@ -97,7 +97,7 @@ },{ "title": "Test on a series containing dates, with an output_strftime_format value", "exact_match_out" : false, - "suppress_test_for": ["SQLAlchemy"], + "suppress_test_for": ["sqlalchemy"], "in": { "column": "zz", "min_value": "2/1/2016", diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_mean_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_mean_to_be_between.json index af6516702430..6987d4020bdf 100644 --- a/tests/test_definitions/column_aggregate_expectations/expect_column_mean_to_be_between.json +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_mean_to_be_between.json @@ -10,11 +10,11 @@ }, "schemas": { "spark": { - "x": "float", - "y": "int", - "z": "int", - "n": "int", - "b": "bool" + "x": "FloatType", + "y": "IntegerType", + "z": "IntegerType", + "n": "IntegerType", + "b": "BooleanType" } }, "tests": [{ @@ -109,6 +109,20 @@ "success": true, "observed_value": 0.0 } + }, + { + "title": "Vacuously true: missing min and max", + "exact_match_out": false, + "in": { + "column": "x", + "min_value": null, + "max_value": null, + "catch_exceptions": true + }, + "out": { + "success": true + }, + "_note": "vacuously true" } ] }, @@ -121,7 +135,7 @@ "tests": [{ "title": "type mismatch: null observed_value", "exact_match_out": false, - "suppress_test_for": ["SQLAlchemy"], + "suppress_test_for": ["sqlalchemy"], "in": { "column": "s", "min_value": 0, @@ -144,7 +158,7 @@ "success": true, "observed_value": 0.5 }, - "suppress_test_for": ["postgresql", "Spark"], + "suppress_test_for": ["postgresql", "spark"], "_comment": "postgresql will not allow coercing the boolean type to an average" }, { @@ -159,7 +173,7 @@ "success": true, "observed_value": 0.5 }, - "suppress_test_for": ["postgresql", "Spark"], + "suppress_test_for": ["postgresql", "spark"], "_comment": "postgresql will not allow coercing the boolean type to an average" }, { @@ -185,27 +199,14 @@ "out": { "traceback_substring": "ValueError" } - }, - { - "title": "TypeError: missing min and max", - "exact_match_out": false, - "in": { - "column": "x", - "min_value": null, - "max_value": null, - "catch_exceptions": true - }, - "out": { - "traceback_substring": "ValueError" - } - }]}, + }]}, { "data": { "empty_column": [] }, "schemas": { "spark": { - "empty_column": "int" + "empty_column": "IntegerType" } }, "tests": [{ diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_median_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_median_to_be_between.json index 2bd59e377c7c..001ae179095c 100644 --- a/tests/test_definitions/column_aggregate_expectations/expect_column_median_to_be_between.json +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_median_to_be_between.json @@ -24,9 +24,9 @@ }, "schemas": { "spark": { - "a": "int", - "b": "int", - "c": "int" + "a": "IntegerType", + "b": "IntegerType", + "c": "IntegerType" } }, "tests": [ @@ -162,7 +162,7 @@ }, "schemas": { "spark": { - "empty_column": "int" + "empty_column": "IntegerType" } }, "tests": [{ diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_min_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_min_to_be_between.json index eb4bbf8a0d42..bbaa4d0b5dec 100644 --- a/tests/test_definitions/column_aggregate_expectations/expect_column_min_to_be_between.json +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_min_to_be_between.json @@ -12,22 +12,22 @@ }, "schemas": { "sqlite": { - "w": "int", - "x": "int", - "y": "int", - "z": "varchar", - "zz": "datetime", - "a": "int", - "b": "int" + "w": "INTEGER", + "x": "INTEGER", + "y": "INTEGER", + "z": "VARCHAR", + "zz": "DATETIME", + "a": "INTEGER", + "b": "INTEGER" }, "postgresql": { - "w": "int", - "x": "int", - "y": "int", - "z": "text", - "zz": "timestamp", - "a": "int", - "b": "int" + "w": "INTEGER", + "x": "INTEGER", + "y": "INTEGER", + "z": "TEXT", + "zz": "TIMESTAMP", + "a": "INTEGER", + "b": "INTEGER" } }, "tests" : [{ @@ -106,7 +106,7 @@ },{ "title": "Test on a series containing dates", "exact_match_out" : false, - "suppress_test_for": ["SQLAlchemy"], + "suppress_test_for": ["sqlalchemy"], "in": { "column": "zz", "min_value": "2/1/2016", @@ -120,7 +120,7 @@ },{ "title": "Test on a series containing dates, with an output_strftime_format value", "exact_match_out" : false, - "suppress_test_for": ["SQLAlchemy"], + "suppress_test_for": ["sqlalchemy"], "in": { "column": "zz", "min_value": "2/1/2016", diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_quantile_values_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_quantile_values_to_be_between.json new file mode 100644 index 000000000000..1fae84a3ac20 --- /dev/null +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_quantile_values_to_be_between.json @@ -0,0 +1,100 @@ +{ + "expectation_type" : "expect_column_quantile_values_to_be_between", + "datasets" : [{ + "_notes": "continuous", + "data" : { + "norm_0_1": [0.7225866251125405, -0.5951819764073379, -0.2679313226299394, -0.22503289285616823, 0.1432092195399402, 1.1874676802669433, 1.2766412196640815, 0.15197071140718296, -0.08787273509474242, -0.14524643717509128, -1.236408169492396, -0.1595432263317598, 1.0856768114741797, 0.5082788229519655, 0.26419244684748955, -0.2532308428977167, -0.6362679196021943, -3.134120304969242, -1.8990888524318292, 0.15701781863102648, -0.775788419966582, -0.7400872167978756, -0.10578357492485335, 0.30287010067847436, -1.2127058770179304, -0.6750567678010801, 0.3341434318919877, 1.8336516507046157, 1.105410842250908, -0.7711783703442725, -0.20834347267477862, -0.06315849766945486, 0.003016997583954831, -1.0500016329150343, -0.9168020284223636, 0.306128397266698, 1.0980602112281863, -0.10465519493772572, 0.4557797534454941, -0.2524452955086468, -1.6176089110359837, 0.46251282530754667, 0.45751208998354903, 0.4222844954971609, 0.9651098606162691, -0.1364401431697167, -0.4988616288584964, -0.29549238375582904, 0.6950204582392359, 0.2975369992016046, -1.0159498719807218, 1.3704532401348395, 1.1210419577766673, 1.2051869452003332, 0.10749349867353084, -3.1876892257116562, 1.316240976262548, -1.3777452919511493, -1.0666211985935259, 1.605446695828751, -0.39682821266996865, -0.2828059717857655, 1.30488698803017, -2.116606225467923, -0.2026680301462151, -0.05504008273574069, -0.028520163428411835, 0.4424105678123449, -0.3427628263418371, 0.23805293411919937, -0.7515414823259695, -0.1272505897548366, 1.803348436304099, -2.0178252709022124, 0.4860300090112474, 1.2304054166426217, 0.7228668982068365, 1.7400607500575112, 0.3480274098246697, -0.3887978895385282, -1.6511926233909175, 0.14517929503564567, -1.1599010576123796, -0.016133552438119002, 0.47157644883706273, 0.27657785075518254, 1.4464286976282463, -1.2605489185634533, -1.2548765025615338, 0.0755319579826929, 1.0476733637516833, -0.7038690219524807, -0.9580696842862921, -0.18135657098008018, -0.18163993379314564, 0.4092798531146971, -2.049808182546896, -1.2447062617916826, -1.6681140306283337, 1.0709944517933483, -0.7059385234342846, -0.8033587669003331, -1.8152275905903312, 0.11729996097670137, 2.2994900038012376, -0.1291192451734159, -0.6731565869164164, -0.06690994571366346, -0.40330072968473235, -0.23927186025094221, 2.7756216937096676, 0.06441299443146056, -0.5095247173507204, -0.5228853558871007, 0.806629654091097, -2.110096084114651, -0.1233374136509439, -1.021178519845751, 0.058906278340351045, -0.26316852406211017, -1.2990807244026237, -0.1937986598084067, 0.3909222793445317, 0.578027315076297, -0.11837271520846208, -1.134297652720464, 0.496915417153268, -0.5315184110418045, 0.5284176849952198, -1.6810338988102331, 0.41220454054009154, 1.0554031136792, -1.4222775023918832, -1.1664353586956209, 0.018952180522661358, -0.04620616876577671, -0.8446292647938418, -0.6889432180332509, -0.16012081070647954, 0.5680940644754282, -1.9792941921407943, 0.35441842206114726, 0.12433268557499534, 0.25366905921805377, 0.6262297786892028, 1.327981424671081, 1.774834324890265, -0.9725604763128438, 0.42824027889428, 0.19725541390327114, 1.4640606982992412, 1.6484993842838995, 0.009848260786412894, -2.318740403198263, -0.4125245127403577, -0.15500831770388285, 1.010740123094443, 0.7509498708766653, -0.021415407776108144, 0.6466776546788641, -1.421096837521404, 0.5632248951325018, -1.230539161899903, -0.26766333435961503, -1.7208241092827994, -1.068122926814994, -1.6339248620455546, 0.07225436117508208, -1.2018233250224348, -0.07213000691963527, -1.0080992229563746, -1.151378048476321, -0.2660104149809121, 1.6307779136408695, 0.8394822016824073, -0.23362802143120032, -0.36799502320054384, 0.35359852278856263, 0.5830948999779656, -0.730683771776052, 1.4715728371820667, -1.0668090648998136, -1.025762014881618, 0.21056106958224155, -0.5141254207774576, -0.1592942838690149, 0.7688711617969363, -2.464535892598544, -0.33306989349452987, 0.9457207224940593, 0.36108072442574435, -0.6490066877470516, -0.8714147266896871, 0.6567118414749348, -0.18543305444915045, 0.11156511615955596, 0.7299392157186994, -0.9902398239693843, -1.3231344439063761, -1.1402773433114928, 0.3696183719476138, -1.0512718152423168, -0.6093518314203102, 0.0010622538704462257, -0.17676306948277776, -0.6291120128576891, 1.6390197341434742, -0.8105788162716191, -2.0105672384392204, -0.7909143328024505, -0.10510684692203587, -0.013384480496840259, 0.37683659744804815, -0.15123337965442354, 1.8427651248902048, 1.0371006855495906, 0.29198928612503655, -1.7455852392709181, 1.0854545339796853, 1.8156620972829793, 1.2399563224061596, 1.1196530775769857, 0.4349954478175989, 0.11093680938321168, 0.9945934589378227, -0.5779739742428905, 1.0398502505219054, -0.09401160691650227, 0.22793239636661505, -1.8664992140331715, -0.16104499274010126, -0.8497511318264537, -0.005035074822415585, -1.7956896952184151, 1.8304783101189757, 0.19094408763231646, 1.3353023874309002, 0.5889134606052353, -0.48487660139277866, 0.4817014755127622, 1.5981632863770983, 2.1416849775567943, -0.5524061711669017, 0.3364804821524787, -0.8609687548167294, 0.24548635047971906, -0.1281468603588133, -0.03871410517044196, -0.2678174852638268, 0.41800607312114096, -0.2503930647517959, 0.8432391494945226, -0.5684563173706987, -0.6737077809046504, 2.0559579098493606, -0.29098826888414253, -0.08572747304559661, -0.301857666880195, -0.3446199959065524, 0.7391340848217359, -0.3087136212446006, 0.5245553707204758, -3.063281336805349, 0.47471623010413705, 0.3733427291759615, -0.26216851429591426, -0.5433523111756248, 0.3305385199964823, -1.4866150542941634, -0.4699911958560942, 0.7312367186673805, -0.22346998944216903, -0.4102860865811592, -0.3003478250288424, -0.3436168605845268, 0.9456524589400904, -0.03710285453384255, 0.10330609878001526, 0.6919858329179392, 0.8673477607085118, 0.380742577915601, 0.5785785515837437, -0.011421905830097267, 0.587187810965595, -1.172536467775141, -0.532086162097372, -0.34440413367820183, -1.404900386188497, -0.1916375229779241, 1.6910999461291834, -0.6070351182769795, -0.8371447893868493, 0.8853944070432224, 1.4062946075925473, -0.4575973141608374, 1.1458755768004445, 0.2619874618238163, 1.7105876844856704, -1.3938976454537522, -0.11403217166441704, -1.0354305240085717, -0.4285770475062154, 0.10326635421187867, 0.6911853442971228, 0.6293835213179542, -0.819693698713199, -0.7378190403744175, -1.495947672573938, -1.2406693914431872, -1.0486341638186725, -1.3715759883075953, 3.585407817418151, -0.8007079372574223, -1.527336776754733, -0.4716571043072485, -0.6967311271405545, 1.0003347462169225, -0.30569565002022697, 0.3646134876772732, 0.49083033603832493, 0.07754580794955847, -0.13467337850920083, 0.02134473458605164, 0.5025183900540823, -0.940929087894874, 1.441600637127558, -0.0857298131221344, -0.575175243519591, 0.42622029657630595, -0.3239674701415489, 0.22648849821602596, -0.6636465305318631, 0.30415000329164754, -0.6170241274574016, 0.07578674772163065, 0.2952841441615124, 0.8120317689468056, -0.46861353019671337, 0.04718559572470416, -0.3105660017232523, -0.28898463203535724, 0.9575298065734561, -0.1977556031830993, 0.009658232624257272, 1.1432743259603295, -1.8989396918936858, 0.20787070770386357, 1.4256750543782999, -0.03838329973778874, -0.9051229357470373, -1.2002277085489457, 2.405569956130733, 1.895817948326675, -0.8260858325924574, 0.5759061866255807, 2.7022875569683342, 1.0591327405967745, 0.21449833798124354, 0.19970388388081273, 0.018242139911433558, -0.630960146999549, -2.389646042147776, 0.5424304992480339, -1.2159551561948718, -1.6851632640204128, -0.4812221268109694, 0.6217652794219579, -0.380139431677482, -0.2643524783321051, 0.5106648694993016, -0.895602157034141, -0.20559568725141816, 1.5449271875734911, 1.544075783565114, 0.17877619857826843, 1.9729717339967108, 0.8302033109816261, -0.39118561199170965, -0.4428357598297098, -0.02550407946753186, -1.0202977138210447, 2.6604654314300835, 1.9163029269361842, 0.34697436596877657, -0.8078124769022497, -1.3876596649099957, 0.44707250163663864, -0.6752837232272447, -0.851291770954755, 0.7599767868730256, 0.8134109401706875, -1.6766750539980289, -0.06051832829232975, -0.4652931327216134, -0.9249124398287735, 1.9022739762222731, 1.7632300613807597, 1.675335012283785, 0.47529854476887495, -0.7892463423254658, 0.3910120652706098, 0.5812432547936405, 0.2693084649672777, -0.08138564925779349, 0.9150619269526952, -0.8637356349272142, -0.14137853834901817, -0.20192754829896423, 0.04718228147088756, -0.9743600144318, -0.9936290943927825, 0.3544612180477054, 0.6839546770735121, 1.5089070357620178, 1.301167565172228, -1.5396145667672985, 0.42854366341485456, -1.5876582617301032, -0.0316985879141714, 0.3144220016570915, -0.05054766725644431, 0.2934139006870167, 0.11396170275994542, -0.6472140129693643, 1.6556030742445431, 1.0319410208453506, 0.3292217603989991, -0.058758121958605435, -0.19917171648476298, -0.5192866115874029, 0.1997510689920335, -1.3675686656161756, -1.7761517497832053, -0.11260276070167097, 0.9717892642758689, 0.0840815981843948, -0.40211265381258554, 0.27384496844034517, -1.0403875081272367, 1.2884781173493884, -1.8066239592554476, 1.1136979156298865, -0.06223155785690416, 1.3930381289015936, 0.4586305673655182, 1.3159249757827194, -0.5369892835955705, 0.17827408233621184, 0.22693934439969682, 0.8216240002114816, -1.0422409752281838, 0.3329686606709231, -1.5128804353968217, 1.0323052869815534, 1.1640486934424354, 1.6450118078345612, -0.6717687395070293, -0.08135119186406627, 1.2746921873544188, -0.8255794145095643, 0.7123504776564864, 0.6953336934741682, 2.191382322698439, 1.4155790749261592, 2.4681081786912866, -2.2904357033803815, -0.8375155191566624, 1.1040106662196736, 0.7084133268872015, -3.401968681942055, 0.23237090512844757, 1.1199436238058174, 0.6333916486592628, -0.6012340913121055, -0.3693951838866523, -1.7742670566875682, -0.36431378282545124, -0.4042586409194551, -0.04648644034604476, 1.5138191613743486, -0.2053670782251071, 1.8679122383251414, 0.8355881018692999, -0.5369705129279005, -0.7909355080370954, 2.1080036780007987, 0.019537331188020687, -1.4672982688640615, -1.486842866467901, -1.1036839537574874, 1.0800858540685894, -0.2313974176207594, 0.47763272078271807, -1.9196070490691473, -0.8193535127855751, -0.6853651905832031, -0.18272370464882973, -0.33413577684633056, 2.2261342671906106, 1.6853726343573683, 0.8563421109235769, 1.0468799885096596, 0.12189082561416206, -1.3596466927672854, -0.7607432068282968, 0.7061728288620306, -0.4384478018639071, 0.8620104661898899, 1.04258758121448, -1.1464159128515612, 0.9617945424413628, 0.04987102831355013, -0.8472878887606543, 0.32986774370339184, 1.278319839581162, -0.4040926804592034, -0.6691567800662129, 0.9415431940597389, 0.3974846022291844, -0.8425204662387112, -1.506166868030291, -0.04248497940038203, 0.26434168799067986, -1.5698380163561454, -0.6651727917714935, 1.2400220571204048, -0.1251830593977037, 0.6156254221302833, 0.43585628657139575, -1.6014619037611209, 1.9152323656075512, -0.8847911114213622, 1.359854519784993, -0.5554989575409871, 0.25064804193232354, 0.7976616257678464, 0.37834567410982123, -0.6300374359617635, -1.0613465068052854, -0.866474302027355, 1.2458556977164312, 0.577814049080149, 2.069400463823993, 0.9068690176961165, -0.5031387968484738, -0.3640749863516844, -1.041502465417534, 0.6732994659644133, -0.006355018868252906, -0.3650517541386253, 1.0975063446734974, -2.203726812834859, 1.060685913143899, -0.4618706570892267, 0.06475263817517128, -0.19326357638969882, -0.01812119454736379, 0.1337618009668529, 1.1838276997792907, 0.4273677345455913, -0.4912341608307858, 0.2349993979417651, 0.9566260826411601, -0.7948243131958422, -0.6168334352331588, 0.3369425926447926, 0.8547756445246633, 0.2666330662219728, 2.431868771129661, 1.0089732701876513, -0.1162341515974066, -1.1746306816795218, -0.08227639025627424, 0.794676385688044, 0.15005011094018297, -0.8763821573601055, -1.0811684990769739, 0.6311588092267179, 0.026124278982220386, 0.8306502001533514, 1.0856487813261877, -0.018702855899823106, -0.07338137135247896, -0.8435746484744243, -0.18091216366556986, 0.2295807891528797, -1.0689295774443397, -1.5621175533013612, 1.3314045672598216, 0.6211561903553582, 1.0479302317100871, -1.1509436982013124, 0.447985084931758, 0.19917261474342404, 0.3582887259341301, 0.9953552868908098, 0.8948165434511316, 0.4949033431999123, -0.23004847985703908, 0.6411581535557106, -1.1589671573242186, -0.13691519182560624, -0.8849560872785238, 0.6629182075027006, 2.2608150731789696, 2.2823614453180294, -1.2291376923498247, -0.9267975556981378, 0.2597417839242135, -0.7667310491821938, 0.10503294084132372, 2.960320355577672, -1.0645098483081497, -1.2888339889815872, -0.6564570556444346, 0.4742489396354781, 0.8879606773334898, -0.6477585196839569, -0.7309497810668936, 1.7025953934976548, 0.1789174966941155, -0.4839093362740933, -0.8917713440107442, 1.4521776747175792, -0.1676974219641624, -0.500672037099228, -0.2947747621553442, 0.929636971325952, -0.7614935150071248, 1.6886298813725842, -0.8136217834373227, 1.2030997228178093, 1.382267485738376, 2.594387458306705, -0.7703668776292266, -0.7642584795112598, 1.3356598324609947, -0.5745269784148925, -2.212092904499444, -1.727975556661197, -0.18543087256023608, -0.10167435635752538, 1.3480966068787303, 0.0142803272337873, -0.480077631815393, -0.32270216749876185, -1.7884435311074431, -0.5695640948971382, -0.22859087912027687, -0.08783386938029487, -0.18151955278624396, 0.2031493507095467, 0.06444304447669409, -0.4339138073294572, 0.236563959074551, -0.2937958719187449, 0.1611232843821199, -0.6574871644742827, 1.3141902865107886, 0.6093649138398077, 0.056674985715912514, -1.828714441504608, -0.46768482587669535, 0.6489735384886999, 0.5035677725398181, -0.887590772676158, -0.3222316759913631, -0.35172770495027483, -0.4329205472963193, -0.8449916868048998, 0.38282765028957993, 1.3171924061732359, 0.2956667124648384, 0.5390909497681301, -0.7591989862253667, -1.1520792974885883, -0.39344757869384944, 0.6192677330177175, -0.05578834574542242, 0.593015990282657, 0.9374465229256678, 0.647772562443425, 1.1071167572595217, -1.3015016617832518, 1.267300472456379, -0.5807673178649629, 0.9343468385348384, -0.28554893036513673, 0.4487573993840033, 0.6749018890520516, -1.20482985206765, 0.17291806504654686, -0.4124576407610529, -0.9203236505429044, -0.7461342369802754, -0.19694162321688435, 0.46556512963300906, 0.5198366004764268, -1.7222561645076129, -0.7078891617994071, -1.1653209054214695, 1.5560964971092122, 0.3335520152642012, 0.008390825910327906, 0.11336719644324977, 0.3158913817073965, 0.4704483453862008, -0.5700583482495889, -1.276634964816531, -1.7880560933777756, -0.26514994709973827, 0.6194447367446946, -0.654762456435761, 1.0621929196158544, 0.4454719444987052, -0.9323145612076791, 1.3197357985874438, -0.8792938558447049, -0.2470423905508279, 0.5128954444799875, -0.09202044992462606, -1.3082892596744382, -0.34428948138804927, 0.012422196356164879, 1.4626152292162142, 0.34678216997159833, 0.409462409138861, 0.32838364873801185, 1.8776849459782967, 1.6816627852133539, -0.24894138693568296, 0.7150105850753732, 0.22929306929129853, -0.21434910504054566, 1.3339497173912471, -1.2497042452057836, -0.04487255356399775, -0.6486304639082145, -0.8048044333264733, -1.8090170501469942, 1.481689285694336, -1.4772553200884717, -0.36792462539303805, -1.103508260812736, -0.2135236993720317, 0.40889179796540165, 1.993585196733386, 0.43879096427562897, -0.44512875171982147, -1.1780830020629518, -1.666001035275436, -0.2977294957665528, 1.7299614542270356, 0.9882265798853356, 2.2412430815464597, 0.5801434875813244, -0.739190619909163, -1.2663490594895201, 0.5735521649879137, 1.2105709455012765, 1.9112159951415644, -2.259218931706201, -0.563310876529377, -2.4119185903750493, 0.9662624485722368, -0.22788851242764951, 0.9198283887420099, 0.7855927065251492, -0.7459868094792474, 0.10543289218409971, 0.6401750224618271, -0.0077375118689326705, -0.11647036625911977, -0.4722391874001602, -0.2718425102733572, -0.8796746964457087, 0.6112903638894259, 0.5347851929096421, -0.4749419210717794, 1.0633720764557604, -0.2590556665572949, 2.590182301241823, 1.4524061372706638, -0.8503733047335056, 0.5609357391481067, -1.5661825434426477, 0.8019667474525984, 1.2716795425969496, 0.20011166646917924, -0.7105405282282679, -0.5593129072748189, -1.2401371010520867, -0.7002520937780202, -2.236596391787529, -1.8130090502823886, -0.23990633860801777, 1.7428780878151378, 1.4661206538178901, -0.8678567353744017, 0.2957423562639015, 0.13935419069962593, 1.399598845123674, 0.059729544605779575, -0.9607778026198247, 0.18474907798482051, 1.0117193651915666, -0.9173540069396245, 0.8934765521365161, -0.665655291396948, -0.32955768273493324, 0.3062873812209283, 0.177342106982554, 0.3595522704599547, -1.5964209653110262, 0.6705899137346863, -1.1034642863469553, -1.0029562484065524, 0.10622956543479244, 0.4261871936541378, 0.7777501694354336, -0.806235923997437, -0.8272801398172428, -1.2783440745845536, 0.5982979227669168, -0.28214494859284556, 1.101560367699546, -0.14008021262664466, -0.38717961692054237, 0.9962925044431369, -0.7391490127960976, -0.06294945881724459, 0.7283671247384875, -0.8458895297768138, 0.22808829204347086, 0.43685668023014523, 0.9204095286935638, -0.028241645704951284, 0.15951784765135396, 0.8068984900818966, -0.34387965576978663, 0.573828962760762, -0.13374515460012618, -0.5552788325377814, 0.5644705833909952, -0.7500532220469983, 0.33436674493862256, -0.8595435026628129, -0.38943898244735853, 0.6401502590131951, -1.2968645995363652, 0.5861622311675501, 0.2311759458689689, 0.10962292708600496, -0.26025023584932205, -0.5398478003611565, -1.0514168636922954, 1.2689172189127857, 1.7029909647408918, -0.02325431623491577, -0.3064675950620902, -1.5816446841009473, 0.6874254059433739, 0.7755967316475798, 1.4119333324396597, 0.14198739135512406, 0.2927714469848192, -0.7239793888399496, 0.3506448783535265, -0.7568480706640158, -1.2158508387501554, 0.22197589131086445, -0.5621415304506887, -1.2381112050191665, -1.917208333033256, -0.3321665793941188, -0.5916951886991071, -1.244826507645294, -0.29767661008214463, 0.8590635852032509, -1.8579290298421591, -1.0470546224962876, -2.540080936704841, 0.5458326769958273, 0.042222128206941614, 0.6080450228346708, 0.6542717901662132, -1.7292955132690793, -0.4793123354077725, 0.7341767020417185, -1.3322222208234826, -0.5076389542432337, 0.684399163420284, 0.3948487980667425, -1.7919279627150193, 1.582925890933478, 0.8341846456063038, 0.11776890377042544, 1.7471239793853526, 1.2269451783893597, 0.4235463733287474, 1.5908284320029056, -1.635191535538596, 0.04419903330064594, -1.264385360373252, 0.5370192519783876, 1.2368603501240771, -0.9241079150337286, -0.3428051342915208, 0.0882286441353256, -2.210824604513402, -1.9000343283757128, 0.4633735273417207, -0.32534396967175094, 0.026187836765356437, 0.18253601230609245, 0.8519745761039671, -0.028225375482784816, -0.5114197447067229, -1.2428743809444227, 0.2879711400745508, 1.2857130031108321, 0.5296743558975853, -0.8440551904275335, -1.3776032491368861, 1.8164028526343798, -1.1422045767986222, -1.8675179752970443, 0.6969635320800454, 0.9444010906414336, -1.28197913481747, -0.06259132322304235, -0.4518754825442558, 0.9183188639099813, -0.2916931407869574, -1.1464007469977915, -0.4475136941593681, 0.44385573868752803, 2.1606711638680762, -1.4813603018181851, -0.5647618024870872, -1.474746204557383, -2.9067748098220485, 0.06132111635940877, -0.09663310829361334, -1.087053744976143, -1.774855117659402, 0.8130120568830074, -0.5179279676199186, -0.32549430825787784, -1.1995838271705979, 0.8587480835176114, -0.02095126282663596, 0.6677898019388228, -1.1891003375304232, -2.1125937754631305, -0.047765192715672734, 0.09812525010300294, -1.034992359189106, 1.0213451864081846, 1.0788796513160641, -1.444469239557739, 0.28341828947950637, -2.4556013891966737, 1.7126080715698266, -0.5943068899412715, 1.0897594994215383, -0.16345461884651272, 0.7027032523865234, 2.2851158088542562, 0.5038100496225458, -0.16724173993999966, -0.6747457076421414, 0.42254684460738184, 1.277203836895222, -0.34438446183574595, 0.38956738377878264, -0.26884968654334923, -0.02148772950361766, 0.02044885235644607, -1.3873669828232345, 0.19995968746809226, -1.5826859815811556, -0.20385119370067947, 0.5724329589281247, -1.330307658319185, 0.7756101314358208, -0.4989071461473931, 0.5388161769427321, -0.9811085284266614, 2.335331094403556, -0.5588657325211347, -1.2850853695283377, 0.40092993245913744, -1.9675685522110529, 0.9378938542456674, -0.18645815013912917, -0.6828273180353106, -1.840122530632185, -1.2581798109361761, 0.2867275394896832], + "norm_1_1": [0.9439289476815376, 3.146273559377128, 0.3470436819018874, 0.23500522318660688, 1.5027625578175983, 0.12358380842880556, 0.9671450868036405, 1.3118520859491714, 0.9113520437498089, -0.04440973143224536, 1.9877759491898486, 1.9996204803906474, 1.799720353118032, 1.886565148506723, 0.9382468450761778, 2.491734181515102, -0.053054263465645235, 1.469798649726717, 1.777215368280996, 0.07577664342650925, 1.0045542277192225, 2.7412452131185274, -0.5595060202216326, -0.16867353279754704, 1.3420283448178654, 1.3020050118053694, 3.6811150098500702, 0.7359089372703684, 0.6367721397985389, 1.3831527045233183, 1.0696792111618083, -0.07548532392589946, 1.2373767221817324, 0.6905871672839029, 2.2320305105193055, -0.4085586007880089, 1.1742955084304787, 0.37825903040676423, -0.7596956886965056, 1.367021420077372, 0.9873948483421149, -0.13640744994769483, 1.7793561751070432, 1.0988991394293735, 1.9362675575433446, 0.7085550756037359, 2.925063696905222, 0.8806692291072468, -0.06210876846415636, 1.2592710604342234, 1.1280389902743833, 0.6263462157700377, 2.3173101370701263, 1.3539506924905655, -0.30386146266351677, -0.5503979529701677, 1.0307187125176738, 2.1573750449158933, 1.5383264265809764, 0.9368254885391881, 2.126841068074052, 0.29818526645195, 1.160834919769933, 2.2942018955183805, 0.4548303184715433, 0.8488585159126694, 0.37586075519339346, -0.2040239935920174, 1.114423444719651, 1.0570343498416548, 0.2102942523815694, 0.8493605454014514, 1.709763971201042, 1.640323100930742, 2.0062139004978956, 1.1305513184490183, 1.8196260011799201, 0.7920641792537695, 0.823743602729911, -0.011498625586275013, 0.7893809065866656, 0.5896482937405487, 1.0321897868627052, 1.8466459784932274, 0.23697829566842477, 0.43256836066958515, 1.6143924276326302, 0.0232684950662001, 1.7423652670490082, 0.7274438009497037, -0.032251521065615574, -0.12221028450785076, 1.7405261809821284, 1.3364351998902086, -0.41361744643764187, 0.6913891854595424, 0.14917804501223164, 1.2586902514856342, 1.4031173924589315, 1.1972825119315498, 1.1870103540181003, 0.7727228302408662, 1.0056952259487772, 0.2648149369694939, 1.647736918240041, 0.7651229528484964, 1.025323423320409, 2.6048955526644377, -0.03362100672021118, 1.5243826554270674, 0.7746580241941559, 0.5130663401849572, 1.6083260769618282, 1.20797919805295, 0.6395487329822588, 2.1646863283008093, 0.506953119718831, 0.5309071129869507, 0.9009013450642555, 2.590851468579764, 2.334299765282492, 0.6620467937822381, 1.254212932005745, 0.737281747778163, 1.3477877074295308, 2.5942389252062212, 1.3198707181717433, 2.6745813251704753, 0.5929545614450875, 1.7597528399257998, 1.5151571014603125, -0.36775009072734766, -0.03174005132227853, 1.3919889023907663, 0.742106577595993, 3.3653509358794413, 0.3275580518339237, 1.727530675309728, 0.9098224646710376, 1.4925096155347481, 2.1377956041962185, 0.861941701699401, 0.7897278469199567, 0.9659592160540946, 2.3375418372868593, 0.546934531823859, -0.08252337960485412, 1.8347416924412419, 0.5778717081430498, 1.397178022933769, 0.6615868659351432, 0.3452305768275019, 0.8856592779260553, 2.444895468105796, 1.4308934576311123, -0.33690799576168295, 2.1809503609926195, 2.3824475979116078, 1.0180190814958385, 0.25620975695807, 1.4530546725056812, 1.4308579256373428, 1.9878188261645287, 2.1761184954224326, 1.0816199274453537, 0.8032935668782257, 1.5166647616719828, 0.9082894355274158, 1.4295731866088321, 0.5511589553727413, 1.1412352715021117, 1.182676209240281, 0.4730671117688814, 1.9292905851991766, 2.491328699078411, 0.7690227402296599, -0.27023323842841473, 0.7728894804843585, -0.4834275220858071, -0.41643498491586106, -2.122439387161643, 0.9941752931172687, 1.7746827912734586, -2.587011880629264, 1.6457828914966384, 2.4860091804367466, 1.5349985688203196, 1.675164577843621, 1.5152364147530948, 2.115526071988099, 0.8923598199946604, 2.2311772355248114, 0.907562684765672, 0.014439419723853097, 0.15720667157367596, 1.2020485966017733, 3.67136637159918, 4.128134164522259, 0.8839646831379141, -0.21072028643411445, 0.23852081308432949, 0.3081875105647792, 0.9275295124102192, 1.790617191613886, 0.802257853497117, -0.018368955144921983, 1.9768569063734427, 0.35423941343725207, 3.3297304253605287, 1.3633919324029762, 1.0963043937825416, 1.2500289871469439, 1.465715517196066, -0.886407290306187, 0.20672489280963224, 0.8062090751150639, 0.7951780938249524, 1.2071672664160726, 0.150494356434832, 2.8588637283704847, 2.258274058104767, 1.0568991910402896, 1.3076307922983377, 1.0316638928935984, 2.019791294651073, 3.16756955440869, 2.325009363691107, 1.8373756150905354, 0.3324118022875894, 0.9805493160721037, -0.43402229476934306, 1.6370833038606905, -0.47516218639156715, 1.4914335913103254, 0.5513597239016521, 0.8787109081684878, 1.1733491617506075, 1.3533045050995003, 0.7313834914156329, 0.7043810911858954, -0.5005843871741573, 0.5150588939186933, 0.3038721658247664, 0.7425653089732847, 1.4481471434369673, 0.2994617403858175, 0.31803589894434114, -0.9782040195759514, 2.60497725796373, 0.5979079295183307, 1.5405561187116241, 0.05554600545837263, 2.9644563050220003, 1.752872728906579, 1.068928656433089, 1.4849705370851225, 1.1438810160544752, -0.38956714597562536, -0.10544114515485159, -1.0427762813503514, -0.20515562160984935, -0.8175451868484904, 2.7943917846530253, -0.1370290862943817, 0.8836487521687477, 1.2950106344112828, 2.6151582642423676, 2.6285060010186028, 2.0957244460699207, -0.9801283322109986, 1.5223219534626153, 0.8330851618025956, -0.16389301984851534, 0.6250402901576504, 2.044850457294202, 1.1009470898949627, -0.6113103424299924, 1.6228914751067367, 1.4745626368457836, 1.926027938978669, 2.6352917921501273, 2.984553904170974, 1.9287728585745287, 1.6427009950921814, 0.6617914331073831, 0.19576043280767308, 0.5296596372650431, 1.8708411458340244, 0.7346145124119067, 0.196240475554057, 1.1750263096115463, -0.9925973610573318, 1.2742862721013015, 0.38401542473937256, -0.026791173285904968, 0.6365795377253844, 0.8322860782931374, -0.5336375854964073, 1.735716140975975, 1.5147273154011112, 1.6118180856958826, 0.11077716191228493, 0.7432628973661435, 1.1820531628750746, 1.3999883361753365, 1.9710372575987662, -1.1174334182309722, 1.6631324997974481, 1.7196290499958842, 0.014494293505393996, 1.7884320111762118, 1.3902644823502788, 0.45393098036620194, -1.0290859271112498, 0.4116237587848671, 1.8797857212712796, 1.044533513858495, 1.8532670981501669, 2.092091442823382, 0.6134777913193156, 0.7875775797353411, 1.0988573074566785, 1.1696109174337816, 0.8973181768908489, 1.5342296259311934, 0.8514207096887789, 1.0891894421391655, 0.7512072129955728, 3.566748363223852, 0.41364190312295535, -0.8147213090730079, -0.07311361108269776, 0.14372071325380897, 1.1969503284906922, 2.15109270169691, 0.7116516815173162, -0.18047984598070865, 1.2715755272697002, 2.765143685176671, 1.5590363444369433, 0.7048847170862482, -0.06666270509296512, 0.9754228221535766, 1.5884172853434833, 0.034237386513273904, 1.8205014231076158, 0.21793793235506076, 1.5280279268099146, -1.6696504848310219, 1.1998909064046384, -2.6116416088836107, 1.8575694173770998, 1.1564602537867101, -0.041714779508320365, 0.7073119114382507, 0.1906791534683464, -0.4209736403010287, 2.1035976226554642, 1.1156866582588036, 1.2197545495147504, 0.28263329158114747, 1.0667781570461576, -0.5322745864805178, 0.40889549709539275, 1.0036750959873584, 0.5229340210998343, 2.2958695033088814, 0.5490670572001375, 1.3459055906918467, 0.5392032636631754, 0.6527511371920389, 0.34413651888472074, 0.40468028975524317, 1.9287896037293017, 1.5856328351750286, 0.9902029529116818, -0.6564141748330123, 1.2777490085079088, 1.3267458442703348, 0.9483736808097628, -0.19510207428369708, -0.1257790541142203, -0.18181081702483826, 1.4874823496173897, 1.7005898524075924, 0.6333895462301791, 1.461220913330714, 1.826631086716752, 0.34430843328269145, 1.1815644385539301, -2.135099591787039, 0.9174255794012871, 1.2572927036657906, 1.2438312587880156, 1.5023740770935279, 1.2582409905346517, 1.3386884965705503, 1.309251487235587, 0.3625667377579803, 1.6873933159613181, 2.291911109947078, -0.26702970657839575, 1.023357067569446, 1.8818842353997274, 0.6678970149195208, 0.5041772247443688, 1.0170886902033467, 0.797739875734258, 0.4737481471721765, 0.7115264688561244, 0.3723794763553355, 1.986270540812178, 3.9111955381463526, 1.50957421456837, 0.3798017453283048, 1.511884873157785, 2.674377887543595, -0.1493443880552492, 0.29607506581356946, 0.18827742335873932, 0.47384656682532234, 3.1244965614119273, 1.767378834079111, 2.3684334403183622, 1.5037444344067201, 2.0414345601859294, 0.48212164653721024, -0.21909118164650598, 2.323222169720534, 3.784313240513907, 1.5190646190883976, 1.5678647963362313, 1.3726343597628488, 1.3962845500210206, 0.15010566719413265, 1.2898788018694924, 0.7252733436717675, 1.6283508013566235, 0.601144209254887, 1.127896501279436, 1.2997828294973954, 2.561526850417782, 1.0074194479678356, -0.7207743072854409, 1.6299037878027893, 2.4860086507453025, 2.626483125037315, 0.33541820383309584, 0.30337548357091537, 1.905946288259434, 1.1952593546577415, 1.1284512188908145, 0.9455804288630874, -0.31923887232703496, 3.1769566187746876, 0.6809609416497249, 0.2357045262654025, -0.6460710927408215, 0.34915729759882175, 1.130366562046605, 2.2093095615081593, 0.7540165477804306, 0.603649039589716, -0.045979881805138234, 1.8196484317300616, 1.9010288037715144, 0.798573669358241, 2.2698422604706177, 1.0116490924036916, 0.17257842789939215, 2.622491994614012, 0.21193726448020433, 1.1661019305895055, -0.4641227794466354, 0.7271167066139251, 2.0806994671885213, 0.9150039885089255, 2.7490097923976675, 1.1959090328662527, 0.7115866190917428, 2.0936283085610574, -0.7692791068553213, 1.9244699272044112, 2.5259222388344478, -0.20905405839753466, -0.31526108127443586, 1.8877642674386825, 1.9547125049413463, 0.08979902184136501, 1.377112215599062, 1.9725334922193614, 2.4393748156251425, 0.29033648623853037, -0.6870792818547982, 0.6605321967479459, 0.688455770902062, -0.22855014344588764, 2.5147373720083426, 0.9349169208755647, 0.18367445154300543, 0.18856168893623693, 0.828116067430801, 1.3342275382925777, 1.2049095228250444, 2.0972500301662027, 1.257856755677393, 0.8078133018684028, 2.142308840079754, 1.2482284960092027, 2.246885570674106, 0.09758076340723953, 1.6076644006609229, 0.34736614976229596, 2.9021764923187403, 1.0970217580169936, 1.647688998881664, 0.014515276960043177, 0.6438079957399008, 1.108628966629711, 0.1410982014773452, -0.1969768641572256, -0.5496814593656323, 0.7296185962737656, 0.21671356212570714, 1.0105168747555233, 0.22115390061645146, 0.8456257867685341, 0.005808252596950836, 1.806703708188445, 1.2085741563584702, 1.727705667366251, -0.11488723965161562, 0.3517268043991435, 1.7015786863941074, 1.2862047406116934, 2.387839736595148, 0.7155016553586135, 1.5716557064168093, 1.3345776990190608, 2.685028372310539, 1.0574053644044779, 1.5697610108623063, 1.0230757584969044, 2.021229525603941, -0.5158604681879382, 2.0976533084160387, 0.9665362512068408, 1.6301453709295308, -0.3648180749463823, 1.2021547491538926, 0.37058055831029535, 1.688650155012013, 2.4686807366907266, 1.1048759769348848, 1.6114822022368533, 0.08602473308390357, 1.6259685980317826, -0.48874897321035005, 0.7461058676296025, 1.4032719157852425, 1.3197724566538338, 0.5547448414052866, 1.2487303784109454, 2.116843253985413, 1.1508927073086002, -0.6295682790325003, 0.8523077410870681, 1.8017160706086557, 0.16810074465997338, 2.090463413209452, 1.833141998866295, 2.292690082978666, 1.5752533199270333, -0.24795509395358883, -0.8300558867657644, -0.48371857720948896, 1.2653641669222004, 0.02906014336165752, 1.2474439009857687, 1.3393327899579819, -0.7079088031219478, -0.05266365567311149, 2.025171759935059, 2.1656699620939266, 0.2104448764500919, 0.10943003045607524, 1.5460400624535748, 0.2910166619530504, 1.6849945930683687, 0.9935083639136372, 0.1294438778793171, 0.9065153351782655, 1.151702277860739, 0.43040465108544224, -0.4213070391657807, -0.48007503044153466, 0.8875122716748995, 0.38504345175101284, 1.0076824626545358, 0.2397911015246369, 1.1151082649713246, 2.304597789364188, 0.46023220794989106, -0.43974911356060975, 1.262572020594177, 1.5033227027723506, 0.23414143406304022, 1.3022570914100906, 0.8625677397200946, 0.27400466448547145, 0.16932647765185427, 1.082045771123449, 0.3437969338440605, 0.6803755668090572, 1.1659774090891943, 0.3109513596053801, 1.3091799473307848, 2.6520073955133405, 0.2259607625932898, 1.30981634956103, 1.4076626702936115, 2.7203166052996766, 1.623085868257788, 1.1892299967579258, 0.7210020732688899, 1.6155373535161444, -0.8395206267974036, 0.41754083228884475, 1.0133048365222876, 0.846169183744136, 0.9330204168799799, 2.6367911335283956, 0.03191052250133575, 2.933676718014959, 1.61264368050961, 0.8129354850727784, 2.3734632433172593, 0.508525676410383, 4.052455914072482, 1.5395813652069834, 0.47488244596104134, 0.8849084058218769, 1.9011429239004611, 0.0705669158690998, 0.09199800862052931, -0.15539788523987075, 0.09962979720626497, 1.7833999837904817, 2.4037789078212883, -0.1894830387098927, 1.0597429794665072, 0.8272109562097675, 0.9791988398706389, 1.0001597205854535, 2.1396503765801684, 0.22124571535216464, 1.4261960381079728, 0.7509179467564698, 2.1013663504102973, -0.025835660273390237, -0.5642401128668728, 2.3893539112896485, 0.9530997723658488, 0.8699103446382038, 2.111649370974012, 2.2246744364464153, 0.8844008136008745, 2.794501695126276, -0.5144534181695086, 0.39885772964041655, 1.9972891806666926, 0.9379521334980345, 1.54979513809331, -0.9379526819899526, 1.2654419496446425, -0.3040328346686123, 1.1555817464359828, 0.2889627291511876, 0.3458295622633967, 0.5576970781961343, 1.9261092279371925, 0.7178001479578799, -0.858770800203619, 1.5293405598989036, 0.2752683235937007, 1.1979385602695278, 1.3019482183901345, 2.0322720881171277, 1.544447104581539, 1.8082816041459522, 2.5671647770400496, 0.2871221089526387, 0.8441250347096808, -0.04275140741818162, 0.477624975658388, -0.6172755513485251, 1.314647872755486, 1.075071579233564, -1.3335706073136158, -0.014024016775799897, 1.6152452568698337, 1.7994862496872877, 0.24967618279928305, -0.7874711159132706, 1.5557284568831777, 1.427755912238499, 1.7928915987187435, 1.2222423902183472, 1.9933798062514603, 2.5562653898964696, -0.4056771817850462, 1.563088525949157, 0.7685658093611922, 0.1479959518762407, -0.2394277940212437, 0.1630251373124869, 0.6677355011296486, 1.3315401156260855, 2.4164685005643953, 1.1796996834088016, 0.22178163324147726, 0.16278906488080036, 0.08353212726729298, 1.5920552157361825, 3.1946374565150424, 0.5036595701050207, 2.750672604054599, 0.7313956027524133, 1.8779724444597101, 0.5744295270625093, 0.6232319832814676, 0.9983565923982267, 3.123945982401918, 0.4968707729145434, 1.707646929326181, 3.396555213244133, 1.8074418436463937, 0.3786467250980573, 1.9057620463543237, -0.06784202574026921, 2.5272870561308984, 1.5040023482353844, 0.06906531252355541, 2.8748225295643888, 1.2099395627762703, 1.4275692469936208, 0.8174396380468221, 0.08928214704308846, 2.857056320713837, 1.479130346343359, 1.5076990170829743, 2.8205285246754297, -0.13422772835185115, 1.0384307607779835, -0.4891455835273326, 1.3658759546354762, 0.7317387042919388, 2.5765750740128945, 0.5701231257282764, 3.7123545310472714, -0.19662933488729117, 0.5311953188797843, 0.8964245243742438, 1.8621005639090766, 1.1605521970391404, 1.1323321623434122, 3.085803518113754, 1.4492266526063322, 1.7849701010040997, 0.16888025999554357, 1.4964467864159234, 2.044641768457976, 0.6032675054185457, 1.0496474870273729, 2.2186167896925397, -0.1920737599855129, 1.1576544199409122, 1.9728890397669385, 0.6521093966834546, 1.203676186386656, 1.8755530779691534, 0.10653842578723027, 0.6138997456232376, -0.004717024361769351, 2.092393498656741, 1.1022663527680836, 1.8309516833250283, 1.6193500804365277, 0.7615134401129977, 1.5394513969460784, 1.0351617277952492, 2.770377730382555, 2.6132259073806607, 0.32653334442174764, 1.139395696959093, 0.34670204320595144, 0.1803113702709871, 2.208474278629656, 1.7312088921213826, -0.23697779346163372, 0.2685655520301713, 1.2570384865393267, 0.6495582287637965, -0.10903963405628803, 1.4684563732329439, 1.265850326771272, -0.10496175689370868, 0.6096307061952748, 1.2708657158472203, 0.9727546406670835, 0.958790026848646, 1.7999965723180185, 1.0945104124355196, 0.6682419228177416, 2.6224377891348283, 0.9138638782463208, 1.0580423251997932, 0.5491918375902436, 0.2414534073141532, 0.10728161713314477, 1.6933183753577725, 1.4474615936915165, 0.58194749114223, 1.1505157139822952, 0.6315922458323653, 0.531977605977884, 2.3823729497668182, 0.17193094426509015, 0.6596902450970048, -0.9434227963637014, 0.641486484780706, 1.23294787828323, 2.7897112167865057, 0.7807574007377986, 1.879471024665305, 1.8390116134436396, -0.5961967644569612, 0.817489188759139, 0.9172328929720406, 1.0834538117908896, 1.9046196927907082, 1.2948139888306938, 0.326097845082705, 0.5103203832259658, 0.8276351380210181, 0.5287492181300184, 1.2089658132164072, 0.27404057896347445, 0.10198446964481833, 2.364693493758857, 1.2612384182256644, 1.8324077036313196, 0.9733328068104737, 1.462277600111109, 0.9815637422332565, 0.9108864932670093, 1.1834526661341054, 1.5352215283372659, 0.5540815553236613, 1.0861002796631918, 0.13718062653447172, 1.9617025240295556, 1.318321848981391, 1.237513456886778, -0.9049082655979523, -1.397566456059911, 2.5235886019002063, 2.0888643925087638, 1.759632887728867, 0.4434159183189643, 1.2796255812414041, 0.4626938065222319, 1.8736048048760354, -0.08945988848203656, 0.8579642555131665, -0.19352126292672045, 2.674071166195404, -0.5625760598468019, 1.14933176671998, 2.8465289405354213, -0.4550340755659106, 3.390662897319301, 1.7152314069647032, 1.4864943077847514, 3.215888262375325, 1.7400268782293788, 0.8145819011321423, 1.0821742177554605, 1.9349838626539824, 1.6511534310811307, 1.3775327200579475, 0.786966888348791, 1.9898226647248498, 0.19289432021996922, 0.2595748192826055, 1.6896086001124653, 1.5406021056649224, 0.4094561414163007, -0.7271811065069895, 3.9437967897118815, 2.0824537999163124, 1.5481326712303636, 1.4012762691419716, 4.084768179647356, 1.7494140485274996, 1.094752205662429, -0.5475681675420381, 2.006935505771426, 1.4406874354798131, 0.22644029952274125, 0.31913965563968294, 0.8734974659048046, 0.33258848128713403, 2.221595123469765, 1.6655578798147537, 1.1997999630295264, 0.9769532085540074, 1.3265209304756513, -0.41639806037964977, 1.5753299424021547, 1.117883963331852, 2.07173437675882, -0.3162631159488294, 1.0212570074728546, 0.6318419350330682, 1.945104438418119, 0.48233819394601385, 0.6828228991347791, 0.8504771540443077, 1.9928066431783034, 0.26830349314705604, 2.2160818691455306, 0.08077106158976233, 0.8528392181671496, 1.6718508878125142, -0.5680534895597382, -0.4786563992036996, -0.34578308105394506, 0.0011981759465338193, 1.6466728056257076, 2.1141813915518357, 1.6208978374805012, 1.200916530385835, 1.8376478354111114, 0.35200088550407727, 0.2339088245769677, 0.03195907525894559, 1.4623074667277995, 1.9395222855287244, 4.11139915204436, 3.0581550199106737, 0.9288657073341118, -0.9640606358695012, 1.3458228793795088, 1.947831156016957, 1.871456839336148, 0.8785747179018829, 1.5683269998678426, 1.401162670646302, 0.8041003259207168, 2.571940881517926, 1.6970261631402968, 1.2191100051048722, 2.5244653128833536, 1.1135675091004409, -0.3644894023036609, 0.26904341195983217, 1.1341278119314595, 2.1222397206073267, 0.8916980516501747, 1.6443137953462412, 0.6287990000398109, -0.36608861293950645, 0.4915158402519816, 0.39746381481782744, 2.7404052832856056, 2.4655219268262485, 0.9859480285889792, -0.018144649984449535, 2.1020308951537428, 1.1531457610927214, 1.1383158125832764, 2.1513400124321933, -0.20836032722560627, 0.5951010547000509, 2.053873984558643, 1.3130473583120879, 1.1797978816207997, 1.3829342072289306, 1.0510393522131602, 1.2218846807322914, 2.399365279769126, 1.68735788120717, -1.3283579154932852, 0.9225145970181905, -0.8629425419130379, 0.26052383053502604, 1.0382814191197032, -0.3587399222943657, -0.1816327804133553], + "norm_10_1": [9.040841267802735, 11.106799708991128, 9.663724250201376, 11.037829347437087, 10.453701807733168, 9.81819325760899, 10.402232950179965, 7.6209955191672805, 7.931075493499095, 10.181029664959565, 9.72527296765712, 10.565358861268006, 9.485694063727857, 8.296311764473822, 10.596810540377472, 8.34984883596158, 11.370769344347934, 10.828432548029152, 10.060552665982748, 9.501379711793113, 8.858973911699067, 10.432154595850877, 10.528295936984472, 10.374675294487536, 10.315701821789522, 9.361626955059483, 12.04204606342353, 10.090320649718375, 7.2591141991692645, 9.803184123286712, 9.114918857072373, 7.9360168816756484, 9.107612007477753, 10.925670692193044, 8.79907239052001, 8.926545596128651, 10.276709213302308, 9.32573717045887, 11.065526168484991, 9.089388952636105, 9.980586319597185, 8.470722705158492, 10.026355385648255, 8.813221981822263, 9.534533748411253, 9.853556816388183, 9.653955537320385, 8.058301044144724, 10.088520802063835, 9.416877108345943, 10.580857340431407, 9.76235022121817, 10.038049981661361, 10.454223704043159, 9.842358424775055, 11.25186810905153, 9.13615429523526, 9.46213131604413, 9.845659909137684, 9.187735913484952, 9.768016293176514, 9.777047723115519, 10.72841393574283, 10.487569589860344, 8.630418039123745, 9.526765006402815, 10.226235089014514, 8.396706165266703, 11.82435178919067, 10.858255334801372, 10.102342356524124, 11.828402337847134, 11.832677454740637, 10.352589993050978, 9.24154762763915, 9.308297116784038, 11.183556254163525, 9.750201439225636, 8.597757855326217, 9.640408093451954, 9.097513453864847, 9.700525156323769, 9.62207080881433, 8.440223841256604, 9.938330588064739, 9.179799142850076, 9.050357663111521, 9.065405604048241, 10.168417121414272, 10.348565757131812, 8.645136731133274, 10.501444013086793, 9.348277708559989, 9.753456137894073, 9.462528343306014, 9.559266753968616, 9.934871563637163, 12.053370747330588, 10.798586409641374, 10.965092180807627, 9.699118165191361, 8.224081350529353, 9.102277776571718, 9.916371547117267, 9.29098431302276, 8.430960753850089, 11.68327287533711, 10.165267787438632, 10.156122804185268, 10.904311176570529, 11.227093470821643, 10.950387163519093, 9.87094549642286, 10.00000340471139, 11.040758192588166, 10.34534629939728, 8.664514454321372, 10.130436829771597, 10.691323223154017, 10.462092708716861, 8.80960709185836, 10.882386165602743, 8.171600699499654, 12.223883906463207, 11.112032864719275, 9.374770507378699, 12.233304197728286, 10.64879511593974, 10.075591150240367, 9.274204795200959, 8.591007383081767, 10.25560242596336, 10.54458768033494, 9.121338458537421, 9.66278154842079, 9.556201176101844, 9.502225338668355, 9.148981449669872, 8.862782732891917, 9.944832964799936, 10.70601549769572, 9.938023145503793, 11.365908866565256, 9.4138177738982, 10.911539339166223, 9.681913796486272, 11.22893728037362, 9.6370235121311, 9.004177281353757, 9.985480171035649, 10.702541955450075, 10.363532809780295, 9.992701363602352, 9.816794719669828, 9.283056138867037, 9.97675160736348, 10.07685078091665, 9.327041194264197, 9.38772031650513, 10.717241254607949, 9.695518074677139, 11.743848674915498, 9.379765437465057, 12.267049116248199, 9.71495991810392, 10.000804524056601, 10.535501836482425, 11.759155667575573, 10.79104448603048, 10.792624031877756, 10.869483000785268, 8.649800392290585, 10.937358753570678, 11.220144875494743, 8.49762401261176, 9.338765129869438, 9.921765585721351, 9.018945302710693, 8.526343409007016, 10.80920413081, 9.240385432531408, 9.718337794427672, 8.953564744675072, 11.32705466035367, 10.09772740736297, 10.31136155074962, 8.814341165709624, 9.998981727233003, 9.490681378102947, 8.76203976165035, 9.760112414965125, 11.657816328510556, 7.963863908290998, 8.613304335601288, 8.058164095374181, 9.918532930238227, 9.023708154360069, 8.521684110237969, 10.743755381138948, 8.887759550687214, 8.913125422768616, 9.210081809996327, 9.370574803676769, 9.465351085809132, 8.880777790625075, 11.062781991352594, 11.56865207952785, 10.761105196199352, 11.109198606837491, 9.141447612573597, 8.765408843026197, 10.4527206785699, 9.418807373607205, 11.31172455341977, 11.402409604679649, 10.6712388504367, 10.94445298810075, 8.692410953345753, 10.8660507892721, 11.147152623668681, 10.445789235709325, 10.514452457277885, 9.514811856404874, 11.851850157163899, 8.512301036891904, 9.622377035108213, 8.922593726874851, 12.124974763922957, 8.996481256095567, 11.207366968312035, 9.69442219497466, 7.908015520261029, 9.911499020780967, 11.387716394635625, 9.750557420904547, 11.82618502310435, 11.687072554523944, 9.505687048977547, 9.520601458097335, 10.682739211367526, 10.868210501934175, 11.575858169687478, 8.777870203463653, 9.172329981151965, 8.770160015223242, 10.641265393522257, 12.139721638795276, 11.931864814146426, 9.240429484274959, 9.39299910039342, 8.960105904056222, 12.759669469418988, 10.771913297982202, 11.524985170369934, 10.556258371866917, 10.612198434462178, 10.581113788378698, 10.74322733653057, 8.672419183167225, 10.105088603335213, 9.480102278247541, 10.055060732534711, 10.20970563844142, 11.494803701722484, 10.201627742653471, 10.527839731761297, 10.467141517288006, 8.621646032542728, 8.266602306868412, 9.683820634969289, 9.538995787004586, 9.343986508249825, 9.090361962151873, 10.485819149711638, 9.86008849027794, 9.768437284588824, 9.937147885274415, 10.67183839616365, 11.095070728105357, 10.279718469230222, 9.755447228708563, 8.66482656045748, 8.511617723659727, 10.655169177181772, 10.518130494258521, 9.487118174619296, 10.070014764554173, 10.794417612388235, 10.141417616950426, 10.88763382156933, 11.621854607092247, 9.49060069971835, 9.11299470989561, 10.183018965916007, 10.774135520409665, 9.203753251119965, 9.853853050571603, 10.082588288337725, 10.825719959317063, 9.447827077686076, 10.289013454463358, 10.334538687870577, 10.653591063921528, 8.78336659506064, 9.84911470014561, 9.8730684390772, 10.77370552633287, 11.756326868695648, 8.94814978551935, 8.663582346399958, 9.894704763918906, 9.940185085092073, 10.570572052003707, 10.016905946420303, 8.280082288029002, 10.296472198050632, 9.235260399307254, 9.55834124131889, 10.543296468391693, 12.02364989293243, 9.724851439722281, 12.594820030451944, 11.008207991876292, 8.906857350176534, 9.637775428332034, 7.785500070082081, 8.984821463487588, 10.527768173628367, 9.519502977989147, 10.275845048376652, 10.238880243655094, 12.009246313905011, 9.256084089719536, 10.113739220870405, 9.916314418387218, 10.4614211912953, 10.050316151950948, 8.936596815574937, 9.58654048959267, 9.710739924738128, 8.593372392837768, 9.872353473491145, 10.412229566702795, 9.11537460171611, 10.611056378343491, 10.529236279465557, 9.720448734320154, 9.649035207680171, 9.298388837527225, 9.33181631026098, 8.738003944413874, 10.195505888236244, 8.803100543940458, 9.960388515796396, 9.997213187655078, 10.179520500051193, 8.921357003621576, 9.754464628551814, 10.160384291305139, 9.025457818155886, 9.730479675581256, 8.553161384213503, 9.111862568346748, 10.685888645649046, 9.70871214245464, 11.318621988664038, 10.57399094257553, 9.136608548607592, 10.489433900325247, 8.679959194281624, 10.29336848904415, 11.109376095890992, 10.913330339233001, 8.751445912303339, 9.186015850494767, 11.643179022565166, 9.838974608547348, 11.257220675118983, 9.33527176261113, 11.54829026075894, 10.479507270270702, 9.296131392119385, 9.752274951178366, 10.015535636445698, 10.474973056281952, 10.009189896294508, 8.35804674935555, 10.670439417116217, 10.493877497089892, 8.789400319824225, 11.510778890820518, 11.04616275296382, 9.694539342068957, 12.366451224790298, 11.06230566051001, 10.348644426174978, 10.199084321894599, 9.921587730191911, 8.709546320610231, 9.105296995317431, 10.899310052623566, 11.928736546667107, 9.490181263055787, 11.936991974771937, 9.885122866979824, 9.882224967806383, 10.196952695590427, 9.290118586040261, 10.941666554199484, 11.211249048346218, 10.68244899686549, 11.756849804277872, 9.501664077436228, 11.62305004962866, 9.02712650980373, 9.451024343092836, 8.395201058340948, 10.96175022996361, 9.734781751896422, 11.407078056518891, 10.199866986175257, 11.758689270920728, 11.157416662711057, 9.444185124163424, 9.993231677749685, 8.738121636514952, 9.091827871335186, 9.610340312309503, 10.120577110133098, 10.906995163228679, 9.815080372443054, 8.694536985791206, 10.442513944530015, 11.357014584727253, 11.025066412873317, 11.17237212539217, 9.382527225552503, 9.2056085577839, 9.736768381228739, 10.074025603148325, 9.891723128365053, 11.524164747716528, 10.226456889695616, 10.703111935372295, 10.38870174582839, 7.530740534083932, 10.652657926832322, 11.701654742833513, 9.495338630200052, 10.123614073123365, 11.033477751060236, 10.378854418966794, 11.368969924598243, 10.787937168975386, 10.451590107922206, 11.419076327539889, 9.415861384123827, 8.79815774472782, 10.581355180506094, 11.32451064078806, 11.508991301755412, 8.62181623066325, 9.828717035827921, 11.411528527046935, 8.841861575014812, 9.931505869422846, 10.413929231376594, 10.626019174729501, 10.948472117749388, 9.010617981202213, 10.659301293492955, 11.065175426755571, 7.967492800478297, 9.271178369999635, 8.022301246290878, 10.989195782159449, 8.857613851013445, 10.750691490751326, 10.238851746517595, 11.050315264847132, 10.285900695278702, 8.754947850343198, 9.50089055542205, 12.037483481004026, 9.27802649081427, 9.152002245319819, 9.323222567521594, 10.588916907113274, 10.104144589932885, 10.64283956005658, 8.889049243584997, 11.02278761915662, 7.569104319329256, 8.359910760831559, 9.828167300442168, 9.94123502199896, 9.230112759885559, 11.67224197628526, 10.131885618677108, 9.432637205671583, 9.815173154691152, 9.195721936851381, 9.454764722402919, 12.39028070853414, 11.077109184383136, 11.11018274263151, 8.253573613752238, 9.249224389601387, 10.661017897574766, 10.636424837770212, 11.098766960743015, 10.849052705101403, 10.312653805479268, 12.512271418658774, 11.515927866402878, 10.805646900536944, 9.615772020628349, 8.135249052972544, 11.223128141846045, 10.99720813452923, 10.74657586919516, 10.12948400807092, 9.375281731983863, 9.24839885126103, 8.372084289437371, 8.93659570161449, 10.610084747373708, 10.551196629616179, 8.408813929020738, 10.52449654468341, 9.565964176571338, 10.19516240210464, 8.978311891652176, 10.338322015779333, 11.750898927951285, 10.011317551924526, 8.695758705558697, 9.339763109322439, 10.001058719022168, 9.942625377081312, 11.304785028918761, 9.987200879300868, 11.679002903338883, 8.998885072444704, 8.819784995357129, 10.568184132145879, 11.659125713002139, 10.762336914469303, 9.32320260988687, 9.725203078628804, 9.964074050288879, 8.227648949871316, 8.379280528982454, 9.043592955512342, 10.673959852055166, 11.04178950765339, 9.687649733091025, 9.41133684548883, 10.142143695235507, 9.885787857653991, 10.77036436129569, 10.199859160532041, 11.906921437391155, 9.092145565795091, 9.599859593366261, 10.006151121719428, 10.251489353814739, 11.665149283823462, 10.36216926512228, 9.233047308296833, 9.766323940883256, 9.934689941809895, 8.920980497968037, 10.700167568037603, 11.972059548230819, 11.807975183063977, 9.780838822594752, 9.082996827746783, 8.66457095646258, 8.235356999576195, 10.07717712804332, 9.41169774916071, 9.73444471105237, 8.510626864453481, 11.0483629493406, 10.075424506768783, 9.71618118767198, 9.855777633540233, 9.321269926037239, 10.31344862832386, 10.44712322592716, 9.483081235060903, 8.840619905612026, 12.312860683751307, 11.387457811126849, 10.91254103125626, 10.26528523590661, 9.36564044255442, 9.682702879610524, 10.633009560329048, 8.953172548723792, 10.31081304976398, 11.046888690584755, 11.0782772826774, 9.915055567590793, 9.280605798698062, 10.982861748765211, 9.784771773336795, 9.03417297376102, 9.692965865033948, 10.040871553746763, 10.385450302697754, 10.651371663079813, 7.759842177973297, 10.175662480896953, 9.01272511209246, 9.71991735582558, 11.55424986087538, 10.89862781328883, 10.230811407645387, 9.527776922664856, 9.655463047580092, 10.244845872335564, 7.667143349060906, 9.560890691888703, 8.173301714205879, 10.491515746056718, 9.567419698329202, 10.105088711303909, 9.469230551349817, 11.179940410352575, 9.006175708698736, 10.468487876224614, 10.268229766953757, 9.462441000556659, 10.390072406239947, 10.808028783574198, 9.749151257518486, 8.100460700253574, 9.625207212566387, 10.88671697913831, 11.134035258727796, 8.379469292119362, 10.24713724308472, 11.246175642752887, 9.413082181899219, 10.130347960228239, 10.307506705752774, 11.880659432945912, 10.806518362233641, 10.713009745795166, 11.64271936471556, 9.356463282602544, 10.018025051295322, 10.457373698061431, 9.544828164290337, 9.351138661160832, 12.72723771325665, 7.978379090659482, 10.086368596519465, 8.580341410491465, 9.998579668011693, 8.231605180020328, 8.618973281314338, 9.743123683317958, 8.882795710103597, 9.485391146197971, 9.227038234014854, 10.515811406550903, 9.502496859717816, 9.528866006439157, 9.652387835502346, 9.999719149199045, 8.234784023145568, 12.08218578427089, 9.404501226829378, 9.42705519837542, 10.92362016447942, 10.706137678621008, 10.384611195372534, 12.530037518195305, 7.9494995344271615, 11.560130002283893, 10.627242934381766, 10.238870914060263, 10.657001995795557, 11.168922611975656, 9.22454213363792, 8.84397491328157, 11.617739774704365, 8.66047963884963, 10.61029901352552, 10.770462026406745, 10.605468371519626, 11.372244538196599, 8.759537062234754, 9.850870841567952, 7.842285754040711, 9.698709052920634, 9.55956852768227, 9.14106926825522, 9.999822044052786, 10.46135170093459, 8.863880719140143, 10.034124291986638, 8.914170721204115, 10.472801157669885, 9.2881744697782, 10.147253456233594, 9.688734072641253, 9.042159207572507, 8.663446134759319, 9.644212732730034, 9.904167948132605, 9.700218589249761, 9.649530986132676, 10.267968065680032, 10.919737084198182, 10.415773703978761, 9.877753734712984, 8.25295618678857, 10.29999197369132, 9.704337844118152, 8.39851110688631, 9.928231225460367, 10.232315733705695, 11.278256661495051, 9.815138668351244, 10.82871741707962, 8.95016435178245, 9.872651108417015, 9.768952893928173, 10.173307192272842, 11.14381101128269, 10.167175257846964, 10.526715578223584, 9.556894557709755, 10.21885981029533, 9.103575263757648, 8.772585681910156, 10.471055468091143, 10.05593480280753, 10.75052849007012, 10.250381003510102, 9.807623655277606, 11.043392344935185, 11.443834708152007, 10.9999163199769, 11.04711997647443, 11.176424454500305, 11.323629234245239, 10.580251384393724, 10.724985843947682, 9.328074707008229, 10.083614309843068, 9.169605432073089, 10.212716761005078, 11.24071512686644, 8.066997819347282, 9.745320801891525, 11.071591706113662, 11.809083709320621, 10.19321274166031, 9.681712323692512, 9.464672050137128, 8.083831600837833, 11.152848084201906, 8.543324531243814, 9.880075223924324, 12.992811005499586, 9.627721400411446, 10.333661384541301, 7.948411839344616, 9.938657120236162, 8.84976324045843, 9.465678121139167, 11.051733592206105, 10.258274684097461, 9.654637992252383, 9.049221740360393, 8.18579716737424, 9.893519354155917, 11.256748698358216, 10.123073816686242, 11.038734509785028, 9.431189829466998, 10.663852469551184, 10.815013866498624, 9.962899902035808, 9.035699088240944, 9.155739043928861, 10.423958709401013, 9.696934935093667, 10.246394208977696, 11.234891486570675, 8.903150850641111, 11.951616294982772, 9.823011347830194, 9.664970074852166, 9.894109849164023, 10.148186854957597, 10.006061095603062, 8.734935349183761, 10.826828977763665, 10.105621687186614, 11.081922723492788, 10.340608752103632, 9.774210988574826, 10.169161501480882, 10.213886080366082, 10.381600063015902, 9.461052205583645, 10.24785032571721, 10.683940615678047, 10.180496906660423, 9.43585641409653, 9.927764386148102, 8.747719322075184, 9.45091735909254, 10.975697344166349, 9.509308110038658, 9.678081022558306, 9.502277267983184, 10.259016336448857, 9.539699891424476, 9.404789093559481, 10.364295304224317, 10.688744903208313, 8.026676712037638, 12.036084051468308, 9.478703462476828, 9.68565551154149, 11.145124298747552, 10.931969274443409, 9.244918816151326, 9.544216459784334, 10.458945297057214, 9.092599451770562, 9.671033018084339, 10.607448835715248, 9.876467797378703, 10.738338975888386, 8.967770686569413, 9.080652887697477, 10.42526145421582, 8.869898760291694, 10.77912016692631, 9.569665325198468, 10.37410823980801, 10.001050628550791, 10.175200919020194, 9.375252223422686, 11.366118924633822, 11.542670136116437, 9.29441564739444, 10.366851903745204, 9.973814713650711, 9.730709472459985, 10.343920051962277, 10.068826867293733, 9.885834307652912, 10.282366822342045, 10.37562451057554, 10.43505589156704, 9.878269430878657, 11.557755284537198, 9.838254847423627, 9.62859113635893, 9.134138520500688, 8.962372735636661, 10.500040414057844, 10.277077264779084, 9.916017753054408, 10.992690891245179, 9.868125742567045, 9.894041706876369, 10.888698205866424, 10.21938512053016, 9.240215198536411, 9.402715526573372, 9.425784416867447, 10.077637711413354, 9.257496434456577, 8.803094224556933, 7.658770888768531, 10.280718283626205, 11.089111401858185, 10.739967696312384, 7.198882148966959, 9.924155386764681, 8.871399207899634, 10.25369946460737, 9.331568795660454, 10.007775152126294, 8.95033094644488, 9.400145602314105, 10.259496974567055, 10.793045643430844, 9.30655770341641, 11.285730341633359, 11.056860604326372, 9.505412619305067, 11.951535597779507, 10.99779212279173, 10.828945719064526, 9.011790733812278, 9.593475874473091, 9.218712441104243, 9.243302018868718, 10.00422685060251, 9.332054304813328, 9.465127473802482, 10.25565148642919, 8.663016942478883, 11.08129306826306, 10.804456475017028, 11.403740323561363, 10.855511157859507, 10.461917726961362, 9.001573075483222, 11.224144270420922, 8.689259116609671, 10.150221898412491, 9.007285700636182, 11.60867352019891, 9.457374672382427, 8.285565448166821, 11.221347137028255, 9.313644825170684, 10.002615273391601, 9.821889041799993, 10.902591223085455, 9.561263864722937, 10.646885520332864, 8.28954736489342, 9.87514987011174, 7.484577763630988, 11.750531617452692, 11.104696954648517, 9.2402705220721, 11.686375934874148, 10.003557627098806, 11.965653748348506, 10.423929821286558, 11.314345706920706, 12.248276908924456, 10.068285342526544, 8.528361303693776, 8.890515832028504, 8.851718035159678, 11.909181169855241, 12.4077439825466, 9.904043852696201, 10.166459285632175, 9.97914891045837, 9.778381917677333, 11.175540214292313, 10.1029031223738, 8.737307945662081, 9.988537333449353, 9.782111563112956, 8.500850012052682, 8.63788567318244, 10.427013514982507, 9.87365842199117, 9.796612994083167, 9.239550595343976, 11.02980932669381, 10.021260206998564, 10.111139313455114, 9.310503660057506, 10.259275858453702, 10.62398445966753, 8.853509607259221, 9.447185719307885, 9.930970246278669, 9.010205140039004, 10.053997126140763, 9.272370934554909, 9.80001532867991, 9.414154803358368, 8.876179874194799, 10.998540199447916, 10.845407438261633, 10.408355727772314, 11.262041191487352, 9.359724741813332, 9.35598632109662, 10.806049363218438, 9.535612484309597, 11.437118841946974, 9.583721412629474, 9.900483687258218, 10.645583179998752, 9.88964472992708, 11.969775976684911, 10.364880520288668, 9.187869705602285, 10.52212059567159, 10.457434646988345], + "bimodal": [0.7225866251125405, -0.5951819764073379, -0.2679313226299394, -0.22503289285616823, 0.1432092195399402, 1.1874676802669433, 1.2766412196640815, 0.15197071140718296, -0.08787273509474242, -0.14524643717509128, -1.236408169492396, -0.1595432263317598, 1.0856768114741797, 0.5082788229519655, 0.26419244684748955, -0.2532308428977167, -0.6362679196021943, -3.134120304969242, -1.8990888524318292, 0.15701781863102648, -0.775788419966582, -0.7400872167978756, -0.10578357492485335, 0.30287010067847436, -1.2127058770179304, -0.6750567678010801, 0.3341434318919877, 1.8336516507046157, 1.105410842250908, -0.7711783703442725, -0.20834347267477862, -0.06315849766945486, 0.003016997583954831, -1.0500016329150343, -0.9168020284223636, 0.306128397266698, 1.0980602112281863, -0.10465519493772572, 0.4557797534454941, -0.2524452955086468, -1.6176089110359837, 0.46251282530754667, 0.45751208998354903, 0.4222844954971609, 0.9651098606162691, -0.1364401431697167, -0.4988616288584964, -0.29549238375582904, 0.6950204582392359, 0.2975369992016046, -1.0159498719807218, 1.3704532401348395, 1.1210419577766673, 1.2051869452003332, 0.10749349867353084, -3.1876892257116562, 1.316240976262548, -1.3777452919511493, -1.0666211985935259, 1.605446695828751, -0.39682821266996865, -0.2828059717857655, 1.30488698803017, -2.116606225467923, -0.2026680301462151, -0.05504008273574069, -0.028520163428411835, 0.4424105678123449, -0.3427628263418371, 0.23805293411919937, -0.7515414823259695, -0.1272505897548366, 1.803348436304099, -2.0178252709022124, 0.4860300090112474, 1.2304054166426217, 0.7228668982068365, 1.7400607500575112, 0.3480274098246697, -0.3887978895385282, -1.6511926233909175, 0.14517929503564567, -1.1599010576123796, -0.016133552438119002, 0.47157644883706273, 0.27657785075518254, 1.4464286976282463, -1.2605489185634533, -1.2548765025615338, 0.0755319579826929, 1.0476733637516833, -0.7038690219524807, -0.9580696842862921, -0.18135657098008018, -0.18163993379314564, 0.4092798531146971, -2.049808182546896, -1.2447062617916826, -1.6681140306283337, 1.0709944517933483, -0.7059385234342846, -0.8033587669003331, -1.8152275905903312, 0.11729996097670137, 2.2994900038012376, -0.1291192451734159, -0.6731565869164164, -0.06690994571366346, -0.40330072968473235, -0.23927186025094221, 2.7756216937096676, 0.06441299443146056, -0.5095247173507204, -0.5228853558871007, 0.806629654091097, -2.110096084114651, -0.1233374136509439, -1.021178519845751, 0.058906278340351045, -0.26316852406211017, -1.2990807244026237, -0.1937986598084067, 0.3909222793445317, 0.578027315076297, -0.11837271520846208, -1.134297652720464, 0.496915417153268, -0.5315184110418045, 0.5284176849952198, -1.6810338988102331, 0.41220454054009154, 1.0554031136792, -1.4222775023918832, -1.1664353586956209, 0.018952180522661358, -0.04620616876577671, -0.8446292647938418, -0.6889432180332509, -0.16012081070647954, 0.5680940644754282, -1.9792941921407943, 0.35441842206114726, 0.12433268557499534, 0.25366905921805377, 0.6262297786892028, 1.327981424671081, 1.774834324890265, -0.9725604763128438, 0.42824027889428, 0.19725541390327114, 1.4640606982992412, 1.6484993842838995, 0.009848260786412894, -2.318740403198263, -0.4125245127403577, -0.15500831770388285, 1.010740123094443, 0.7509498708766653, -0.021415407776108144, 0.6466776546788641, -1.421096837521404, 0.5632248951325018, -1.230539161899903, -0.26766333435961503, -1.7208241092827994, -1.068122926814994, -1.6339248620455546, 0.07225436117508208, -1.2018233250224348, -0.07213000691963527, -1.0080992229563746, -1.151378048476321, -0.2660104149809121, 1.6307779136408695, 0.8394822016824073, -0.23362802143120032, -0.36799502320054384, 0.35359852278856263, 0.5830948999779656, -0.730683771776052, 1.4715728371820667, -1.0668090648998136, -1.025762014881618, 0.21056106958224155, -0.5141254207774576, -0.1592942838690149, 0.7688711617969363, -2.464535892598544, -0.33306989349452987, 0.9457207224940593, 0.36108072442574435, -0.6490066877470516, -0.8714147266896871, 0.6567118414749348, -0.18543305444915045, 0.11156511615955596, 0.7299392157186994, -0.9902398239693843, -1.3231344439063761, -1.1402773433114928, 0.3696183719476138, -1.0512718152423168, -0.6093518314203102, 0.0010622538704462257, -0.17676306948277776, -0.6291120128576891, 1.6390197341434742, -0.8105788162716191, -2.0105672384392204, -0.7909143328024505, -0.10510684692203587, -0.013384480496840259, 0.37683659744804815, -0.15123337965442354, 1.8427651248902048, 1.0371006855495906, 0.29198928612503655, -1.7455852392709181, 1.0854545339796853, 1.8156620972829793, 1.2399563224061596, 1.1196530775769857, 0.4349954478175989, 0.11093680938321168, 0.9945934589378227, -0.5779739742428905, 1.0398502505219054, -0.09401160691650227, 0.22793239636661505, -1.8664992140331715, -0.16104499274010126, -0.8497511318264537, -0.005035074822415585, -1.7956896952184151, 1.8304783101189757, 0.19094408763231646, 1.3353023874309002, 0.5889134606052353, -0.48487660139277866, 0.4817014755127622, 1.5981632863770983, 2.1416849775567943, -0.5524061711669017, 0.3364804821524787, -0.8609687548167294, 0.24548635047971906, -0.1281468603588133, -0.03871410517044196, -0.2678174852638268, 0.41800607312114096, -0.2503930647517959, 0.8432391494945226, -0.5684563173706987, -0.6737077809046504, 2.0559579098493606, -0.29098826888414253, -0.08572747304559661, -0.301857666880195, -0.3446199959065524, 0.7391340848217359, -0.3087136212446006, 0.5245553707204758, -3.063281336805349, 0.47471623010413705, 0.3733427291759615, -0.26216851429591426, -0.5433523111756248, 0.3305385199964823, -1.4866150542941634, -0.4699911958560942, 0.7312367186673805, -0.22346998944216903, -0.4102860865811592, -0.3003478250288424, -0.3436168605845268, 0.9456524589400904, -0.03710285453384255, 0.10330609878001526, 0.6919858329179392, 0.8673477607085118, 0.380742577915601, 0.5785785515837437, -0.011421905830097267, 0.587187810965595, -1.172536467775141, -0.532086162097372, -0.34440413367820183, -1.404900386188497, -0.1916375229779241, 1.6910999461291834, -0.6070351182769795, -0.8371447893868493, 0.8853944070432224, 1.4062946075925473, -0.4575973141608374, 1.1458755768004445, 0.2619874618238163, 1.7105876844856704, -1.3938976454537522, -0.11403217166441704, -1.0354305240085717, -0.4285770475062154, 0.10326635421187867, 0.6911853442971228, 0.6293835213179542, -0.819693698713199, -0.7378190403744175, -1.495947672573938, -1.2406693914431872, -1.0486341638186725, -1.3715759883075953, 3.585407817418151, -0.8007079372574223, -1.527336776754733, -0.4716571043072485, -0.6967311271405545, 1.0003347462169225, -0.30569565002022697, 0.3646134876772732, 0.49083033603832493, 0.07754580794955847, -0.13467337850920083, 0.02134473458605164, 0.5025183900540823, -0.940929087894874, 1.441600637127558, -0.0857298131221344, -0.575175243519591, 0.42622029657630595, -0.3239674701415489, 0.22648849821602596, -0.6636465305318631, 0.30415000329164754, -0.6170241274574016, 0.07578674772163065, 0.2952841441615124, 0.8120317689468056, -0.46861353019671337, 0.04718559572470416, -0.3105660017232523, -0.28898463203535724, 0.9575298065734561, -0.1977556031830993, 0.009658232624257272, 1.1432743259603295, -1.8989396918936858, 0.20787070770386357, 1.4256750543782999, -0.03838329973778874, -0.9051229357470373, -1.2002277085489457, 2.405569956130733, 1.895817948326675, -0.8260858325924574, 0.5759061866255807, 2.7022875569683342, 1.0591327405967745, 0.21449833798124354, 0.19970388388081273, 0.018242139911433558, -0.630960146999549, -2.389646042147776, 0.5424304992480339, -1.2159551561948718, -1.6851632640204128, -0.4812221268109694, 0.6217652794219579, -0.380139431677482, -0.2643524783321051, 0.5106648694993016, -0.895602157034141, -0.20559568725141816, 1.5449271875734911, 1.544075783565114, 0.17877619857826843, 1.9729717339967108, 0.8302033109816261, -0.39118561199170965, -0.4428357598297098, -0.02550407946753186, -1.0202977138210447, 2.6604654314300835, 1.9163029269361842, 0.34697436596877657, -0.8078124769022497, -1.3876596649099957, 0.44707250163663864, -0.6752837232272447, -0.851291770954755, 0.7599767868730256, 0.8134109401706875, -1.6766750539980289, -0.06051832829232975, -0.4652931327216134, -0.9249124398287735, 1.9022739762222731, 1.7632300613807597, 1.675335012283785, 0.47529854476887495, -0.7892463423254658, 0.3910120652706098, 0.5812432547936405, 0.2693084649672777, -0.08138564925779349, 0.9150619269526952, -0.8637356349272142, -0.14137853834901817, -0.20192754829896423, 0.04718228147088756, -0.9743600144318, -0.9936290943927825, 0.3544612180477054, 0.6839546770735121, 1.5089070357620178, 1.301167565172228, -1.5396145667672985, 0.42854366341485456, -1.5876582617301032, -0.0316985879141714, 0.3144220016570915, -0.05054766725644431, 0.2934139006870167, 0.11396170275994542, -0.6472140129693643, 1.6556030742445431, 1.0319410208453506, 0.3292217603989991, -0.058758121958605435, -0.19917171648476298, -0.5192866115874029, 0.1997510689920335, -1.3675686656161756, -1.7761517497832053, -0.11260276070167097, 0.9717892642758689, 0.0840815981843948, -0.40211265381258554, 0.27384496844034517, -1.0403875081272367, 1.2884781173493884, -1.8066239592554476, 1.1136979156298865, -0.06223155785690416, 1.3930381289015936, 0.4586305673655182, 1.3159249757827194, -0.5369892835955705, 0.17827408233621184, 0.22693934439969682, 0.8216240002114816, -1.0422409752281838, 0.3329686606709231, -1.5128804353968217, 1.0323052869815534, 1.1640486934424354, 1.6450118078345612, -0.6717687395070293, -0.08135119186406627, 1.2746921873544188, -0.8255794145095643, 0.7123504776564864, 0.6953336934741682, 2.191382322698439, 1.4155790749261592, 2.4681081786912866, -2.2904357033803815, -0.8375155191566624, 1.1040106662196736, 0.7084133268872015, -3.401968681942055, 0.23237090512844757, 1.1199436238058174, 0.6333916486592628, -0.6012340913121055, -0.3693951838866523, -1.7742670566875682, -0.36431378282545124, -0.4042586409194551, -0.04648644034604476, 1.5138191613743486, -0.2053670782251071, 1.8679122383251414, 0.8355881018692999, -0.5369705129279005, -0.7909355080370954, 2.1080036780007987, 0.019537331188020687, -1.4672982688640615, -1.486842866467901, -1.1036839537574874, 1.0800858540685894, -0.2313974176207594, 0.47763272078271807, -1.9196070490691473, -0.8193535127855751, -0.6853651905832031, -0.18272370464882973, -0.33413577684633056, 2.2261342671906106, 1.6853726343573683, 9.815173154691152, 9.195721936851381, 9.454764722402919, 12.39028070853414, 11.077109184383136, 11.11018274263151, 8.253573613752238, 9.249224389601387, 10.661017897574766, 10.636424837770212, 11.098766960743015, 10.849052705101403, 10.312653805479268, 12.512271418658774, 11.515927866402878, 10.805646900536944, 9.615772020628349, 8.135249052972544, 11.223128141846045, 10.99720813452923, 10.74657586919516, 10.12948400807092, 9.375281731983863, 9.24839885126103, 8.372084289437371, 8.93659570161449, 10.610084747373708, 10.551196629616179, 8.408813929020738, 10.52449654468341, 9.565964176571338, 10.19516240210464, 8.978311891652176, 10.338322015779333, 11.750898927951285, 10.011317551924526, 8.695758705558697, 9.339763109322439, 10.001058719022168, 9.942625377081312, 11.304785028918761, 9.987200879300868, 11.679002903338883, 8.998885072444704, 8.819784995357129, 10.568184132145879, 11.659125713002139, 10.762336914469303, 9.32320260988687, 9.725203078628804, 9.964074050288879, 8.227648949871316, 8.379280528982454, 9.043592955512342, 10.673959852055166, 11.04178950765339, 9.687649733091025, 9.41133684548883, 10.142143695235507, 9.885787857653991, 10.77036436129569, 10.199859160532041, 11.906921437391155, 9.092145565795091, 9.599859593366261, 10.006151121719428, 10.251489353814739, 11.665149283823462, 10.36216926512228, 9.233047308296833, 9.766323940883256, 9.934689941809895, 8.920980497968037, 10.700167568037603, 11.972059548230819, 11.807975183063977, 9.780838822594752, 9.082996827746783, 8.66457095646258, 8.235356999576195, 10.07717712804332, 9.41169774916071, 9.73444471105237, 8.510626864453481, 11.0483629493406, 10.075424506768783, 9.71618118767198, 9.855777633540233, 9.321269926037239, 10.31344862832386, 10.44712322592716, 9.483081235060903, 8.840619905612026, 12.312860683751307, 11.387457811126849, 10.91254103125626, 10.26528523590661, 9.36564044255442, 9.682702879610524, 10.633009560329048, 8.953172548723792, 10.31081304976398, 11.046888690584755, 11.0782772826774, 9.915055567590793, 9.280605798698062, 10.982861748765211, 9.784771773336795, 9.03417297376102, 9.692965865033948, 10.040871553746763, 10.385450302697754, 10.651371663079813, 7.759842177973297, 10.175662480896953, 9.01272511209246, 9.71991735582558, 11.55424986087538, 10.89862781328883, 10.230811407645387, 9.527776922664856, 9.655463047580092, 10.244845872335564, 7.667143349060906, 9.560890691888703, 8.173301714205879, 10.491515746056718, 9.567419698329202, 10.105088711303909, 9.469230551349817, 11.179940410352575, 9.006175708698736, 10.468487876224614, 10.268229766953757, 9.462441000556659, 10.390072406239947, 10.808028783574198, 9.749151257518486, 8.100460700253574, 9.625207212566387, 10.88671697913831, 11.134035258727796, 8.379469292119362, 10.24713724308472, 11.246175642752887, 9.413082181899219, 10.130347960228239, 10.307506705752774, 11.880659432945912, 10.806518362233641, 10.713009745795166, 11.64271936471556, 9.356463282602544, 10.018025051295322, 10.457373698061431, 9.544828164290337, 9.351138661160832, 12.72723771325665, 7.978379090659482, 10.086368596519465, 8.580341410491465, 9.998579668011693, 8.231605180020328, 8.618973281314338, 9.743123683317958, 8.882795710103597, 9.485391146197971, 9.227038234014854, 10.515811406550903, 9.502496859717816, 9.528866006439157, 9.652387835502346, 9.999719149199045, 8.234784023145568, 12.08218578427089, 9.404501226829378, 9.42705519837542, 10.92362016447942, 10.706137678621008, 10.384611195372534, 12.530037518195305, 7.9494995344271615, 11.560130002283893, 10.627242934381766, 10.238870914060263, 10.657001995795557, 11.168922611975656, 9.22454213363792, 8.84397491328157, 11.617739774704365, 8.66047963884963, 10.61029901352552, 10.770462026406745, 10.605468371519626, 11.372244538196599, 8.759537062234754, 9.850870841567952, 7.842285754040711, 9.698709052920634, 9.55956852768227, 9.14106926825522, 9.999822044052786, 10.46135170093459, 8.863880719140143, 10.034124291986638, 8.914170721204115, 10.472801157669885, 9.2881744697782, 10.147253456233594, 9.688734072641253, 9.042159207572507, 8.663446134759319, 9.644212732730034, 9.904167948132605, 9.700218589249761, 9.649530986132676, 10.267968065680032, 10.919737084198182, 10.415773703978761, 9.877753734712984, 8.25295618678857, 10.29999197369132, 9.704337844118152, 8.39851110688631, 9.928231225460367, 10.232315733705695, 11.278256661495051, 9.815138668351244, 10.82871741707962, 8.95016435178245, 9.872651108417015, 9.768952893928173, 10.173307192272842, 11.14381101128269, 10.167175257846964, 10.526715578223584, 9.556894557709755, 10.21885981029533, 9.103575263757648, 8.772585681910156, 10.471055468091143, 10.05593480280753, 10.75052849007012, 10.250381003510102, 9.807623655277606, 11.043392344935185, 11.443834708152007, 10.9999163199769, 11.04711997647443, 11.176424454500305, 11.323629234245239, 10.580251384393724, 10.724985843947682, 9.328074707008229, 10.083614309843068, 9.169605432073089, 10.212716761005078, 11.24071512686644, 8.066997819347282, 9.745320801891525, 11.071591706113662, 11.809083709320621, 10.19321274166031, 9.681712323692512, 9.464672050137128, 8.083831600837833, 11.152848084201906, 8.543324531243814, 9.880075223924324, 12.992811005499586, 9.627721400411446, 10.333661384541301, 7.948411839344616, 9.938657120236162, 8.84976324045843, 9.465678121139167, 11.051733592206105, 10.258274684097461, 9.654637992252383, 9.049221740360393, 8.18579716737424, 9.893519354155917, 11.256748698358216, 10.123073816686242, 11.038734509785028, 9.431189829466998, 10.663852469551184, 10.815013866498624, 9.962899902035808, 9.035699088240944, 9.155739043928861, 10.423958709401013, 9.696934935093667, 10.246394208977696, 11.234891486570675, 8.903150850641111, 11.951616294982772, 9.823011347830194, 9.664970074852166, 9.894109849164023, 10.148186854957597, 10.006061095603062, 8.734935349183761, 10.826828977763665, 10.105621687186614, 11.081922723492788, 10.340608752103632, 9.774210988574826, 10.169161501480882, 10.213886080366082, 10.381600063015902, 9.461052205583645, 10.24785032571721, 10.683940615678047, 10.180496906660423, 9.43585641409653, 9.927764386148102, 8.747719322075184, 9.45091735909254, 10.975697344166349, 9.509308110038658, 9.678081022558306, 9.502277267983184, 10.259016336448857, 9.539699891424476, 9.404789093559481, 10.364295304224317, 10.688744903208313, 8.026676712037638, 12.036084051468308, 9.478703462476828, 9.68565551154149, 11.145124298747552, 10.931969274443409, 9.244918816151326, 9.544216459784334, 10.458945297057214, 9.092599451770562, 9.671033018084339, 10.607448835715248, 9.876467797378703, 10.738338975888386, 8.967770686569413, 9.080652887697477, 10.42526145421582, 8.869898760291694, 10.77912016692631, 9.569665325198468, 10.37410823980801, 10.001050628550791, 10.175200919020194, 9.375252223422686, 11.366118924633822, 11.542670136116437, 9.29441564739444, 10.366851903745204, 9.973814713650711, 9.730709472459985, 10.343920051962277, 10.068826867293733, 9.885834307652912, 10.282366822342045, 10.37562451057554, 10.43505589156704, 9.878269430878657, 11.557755284537198, 9.838254847423627, 9.62859113635893, 9.134138520500688, 8.962372735636661, 10.500040414057844, 10.277077264779084, 9.916017753054408, 10.992690891245179, 9.868125742567045, 9.894041706876369, 10.888698205866424, 10.21938512053016, 9.240215198536411, 9.402715526573372, 9.425784416867447, 10.077637711413354, 9.257496434456577, 8.803094224556933, 7.658770888768531, 10.280718283626205, 11.089111401858185, 10.739967696312384, 7.198882148966959, 9.924155386764681, 8.871399207899634, 10.25369946460737, 9.331568795660454, 10.007775152126294, 8.95033094644488, 9.400145602314105, 10.259496974567055, 10.793045643430844, 9.30655770341641, 11.285730341633359, 11.056860604326372, 9.505412619305067, 11.951535597779507, 10.99779212279173, 10.828945719064526, 9.011790733812278, 9.593475874473091, 9.218712441104243, 9.243302018868718, 10.00422685060251, 9.332054304813328, 9.465127473802482, 10.25565148642919, 8.663016942478883, 11.08129306826306, 10.804456475017028, 11.403740323561363, 10.855511157859507, 10.461917726961362, 9.001573075483222, 11.224144270420922, 8.689259116609671, 10.150221898412491, 9.007285700636182, 11.60867352019891, 9.457374672382427, 8.285565448166821, 11.221347137028255, 9.313644825170684, 10.002615273391601, 9.821889041799993, 10.902591223085455, 9.561263864722937, 10.646885520332864, 8.28954736489342, 9.87514987011174, 7.484577763630988, 11.750531617452692, 11.104696954648517, 9.2402705220721, 11.686375934874148, 10.003557627098806, 11.965653748348506, 10.423929821286558, 11.314345706920706, 12.248276908924456, 10.068285342526544, 8.528361303693776, 8.890515832028504, 8.851718035159678, 11.909181169855241, 12.4077439825466, 9.904043852696201, 10.166459285632175, 9.97914891045837, 9.778381917677333, 11.175540214292313, 10.1029031223738, 8.737307945662081, 9.988537333449353, 9.782111563112956, 8.500850012052682, 8.63788567318244, 10.427013514982507, 9.87365842199117, 9.796612994083167, 9.239550595343976, 11.02980932669381, 10.021260206998564, 10.111139313455114, 9.310503660057506, 10.259275858453702, 10.62398445966753, 8.853509607259221, 9.447185719307885, 9.930970246278669, 9.010205140039004, 10.053997126140763, 9.272370934554909, 9.80001532867991, 9.414154803358368, 8.876179874194799, 10.998540199447916, 10.845407438261633, 10.408355727772314, 11.262041191487352, 9.359724741813332, 9.35598632109662, 10.806049363218438, 9.535612484309597, 11.437118841946974, 9.583721412629474, 9.900483687258218, 10.645583179998752, 9.88964472992708, 11.969775976684911, 10.364880520288668, 9.187869705602285, 10.52212059567159, 10.457434646988345] + }, + "schemas": { + "spark": { + "norm_0_1": "FloatType", + "norm_1_1": "FloatType", + "norm_10_1": "FloatType", + "bimodal": "FloatType" + } + }, + "tests": [ + { + "title": "Basic positive test: extremes", + "exact_match_out": false, + "in": { + "column": "norm_0_1", + "quantile_ranges": { + "quantiles": [0.0, 1.0], + "value_ranges": [[null, -3], [3, null]] + } + }, + "out": { + "success": true, + "observed_value": { + "quantiles": [0.0, 1.0], + "values": [-3.40197, 3.58541] + } + }, + "tolerance": 0.1, + "_note": "The large tolerance here documents implementation differences between pandas, sql, and spark wrt interpolation behavior / specific ntile calculation", + "suppress_test_for": ["sqlite"] + }, + { + "title": "Basic positive test: normal quartiles", + "exact_match_out": false, + "in": { + "column": "norm_0_1", + "quantile_ranges": { + "quantiles": [0.0, 0.25, 0.5, 0.75, 1.0], + "value_ranges": [[null, -3], [-0.8, -0.6], [-0.1, 0.1], [0.6, 0.8], [3, null]] + } + }, + "out": { + "success": true, + "observed_value": { + "quantiles": [0.0, 0.25, 0.5, 0.75, 1.0], + "values": [-3.40197, -0.70025, -0.03871, 0.62116, 3.58541] + } + }, + "tolerance": 0.1, + "_note": "The large tolerance here documents implementation differences between pandas, sql, and spark wrt interpolation behavior / specific ntile calculation", + "suppress_test_for": ["sqlite"] + }, + { + "title": "Basic positive test: uneven spacing", + "exact_match_out": false, + "in": { + "column": "norm_0_1", + "quantile_ranges": { + "quantiles": [0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0], + "value_ranges": [[null, -3], [-2, -1.5], [-0.8, -0.6], [-0.1, 0.1], [0.6, 0.8], [1.5, 2], [3, null]] + } + }, + "out": { + "success": true, + "observed_value": { + "quantiles": [0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0], + "values": [-3.40196868, -1.72089571, -0.70115633, -0.04059954, 0.62130846, 1.6855355 , 3.58540782] + } + }, + "suppress_test_for": ["sqlite"], + "tolerance": 0.1, + "_note": "The large tolerance here documents implementation differences between pandas, sql, and spark wrt interpolation behavior / specific ntile calculation" + }, + { + "title": "Basic negative test: normal quartiles, wrong distribution", + "exact_match_out": false, + "in": { + "column": "norm_1_1", + "quantile_ranges": { + "quantiles": [0.0, 0.25, 0.5, 0.75, 1.0], + "value_ranges": [[null, -3], [-0.8, -0.6], [-0.1, 0.1], [0.6, 0.8], [3, null]] + } + }, + "out": { + "success": false + }, + "suppress_test_for": ["sqlite"] + } + ] + }] +} \ No newline at end of file diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_stdev_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_stdev_to_be_between.json index e00db54f4985..119ce69a7d8e 100644 --- a/tests/test_definitions/column_aggregate_expectations/expect_column_stdev_to_be_between.json +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_stdev_to_be_between.json @@ -21,7 +21,20 @@ "success": true, "observed_value": 1.1547005383792517 } - },{ + }, + { + "title": "Vacuously true", + "exact_match_out": false, + "tolerance": 0.000000000001, + "in": { + "column": "dist1" + }, + "out": { + "success": true, + "observed_value": 1.1547005383792517 + } + }, + { "title" : "Positive test, exact min and max", "exact_match_out" : false, "in": { @@ -33,7 +46,7 @@ "success": true, "observed_value": 1.1547005383792517 }, - "suppress_test_for": ["Spark"] + "suppress_test_for": ["spark"] }, { "title": "Basic Negative test", @@ -126,19 +139,6 @@ "missing_percent": 0.3333333333333333 } } - },{ - "title": "Test exception, both min and max null", - "exact_match_out": false, - "in": { - "column": "missing", - "min_value": null, - "max_value": null, - "catch_exceptions": true - }, - "out": {}, - "error": { - "traceback_substring": "min_value and max_value cannot both be None" - } }] }] } \ No newline at end of file diff --git a/tests/test_definitions/column_aggregate_expectations/expect_column_sum_to_be_between.json b/tests/test_definitions/column_aggregate_expectations/expect_column_sum_to_be_between.json index 5dfdffa404e8..041dab5436b5 100644 --- a/tests/test_definitions/column_aggregate_expectations/expect_column_sum_to_be_between.json +++ b/tests/test_definitions/column_aggregate_expectations/expect_column_sum_to_be_between.json @@ -12,31 +12,31 @@ }, "schemas": { "sqlite": { - "w": "int", - "x": "int", - "y": "int", - "z": "varchar", - "zz": "datetime", - "a": "int", - "b": "int" + "w": "INTEGER", + "x": "INTEGER", + "y": "INTEGER", + "z": "VARCHAR", + "zz": "DATETIME", + "a": "INTEGER", + "b": "INTEGER" }, "postgresql": { - "w": "int", - "x": "int", - "y": "int", - "z": "text", - "zz": "timestamp", - "a": "int", - "b": "int" + "w": "INTEGER", + "x": "INTEGER", + "y": "INTEGER", + "z": "TEXT", + "zz": "TIMESTAMP", + "a": "INTEGER", + "b": "INTEGER" }, "spark": { - "w": "int", - "x": "int", - "y": "int", - "z": "string", - "zz": "string", - "a": "int", - "b": "int" + "w": "IntegerType", + "x": "IntegerType", + "y": "IntegerType", + "z": "StringType", + "zz": "StringType", + "a": "IntegerType", + "b": "IntegerType" } }, "tests" : [{ diff --git a/tests/test_definitions/column_distributional_expectations/expect_column_kl_divergence_to_be_less_than.json b/tests/test_definitions/column_distributional_expectations/expect_column_kl_divergence_to_be_less_than.json index f677f2fc9822..89387fa6d11d 100644 --- a/tests/test_definitions/column_distributional_expectations/expect_column_kl_divergence_to_be_less_than.json +++ b/tests/test_definitions/column_distributional_expectations/expect_column_kl_divergence_to_be_less_than.json @@ -113,11 +113,18 @@ }, "schemas": { "spark": { - "norm_0_1": "float", - "norm_1_1": "float", - "norm_10_1": "float", - "bimodal": "float", - "categorical_fixed": "string" + "norm_0_1": "FloatType", + "norm_1_1": "FloatType", + "norm_10_1": "FloatType", + "bimodal": "FloatType", + "categorical_fixed": "StringType" + }, + "postgresql": { + "norm_0_1": "NUMERIC", + "norm_1_1": "NUMERIC", + "norm_10_1": "NUMERIC", + "bimodal": "NUMERIC", + "categorical_fixed": "TEXT" } }, "tests": [ @@ -169,6 +176,35 @@ "success": false } }, + { + "title": "norm_0_1_auto_inf partition - null threshold should always succeed", + "exact_match_out": false, + "in": { + "column": "norm_0_1", + "partition_object": { + "bins": [ -Infinity, -3.2412673400690726, -2.987910238971794, -2.734553137874516, -2.481196036777238, -2.2278389356799595, -1.974481834582681, -1.7211247334854027, -1.4677676323881244, -1.214410531290846, -0.9610534301935676, -0.7076963290962892, -0.45433922799901083, -0.2009821269017329, 0.05237497419554549, 0.30573207529282387, 0.5590891763901022, 0.8124462774873806, 1.065803378584659, 1.3191604796819374, 1.5725175807792158, 1.8258746818764942, 2.0792317829737725, 2.332588884071051, 2.5859459851683293, 2.839303086265607, 3.092660187362885, 3.3460172884601636, 3.599374389557442, 3.852731490654721, Infinity], + "weights": [ 0.005, 0.00099, 0.0, 0.00297, 0.00297, 0.010889999999999999, 0.01287, 0.028710000000000003, 0.04554, 0.052469999999999996, 0.0693, 0.10196999999999999, 0.08613, 0.09207, 0.1089, 0.08811, 0.08514, 0.06336, 0.0396, 0.03762, 0.02277, 0.01683, 0.010889999999999999, 0.00594, 0.00198, 0.00099, 0.0, 0.0, 0.00099, 0.005 ] + }, + "threshold": null + }, + "out": { + "success": true + } + }, + { + "title": "norm_0_1_auto_inf partition - null threshold *and* partition object supports profiling", + "exact_match_out": false, + "in": { + "column": "norm_0_1", + "partition_object": null, + "threshold": null + }, + "out": { + "success": true + }, + "suppress_test_for": ["sqlite"], + "_notes": "this test is suppressed for sqlite since it generates partitions, which may not be possible when sqlite doesn't support the ntile windowing function" + }, { "title": "norm_0_1_auto_inf partition - tail weights - should fail with no internal holdout", "exact_match_out": false, @@ -383,8 +419,8 @@ }, "schemas": { "spark": { - "x": "float", - "y": "float" + "x": "FloatType", + "y": "FloatType" } }, "tests": [ @@ -498,7 +534,7 @@ "z": [-0.5, 0.5, 0.5, 1.5, 1.5, 1.5, 1.5, 2.5, 2.5, 3.5] }, "schemas": { - "z": "float" + "z": "FloatType" }, "tests": [ { @@ -601,22 +637,6 @@ "traceback_substring": "ValueError" } }, - { - "title": "missing threshold", - "exact_match_out": false, - "in": { - "column": "z", - "partition_object": { - "bins": [0, 1, 2, 3], - "weights": [0.2, 0.4, 0.2], - "tail_weights": [0.1, 0.1] - }, - "catch_exceptions": true - }, - "out": { - "traceback_substring": "ValueError" - } - }, { "title": "too-big tail_weight", "exact_match_out": false, @@ -686,8 +706,8 @@ }, "schemas": { "spark": { - "x": "float", - "x2": "float" + "x": "FloatType", + "x2": "FloatType" } }, "tests": [ diff --git a/tests/test_definitions/column_map_expectations/expect_column_value_lengths_to_be_between.json b/tests/test_definitions/column_map_expectations/expect_column_value_lengths_to_be_between.json index 5e1964587f14..376314687567 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_value_lengths_to_be_between.json +++ b/tests/test_definitions/column_map_expectations/expect_column_value_lengths_to_be_between.json @@ -7,6 +7,14 @@ "s3":["cool","calm","collected","casual", null], "s4":[1,2,3,4,5] }, + "schemas": { + "spark": { + "s1": "StringType", + "s2": "StringType", + "s3": "StringType", + "s4": "IntegerType" + } + }, "tests": [{ "title": "Positive test, exact min and max", "exact_match_out" : false, diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_between.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_between.json index 6824b22a34c9..65280f4fcb8a 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_between.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_between.json @@ -21,21 +21,29 @@ "numeric": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] }, "schemas": { + "spark": { + "x": "IntegerType", + "y": "StringType", + "z": "IntegerType", + "ts": "TimestampType", + "alpha": "StringType", + "numeric": "IntegerType" + }, "sqlite": { - "x": "int", - "y": "varchar", - "z": "int", - "ts": "datetime", - "alpha": "varchar", - "numeric": "int" + "x": "INTEGER", + "y": "VARCHAR", + "z": "INTEGER", + "ts": "DATETIME", + "alpha": "VARCHAR", + "numeric": "INTEGER" }, "postgresql": { - "x": "int", - "y": "text", - "z": "int", - "ts": "timestamp", - "alpha": "text", - "numeric": "int" + "x": "INTEGER", + "y": "TEXT", + "z": "INTEGER", + "ts": "TIMESTAMP", + "alpha": "TEXT", + "numeric": "INTEGER" } }, "tests": [ @@ -356,9 +364,9 @@ "exact_match_out": false, "out": { "unexpected_list": [ - "Jan 01 1970 12:00:01", - "Dec 31 1999 12:00:01", - "Jan 01 2001 12:00:01" + "Jan 01 1970 12:00", + "Dec 31 1999 12:00", + "Jan 01 2001 12:00" ], "unexpected_index_list": [ 0, 1, 9 @@ -370,7 +378,7 @@ "max_value": "Dec 31 2000", "min_value": "Jan 01 2000", "parse_strings_as_datetimes": true, - "output_strftime_format" : "%b %d %Y %H:%M:%S" + "output_strftime_format" : "%b %d %Y %H:%M" } }, { @@ -418,7 +426,8 @@ 9 ], "success": false - } + }, + "suppress_test_for": ["spark"] }, { "title": "Test allow_cross_type_comparisons again", @@ -435,7 +444,8 @@ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ], "success": false - } + }, + "suppress_test_for": ["spark"] }, { "title": "Verify that min_value=max_value=None raises an error", diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_set.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_set.json index 4cc22910e2c7..0196d9f65614 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_set.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_set.json @@ -18,6 +18,17 @@ "success": true } }, + { + "title": "Vacuously true - empty value_set", + "exact_match_out": false, + "in": { + "column": "x", + "value_set": null + }, + "out": { + "success": true + } + }, { "title": "Negative test case, exclude existing column value", "exact_match_out": false, @@ -125,7 +136,7 @@ }, "schemas": { "postgresql": { - "empty_column": "text" + "empty_column": "TEXT" } }, "tests": [{ @@ -147,10 +158,10 @@ }, "schemas": { "sqlite": { - "dates": "datetime" + "dates": "DATETIME" }, "postgresql": { - "dates": "timestamp" + "dates": "TIMESTAMP" }, "pandas": { "dates": "datetime64" diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_type_list.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_type_list.json index 7a6a4e248ec3..d72e38fcea85 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_type_list.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_in_type_list.json @@ -20,6 +20,42 @@ "b": "bool", "s": "str", "s1": "str" + }, + "postgresql": { + "x": "INTEGER", + "y": "DOUBLE_PRECISION", + "z": "TEXT", + "n": "TEXT", + "b": "BOOLEAN", + "s": "TEXT", + "s1": "TEXT" + }, + "spark": { + "x": "IntegerType", + "y": "FloatType", + "z": "StringType", + "n": "StringType", + "b": "BooleanType", + "s": "StringType", + "s1": "StringType" + }, + "sqlite": { + "x": "INTEGER", + "y": "FLOAT", + "z": "VARCHAR", + "n": "VARCHAR", + "b": "BOOLEAN", + "s": "VARCHAR", + "s1": "VARCHAR" + }, + "mysql": { + "x": "INTEGER", + "y": "FLOAT", + "z": "TEXT", + "n": "TEXT", + "b": "BOOLEAN", + "s": "TEXT", + "s1": "TEXT" } }, "tests": [ @@ -33,7 +69,22 @@ "out":{ "success":true, "unexpected_list": [], - "unexpected_index_list": []}}, + "unexpected_index_list": [] + }, + "only_for": ["pandas"] + }, + { + "title": "Positive, integer column", + "exact_match_out": false, + "in":{ + "column": "x", + "type_list": ["INTEGER"] + }, + "out":{ + "success":true + }, + "only_for": ["sqlalchemy"] + }, { "title": "Negative, Float values are not strings", "exact_match_out": false, @@ -45,7 +96,20 @@ "success": false, "unexpected_list": [1,2,4], "unexpected_index_list": [0,1,2] - } + }, + "only_for": ["pandas"] + }, + { + "title": "Negative, Float values are not TEXT", + "exact_match_out": false, + "in":{ + "column": "x", + "type_list": ["TEXT"] + }, + "out":{ + "success": false + }, + "only_for": ["sqlalchemy"] }, { "title": "Positive, Float values", @@ -58,7 +122,20 @@ "success":true, "unexpected_list":[], "unexpected_index_list":[] - } + }, + "only_for": ["pandas"] + }, + { + "title": "Positive, Float values", + "exact_match_out": false, + "in":{ + "column": "y", + "type_list": ["DOUBLE_PRECISION", "FLOAT"] + }, + "out":{ + "success":true + }, + "only_for": ["sqlalchemy"] }, { "title": "Positive, String values", @@ -71,7 +148,20 @@ "success": true, "unexpected_list": [], "unexpected_index_list": [] - } + }, + "only_for": ["pandas"] + }, + { + "title": "Positive, TEXT values", + "exact_match_out": false, + "in":{ + "column": "z", + "type_list": ["TEXT", "VARCHAR"] + }, + "out":{ + "success": true + }, + "only_for": ["sqlalchemy"] }, { "title": "Positive, boolean values", @@ -84,7 +174,20 @@ "success": true, "unexpected_list": [], "unexpected_index_list": [] - } + }, + "only_for": ["pandas"] + }, + { + "title": "Positive, boolean values", + "exact_match_out": false, + "in":{ + "column": "b", + "type_list": ["BOOLEAN"] + }, + "out":{ + "success": true + }, + "only_for": ["sqlalchemy"] }, { "title": "Positive, string and int values", @@ -97,7 +200,21 @@ "success": true, "unexpected_list": [], "unexpected_index_list": [] - } - }] + }, + "only_for": ["pandas"] + }, + { + "title": "Positive, TEXT and INTEGER values", + "exact_match_out": false, + "in":{ + "column": "s", + "type_list": ["TEXT", "VARCHAR", "INTEGER"] + }, + "out":{ + "success": true + }, + "only_for": ["sqlalchemy"] + } + ] }] } \ No newline at end of file diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_null.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_null.json index df9ad101cbcd..99ff5506fc40 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_null.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_null.json @@ -11,11 +11,11 @@ }, "schemas": { "spark": { - "one_null": "int", - "two_null": "int", - "three_null": "int", - "no_null": "string", - "all_null": "string" + "one_null": "IntegerType", + "two_null": "IntegerType", + "three_null": "IntegerType", + "no_null": "StringType", + "all_null": "StringType" } }, "tests": [ diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_of_type.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_of_type.json index f039081f3d09..b24b093891dd 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_of_type.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_of_type.json @@ -20,6 +20,42 @@ "b": "bool", "s": "float64", "s1": "int64" + }, + "postgresql": { + "x": "INTEGER", + "y": "DOUBLE_PRECISION", + "z": "TEXT", + "n": "TEXT", + "b": "BOOLEAN", + "s": "DOUBLE_PRECISION", + "s1": "BIGINT" + }, + "spark": { + "x": "IntegerType", + "y": "FloatType", + "z": "StringType", + "n": "StringType", + "b": "BooleanType", + "s": "FloatType", + "s1": "LongType" + }, + "sqlite": { + "x": "INTEGER", + "y": "FLOAT", + "z": "VARCHAR", + "n": "VARCHAR", + "b": "BOOLEAN", + "s": "FLOAT", + "s1": "INTEGER" + }, + "mysql": { + "x": "INTEGER", + "y": "FLOAT", + "z": "TEXT", + "n": "TEXT", + "b": "BOOLEAN", + "s": "FLOAT", + "s1": "BIGINT" } }, "tests": [ @@ -34,10 +70,24 @@ "success": true, "unexpected_list": [], "unexpected_index_list": [] - } + }, + "only_for": ["pandas"] }, { - "title": "Negative, Python ints are not ", + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Basic non-python integer Positive Test", + "exact_match_out": false, + "in": { + "column": "x", + "type_": "INTEGER" + }, + "out": { + "success": true + }, + "only_for": ["sqlalchemy"] + }, + { + "title": "Negative, Python ints are not string", "exact_match_out": false, "in": { @@ -49,7 +99,38 @@ "success":false, "unexpected_list":[1,2,4], "unexpected_index_list":[0,1,2] - } + }, + "only_for": ["pandas"] + }, + { + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Negative, sql ints are not text", + "exact_match_out": false, + "in": + { + "column":"x", + "type_":"TEXT" + }, + "out": + { + "success":false + }, + "only_for": ["postgresql", "mysql"] + }, + { + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Negative, sqlite integer is not VARCHAR", + "exact_match_out": false, + "in": + { + "column":"x", + "type_":"VARCHAR" + }, + "out": + { + "success":false + }, + "only_for": ["sqlite"] }, { "title": "Positive, floats", @@ -64,17 +145,68 @@ "success":true, "unexpected_list":[], "unexpected_index_list":[] - } + }, + "only_for": ["pandas"] + }, + { + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Positive, floats", + "exact_match_out": false, + "in": + { + "column":"y", + "type_":"DOUBLE_PRECISION" + }, + "out": + { + "success":true + }, + "only_for": ["postgres"] }, { "title": "Positive, strings", "exact_match_out": false, "in":{"column":"z","type_":"string"}, - "out":{"success":true, "unexpected_list":[], "unexpected_index_list":[]}}, + "out":{"success":true, "unexpected_list":[], "unexpected_index_list":[]}, + "only_for": ["pandas"] + }, + { + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Positive, TEXT", + "exact_match_out": false, + "in":{"column":"z","type_":"TEXT"}, + "out":{"success":true}, + "only_for": ["postgresql", "mysql"] + }, + { + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Positive, VARCHAR", + "exact_match_out": false, + "in":{"column":"z","type_":"VARCHAR"}, + "out":{"success":true}, + "only_for": ["sqlite"] + }, { "title": "Negative, python floats are not python bools", "exact_match_out": false, "in":{"column":"b","type_":"boolean"}, - "out":{"success":true, "unexpected_list":[], "unexpected_index_list":[]}}] - }] + "out":{ + "success":true, + "unexpected_list":[], + "unexpected_index_list":[] + }, + "only_for": ["pandas"] + }, + { + "_note": "type name here works for tested sqlalchemy types; for sqlalchemy note we do not expect unexpected_list or unexpected_index_list since there are no column_map semantics for type", + "title": "Negative, sqlalchemy floats are not BOOLEAN", + "exact_match_out": false, + "in":{"column":"b","type_":"BOOLEAN"}, + "out":{ + "success":true + }, + "only_for": ["sqlalchemy"] + } + ] + }] } \ No newline at end of file diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_unique.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_unique.json index b87ca9c66c05..0f5f6b5a1893 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_be_unique.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_be_unique.json @@ -13,14 +13,14 @@ }, "schemas": { "spark": { - "a": "string", - "b": "string", - "c": "int", - "d": "string", - "n": "int", - "unique": "int", - "null": "null", - "mult_dup": "string" + "a": "StringType", + "b": "StringType", + "c": "IntegerType", + "d": "StringType", + "n": "IntegerType", + "unique": "IntegerType", + "null": "NullType", + "mult_dup": "StringType" } }, "tests" : [ @@ -49,7 +49,7 @@ },{ "title": "Multi-type column, contains 2 and '2'. Suppressed for SQAlchemy ", "exact_match_out": false, - "suppress_test_for": ["SQLAlchemy"], + "suppress_test_for": ["sqlalchemy"], "in": { "column": "c", "mostly": 0.3 diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex.json index aaf0e5a03ce3..4e39c20e9a73 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex.json @@ -8,9 +8,9 @@ }, "schemas": { "spark": { - "a": "string", - "b": "string", - "c": "string" + "a": "StringType", + "b": "StringType", + "c": "StringType" } }, "tests": [{ @@ -25,7 +25,8 @@ "success":false, "unexpected_index_list": [4], "unexpected_list": ["bee"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, exact mostly w/ one non-matching value", "exact_match_out" : false, @@ -38,8 +39,9 @@ "success":true, "unexpected_index_list": [4], "unexpected_list": ["bee"] - } - },{ + }, + "suppress_test_for": ["sqlite"] + },{ "title": "Positive test, sufficient mostly w/ one non-matching value", "exact_match_out" : false, "in":{ @@ -51,7 +53,8 @@ "success":true, "unexpected_index_list": [4], "unexpected_list": ["bee"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Negative test, one missing value and insufficent mostly", "exact_match_out" : false, @@ -64,7 +67,8 @@ "success":false, "unexpected_index_list": [3], "unexpected_list": ["bdd"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, one missing value and exact mostly", "exact_match_out" : false, @@ -77,7 +81,8 @@ "success":true, "unexpected_index_list": [3], "unexpected_list": ["bdd"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, one missing value and sufficent mostly", "exact_match_out" : false, @@ -90,7 +95,8 @@ "success":true, "unexpected_index_list": [3], "unexpected_list": ["bdd"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test. all missing values", "exact_match_out" : false, @@ -102,7 +108,8 @@ "success": true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test. all missing values, mostly", "exact_match_out" : false, @@ -115,7 +122,8 @@ "success": true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, Empty regex", "exact_match_out" : false, @@ -127,7 +135,8 @@ "success": true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, more complicated regex pattern", "exact_match_out" : false, @@ -139,7 +148,8 @@ "success": true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, match characters not at the beginning of string", "exact_match_out" : false, @@ -152,7 +162,8 @@ "success": true, "unexpected_index_list": [0, 2, 3], "unexpected_list": ["aaa", "acc", "add"] - } + }, + "suppress_test_for": ["sqlite"] }] }] } \ No newline at end of file diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex_list.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex_list.json index c9dbc7271ada..34e52f4d7105 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex_list.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_match_regex_list.json @@ -16,7 +16,8 @@ "unexpected_list": [], "unexpected_index_list": [], "success": true - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Positive test with multiple regexes", "exact_match_out" : false, @@ -29,7 +30,8 @@ "unexpected_list": [], "unexpected_index_list": [], "success": true - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Basic negative test", "exact_match_out" : false, @@ -42,7 +44,8 @@ "unexpected_list": ["111", "222", "333", "123", "321", "444", "456", "654", "555"], "unexpected_index_list": [0,1,2,3,4,5,6,7,8], "success": false - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Negative test with more string-ish strings", "exact_match_out" : false, @@ -54,7 +57,8 @@ "unexpected_list": ["bit", "bot", "but", "bet"], "unexpected_index_list": [6,7,8,9], "success": false - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Positive test with match_on=any", "exact_match_out" : false, @@ -67,7 +71,8 @@ "unexpected_list": [], "unexpected_index_list": [], "success": true - } + }, + "suppress_test_for": ["sqlite"] }] }] } \ No newline at end of file diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_match_strftime_format.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_match_strftime_format.json index 0541e8f70fc5..65a1616a19bb 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_match_strftime_format.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_match_strftime_format.json @@ -11,12 +11,12 @@ }, "schemas": { "spark": { - "a": "string", - "b": "string", - "c": "string", - "d": "int", - "e": "timestamp", - "f": "string" + "a": "StringType", + "b": "StringType", + "c": "StringType", + "d": "IntegerType", + "e": "TimestampType", + "f": "StringType" } }, "tests": [{ @@ -131,7 +131,7 @@ { "title": "Negative test - input already datetimes", "exact_match_out" : false, - "suppress_test_for": ["Pandas"], + "suppress_test_for": ["pandas"], "in":{ "column": "e", "strftime_format": "%Y-%m-%d", diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_not_be_null.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_not_be_null.json index 34f1b705c7ea..e9e20f597afd 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_not_be_null.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_not_be_null.json @@ -11,11 +11,11 @@ }, "schemas": { "spark": { - "one_null": "int", - "two_null": "int", - "three_null": "int", - "no_null": "string", - "all_null": "string" + "one_null": "IntegerType", + "two_null": "IntegerType", + "three_null": "IntegerType", + "no_null": "StringType", + "all_null": "StringType" } }, "tests": [ diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex.json index c302811928ea..ce1336b91a7f 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex.json @@ -8,9 +8,9 @@ }, "schemas": { "spark": { - "a": "string", - "b": "string", - "c": "string" + "a": "StringType", + "b": "StringType", + "c": "StringType" } }, "tests": [{ @@ -25,7 +25,8 @@ "success":false, "unexpected_index_list": [0, 1, 2, 3], "unexpected_list": ["aaa", "abb", "acc", "add"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, exact mostly w/ one non-matching value", "exact_match_out" : false, @@ -38,7 +39,8 @@ "success":true, "unexpected_index_list": [0, 1, 2, 3], "unexpected_list": ["aaa", "abb", "acc", "add"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, sufficient mostly w/ one non-matching value", "exact_match_out" : false, @@ -51,7 +53,8 @@ "success":true, "unexpected_index_list": [0, 1, 2, 3], "unexpected_list": ["aaa", "abb", "acc", "add"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Negative test, one missing value and insufficent mostly", "exact_match_out" : false, @@ -64,7 +67,8 @@ "success":false, "unexpected_index_list": [0, 1, 2], "unexpected_list": ["aaa", "abb", "acc"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test, one missing value, no exceptions", "exact_match_out" : false, @@ -76,7 +80,8 @@ "success":true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test. all missing values", "exact_match_out" : false, @@ -88,7 +93,8 @@ "success": true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Positive test. all missing values, mostly", "exact_match_out" : false, @@ -101,7 +107,8 @@ "success": true, "unexpected_index_list": [], "unexpected_list": [] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Negative test, Empty regex", "exact_match_out" : false, @@ -113,7 +120,8 @@ "success": false, "unexpected_index_list": [0, 1, 2, 3], "unexpected_list": ["aaa", "abb", "acc", "bdd"] - } + }, + "suppress_test_for": ["sqlite"] },{ "title": "Negative test, match characters not at the beginning of string, exact mostly", "exact_match_out" : false, @@ -126,7 +134,8 @@ "success": true, "unexpected_index_list": [1, 4], "unexpected_list": ["abb", "bee"] - } + }, + "suppress_test_for": ["sqlite"] }] }] } \ No newline at end of file diff --git a/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex_list.json b/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex_list.json index 19c4b8cc749d..3cd22b68a6d0 100644 --- a/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex_list.json +++ b/tests/test_definitions/column_map_expectations/expect_column_values_to_not_match_regex_list.json @@ -16,7 +16,8 @@ "unexpected_list": [], "unexpected_index_list": [], "success": true - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Positive test with multiple regexes", "exact_match_out" : false, @@ -28,7 +29,8 @@ "unexpected_list": [], "unexpected_index_list": [], "success": true - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Basic negative test", "exact_match_out" : false, @@ -40,7 +42,8 @@ "unexpected_list": ["111", "222", "123", "321", "444", "456", "654", "555"], "unexpected_index_list": [0,1,3,4,5,6,7,8], "success": false - } + }, + "suppress_test_for": ["sqlite"] },{ "title" : "Negative test with more string-ish strings", "exact_match_out" : false, @@ -52,7 +55,8 @@ "unexpected_list": ["hat"], "unexpected_index_list": [4], "success": false - } + }, + "suppress_test_for": ["sqlite"] }] }] } \ No newline at end of file diff --git a/tests/test_definitions/other_expectations/expect_column_to_exist.json b/tests/test_definitions/other_expectations/expect_column_to_exist.json index b30618d4b93d..e56be869946a 100644 --- a/tests/test_definitions/other_expectations/expect_column_to_exist.json +++ b/tests/test_definitions/other_expectations/expect_column_to_exist.json @@ -8,9 +8,9 @@ }, "schemas": { "spark": { - "c1": "int", - "c2": "string", - "c3": "string" + "c1": "IntegerType", + "c2": "StringType", + "c3": "StringType" } }, "tests": [{ diff --git a/tests/test_definitions/other_expectations/expect_table_columns_to_match_ordered_list_test_set.json b/tests/test_definitions/other_expectations/expect_table_columns_to_match_ordered_list_test_set.json index 831cf2fdd174..26d9c28b547c 100644 --- a/tests/test_definitions/other_expectations/expect_table_columns_to_match_ordered_list_test_set.json +++ b/tests/test_definitions/other_expectations/expect_table_columns_to_match_ordered_list_test_set.json @@ -9,9 +9,9 @@ }, "schemas": { "spark": { - "c1": "int", - "c2": "string", - "c3": "string" + "c1": "IntegerType", + "c2": "StringType", + "c3": "StringType" } }, "tests": [{ @@ -59,6 +59,17 @@ "out":{ "success": false } - }] + },{ + "title": "Null list provides vacuously true expectation", + "exact_match_out": false, + "in":{ + "column_list": null + }, + "out":{ + "success": true, + "observed_value": ["c1", "c2", "c3"] + } + } + ] }] } \ No newline at end of file diff --git a/tests/test_definitions/other_expectations/expect_table_row_count_to_be_between.json b/tests/test_definitions/other_expectations/expect_table_row_count_to_be_between.json index eb15f6f74d0b..b5ecb569415d 100644 --- a/tests/test_definitions/other_expectations/expect_table_row_count_to_be_between.json +++ b/tests/test_definitions/other_expectations/expect_table_row_count_to_be_between.json @@ -8,9 +8,9 @@ }, "schemas": { "spark": { - "c1": "int", - "c2": "string", - "c3": "string" + "c1": "IntegerType", + "c2": "StringType", + "c3": "StringType" } }, "tests": [{ @@ -24,7 +24,18 @@ "success":true, "observed_value": 4 } - },{ + }, + { + "title": "Vacuously true", + "exact_match_out" : false, + "in":{ + }, + "out":{ + "success":true, + "observed_value": 4 + } + }, + { "title": "Basic negative test", "exact_match_out" : false, "in":{ diff --git a/tests/test_definitions/other_expectations/expect_table_row_count_to_equal.json b/tests/test_definitions/other_expectations/expect_table_row_count_to_equal.json index d8fd45eaa243..71f0c1153f3a 100644 --- a/tests/test_definitions/other_expectations/expect_table_row_count_to_equal.json +++ b/tests/test_definitions/other_expectations/expect_table_row_count_to_equal.json @@ -24,9 +24,9 @@ }, "schemas": { "spark": { - "c1": "int", - "c2": "string", - "c3": "string" + "c1": "IntegerType", + "c2": "StringType", + "c3": "StringType" } }, "tests": [ diff --git a/tests/test_definitions/other_expectations/expect_this_test_to_be_suppressed.json b/tests/test_definitions/other_expectations/expect_this_test_to_be_suppressed.json index ba69f87cc90e..cfaae8261d01 100644 --- a/tests/test_definitions/other_expectations/expect_this_test_to_be_suppressed.json +++ b/tests/test_definitions/other_expectations/expect_this_test_to_be_suppressed.json @@ -7,7 +7,7 @@ [{ "title": "This test should never run!", "exact_match_out": false, - "suppress_test_for": ["Pandas", "SQLAlchemy", "Spark"], + "suppress_test_for": ["pandas", "sqlalchemy", "spark"], "in": ["a"], "kwargs": {}, "out": { diff --git a/tests/test_definitions/test_expectations.py b/tests/test_definitions/test_expectations.py index 63296f717503..2c8fdcde0c84 100644 --- a/tests/test_definitions/test_expectations.py +++ b/tests/test_definitions/test_expectations.py @@ -8,12 +8,15 @@ from sqlalchemy.dialects.sqlite import dialect as sqliteDialect from sqlalchemy.dialects.postgresql import dialect as postgresqlDialect +from sqlalchemy.dialects.mysql import dialect as mysqlDialect from great_expectations.dataset import SqlAlchemyDataset, PandasDataset, SparkDFDataset -from ..test_utils import CONTEXTS, get_dataset, candidate_test_is_on_temporary_notimplemented_list, evaluate_json_test +from ..conftest import CONTEXTS +from ..test_utils import get_dataset, candidate_test_is_on_temporary_notimplemented_list, evaluate_json_test logger = logging.getLogger(__name__) + def pytest_generate_tests(metafunc): # Load all the JSON files in the directory @@ -33,40 +36,81 @@ def pytest_generate_tests(metafunc): test_configuration = json.load(file, object_pairs_hook=OrderedDict) for d in test_configuration['datasets']: - skip = False + skip_expectation = False # Pass the test if we are in a test condition that is a known exception if candidate_test_is_on_temporary_notimplemented_list(c, test_configuration["expectation_type"]): - skip = True + skip_expectation = True - if skip: + if skip_expectation: schemas = data_asset = None else: schemas = d["schemas"] if "schemas" in d else None data_asset = get_dataset(c, d["data"], schemas=schemas) for test in d["tests"]: + generate_test = True + skip_test = False + if 'only_for' in test: + # if we're not on the "only_for" list, then never even generate the test + generate_test = False + if not isinstance(test["only_for"], list): + raise ValueError("Invalid test specification.") + + if isinstance(data_asset, SqlAlchemyDataset): + # Call out supported dialects + if "sqlalchemy" in test["only_for"]: + generate_test = True + elif ("sqlite" in test["only_for"] and + isinstance(data_asset.engine.dialect, sqliteDialect)): + generate_test = True + elif ("postgresql" in test["only_for"] and + isinstance(data_asset.engine.dialect, postgresqlDialect)): + generate_test = True + elif ("mysql" in test["only_for"] and + isinstance(data_asset.engine.dialect, mysqlDialect)): + generate_test = True + elif isinstance(data_asset, PandasDataset): + if "pandas" in test["only_for"]: + generate_test = True + elif isinstance(data_asset, SparkDFDataset): + if "spark" in test["only_for"]: + generate_test = True + + if not generate_test: + continue + if 'suppress_test_for' in test and ( - 'SQLAlchemy' in test['suppress_test_for'] and isinstance(data_asset, SqlAlchemyDataset) - or 'sqlite' in test['suppress_test_for'] and isinstance(data_asset, SqlAlchemyDataset) and isinstance(data_asset.engine.dialect, sqliteDialect) - or 'postgresql' in test['suppress_test_for'] and isinstance(data_asset, SqlAlchemyDataset) and isinstance(data_asset.engine.dialect, postgresqlDialect) - or 'Pandas' in test['suppress_test_for'] and isinstance(data_asset, PandasDataset) - or 'Spark' in test['suppress_test_for'] and isinstance(data_asset, SparkDFDataset) + ('sqlalchemy' in test['suppress_test_for'] and + isinstance(data_asset, SqlAlchemyDataset)) or + ('sqlite' in test['suppress_test_for'] and + isinstance(data_asset, SqlAlchemyDataset) and + isinstance(data_asset.engine.dialect, sqliteDialect)) or + ('postgresql' in test['suppress_test_for'] and + isinstance(data_asset, SqlAlchemyDataset) and + isinstance(data_asset.engine.dialect, postgresqlDialect)) or + ('mysql' in test['suppress_test_for'] and + isinstance(data_asset, SqlAlchemyDataset) and + isinstance(data_asset.engine.dialect, mysqlDialect)) or + ('pandas' in test['suppress_test_for'] and + isinstance(data_asset, PandasDataset)) or + ('spark' in test['suppress_test_for'] and + isinstance(data_asset, SparkDFDataset)) ): - skip = True + skip_test = True # Known condition: SqlAlchemy does not support allow_cross_type_comparisons if 'allow_cross_type_comparisons' in test['in'] and isinstance(data_asset, SqlAlchemyDataset): - skip = True + skip_test = True parametrized_tests.append({ "expectation_type": test_configuration["expectation_type"], "dataset": data_asset, "test": test, - "skip": skip, + "skip": skip_expectation or skip_test, }) - ids.append(expectation_category + "/" + - c+":"+test_configuration["expectation_type"]+":"+test["title"]) - + ids.append(c + "/" + expectation_category + "/" + + test_configuration["expectation_type"] + ":" + test["title"]) + metafunc.parametrize( "test_case", parametrized_tests, diff --git a/tests/test_expectation_decorators.py b/tests/test_expectation_decorators.py index 5055fd7b996b..190627990e15 100644 --- a/tests/test_expectation_decorators.py +++ b/tests/test_expectation_decorators.py @@ -29,7 +29,7 @@ def test_expectation_decorator_build_config(self): eds.no_op_expectation() eds.no_op_value_expectation('a') - config = eds.get_expectations_config() + config = eds.get_expectation_suite() self.assertEqual({'expectation_type': 'no_op_expectation', 'kwargs': {}}, config['expectations'][0]) @@ -49,7 +49,7 @@ def test_expectation_decorator_meta(self): metadata = {'meta_key': 'meta_value'} eds = ExpectationOnlyDataAsset() out = eds.no_op_value_expectation('a', meta=metadata) - config = eds.get_expectations_config() + config = eds.get_expectation_suite() self.assertEqual({'success': True, 'meta': metadata}, diff --git a/tests/test_filedata_asset.py b/tests/test_filedata_asset.py index 5f315823901a..71945e0e982a 100644 --- a/tests/test_filedata_asset.py +++ b/tests/test_filedata_asset.py @@ -3,26 +3,28 @@ import warnings import pytest import great_expectations as ge -import great_expectations.dataset.autoinspect as autoinspect from .test_utils import assertDeepAlmostEqual def test_autoinspect_filedata_asset(): - #Expect a warning to be raised since a file object doesn't have a columns attribute + #Expect an error to be raised since a file object doesn't have a columns attribute warnings.simplefilter('always', UserWarning) file_path = './tests/test_sets/toy_data_complete.csv' my_file_data = ge.data_asset.FileDataAsset(file_path) - with pytest.raises(UserWarning): - with warnings.catch_warnings(record=True): - warnings.simplefilter("error") - try: - my_file_data.autoinspect(autoinspect.columns_exist) - except: - raise + with pytest.raises(ge.exceptions.GreatExpectationsError) as exc: + my_file_data.profile(ge.profile.ColumnsExistProfiler) + assert "Invalid data_asset for profiler; aborting" in exc.message + # with warnings.catch_warnings(record=True): + # warnings.simplefilter("error") + # try: + # my_file_data.profile(ge.profile.ColumnsExistProfiler) + # except: + # raise -def test_expectation_config_filedata_asset(): + +def test_expectation_suite_filedata_asset(): # Load in data files file_path = './tests/test_sets/toy_data_complete.csv' @@ -41,7 +43,7 @@ def test_expectation_config_filedata_asset(): include_config=True) # Test basic config output - complete_config = f_dat.get_expectations_config() + complete_config = f_dat.get_expectation_suite() expected_config_expectations = [{'expectation_type':'expect_file_line_regex_match_count_to_equal', 'kwargs': {'expected_count': 3, 'regex': ',\\S', @@ -49,8 +51,8 @@ def test_expectation_config_filedata_asset(): assertDeepAlmostEqual(complete_config["expectations"], expected_config_expectations) # Include result format kwargs - complete_config2 = f_dat.get_expectations_config(discard_result_format_kwargs=False, - discard_failed_expectations=False) + complete_config2 = f_dat.get_expectation_suite(discard_result_format_kwargs=False, + discard_failed_expectations=False) expected_config_expectations2 = [{'expectation_type': 'expect_file_line_regex_match_count_to_equal', 'kwargs': {'expected_count': 3, 'regex': ',\\S', @@ -65,8 +67,8 @@ def test_expectation_config_filedata_asset(): assertDeepAlmostEqual(complete_config2["expectations"], expected_config_expectations2) # Discard Failing Expectations - complete_config3 = f_dat.get_expectations_config(discard_result_format_kwargs=False, - discard_failed_expectations=True) + complete_config3 = f_dat.get_expectation_suite(discard_result_format_kwargs=False, + discard_failed_expectations=True) expected_config_expectations3 = [{'expectation_type': 'expect_file_line_regex_match_count_to_equal', 'kwargs': {'expected_count': 3, diff --git a/tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json b/tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json new file mode 100644 index 000000000000..d53dfd6e3ebd --- /dev/null +++ b/tests/test_fixtures/expectation_suites/parameterized_expectation_suite_fixture.json @@ -0,0 +1,25 @@ +{ + "data_asset_name": "mydataset/mygenerator/parameterized_expectation_suite_fixture/default", + "expectation_suite_name": "default", + "data_asset_type": "Dataset", + "meta": { + }, + "expectations": [ + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": { + "value": { + "$PARAMETER": "urn:great_expectations:validations:source_diabetes_data:expectations:expect_column_unique_value_count_to_be_between:columns:patient_nbr:result:observed_value" + } + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "value": { + "$PARAMETER": "urn:great_expectations:validations:source_patient_data:expectations:expect_table_row_count_to_equal:result:observed_value" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/test_fixtures/great_expectations_basic.yml b/tests/test_fixtures/great_expectations_basic.yml new file mode 100644 index 000000000000..35fa3baa62f5 --- /dev/null +++ b/tests/test_fixtures/great_expectations_basic.yml @@ -0,0 +1,11 @@ +# This is a basic configuration for testing. +# It has comments that should be preserved. +datasources: + # For example, this one. + mydatasource: + type: pandas + generators: + # The name default is read if no datasource or generator is specified + mygenerator: + type: subdir_reader + base_directory: ../data diff --git a/tests/test_fixtures/great_expectations_titanic.yml b/tests/test_fixtures/great_expectations_titanic.yml new file mode 100644 index 000000000000..03b38dcc279c --- /dev/null +++ b/tests/test_fixtures/great_expectations_titanic.yml @@ -0,0 +1,15 @@ +# This is a basic configuration for testing. +# It has comments that should be preserved. +datasources: + # For example, this one. + mydatasource: + type: pandas + generators: + # The name default is read if no datasource or generator is specified + mygenerator: + type: subdir_reader + base_directory: ../data + +result_store: + filesystem: + base_directory: uncommitted/validations diff --git a/tests/test_fixtures/rendering_fixtures/evr_suite_1.json b/tests/test_fixtures/rendering_fixtures/evr_suite_1.json new file mode 100644 index 000000000000..5ad4d6c7cbbc --- /dev/null +++ b/tests/test_fixtures/rendering_fixtures/evr_suite_1.json @@ -0,0 +1,103 @@ +{ + "results": [ + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "x_var" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "y_var" + } + } + }, + { + "success": true, + "result": { + "element_count": 5, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "x_var" + } + } + }, + { + "success": true, + "result": { + "element_count": 5, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "x_var", + "type_": "int", + "target_datasource": "python" + } + } + }, + { + "success": true, + "result": { + "element_count": 5, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "y_var" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/test_fixtures/rendering_fixtures/evr_suite_3.json b/tests/test_fixtures/rendering_fixtures/evr_suite_3.json new file mode 100644 index 000000000000..7bb2c9798382 --- /dev/null +++ b/tests/test_fixtures/rendering_fixtures/evr_suite_3.json @@ -0,0 +1,1883 @@ +{ + "results": [ + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "policyID", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "statecode", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "county", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "eq_site_limit", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "hu_site_limit", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fl_site_limit", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fr_site_limit", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "tiv_2011", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "tiv_2012", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "eq_site_deductible", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "hu_site_deductible", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fl_site_deductible", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fr_site_deductible", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "point_latitude", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "point_longitude", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "line", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "construction", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "point_granularity", + "result_format": "BASIC" + } + } + }, + { + "success": false, + "result": { + "observed_value": 2160000000.0, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 0, + "max_value": 0, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 554, + "unexpected_percent": 0.015122563738603483, + "unexpected_percent_nonmissing": 0.015122563738603483, + "partial_unexpected_list": [ + 0.0, + 302400000.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 100, + "max_value": 216000000.0, + "mostly": 0.9, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "policyID", + "type_": "int", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 36634, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "policyID", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 1.0, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "policyID", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "policyID", + "result_format": "BASIC" + } + } + }, + { + "success": false, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 18228, + "unexpected_percent": 0.497570562865098, + "unexpected_percent_nonmissing": 0.497570562865098, + "partial_unexpected_list": [ + 206893, + 172534, + 223488, + 142071, + 422834, + 580146, + 456149, + 353022, + 934215, + 385951, + 633663, + 105851, + 703001, + 352792, + 294022, + 491831, + 737515, + 222653, + 691681, + 368807 + ] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "policyID", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "statecode", + "type_": "string", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 1, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "statecode", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 2.7297046459573073e-05, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "statecode", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": false, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 36634, + "unexpected_percent": 1.0, + "unexpected_percent_nonmissing": 1.0, + "partial_unexpected_list": [ + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL", + "FL" + ] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "statecode", + "values_set": [], + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "county", + "type_": "string", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 67, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "county", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.0018289021127913959, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "county", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": false, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 36634, + "unexpected_percent": 1.0, + "unexpected_percent_nonmissing": 1.0, + "partial_unexpected_list": [ + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY", + "CLAY COUNTY" + ] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "county", + "values_set": [], + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "eq_site_limit", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 5961, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "eq_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.1627176939455151, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "eq_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "hu_site_limit", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 24531, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.669623846699787, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fl_site_limit", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 4581, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fl_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.12504776983130425, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fl_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fr_site_limit", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 7536, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fr_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.2057105421193427, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fr_site_limit", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tiv_2011", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 24804, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "tiv_2011", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.6770759403832506, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "tiv_2011", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tiv_2012", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 35295, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "tiv_2012", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.9634492547906317, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "tiv_2012", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "eq_site_deductible", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 155, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "eq_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.0042310422012338264, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "eq_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "hu_site_deductible", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 2153, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "hu_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.05877054102746083, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "hu_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fl_site_deductible", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 43, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fl_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.0011737729977616422, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fl_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fr_site_deductible", + "type_": "int", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 6, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fr_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.00016378227875743845, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fr_site_deductible", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "point_latitude", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 17900, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "point_latitude", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.488617131626358, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "point_latitude", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "point_longitude", + "type_": "float", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 17731, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "point_longitude", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.48400393077469017, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "point_longitude", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "line", + "type_": "string", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 2, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "line", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 5.4594092919146145e-05, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "line", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": false, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 36634, + "unexpected_percent": 1.0, + "unexpected_percent_nonmissing": 1.0, + "partial_unexpected_list": [ + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Commercial", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential", + "Residential" + ] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "line", + "values_set": [], + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "construction", + "type_": "string", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 5, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "construction", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.00013648523229786537, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "construction", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": false, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 36634, + "unexpected_percent": 1.0, + "unexpected_percent_nonmissing": 1.0, + "partial_unexpected_list": [ + "Masonry", + "Masonry", + "Wood", + "Wood", + "Wood", + "Masonry", + "Reinforced Concrete", + "Wood", + "Wood", + "Masonry", + "Masonry", + "Wood", + "Wood", + "Wood", + "Wood", + "Wood", + "Wood", + "Wood", + "Wood", + "Wood" + ] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "construction", + "values_set": [], + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [] + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "point_granularity", + "type_": "int", + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 5, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "point_granularity", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + }, + { + "success": true, + "result": { + "observed_value": 0.00013648523229786537, + "element_count": 36634, + "missing_count": 0, + "missing_percent": 0.0 + }, + "exception_info": { + "raised_exception": false, + "exception_message": null, + "exception_traceback": null + }, + "expectation_config": { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "point_granularity", + "min_value": 0, + "max_value": null, + "result_format": "BASIC" + } + } + } + ], + "success": false, + "statistics": { + "evaluated_expectations": 80, + "successful_expectations": 74, + "unsuccessful_expectations": 6, + "success_percent": 92.5 + } +} \ No newline at end of file diff --git a/tests/test_fixtures/rendering_fixtures/expectation_suite_3.json b/tests/test_fixtures/rendering_fixtures/expectation_suite_3.json new file mode 100644 index 000000000000..a142345344b3 --- /dev/null +++ b/tests/test_fixtures/rendering_fixtures/expectation_suite_3.json @@ -0,0 +1,588 @@ +{ + "data_asset_name": null, + "expectation_suite_name": "default", + "meta": { + "great_expectations.__version__": "0.4.4" + }, + "expectations": [ + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "policyID" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "statecode" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "county" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "eq_site_limit" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "hu_site_limit" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fl_site_limit" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fr_site_limit" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "tiv_2011" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "tiv_2012" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "eq_site_deductible" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "hu_site_deductible" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fl_site_deductible" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "fr_site_deductible" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "point_latitude" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "point_longitude" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "line" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "construction" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "point_granularity" + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 0, + "max_value": 0 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 100, + "max_value": 216000000.0, + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "policyID", + "type_": "int" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "policyID", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "policyID", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "policyID" + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "policyID" + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "statecode", + "type_": "string" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "statecode", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "statecode", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "statecode", + "values_set": [] + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "county", + "type_": "string" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "county", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "county", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "county", + "values_set": [] + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "eq_site_limit", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "eq_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "eq_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "hu_site_limit", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "hu_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fl_site_limit", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fl_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fl_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fr_site_limit", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fr_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fr_site_limit", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tiv_2011", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "tiv_2011", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "tiv_2011", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tiv_2012", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "tiv_2012", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "tiv_2012", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "eq_site_deductible", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "eq_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "eq_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "hu_site_deductible", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "hu_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "hu_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fl_site_deductible", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fl_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fl_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "fr_site_deductible", + "type_": "int" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "fr_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "fr_site_deductible", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "point_latitude", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "point_latitude", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "point_latitude", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "point_longitude", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "point_longitude", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "point_longitude", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "line", + "type_": "string" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "line", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "line", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "line", + "values_set": [] + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "construction", + "type_": "string" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "construction", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "construction", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "construction", + "values_set": [] + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "point_granularity", + "type_": "int" + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "point_granularity", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "point_granularity", + "min_value": 0, + "max_value": null + } + } + ] +} \ No newline at end of file diff --git a/tests/test_fixtures/rendering_fixtures/expectations_suite_1.json b/tests/test_fixtures/rendering_fixtures/expectations_suite_1.json new file mode 100644 index 000000000000..6992652f6d72 --- /dev/null +++ b/tests/test_fixtures/rendering_fixtures/expectations_suite_1.json @@ -0,0 +1,35 @@ +{ + "data_asset_name": null, + "expectation_suite_name": "default", + "meta": { + "great_expectations.__version__": "0.4.5" + }, + "expectations": [ + { + "expectation_type": "expect_column_to_exist", + "kwargs": {"column": "x_var"} + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": {"column": "y_var"} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": {"column": "x_var"} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "x_var", + "type_": "int", + "target_datasource": "python" + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "y_var" + } + } + ] +} \ No newline at end of file diff --git a/tests/test_fixtures/rendering_fixtures/expectations_suite_2.json b/tests/test_fixtures/rendering_fixtures/expectations_suite_2.json new file mode 100644 index 000000000000..02df3acbeafe --- /dev/null +++ b/tests/test_fixtures/rendering_fixtures/expectations_suite_2.json @@ -0,0 +1,2592 @@ +[ + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "equal_length_string", + "value": 1 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "equal_length_string", + "value": 2 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "s3", + "value": 4, + "mostly": 0.5 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "s1", + "value": 5 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "s1", + "value": 5, + "mostly": 0.8 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "equal_length_integer", + "value": 1, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "no_null", + "result_format": "BASIC" + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "all_null", + "result_format": "COMPLETE" + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "one_null", + "mostly": 0.7, + "result_format": "COMPLETE" + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "one_null", + "result_format": "COMPLETE" + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "one_null", + "mostly": 0.8 + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "no_null", + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "three_null" + } + }, + { + "expectation_type": "expect_column_values_to_match_regex_list", + "kwargs": { + "column": "w", + "regex_list": [ + "\\d+" + ] + } + }, + { + "expectation_type": "expect_column_values_to_match_regex_list", + "kwargs": { + "column": "w", + "regex_list": [ + "[123]+", + "[456]+" + ], + "match_on": "any" + } + }, + { + "expectation_type": "expect_column_values_to_match_regex_list", + "kwargs": { + "column": "w", + "regex_list": [ + "[123]+", + "[456]+" + ], + "match_on": "all" + } + }, + { + "expectation_type": "expect_column_values_to_match_regex_list", + "kwargs": { + "column": "x", + "regex_list": [ + "^.*a.*$" + ] + } + }, + { + "expectation_type": "expect_column_values_to_match_regex_list", + "kwargs": { + "column": "x", + "regex_list": [ + "^.*a.*$", + "b.t" + ], + "match_on": "any" + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 10, + "min_value": 1 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 20, + "min_value": 0 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 20 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "min_value": null, + "max_value": 20 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "min_value": 0 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "min_value": 0, + "max_value": null + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 9, + "min_value": 1 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 10, + "min_value": 3 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 10, + "min_value": 1, + "result_format": "BOOLEAN_ONLY" + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 20, + "min_value": 0, + "result_format": "BOOLEAN_ONLY" + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 9, + "min_value": 1, + "result_format": "BOOLEAN_ONLY" + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 10, + "min_value": 3, + "result_format": "BOOLEAN_ONLY" + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 10, + "min_value": 1, + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 20, + "min_value": 0, + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 9, + "min_value": 1, + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "max_value": 10, + "min_value": 3, + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "y", + "max_value": 10, + "min_value": 1, + "mostly": 0.95, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "y", + "max_value": 10, + "min_value": 1, + "mostly": 0.9, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "y", + "max_value": 10, + "min_value": 1, + "mostly": 0.8, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "z", + "max_value": 4, + "min_value": 1, + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "z", + "max_value": 4, + "min_value": 1, + "mostly": 0.8 + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "ts", + "max_value": "Dec 31 2000", + "min_value": "Jan 01 2000", + "parse_strings_as_datetimes": true, + "output_strftime_format": "%b %d %Y %H:%M:%S" + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "numeric", + "max_value": 10, + "min_value": 0, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "x", + "min_value": 10, + "max_value": 0, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "y", + "min_value": 0, + "max_value": 10, + "allow_cross_type_comparisons": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "numeric", + "min_value": 0, + "max_value": 10, + "allow_cross_type_comparisons": true + } + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "y", + "max_value": null, + "min_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "a", + "regex": "^a", + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "a", + "regex": "^a", + "mostly": 0.8 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "a", + "regex": "^a", + "mostly": 0.7 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "b", + "regex": "^a", + "mostly": 0.8 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "b", + "regex": "^a", + "mostly": 0.75 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "b", + "regex": "^a", + "mostly": 0.7 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "c", + "regex": "^a" + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "c", + "regex": "^a", + "mostly": 0.2 + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "b", + "regex": "" + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "b", + "regex": "[ab]{1}[a-d]{2}" + } + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "a", + "regex": "b", + "mostly": 0.4 + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 0 + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 1 + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "x", + "value_set": [] + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "z", + "value_set": [ + "hello", + "jello", + "mello" + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "z", + "value_set": [ + "ello" + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "y", + "value_set": [ + 1.1, + 2.2 + ], + "mostly": 0.65 + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "y", + "value_set": [ + 1.1, + 2.2 + ], + "mostly": 0.7 + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "n", + "value_set": [ + null + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_be_in_set", + "kwargs": { + "column": "z", + "value_set": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "x", + "type_": "int" + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "x", + "type_": "string" + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "y", + "type_": "float" + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "z", + "type_": "string" + } + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "b", + "type_": "boolean" + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 1, + 2, + 4 + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 2, + 4 + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "z", + "value_set": [ + "hello", + "jello", + "mello" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "z", + "value_set": [ + "hello", + "jello" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 3 + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "y", + "value_set": [ + 1.1, + 2.2, + 5.5 + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "y", + "value_set": [ + 1.11, + 2.22, + 5.51 + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "z", + "value_set": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "empty_column", + "value_set": [ + "cat", + "dog" + ], + "catch_exceptions": false + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "dates", + "value_set": [ + "2018-01-01", + "2018-01-02", + "2018-01-02 00:34:01" + ], + "parse_strings_as_datetimes": true + } + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "dates", + "value_set": [ + "2018-01-02", + "2018-01-02 00:34:01" + ], + "parse_strings_as_datetimes": true + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "w", + "mostly": 0.6 + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "y" + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "y", + "strictly": true + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "x" + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "z", + "parse_strings_as_datetimes": true + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "z", + "parse_strings_as_datetimes": true, + "strictly": true + } + }, + { + "expectation_type": "expect_column_values_to_be_decreasing", + "kwargs": { + "column": "empty_column", + "catch_exceptions": false + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "a", + "regex": "^a", + "mostly": 0.3 + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "a", + "regex": "^a", + "mostly": 0.2 + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "a", + "regex": "^a", + "mostly": 0.1 + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "b", + "regex": "^a", + "mostly": 0.5 + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "b", + "regex": "^c" + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "c", + "regex": "^a" + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "c", + "regex": "^a", + "mostly": 0.2 + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "b", + "regex": "" + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex", + "kwargs": { + "column": "a", + "regex": "b", + "mostly": 0.6 + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex_list", + "kwargs": { + "column": "w", + "regex_list": [ + "\\s+" + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex_list", + "kwargs": { + "column": "w", + "regex_list": [ + "\\s+", + "[a-zA-Z]" + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex_list", + "kwargs": { + "column": "w", + "regex_list": [ + "[12]+", + "[45]+" + ] + } + }, + { + "expectation_type": "expect_column_values_to_not_match_regex_list", + "kwargs": { + "column": "x", + "regex_list": [ + "opatomus", + "ovat", + "h.*t" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "x" + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "y" + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "y", + "strictly": true + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "w" + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "zz", + "parse_strings_as_datetimes": true + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "a" + } + }, + { + "expectation_type": "expect_column_values_to_be_increasing", + "kwargs": { + "column": "b" + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s1", + "min_value": 4, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s2", + "min_value": 4, + "max_value": 8 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s2", + "min_value": 5, + "max_value": 9 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s1", + "min_value": null, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s1", + "min_value": 4, + "max_value": null + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s1", + "min_value": 1, + "max_value": 0 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s3", + "min_value": 4, + "max_value": 9 + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "min_value": "quack", + "max_value": 0, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "min_value": 0, + "max_value": "quack", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s1", + "min_value": null, + "max_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "s4", + "min_value": 0, + "max_value": 1, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "unique" + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "a" + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "c", + "mostly": 0.3 + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "c", + "mostly": 0.3 + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "c", + "mostly": 0.4 + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "n" + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "c" + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "null" + } + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "mult_dup" + } + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "x", + "json_schema": {} + } + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "x", + "json_schema": { + "properties": { + "a": { + "type": "integer" + } + }, + "required": [ + "a" + ] + } + } + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "x", + "json_schema": { + "properties": { + "a": { + "type": "integer" + } + }, + "required": [ + "b" + ] + } + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "all_null", + "result_format": "BASIC" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "no_null" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "three_null", + "mostly": 0.75 + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "three_null", + "mostly": 0.8 + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "all_null", + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "one_null" + } + }, + { + "expectation_type": "expect_column_values_to_be_in_type_list", + "kwargs": { + "column": "x", + "type_list": [ + "int" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_type_list", + "kwargs": { + "column": "x", + "type_list": [ + "string" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_type_list", + "kwargs": { + "column": "y", + "type_list": [ + "float" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_type_list", + "kwargs": { + "column": "z", + "type_list": [ + "string" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_type_list", + "kwargs": { + "column": "b", + "type_list": [ + "boolean" + ] + } + }, + { + "expectation_type": "expect_column_values_to_be_in_type_list", + "kwargs": { + "column": "s", + "type_list": [ + "string", + "int" + ] + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": 0.5, + "max_value": 1.5 + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": 1.1547005383792517, + "max_value": 1.1547005383792517 + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": 0, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "zero", + "min_value": 0, + "max_value": 0 + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist2", + "min_value": 1, + "max_value": null + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist2", + "min_value": null, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist2", + "min_value": 1.5, + "max_value": null + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "dist2", + "min_value": null, + "max_value": 0.5 + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "missing", + "min_value": 1, + "max_value": 1, + "result_format": "COMPLETE" + } + }, + { + "expectation_type": "expect_column_stdev_to_be_between", + "kwargs": { + "column": "missing", + "min_value": null, + "max_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 1 + ] + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 1, + 2 + ] + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 1 + ], + "ties_okay": true + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 2 + ], + "ties_okay": true + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 3 + ] + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "x", + "value_set": [ + 3 + ], + "ties_okay": true + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "y", + "value_set": [ + "jello", + "hello" + ] + } + }, + { + "expectation_type": "expect_column_most_common_value_to_be_in_set", + "kwargs": { + "column": "y", + "value_set": [ + "mello", + "hello" + ] + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "w", + "result_format": "BASIC", + "min_value": 4, + "max_value": 6 + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "w", + "result_format": "BASIC", + "min_value": null, + "max_value": 4 + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "w", + "result_format": "SUMMARY", + "min_value": 0, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "x", + "min_value": 3 + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "w", + "min_value": 50 + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "zz", + "min_value": "2/1/2016", + "max_value": "3/1/2016", + "parse_strings_as_datetimes": true, + "output_strftime_format": "%m/%d/%Y" + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "zzz", + "min_value": "2/1/2016", + "max_value": "3/1/2016", + "parse_strings_as_datetimes": false + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "z", + "min_value": "d", + "max_value": "f" + } + }, + { + "expectation_type": "expect_column_max_to_be_between", + "kwargs": { + "column": "empty_column", + "min_value": 0, + "max_value": 0, + "catch_exceptions": false + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "norm_std", + "result_format": "BASIC", + "params": { + "mean": 0, + "std_dev": 1 + }, + "distribution": "norm", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "norm_std", + "result_format": "BASIC", + "params": { + "mean": 1, + "std_dev": 1 + }, + "distribution": "norm", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "beta", + "result_format": "BASIC", + "params": { + "alpha": 0.5, + "beta": 10, + "loc": 5, + "scale": 11 + }, + "distribution": "beta", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "beta", + "result_format": "BASIC", + "params": [ + 0.5, + 10, + 5, + 11 + ], + "distribution": "beta", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "beta", + "result_format": "BASIC", + "params": { + "alpha": 1, + "beta": 11, + "loc": 5, + "scale": 11 + }, + "distribution": "beta", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "beta", + "result_format": "BASIC", + "params": [ + 1, + 11, + 5, + 11 + ], + "distribution": "beta", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "gamma", + "result_format": "BASIC", + "params": { + "alpha": 2, + "loc": 20, + "scale": 3 + }, + "distribution": "gamma", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "gamma", + "result_format": "BASIC", + "params": [ + 2, + 20, + 3 + ], + "distribution": "gamma", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "gamma", + "result_format": "BASIC", + "params": { + "alpha": 3, + "loc": 20, + "scale": 3 + }, + "distribution": "gamma", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "gamma", + "result_format": "BASIC", + "params": [ + 3, + 20, + 3 + ], + "distribution": "gamma", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "uniform", + "result_format": "BASIC", + "params": { + "min": -5, + "max": 11 + }, + "distribution": "uniform", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "uniform", + "result_format": "BASIC", + "params": [ + -5, + 11 + ], + "distribution": "uniform", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "uniform", + "result_format": "BASIC", + "params": { + "min": -4, + "max": 12 + }, + "distribution": "uniform", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "uniform", + "result_format": "BASIC", + "params": [ + -4, + 12 + ], + "distribution": "uniform", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "chi2", + "result_format": "BASIC", + "params": { + "df": 30, + "loc": 3, + "scale": 5 + }, + "distribution": "chi2", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "chi2", + "result_format": "COMPLETE", + "params": [ + 30, + 3, + 5 + ], + "distribution": "chi2", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "chi2", + "result_format": "BASIC", + "params": { + "df": 33, + "loc": 3, + "scale": 5 + }, + "distribution": "chi2", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "chi2", + "result_format": "BASIC", + "params": [ + 33, + 3, + 5 + ], + "distribution": "chi2", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "exponential", + "result_format": "BASIC", + "params": { + "loc": 4.2, + "scale": 10 + }, + "distribution": "expon", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "exponential", + "result_format": "BASIC", + "params": [ + 4.2, + 10 + ], + "distribution": "expon", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "exponential", + "result_format": "BASIC", + "params": { + "loc": 5, + "scale": 10 + }, + "distribution": "expon", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + "kwargs": { + "column": "exponential", + "result_format": "BASIC", + "params": [ + 5, + 10 + ], + "distribution": "expon", + "p_value": 0.05 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "x", + "min_value": 2, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "x", + "min_value": 1, + "max_value": 2 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "y", + "min_value": 5, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "y", + "min_value": 4, + "max_value": 4 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "z", + "min_value": 5, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "z", + "min_value": 13, + "max_value": 14 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "n", + "min_value": 0, + "max_value": 0 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "s", + "min_value": 0, + "max_value": 0, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "b", + "min_value": 0, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "x", + "min_value": 0, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "x", + "min_value": "s", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "x", + "max_value": "s", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "x", + "min_value": null, + "max_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_mean_to_be_between", + "kwargs": { + "column": "empty_column", + "min_value": 0, + "max_value": 1, + "catch_exceptions": false + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "w", + "result_format": "BASIC", + "min_value": 30, + "max_value": 30 + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "w", + "result_format": "BASIC", + "min_value": 40, + "max_value": 50 + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "w", + "result_format": "SUMMARY", + "min_value": 20, + "max_value": 40 + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "x", + "min_value": 30 + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "w", + "min_value": 50 + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "y", + "max_value": 20 + } + }, + { + "expectation_type": "expect_column_sum_to_be_between", + "kwargs": { + "column": "y", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": 0, + "max_value": 10 + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "dist2", + "min_value": 0, + "max_value": 10 + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "dist3", + "min_value": null, + "max_value": 10 + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "dist4", + "min_value": 2, + "max_value": null + } + }, + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": null, + "max_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "a", + "min_value": 2.5, + "max_value": 2.5 + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "a", + "min_value": null, + "max_value": 3 + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "a", + "min_value": 2, + "max_value": null + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "b", + "min_value": 5, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "b", + "min_value": null, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "b", + "min_value": 2.5, + "max_value": null + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "c", + "min_value": 6, + "max_value": 6, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "c", + "min_value": 7, + "max_value": 7, + "result_format": "COMPLETE" + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "c", + "min_value": null, + "max_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_median_to_be_between", + "kwargs": { + "column": "empty_column", + "min_value": 0, + "max_value": 0, + "catch_exceptions": false + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "w", + "result_format": "BASIC", + "min_value": -10, + "max_value": 5 + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "w", + "result_format": "BASIC", + "min_value": 4, + "max_value": null + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "w", + "result_format": "SUMMARY", + "min_value": 0, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "x", + "min_value": 1 + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "w", + "min_value": 50 + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "a", + "min_value": 1, + "max_value": 2 + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "zz", + "min_value": "2/1/2016", + "max_value": "3/1/2016", + "parse_strings_as_datetimes": true + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "zz", + "min_value": "2/1/2016", + "max_value": "3/1/2016", + "parse_strings_as_datetimes": true, + "output_strftime_format": "%m/%d/%Y" + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "y", + "max_value": 0 + } + }, + { + "expectation_type": "expect_column_min_to_be_between", + "kwargs": { + "column": "y", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": 0.5, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "dist2", + "min_value": 0.5, + "max_value": 1 + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "dist3", + "min_value": 0.6, + "max_value": 0.7 + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "dist4", + "min_value": 0.3, + "max_value": null + } + }, + { + "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", + "kwargs": { + "column": "dist1", + "min_value": null, + "max_value": null, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": [ + 4 + ] + }, + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": [ + 5 + ] + }, + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": [ + 0 + ] + }, + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": { + "value": 3 + } + }, + { + "expectation_type": "expect_table_row_count_to_equal", + "kwargs": { + "value": "hello", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "c1" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "covfefe" + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "c2", + "column_index": 1 + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": { + "column": "c3", + "column_index": 4 + } + }, + { + "expectation_type": "expect_column_to_exist", + "kwargs": [ + "a" + ] + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "min_value": 3, + "max_value": 5 + } + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "min_value": 0, + "max_value": 1 + } + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "min_value": null, + "max_value": 4 + } + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "min_value": 1, + "max_value": 0 + } + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "min_value": null, + "max_value": 10 + } + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "min_value": "quack", + "max_value": 0, + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_table_row_count_to_be_between", + "kwargs": { + "max_value": "quack", + "catch_exceptions": true + } + }, + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": { + "column_list": [ + "c1", + "c2", + "c3" + ] + } + }, + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": { + "column_list": [ + "c1", + "c2" + ] + } + }, + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": { + "column_list": [ + "b1", + "c2", + "c3" + ] + } + }, + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": { + "column_list": [ + "c3", + "c2", + "c1" + ] + } + }, + { + "expectation_type": "expect_table_columns_to_match_ordered_list", + "kwargs": { + "column_list": [ + "c1", + "c2", + "c3", + "c4" + ] + } + }, + { + "expectation_type": "expect_multicolumn_values_to_be_unique", + "kwargs": { + "column_list": [ + "w", + "x" + ] + } + }, + { + "expectation_type": "expect_multicolumn_values_to_be_unique", + "kwargs": { + "column_list": [ + "w", + "x" + ], + "ignore_row_if": "any_value_is_missing" + } + }, + { + "expectation_type": "expect_multicolumn_values_to_be_unique", + "kwargs": { + "column_list": [ + "a", + "b" + ] + } + }, + { + "expectation_type": "expect_multicolumn_values_to_be_unique", + "kwargs": { + "column_list": [ + "a", + "b" + ] + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "x", + "column_B": "x" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "x", + "column_B": "y" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "x", + "column_B": "y", + "mostly": 0.9 + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "y", + "column_B": "z", + "ignore_row_if": "either_value_is_missing" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "w", + "column_B": "z", + "ignore_row_if": "both_values_are_missing" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "w", + "column_B": "z" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "w", + "column_B": "z" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "w", + "column_B": "z", + "mostly": 0.5 + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_equal", + "kwargs": { + "column_A": "w", + "column_B": "z", + "ignore_row_if": "either_value_is_missing" + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_in_set", + "kwargs": { + "column_A": "x", + "column_B": "x", + "value_pairs_set": [ + [ + 1, + 1 + ], + [ + 2, + 2 + ], + [ + 3, + 3 + ], + [ + 4, + 4 + ], + [ + 5, + 5 + ], + [ + 6, + 6 + ], + [ + 7, + 7 + ], + [ + 8, + 8 + ], + [ + 9, + 9 + ], + [ + 10, + 10 + ] + ] + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_in_set", + "kwargs": { + "column_A": "x", + "column_B": "z", + "ignore_row_if": "either_value_is_missing", + "value_pairs_set": [ + [ + 1, + 1 + ], + [ + 2, + 2 + ], + [ + 3, + 3 + ], + [ + 4, + 4 + ], + [ + 5, + 5 + ] + ] + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_in_set", + "kwargs": { + "column_A": "x", + "column_B": "z", + "value_pairs_set": [ + [ + 1, + 1 + ], + [ + 2, + 2 + ], + [ + 3, + 3 + ], + [ + 4, + 4 + ], + [ + 5, + 5 + ] + ] + } + }, + { + "expectation_type": "expect_column_pair_values_to_be_in_set", + "kwargs": { + "column_A": "a", + "column_B": "b", + "ignore_row_if": "both_values_are_missing", + "value_pairs_set": [ + [ + 1, + 1 + ], + [ + 2, + 2 + ], + [ + 2, + 1 + ], + [ + 1, + 2 + ] + ] + } + }, + { + "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", + "kwargs": { + "column_A": "x", + "column_B": "w" + } + }, + { + "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", + "kwargs": { + "column_A": "x", + "column_B": "z", + "ignore_row_if": "either_value_is_missing" + } + }, + { + "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", + "kwargs": { + "column_A": "w", + "column_B": "z", + "ignore_row_if": "either_value_is_missing" + } + }, + { + "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", + "kwargs": { + "column_A": "w", + "column_B": "z", + "or_equal": true, + "ignore_row_if": "either_value_is_missing" + } + }, + { + "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", + "kwargs": { + "column_A": "a", + "column_B": "b", + "parse_strings_as_datetimes": true, + "mostly": 0.6 + } + } +] \ No newline at end of file diff --git a/tests/test_ge_utils.py b/tests/test_ge_utils.py new file mode 100644 index 000000000000..8acb7dd50c97 --- /dev/null +++ b/tests/test_ge_utils.py @@ -0,0 +1,61 @@ +import pytest + +import great_expectations as ge + + +def test_validate_non_dataset(file_data_asset, empty_expectation_suite): + with pytest.raises(ValueError, match=r"The validate util method only supports dataset validations"): + ge.validate(file_data_asset, empty_expectation_suite, data_asset_type=ge.data_asset.FileDataAsset) + + +def test_validate_dataset(dataset, basic_expectation_suite): + res = ge.validate(dataset, basic_expectation_suite) + assert res["success"] == True + assert res["statistics"]["evaluated_expectations"] == 4 + if isinstance(dataset, ge.dataset.PandasDataset): + res = ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.PandasDataset) + assert res["success"] == True + assert res["statistics"]["evaluated_expectations"] == 4 + with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): + ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.SqlAlchemyDataset) + + elif isinstance(dataset, ge.dataset.SqlAlchemyDataset): + res = ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.SqlAlchemyDataset) + assert res["success"] == True + assert res["statistics"]["evaluated_expectations"] == 4 + with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): + ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.PandasDataset) + + elif isinstance(dataset, ge.dataset.SparkDFDataset): + res = ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.SparkDFDataset) + assert res["success"] == True + assert res["statistics"]["evaluated_expectations"] == 4 + with pytest.raises(ValueError, match=r"The validate util method only supports validation for subtypes of the provided data_asset_type"): + ge.validate(dataset, basic_expectation_suite, data_asset_type=ge.dataset.PandasDataset) + +def test_validate_using_data_context(dataset, data_context): + # Before running, the data context should not have compiled parameters + assert data_context._compiled == False + res = ge.validate(dataset, data_asset_name="parameterized_expectation_suite_fixture", data_context=data_context) + + # After handling a validation result registration, it should be + assert data_context._compiled == True + + # And, we should have validated the right number of expectations from the context-provided config + assert res["success"] == False + assert res["statistics"]["evaluated_expectations"] == 2 + + +def test_validate_using_data_context_path(dataset, data_context): + data_context_path = data_context.root_directory + res = ge.validate(dataset, data_asset_name="parameterized_expectation_suite_fixture", data_context=data_context_path) + + # We should have now found the right suite with expectations to evaluate + assert res["success"] == False + assert res["statistics"]["evaluated_expectations"] == 2 + + + +def test_validate_invalid_parameters(dataset, basic_expectation_suite, data_context): + with pytest.raises(ValueError, match="Either an expectation suite or a DataContext is required for validation."): + ge.validate(dataset) \ No newline at end of file diff --git a/tests/test_great_expectations.py b/tests/test_great_expectations.py index eabde9984308..17f47776e994 100644 --- a/tests/test_great_expectations.py +++ b/tests/test_great_expectations.py @@ -2,15 +2,19 @@ import os import random import unittest +from datetime import datetime +try: + from unittest import mock +except ImportError: + import mock import math import pandas as pd import re import great_expectations as ge -from great_expectations.dataset.autoinspect import columns_exist from great_expectations.dataset import PandasDataset, MetaPandasDataset -from great_expectations.data_asset.base import ( +from great_expectations.data_asset.data_asset import ( _calc_validation_statistics, ValidationStatistics, ) @@ -190,21 +194,23 @@ class TestValidation(unittest.TestCase): def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: - my_expectations_config = json.load(f) + my_expectation_suite = json.load(f) my_df = ge.read_csv( "./tests/test_sets/Titanic.csv", - expectations_config=my_expectations_config + expectation_suite=my_expectation_suite ) my_df.set_default_expectation_argument("result_format", "COMPLETE") - results = my_df.validate(catch_exceptions=False) - # print json.dumps(results, indent=2) + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.utcnow.return_value = datetime(1955, 11, 5) + results = my_df.validate(catch_exceptions=False) + # with open('./tests/test_sets/expected_cli_results_default.json') as f: with open('./tests/test_sets/expected_results_20180303.json') as f: expected_results = json.load(f) - # print json.dumps(expected_results, indent=2) + del results["meta"]["great_expectations.__version__"] self.maxDiff = None assertDeepAlmostEqual( results, @@ -219,11 +225,19 @@ def test_validate(self): # Finally, confirm that only_return_failures works # and does not affect the "statistics" field. - validation_results = my_df.validate(only_return_failures=True) - # print json.dumps(validation_results) + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.utcnow.return_value = datetime(1955, 11, 5) + validation_results = my_df.validate(only_return_failures=True) + del validation_results["meta"]["great_expectations.__version__"] assertDeepAlmostEqual( validation_results, - {"results": [ + { + "meta": { + "data_asset_name": None, + "expectation_suite_name": "default", + "run_id": "1955-11-05T00:00:00" + }, + "results": [ {"expectation_config": { "expectation_type": "expect_column_values_to_be_in_set", "kwargs": {"column": "PClass", "value_set": ["1st", "2nd", "3rd"], "result_format": "COMPLETE"} @@ -250,7 +264,8 @@ def test_validate_catch_non_existent_expectation(self): }) validation_config_non_existent_expectation = { - "dataset_name": None, + "data_asset_name": None, + "expectation_suite_name": "default", "meta": { "great_expectations.__version__": ge.__version__ }, @@ -262,7 +277,7 @@ def test_validate_catch_non_existent_expectation(self): }] } results = df.validate( - expectations_config=validation_config_non_existent_expectation)['results'] + expectation_suite=validation_config_non_existent_expectation)['results'] self.assertIn( "object has no attribute 'non_existent_expectation'", @@ -275,7 +290,8 @@ def test_validate_catch_invalid_parameter(self): }) validation_config_invalid_parameter = { - "dataset_name": None, + "data_asset_name": None, + "expectation_suite_name": "default", "meta": { "great_expectations.__version__": ge.__version__ }, @@ -289,7 +305,7 @@ def test_validate_catch_invalid_parameter(self): }] } - results = df.validate(expectations_config=validation_config_invalid_parameter)[ + results = df.validate(expectation_suite=validation_config_invalid_parameter)[ 'results'] print(results[0]['exception_info']) self.assertIn( @@ -297,89 +313,13 @@ def test_validate_catch_invalid_parameter(self): results[0]['exception_info']['exception_message'] ) - def test_top_level_validate(self): - my_df = pd.DataFrame({ - "x": [1, 2, 3, 4, 5] - }) - validation_result = ge.validate(my_df, { - "dataset_name": None, - "meta": { - "great_expectations.__version__": ge.__version__ - }, - "expectations": [{ - "expectation_type": "expect_column_to_exist", - "kwargs": { - "column": "x" - } - }, { - "expectation_type": "expect_column_values_to_be_between", - "kwargs": { - "column": "x", - "min_value": 3, - "max_value": 5 - } - }] - }) - self.assertEqual( - validation_result, - { - "results": [ - { - "expectation_config": { - "kwargs": { - "column": "x" - }, - "expectation_type": "expect_column_to_exist", - }, - "exception_info": {"exception_message": None, - "exception_traceback": None, - "raised_exception": False}, - "success": True - }, - { - "expectation_config": { - "expectation_type": "expect_column_values_to_be_between", - "kwargs": { - "column": "x", - "max_value": 5, - "min_value": 3 - } - }, - "exception_info": {"exception_message": None, - "exception_traceback": None, - "raised_exception": False}, - "success": False, - "result": {'element_count': 5, - 'missing_count': 0, - 'missing_percent': 0.0, - "unexpected_percent": 0.4, - "partial_unexpected_list": [ - 1, - 2 - ], - "unexpected_percent_nonmissing": 0.4, - "unexpected_count": 2 - } - } - ], - "success": False, - "statistics": { - "evaluated_expectations": 2, - "successful_expectations": 1, - "unsuccessful_expectations": 1, - "success_percent": 50, - } - } - ) - - class TestValidationStatisticsCalculation(unittest.TestCase): def test_no_expectations(self): expectation_results = [] actual = _calc_validation_statistics(expectation_results) # pay attention to these two - self.assertTrue(math.isnan(actual.success_percent)) + self.assertEqual(actual.success_percent, None) self.assertEqual(actual.success, True) # the rest is boring self.assertEqual(actual.successful_expectations, 0) @@ -434,20 +374,20 @@ class TestRepeatedAppendExpectation(unittest.TestCase): def test_validate(self): with open("./tests/test_sets/titanic_expectations.json") as f: - my_expectations_config = json.load(f) + my_expectation_suite = json.load(f) my_df = ge.read_csv("./tests/test_sets/Titanic.csv", - autoinspect_func=columns_exist) + profiler=ge.profile.ColumnsExistProfiler) self.assertEqual( - len(my_df.get_expectations_config()['expectations']), + len(my_df.get_expectation_suite()['expectations']), 7 ) # For column_expectations, _append_expectation should only replace expectations where the expetation_type AND the column match my_df.expect_column_to_exist("PClass") self.assertEqual( - len(my_df.get_expectations_config()['expectations']), + len(my_df.get_expectation_suite()['expectations']), 7 ) diff --git a/tests/test_pandas_dataset.py b/tests/test_pandas_dataset.py index f2b051b61bd9..d88b57145961 100644 --- a/tests/test_pandas_dataset.py +++ b/tests/test_pandas_dataset.py @@ -5,7 +5,7 @@ import datetime import pandas as pd import great_expectations as ge -import great_expectations.dataset.autoinspect as autoinspect +from great_expectations.profile import ColumnsExistProfiler from .test_utils import assertDeepAlmostEqual @@ -270,28 +270,6 @@ class CustomPandasDataset(ge.dataset.PandasDataset): assert list(ge_df_custom['z'])==list(pd_df['z']) -def test_from_pandas_expectations_config(): - # Logic mostly copied from TestValidation.test_validate - def load_ge_config(file): - with open(file) as f: - return json.load(f) - - my_expectations_config = load_ge_config( - "./tests/test_sets/titanic_expectations.json") - - pd_df = pd.read_csv("./tests/test_sets/Titanic.csv") - my_df = ge.from_pandas(pd_df, expectations_config=my_expectations_config) - - my_df.set_default_expectation_argument("result_format", "COMPLETE") - - results = my_df.validate(catch_exceptions=False) - - expected_results = load_ge_config( - "./tests/test_sets/expected_results_20180303.json") - - assertDeepAlmostEqual(results, expected_results) - - def test_ge_pandas_concatenating_no_autoinspect(): df1 = ge.dataset.PandasDataset({ 'A': ['A0', 'A1', 'A2'], @@ -412,7 +390,7 @@ def test_ge_pandas_sampling(): }) # Put some simple expectations on the data frame - df.autoinspect(autoinspect_func=autoinspect.columns_exist) + df.profile(profiler=ColumnsExistProfiler) df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ['a', 'b', 'c', 'd']) @@ -522,7 +500,7 @@ def test_ge_pandas_automatic_failure_removal(): }) # Put some simple expectations on the data frame - df.autoinspect(autoinspect.columns_exist) + df.profile(ge.profile.ColumnsExistProfiler) df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4]) df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8]) df.expect_column_values_to_be_in_set("C", ['w', 'x', 'y', 'z']) diff --git a/tests/test_parameter_substitution.py b/tests/test_parameter_substitution.py index 7a88eba10540..23fb9cc984f2 100644 --- a/tests/test_parameter_substitution.py +++ b/tests/test_parameter_substitution.py @@ -1,6 +1,6 @@ """ Test the expectation decorator's ability to substitute parameters -at evaluation time, and store parameters in expectations_config +at evaluation time, and store parameters in expectation_suite """ import pytest @@ -55,13 +55,13 @@ def test_parameter_substitution(single_expectation_custom_data_asset): # Establish our expectation using that parameter result = single_expectation_custom_data_asset.expect_nothing( expectation_argument={"$PARAMETER": "upstream_dag_key"}) - config = single_expectation_custom_data_asset.get_expectations_config() + suite = single_expectation_custom_data_asset.get_expectation_suite() - # Ensure our value has been substituted during evaluation, and set properly in the config + # Ensure our value has been substituted during evaluation, and set properly in the suite assert result["result"]["details"]["expectation_argument"] == "upstream_dag_value" - assert config["evaluation_parameters"] == { + assert suite["evaluation_parameters"] == { "upstream_dag_key": "upstream_dag_value"} - assert config["expectations"][0]["kwargs"] == { + assert suite["expectations"][0]["kwargs"] == { "expectation_argument": {"$PARAMETER": "upstream_dag_key"}} @@ -71,12 +71,12 @@ def test_exploratory_parameter_substitution(single_expectation_custom_data_asset result = single_expectation_custom_data_asset.expect_nothing( expectation_argument={"$PARAMETER": "upstream_dag_key", "$PARAMETER.upstream_dag_key": "temporary_value"}) - config = single_expectation_custom_data_asset.get_expectations_config() - # Ensure our value has been substituted during evaluation, and NOT stored in the config + suite = single_expectation_custom_data_asset.get_expectation_suite() + # Ensure our value has been substituted during evaluation, and NOT stored in the suite assert result["result"]["details"]["expectation_argument"] == "temporary_value" - assert "evaluation_parameters" not in config or config["evaluation_parameters"] == { + assert "evaluation_parameters" not in suite or suite["evaluation_parameters"] == { } - assert config["expectations"][0]["kwargs"] == { + assert suite["expectations"][0]["kwargs"] == { "expectation_argument": {"$PARAMETER": "upstream_dag_key"}} # Evaluating the expectation without the parameter should now fail, because no parameters were set diff --git a/tests/test_profile.py b/tests/test_profile.py new file mode 100644 index 000000000000..171032528f5e --- /dev/null +++ b/tests/test_profile.py @@ -0,0 +1,164 @@ +import pytest + +import json +import os +import shutil + +from great_expectations.profile.base import DatasetProfiler +from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler +from great_expectations.profile.columns_exist import ColumnsExistProfiler +from great_expectations.dataset.pandas_dataset import PandasDataset +from great_expectations.data_context import DataContext +from great_expectations.data_context.util import safe_mmkdir + +# Tests to write: +# test_cli_method_works -> test_cli +# test context-based profile methods +# test class-based profile methods + + +def test_DataSetProfiler_methods(): + toy_dataset = PandasDataset({"x": [1, 2, 3]}) + + assert DatasetProfiler.validate(1) == False + assert DatasetProfiler.validate(toy_dataset) + + with pytest.raises(NotImplementedError) as e_info: + DatasetProfiler.profile(toy_dataset) + + +def test_ColumnsExistProfiler(): + toy_dataset = PandasDataset({"x": [1, 2, 3]}) + expectations_config, evr_config = ColumnsExistProfiler.profile(toy_dataset) + + print(json.dumps(expectations_config, indent=2)) + # FIXME: REVISIT THIS TEST FOR CONTENT + # TODO: REVISIT THIS TEST FOR CONTENT + + # assert expectations_config == { + # "data_asset_name": None, + # "data_asset_type": "Dataset", + # "meta": { + # "great_expectations.__version__": "0.7.0-beta", + # "ColumnsExistProfiler": { + # "created_by": "BasicDatasetProfiler", + # "created_at": 0, + # }, + # }, + # "expectations": [ + # { + # "expectation_type": "expect_column_to_exist", + # "kwargs": { + # "column": "x" + # } + # } + # ] + # } + + +def test_BasicDatasetProfiler(): + toy_dataset = PandasDataset({"x": [1, 2, 3]}) + assert len(toy_dataset.get_expectation_suite( + suppress_warnings=True)["expectations"]) == 0 + + expectations_config, evr_config = BasicDatasetProfiler.profile(toy_dataset) + + # print(json.dumps(expectations_config, indent=2)) + + assert len(toy_dataset.get_expectation_suite( + suppress_warnings=True)["expectations"]) > 0 + + # We should add an additional test that instantiates the batch via context, so the data_asset_name will be populated. + assert expectations_config["data_asset_name"] == None + assert "BasicDatasetProfiler" in expectations_config["meta"] + # We should add an additional test that instantiates the batch via context, so that batch_kwargs will be populated. + assert set(expectations_config["meta"]["BasicDatasetProfiler"].keys()) == { + "created_by", "created_at" + } + for exp in expectations_config["expectations"]: + assert "BasicDatasetProfiler" in exp["meta"] + assert exp["meta"]["BasicDatasetProfiler"] == { + "confidence": "very low" + } + + # FIXME: REVISIT THIS TEST FOR CONTENT + # TODO: REVISIT THIS TEST FOR CONTENT + + + # Example: + # { + # "data_asset_name": "notable_works_by_charles_dickens", + # "meta": { + # "great_expectations.__version__": "0.7.0-beta", + # "BasicDatasetProfiler": { + # "created_by": "BasicDatasetProfiler", + # "created_at": 0, + # "batch_kwargs": {}, + # }, + # }, + # "expectations": [ + # { + # "expectation_type": "expect_column_to_exist", + # "meta": { + # "BasicDatasetProfiler": { + # "confidence": "very low" + # } + # } + # }] + # } + + +def test_BasicDatasetProfiler_with_context(empty_data_context, filesystem_csv_2): + empty_data_context.add_datasource( + "my_datasource", "pandas", base_directory=str(filesystem_csv_2)) + not_so_empty_data_context = empty_data_context + + batch = not_so_empty_data_context.get_batch("my_datasource/f1") + expectations_config, validation_results = BasicDatasetProfiler.profile( + batch) + + # print(batch.get_batch_kwargs()) + # print(json.dumps(expectations_config, indent=2)) + + assert expectations_config["data_asset_name"] == "my_datasource/default/f1" + assert expectations_config["expectation_suite_name"] == "default" + assert "BasicDatasetProfiler" in expectations_config["meta"] + assert set(expectations_config["meta"]["BasicDatasetProfiler"].keys()) == { + "created_by", "created_at", "batch_kwargs" + } + + for exp in expectations_config["expectations"]: + assert "BasicDatasetProfiler" in exp["meta"] + assert exp["meta"]["BasicDatasetProfiler"] == { + "confidence": "very low" + } + + assert validation_results["meta"]["data_asset_name"] == "my_datasource/default/f1" + assert set(validation_results["meta"].keys()) == { + "great_expectations.__version__", "data_asset_name", "expectation_suite_name", "run_id", "batch_kwargs" + } + + +def test_context_profiler(empty_data_context, filesystem_csv_2): + empty_data_context.add_datasource( + "my_datasource", "pandas", base_directory=str(filesystem_csv_2)) + not_so_empty_data_context = empty_data_context + + assert not_so_empty_data_context.list_expectation_suites() == {} + not_so_empty_data_context.profile_datasource("my_datasource") + + assert "my_datasource" in not_so_empty_data_context.list_expectation_suites() + + profiled_expectations = not_so_empty_data_context.get_expectation_suite('f1', "BasicDatasetProfiler") + + # FIXME: REVISIT THIS TEST FOR CONTENT + # TODO: REVISIT THIS TEST FOR CONTENT + assert len(profiled_expectations["expectations"]) > 0 + + # print(json.dumps(validation_results, indent=2)) + + # # Note: deliberately not testing context file storage in this test. + # context_expectations_config = not_so_empty_data_context.get_expectation_suite( + # "my_datasource", "f1") + + # assert context_expectations_config == profiled_expectations diff --git a/tests/test_render.py b/tests/test_render.py new file mode 100644 index 000000000000..bdf04e46ffe6 --- /dev/null +++ b/tests/test_render.py @@ -0,0 +1,250 @@ +# See test_render_new.py + +# import pytest +# import unittest +# import json + +# import great_expectations as ge +# from great_expectations import render +# from .test_utils import assertDeepAlmostEqual + +# @pytest.fixture(scope="module") +# def simple_pandas_dataframe(): +# return ge.dataset.PandasDataset({"a": [1,2,3,4]}) + +# def test_render_single_evr(simple_pandas_dataframe): +# res = simple_pandas_dataframe.expect_column_values_to_be_in_set("a", [1,2,3], include_config=True, result_format="SUMMARY") +# html = render.view_models.DescriptiveEvrColumnSectionRenderer.render([res], res["expectation_config"]["expectation_type"], +# mode='widget') +# print(html) +# with open('./test.html', 'w') as f: +# f.write(html) + +# assert True + +# class TestPageRenderers(unittest.TestCase): + +# def test_import(self): +# from great_expectations import render + +# def test_prescriptive_expectation_renderer(self): +# expectation_suite = json.load( +# open('tests/test_fixtures/rendering_fixtures/expectation_suite_3.json') +# ) +# results = render.view_models.PrescriptiveExpectationPageRenderer().render( +# expectation_suite, +# ) +# assert results != None +# assert "
  • is a required field.
  • " in results +# assert '
  • must have at least 0 unique values.
  • ' in results + +# with open('./test.html', 'w') as f: +# f.write(results) + +# def test_descriptive_evr_renderer(self): +# rendered_page = render.view_models.DescriptiveEvrPageRenderer().render( +# json.load( +# open('tests/test_fixtures/rendering_fixtures/evr_suite_3.json') +# )["results"], +# ) +# assert rendered_page != None + +# with open('./test.html', 'w') as f: +# f.write(rendered_page) + +# def test_full_oobe_flow(self): +# df = ge.read_csv("examples/data/Titanic.csv") +# # df = ge.read_csv("examples/data/Meteorite_Landings.csv") +# df.autoinspect(ge.dataset.autoinspect.pseudo_pandas_profiling) +# # df.autoinspect(ge.dataset.autoinspect.columns_exist) +# evrs = df.validate()["results"] +# # print(json.dumps(evrs, indent=2)) + +# rendered_page = render.compile_to_documentation( +# evrs, +# render.view_models.DescriptiveEvrPageRenderer, +# ) +# assert rendered_page != None + +# # with open('./test.html', 'w') as f: +# # f.write(rendered_page) + + +# class TestSectionRenderers(unittest.TestCase): + +# def test_render_modes(self): +# # df = ge.read_csv("examples/data/Meteorite_Landings.csv") +# # df.autoinspect(ge.dataset.autoinspect.pseudo_pandas_profiling) +# # expectations_list = df.get_expectation_suite()["expectations"] + +# expectations_list = json.load( +# open('tests/test_fixtures/rendering_fixtures/expectation_suite_3.json') +# )["expectations"] + +# # print( json.dumps(expectations_list, indent=2) ) + +# # evrs = df.validate()["results"] +# # print( json.dumps(evrs, indent=2) ) + +# R = render.view_models.default.section.prescriptive.PrescriptiveExpectationColumnSectionRenderer +# rendered_section = R.render( +# expectations_list +# ) +# assert rendered_section != None +# assert json.dumps(rendered_section) +# # print( json.dumps(rendered_section, indent=2) ) + +# rendered_section = R.render( +# expectations_list, +# 'html' +# ) +# # print(rendered_section) + +# assert "
  • is a required field.
  • " in rendered_section +# # assert False + + +# class TestSnippetRenderers(unittest.TestCase): + +# def test_util_render_parameter(self): +# #!!! More tests needed here, eventually. +# assert render.snippets.util.render_parameter( +# 100, "d") == '100' + +# def test_basics(self): +# #!!! Many more tests needed here, eventually. + +# result = render.snippets.expectation_bullet_point.ExpectationBulletPointSnippetRenderer.render({ +# "expectation_type": "expect_column_to_exist", +# "kwargs": {"column": "x_var"} +# }, include_column_name=True) +# print(result) +# assert result == "x_var is a required field." + +# result = render.snippets.expectation_bullet_point.ExpectationBulletPointSnippetRenderer.render( +# { +# "expectation_type": "expect_column_value_lengths_to_be_between", +# "kwargs": { +# "column": "last_name", +# "min_value": 3, +# "max_value": 20, +# "mostly": .95 +# } +# }, include_column_name=False) +# print(result) +# assert result == ' must be between 3 and 20 characters long at least 0.9% of the time.' + + +# class TestContentBlockRenderers(unittest.TestCase): +# result = render.snippets.evr_content_block.EvrContentBlockSnippetRenderer.render( +# { +# 'success': False, +# 'result': { +# 'element_count': 45716, +# 'missing_count': 0, +# 'missing_percent': 0.0, +# 'unexpected_count': 45716, +# 'unexpected_percent': 1.0, +# 'unexpected_percent_nonmissing': 1.0, +# 'partial_unexpected_list': [ +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid', +# 'Valid' +# ], +# 'partial_unexpected_index_list': [ +# 0, +# 1, +# 2, +# 3, +# 4, +# 5, +# 6, +# 7, +# 8, +# 9, +# 10, +# 11, +# 12, +# 13, +# 14, +# 15, +# 16, +# 17, +# 18, +# 19 +# ], +# 'partial_unexpected_counts': [{'value': 'Valid', 'count': 45641}, +# {'value': 'Relict', 'count': 75}] +# }, +# 'exception_info': { +# 'raised_exception': False, +# 'exception_message': None, +# 'exception_traceback': None +# }, +# 'expectation_config': { +# 'expectation_type': 'expect_column_values_to_be_in_set', +# 'kwargs': { +# 'column': 'nametype', +# 'value_set': [], +# 'result_format': 'SUMMARY' +# } +# } +# }, +# "partial_unexpected_counts" +# ) +# print(json.dumps(result)) + +# # assert json.dumps(result) == """{"content_block_type": "graph", "content": [{"$schema": "https://vega.github.io/schema/vega-lite/v2.6.0.json", "config": {"view": {"height": 300, "width": 400}}, "datasets": {"data-cfff8a6fe8134dace707fd67405d0857": [{"count": 45641, "value": "Valid"}, {"count": 75, "value": "Relict"}]}, "height": 900, "layer": [{"data": {"name": "data-cfff8a6fe8134dace707fd67405d0857"}, "encoding": {"x": {"field": "count", "type": "quantitative"}, "y": {"field": "value", "type": "ordinal"}}, "height": 80, "mark": "bar", "width": 240}, {"data": {"name": "data-cfff8a6fe8134dace707fd67405d0857"}, "encoding": {"text": {"field": "count", "type": "quantitative"}, "x": {"field": "count", "type": "quantitative"}, "y": {"field": "value", "type": "ordinal"}}, "height": 80, "mark": {"align": "left", "baseline": "middle", "dx": 3, "type": "text"}, "width": 240}]}]}""" +# # assertDeepAlmostEqual( +# # result, +# # { +# # "content_block_type": "graph", +# # "content": [{ +# # "$schema": "https://vega.github.io/schema/vega-lite/v2.6.0.json", +# # "config": {"view": {"height": 300, "width": 400}}, +# # "datasets": { +# # "data-cfff8a6fe8134dace707fd67405d0857": [ +# # {"count": 45641, "value": "Valid"}, { +# # "count": 75, "value": "Relict"} +# # ]}, +# # "height": 900, +# # "layer": [{ +# # "data": {"name": "data-cfff8a6fe8134dace707fd67405d0857"}, +# # "encoding": { +# # "x": {"field": "count", "type": "quantitative"}, +# # "y": {"field": "value", "type": "ordinal"} +# # }, +# # "height": 80, +# # "mark": "bar", +# # "width": 240 +# # }, { +# # "data": {"name": "data-cfff8a6fe8134dace707fd67405d0857"}, +# # "encoding": { +# # "text": {"field": "count", "type": "quantitative"}, +# # "x": {"field": "count", "type": "quantitative"}, +# # "y": {"field": "value", "type": "ordinal"} +# # }, +# # "height": 80, +# # "mark": {"align": "left", "baseline": "middle", "dx": 3, "type": "text"}, +# # "width": 240 +# # }] +# # }] +# # } +# # ) diff --git a/tests/test_render_new.py b/tests/test_render_new.py new file mode 100644 index 000000000000..c6ee42bf21aa --- /dev/null +++ b/tests/test_render_new.py @@ -0,0 +1,95 @@ +import pytest + +import json + +import great_expectations as ge +from great_expectations.render.renderer import DescriptivePageRenderer, DescriptiveColumnSectionRenderer, PrescriptiveColumnSectionRenderer +from great_expectations.render.view import DescriptivePageView +from great_expectations.render.renderer.content_block import ValueListContentBlock +from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler + + +@pytest.fixture() +def validation_results(): + with open("./tests/test_sets/expected_cli_results_default.json", "r") as infile: + return json.load(infile) + + +@pytest.fixture() +def expectations(): + with open("./tests/test_sets/titanic_expectations.json", "r") as infile: + return json.load(infile) + + +def test_render_descriptive_page_renderer(validation_results): + print(json.dumps(DescriptivePageRenderer.render(validation_results), indent=2)) + # TODO: Use above print to set up snapshot test once we like the result + assert True + + +def test_render_descriptive_page_view(validation_results): + renderer = DescriptivePageRenderer.render(validation_results) + print(DescriptivePageView.render(renderer)) + # TODO: Use above print to set up snapshot test once we like the result + assert True + + +def test_render_descriptive_column_section_renderer(validation_results): + # Group EVRs by column + evrs = {} + for evr in validation_results["results"]: + try: + column = evr["expectation_config"]["kwargs"]["column"] + if column not in evrs: + evrs[column] = [] + evrs[column].append(evr) + except KeyError: + pass + + for column in evrs.keys(): + print(json.dumps(DescriptiveColumnSectionRenderer.render( + evrs[column]), indent=2)) + # TODO: Use above print to set up snapshot test once we like the result + assert True + + +def test_render_prescriptive_column_section_renderer(expectations): + # Group expectations by column + exp_groups = {} + # print(json.dumps(expectations, indent=2)) + for exp in expectations["expectations"]: + try: + column = exp["kwargs"]["column"] + if column not in exp_groups: + exp_groups[column] = [] + exp_groups[column].append(exp) + except KeyError: + pass + + for column in exp_groups.keys(): + print(column) + print(json.dumps(PrescriptiveColumnSectionRenderer.render( + exp_groups[column]), indent=2)) + # TODO: Use above print to set up snapshot test once we like the result + assert True + + +def test_content_block_list_available_expectations(expectations): + available_expectations = ValueListContentBlock.list_available_expectations() + assert available_expectations == ['expect_column_values_to_be_in_set'] + + +def test_full_oobe_flow(): + df = ge.read_csv("examples/data/Titanic.csv") + # df = ge.read_csv("examples/data/Meteorite_Landings.csv") + df.profile(BasicDatasetProfiler) + # df.autoinspect(ge.dataset.autoinspect.columns_exist) + evrs = df.validate() # ["results"] + # print(json.dumps(evrs, indent=2)) + + rendered_json = DescriptivePageRenderer.render(evrs) + rendered_page = DescriptivePageView.render(rendered_json) + assert rendered_page != None + + with open('./test.html', 'w') as f: + f.write(rendered_page) diff --git a/tests/test_sets/expected_cli_results_custom.json b/tests/test_sets/expected_cli_results_custom.json index ba2daa34c203..523fedd38bf8 100644 --- a/tests/test_sets/expected_cli_results_custom.json +++ b/tests/test_sets/expected_cli_results_custom.json @@ -1,4 +1,10 @@ -{ "results": [ +{ + "meta": { + "data_asset_name": null, + "expectation_suite_name": "default", + "run_id": "1955-11-05T00:00:00" + }, + "results": [ { "expectation_config": { "expectation_type": "expect_column_values_to_have_odd_lengths", diff --git a/tests/test_sets/expected_cli_results_default.json b/tests/test_sets/expected_cli_results_default.json index 98ff9236aff7..4e9c381de5e2 100644 --- a/tests/test_sets/expected_cli_results_default.json +++ b/tests/test_sets/expected_cli_results_default.json @@ -1,4 +1,9 @@ { + "meta": { + "data_asset_name": null, + "expectation_suite_name": "default", + "run_id": "1955-11-05T00:00:00" + }, "results": [ { "expectation_config": { diff --git a/tests/test_sets/expected_results_20180303.json b/tests/test_sets/expected_results_20180303.json index 68ae145d3d85..b7ac6fe1a728 100644 --- a/tests/test_sets/expected_results_20180303.json +++ b/tests/test_sets/expected_results_20180303.json @@ -1,4 +1,9 @@ { + "meta": { + "data_asset_name": null, + "expectation_suite_name": "default", + "run_id": "1955-11-05T00:00:00" + }, "results": [ { "expectation_config": { diff --git a/tests/test_sets/titanic_custom_expectations.json b/tests/test_sets/titanic_custom_expectations.json index 1293b4afd47c..88012abdb6bc 100644 --- a/tests/test_sets/titanic_custom_expectations.json +++ b/tests/test_sets/titanic_custom_expectations.json @@ -7,5 +7,6 @@ } } ], - "dataset_name": null + "data_asset_name": null, + "expectation_suite_name": "default" } \ No newline at end of file diff --git a/tests/test_sets/titanic_expectations.json b/tests/test_sets/titanic_expectations.json index 660c63af1425..bfacf46d8840 100644 --- a/tests/test_sets/titanic_expectations.json +++ b/tests/test_sets/titanic_expectations.json @@ -72,5 +72,6 @@ } } ], - "dataset_name": null + "data_asset_name": null, + "expectation_suite_name": "default" } diff --git a/tests/test_sets/titanic_parameterized_expectations.json b/tests/test_sets/titanic_parameterized_expectations.json index a85c2a2428e8..a540d34a3baa 100644 --- a/tests/test_sets/titanic_parameterized_expectations.json +++ b/tests/test_sets/titanic_parameterized_expectations.json @@ -72,5 +72,6 @@ } } ], - "dataset_name": null + "data_asset_name": null, + "expectation_suite_name": "default" } diff --git a/tests/test_sets/unicode.csv b/tests/test_sets/unicode.csv new file mode 100644 index 000000000000..00657aa53f82 --- /dev/null +++ b/tests/test_sets/unicode.csv @@ -0,0 +1,2 @@ +Μ,Α,Θ +😁,😅,😉 \ No newline at end of file diff --git a/tests/test_slack.py b/tests/test_slack.py new file mode 100644 index 000000000000..e49ee178bfd5 --- /dev/null +++ b/tests/test_slack.py @@ -0,0 +1,215 @@ +import pytest +try: + from unittest import mock +except ImportError: + import mock + +from great_expectations.data_context.util import get_slack_callback, build_slack_notification_request + + +@pytest.fixture +def validation_json(): + return { + "meta": { + "data_asset_name": "diabetes_raw_csv", + "run_id": 7, + "result_reference": "s3://my_bucket/blah.json", + "dataset_reference": "s3://my_bucket/blah.csv", + }, + "statistics": {"successful_expectations": 33, "evaluated_expectations": 44}, + "success": True, + } + + +def test_get_slack_callback_returns_callable(): + obs = get_slack_callback("foo") + assert callable(obs) + + +def test_build_slack_notification_request_with_no_validation_json(): + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.strftime.return_value = "05/05/19 12:12:12" + obs = build_slack_notification_request(None) + + assert isinstance(obs, dict) + assert obs == { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "No validation occurred. Please ensure you passed a validation_json.", + }, + }, + { + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": "Great Expectations run id None ran at 05/05/19 12:12:12", + } + ], + }, + ] + } + + +def test_build_slack_notification_request_with_successful_validation(validation_json): + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.strftime.return_value = "05/05/19 12:12:12" + obs = build_slack_notification_request(validation_json) + + assert isinstance(obs, dict) + assert obs == { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Validated batch from data asset:* `diabetes_raw_csv`\n*Status: Success :tada:*\n33 of 44 expectations were met\n\n", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "- *Validation Report*: s3://my_bucket/blah.json", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "- *Validation data asset*: s3://my_bucket/blah.csv", + }, + }, + { + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": "Great Expectations run id 7 ran at 05/05/19 12:12:12", + } + ], + }, + ] + } + + +def test_build_slack_notification_request_with_failed_validation(validation_json): + validation_json["success"] = False + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.strftime.return_value = "05/05/19 12:12:12" + obs = build_slack_notification_request(validation_json) + + assert isinstance(obs, dict) + assert obs == { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Validated batch from data asset:* `diabetes_raw_csv`\n*Status: Failed :x:*\n33 of 44 expectations were met\n\n", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "- *Validation Report*: s3://my_bucket/blah.json", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "- *Validation data asset*: s3://my_bucket/blah.csv", + }, + }, + { + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": "Great Expectations run id 7 ran at 05/05/19 12:12:12", + } + ], + }, + ] + } + + +def test_build_slack_notification_request_with_successful_validation_and_no_result_report( + validation_json +): + validation_json["meta"].pop("result_reference") + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.strftime.return_value = "05/05/19 12:12:12" + obs = build_slack_notification_request(validation_json) + + assert isinstance(obs, dict) + assert obs == { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Validated batch from data asset:* `diabetes_raw_csv`\n*Status: Success :tada:*\n33 of 44 expectations were met\n\n", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "- *Validation data asset*: s3://my_bucket/blah.csv", + }, + }, + { + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": "Great Expectations run id 7 ran at 05/05/19 12:12:12", + } + ], + }, + ] + } + + +def test_build_slack_notification_request_with_successful_validation_and_no_dataset( + validation_json +): + validation_json["meta"].pop("dataset_reference") + with mock.patch("datetime.datetime") as mock_datetime: + mock_datetime.strftime.return_value = "05/05/19 12:12:12" + obs = build_slack_notification_request(validation_json) + + assert isinstance(obs, dict) + assert obs == { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Validated batch from data asset:* `diabetes_raw_csv`\n*Status: Success :tada:*\n33 of 44 expectations were met\n\n", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "- *Validation Report*: s3://my_bucket/blah.json", + }, + }, + { + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": "Great Expectations run id 7 ran at 05/05/19 12:12:12", + } + ], + }, + ] + } diff --git a/tests/test_utils.py b/tests/test_utils.py index bc93fe295a94..73785801e21e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,9 @@ import random import string import warnings +import copy +from dateutil.parser import parse import pandas as pd import numpy as np import pytest @@ -11,47 +13,63 @@ from sqlalchemy.exc import SQLAlchemyError import sqlalchemy.dialects.sqlite as sqlitetypes import sqlalchemy.dialects.postgresql as postgresqltypes +import sqlalchemy.dialects.mysql as mysqltypes from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes from great_expectations.dataset import PandasDataset, SqlAlchemyDataset, SparkDFDataset -import great_expectations.dataset.autoinspect as autoinspect - -CONTEXTS = ['PandasDataset', 'SqlAlchemyDataset', 'SparkDFDataset'] +from great_expectations.profile import ColumnsExistProfiler SQLITE_TYPES = { - "varchar": sqlitetypes.VARCHAR, - "char": sqlitetypes.CHAR, - "int": sqlitetypes.INTEGER, - "smallint": sqlitetypes.SMALLINT, - "datetime": sqlitetypes.DATETIME(truncate_microseconds=True), - "date": sqlitetypes.DATE, - "float": sqlitetypes.FLOAT, - "bool": sqlitetypes.BOOLEAN + "VARCHAR": sqlitetypes.VARCHAR, + "CHAR": sqlitetypes.CHAR, + "INTEGER": sqlitetypes.INTEGER, + "SMALLINT": sqlitetypes.SMALLINT, + "DATETIME": sqlitetypes.DATETIME(truncate_microseconds=True), + "DATE": sqlitetypes.DATE, + "FLOAT": sqlitetypes.FLOAT, + "BOOLEAN": sqlitetypes.BOOLEAN } POSTGRESQL_TYPES = { - "text": postgresqltypes.TEXT, - "char": postgresqltypes.CHAR, - "int": postgresqltypes.INTEGER, - "smallint": postgresqltypes.SMALLINT, - "timestamp": postgresqltypes.TIMESTAMP, - "date": postgresqltypes.DATE, - "float": postgresqltypes.FLOAT, - "bool": postgresqltypes.BOOLEAN + "TEXT": postgresqltypes.TEXT, + "CHAR": postgresqltypes.CHAR, + "INTEGER": postgresqltypes.INTEGER, + "SMALLINT": postgresqltypes.SMALLINT, + "BIGINT": postgresqltypes.BIGINT, + "TIMESTAMP": postgresqltypes.TIMESTAMP, + "DATE": postgresqltypes.DATE, + "DOUBLE_PRECISION": postgresqltypes.DOUBLE_PRECISION, + "BOOLEAN": postgresqltypes.BOOLEAN, + "NUMERIC": postgresqltypes.NUMERIC +} + +MYSQL_TYPES = { + "TEXT": mysqltypes.TEXT, + "CHAR": mysqltypes.CHAR, + "INTEGER": mysqltypes.INTEGER, + "SMALLINT": mysqltypes.SMALLINT, + "BIGINT": mysqltypes.BIGINT, + "TIMESTAMP": mysqltypes.TIMESTAMP, + "DATE": mysqltypes.DATE, + "FLOAT": mysqltypes.FLOAT, + "BOOLEAN": mysqltypes.BOOLEAN } SPARK_TYPES = { - "string": sparktypes.StringType, - "int": sparktypes.IntegerType, - "date": sparktypes.DateType, - "timestamp": sparktypes.TimestampType, - "float": sparktypes.DoubleType, - "bool": sparktypes.BooleanType, - "object": sparktypes.DataType, - "null": sparktypes.NullType + "StringType": sparktypes.StringType, + "IntegerType": sparktypes.IntegerType, + "LongType": sparktypes.LongType, + "DateType": sparktypes.DateType, + "TimestampType": sparktypes.TimestampType, + "FloatType": sparktypes.FloatType, + "DoubleType": sparktypes.DoubleType, + "BooleanType": sparktypes.BooleanType, + "DataType": sparktypes.DataType, + "NullType": sparktypes.NullType } + # Taken from the following stackoverflow: # https://stackoverflow.com/questions/23549419/assert-that-two-dictionaries-are-almost-equal def assertDeepAlmostEqual(expected, actual, *args, **kwargs): @@ -92,80 +110,137 @@ def assertDeepAlmostEqual(expected, actual, *args, **kwargs): raise exc -def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.columns_exist, caching=False): - """For Pandas, data should be either a DataFrame or a dictionary that can - be instantiated as a DataFrame. - For SQL, data should have the following shape: - { - 'table': - 'table': SqlAlchemy Table object - named_column: [list of values] - } - +def get_dataset(dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=False): + """Utility to create datasets for json-formatted tests. """ + df = pd.DataFrame(data) if dataset_type == 'PandasDataset': - df = pd.DataFrame(data) if schemas and "pandas" in schemas: pandas_schema = {key:np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) - return PandasDataset(df, autoinspect_func=autoinspect_func, caching=caching) - elif dataset_type == 'SqlAlchemyDataset': - # Create a new database - - # Try to use a local postgres instance (e.g. on Travis); this will allow more testing than sqlite - try: - engine = create_engine('postgresql://postgres@localhost/test_ci') - conn = engine.connect() - except SQLAlchemyError: - warnings.warn("Falling back to sqlite database.") - engine = create_engine('sqlite://') - conn = engine.connect() + return PandasDataset(df, profiler=profiler, caching=caching) + elif dataset_type == "sqlite": + engine = create_engine('sqlite://') + conn = engine.connect() # Add the data to the database as a new table - df = pd.DataFrame(data) sql_dtypes = {} if schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect): schema = schemas["sqlite"] sql_dtypes = {col : SQLITE_TYPES[dtype] for (col,dtype) in schema.items()} for col in schema: - type = schema[col] - if type == "int": - df[col] = pd.to_numeric(df[col],downcast='signed') - elif type == "float": - df[col] = pd.to_numeric(df[col],downcast='float') - elif type == "datetime": + type_ = schema[col] + if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: + df[col] = pd.to_numeric(df[col], downcast='signed') + elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: + df[col] = pd.to_numeric(df[col]) + elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) - elif schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect): + + tablename = "test_data_" + ''.join([random.choice(string.ascii_letters + string.digits) for n in range(8)]) + df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) + + # Build a SqlAlchemyDataset using that database + return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) + + elif dataset_type == 'postgresql': + # Create a new database + engine = create_engine('postgresql://postgres@localhost/test_ci') + conn = engine.connect() + + sql_dtypes = {} + if schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect): schema = schemas["postgresql"] sql_dtypes = {col : POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items()} for col in schema: - type = schema[col] - if type == "int": - df[col] = pd.to_numeric(df[col],downcast='signed') - elif type == "float": - df[col] = pd.to_numeric(df[col],downcast='float') - elif type == "timestamp": + type_ = schema[col] + if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: + df[col] = pd.to_numeric(df[col], downcast='signed') + elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: + df[col] = pd.to_numeric(df[col]) + elif type_ in ["DATETIME", "TIMESTAMP"]: + df[col] = pd.to_datetime(df[col]) + + tablename = "test_data_" + ''.join([random.choice(string.ascii_letters + string.digits) for n in range(8)]) + df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) + + # Build a SqlAlchemyDataset using that database + return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) + + elif dataset_type == 'mysql': + engine = create_engine('mysql://root@localhost/test_ci') + conn = engine.connect() + + sql_dtypes = {} + if schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect): + schema = schemas["mysql"] + sql_dtypes = {col : MYSQL_TYPES[dtype] for (col, dtype) in schema.items()} + for col in schema: + type_ = schema[col] + if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: + df[col] = pd.to_numeric(df[col], downcast='signed') + elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: + df[col] = pd.to_numeric(df[col]) + elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([random.choice(string.ascii_letters + string.digits) for n in range(8)]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database - return SqlAlchemyDataset(tablename, engine=conn, autoinspect_func=autoinspect_func, caching=caching) + return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'SparkDFDataset': spark = SparkSession.builder.getOrCreate() - data_reshaped = list(zip(*[v for _, v in data.items()])) + # We need to allow null values in some column types that do not support them natively, so we skip + # use of df in this case. + data_reshaped = list(zip(*[v for _, v in data.items()])) # create a list of rows if schemas and 'spark' in schemas: schema = schemas['spark'] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ - sparktypes.StructField(column, SPARK_TYPES[schema[column]]()) + sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) - spark_df = spark.createDataFrame(data_reshaped, spark_schema) + # We create these every time, which is painful for testing + # However nuance around null treatment as well as the desire + # for real datetime support in tests makes this necessary + data = copy.deepcopy(data) + if "ts" in data: + print(data) + print(schema) + for col in schema: + type_ = schema[col] + if type_ in ["IntegerType", "LongType"]: + # Ints cannot be None...but None can be valid in Spark (as Null) + vals = [] + for val in data[col]: + if val is None: + vals.append(val) + else: + vals.append(int(val)) + data[col] = vals + elif type_ in ["FloatType", "DoubleType"]: + vals = [] + for val in data[col]: + if val is None: + vals.append(val) + else: + vals.append(float(val)) + data[col] = vals + elif type_ in ["DateType", "TimestampType"]: + vals = [] + for val in data[col]: + if val is None: + vals.append(val) + else: + vals.append(parse(val)) + data[col] = vals + # Do this again, now that we have done type conversion using the provided schema + data_reshaped = list(zip(*[v for _, v in data.items()])) # create a list of rows + spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) @@ -186,24 +261,27 @@ def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.c # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) - return SparkDFDataset(spark_df, caching=caching) + return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type)) + def candidate_getter_is_on_temporary_notimplemented_list(context, getter): - if context == 'SqlAlchemyDataset': + if context in ["sqlite"]: return getter in [ 'get_column_modes', - 'get_column_stdev', + 'get_column_stdev' ] - if context == 'SparkDFDataset': + if context in ["postgresql", "mysql"]: return getter in [ - 'get_column_median', + 'get_column_modes' ] + if context == 'SparkDFDataset': + return getter in [] def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type): - if context == "SqlAlchemyDataset": + if context in ["sqlite", "postgresql", "mysql"]: return expectation_type in [ # "expect_column_to_exist", # "expect_table_row_count_to_be_between", @@ -212,10 +290,11 @@ def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type # "expect_column_values_to_be_unique", # "expect_column_values_to_not_be_null", # "expect_column_values_to_be_null", - "expect_column_values_to_be_of_type", - "expect_column_values_to_be_in_type_list", + # "expect_column_values_to_be_of_type", + # "expect_column_values_to_be_in_type_list", # "expect_column_values_to_be_in_set", # "expect_column_values_to_not_be_in_set", + # "expect_column_distinct_values_to_be_in_set", # "expect_column_distinct_values_to_equal_set", # "expect_column_distinct_values_to_contain_set", # "expect_column_values_to_be_between", @@ -231,8 +310,9 @@ def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type "expect_column_values_to_be_dateutil_parseable", "expect_column_values_to_be_json_parseable", "expect_column_values_to_match_json_schema", - #"expect_column_mean_to_be_between", - #"expect_column_median_to_be_between", + # "expect_column_mean_to_be_between", + # "expect_column_median_to_be_between", + # "expect_column_quantile_values_to_be_between", "expect_column_stdev_to_be_between", #"expect_column_unique_value_count_to_be_between", #"expect_column_proportion_of_unique_values_to_be_between", @@ -262,12 +342,13 @@ def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type "expect_column_values_to_be_in_type_list", # "expect_column_values_to_be_in_set", # "expect_column_values_to_not_be_in_set", + # "expect_column_distinct_values_to_be_in_set", # "expect_column_distinct_values_to_equal_set", # "expect_column_distinct_values_to_contain_set", - "expect_column_values_to_be_between", + # "expect_column_values_to_be_between", "expect_column_values_to_be_increasing", "expect_column_values_to_be_decreasing", - "expect_column_value_lengths_to_be_between", + # "expect_column_value_lengths_to_be_between", # "expect_column_value_lengths_to_equal", # "expect_column_values_to_match_regex", # "expect_column_values_to_not_match_regex", @@ -278,7 +359,8 @@ def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type "expect_column_values_to_be_json_parseable", "expect_column_values_to_match_json_schema", # "expect_column_mean_to_be_between", - "expect_column_median_to_be_between", + # "expect_column_median_to_be_between", + # "expect_column_quantile_values_to_be_between", # "expect_column_stdev_to_be_between", # "expect_column_unique_value_count_to_be_between", # "expect_column_proportion_of_unique_values_to_be_between", @@ -359,7 +441,12 @@ def evaluate_json_test(data_asset, expectation_type, test): elif key == 'observed_value': if 'tolerance' in test: - assert np.allclose(result['result']['observed_value'], value, rtol=test['tolerance']) + if isinstance(value, dict): + assert set(value.keys()) == set(result["result"]["observed_value"].keys()) + for k,v in value.items(): + assert np.allclose(result["result"]["observed_value"][k], v, rtol=test["tolerance"]) + else: + assert np.allclose(result['result']['observed_value'], value, rtol=test['tolerance']) else: assert value == result['result']['observed_value'] @@ -377,6 +464,10 @@ def evaluate_json_test(data_asset, expectation_type, test): elif key == 'details': assert result['result']['details'] == value + elif key == "value_counts": + for val_count in value: + assert val_count in result["result"]["details"]["value_counts"] + elif key.startswith("observed_cdf"): if "x_-1" in key: if key.endswith("gt"): diff --git a/travis-java.sh b/travis-java.sh index bc2a71fb8dc3..8771c7780ac4 100755 --- a/travis-java.sh +++ b/travis-java.sh @@ -27,4 +27,9 @@ export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 export PATH=$JAVA_HOME/bin:$PATH echo "Current JAVA_HOME: $JAVA_HOME" echo "Current java -version:" -java -version \ No newline at end of file +java -version + + +find / | grep java +ls -al /usr/lib/jvm +ls -al /usr/lib/jvm/java-8-openjdk-amd64