Merge branch 'develop' into m/dx-47/drop-tables-from-validator

* develop: Consolidate Cloud tutorials (#7395) [DOCS] DOC-473 Adds guide "How to set up GX to work with SQL databases" (#7409) [CONTRIB] Limit results for two expectations (#7403) [MAINTENANCE] : split up map_metric_provider.py (#7402) [DOCS] DOC-473: Adds shared components for fluent and state management updates (#7404) [FEATURE] add optional `id` to Fluent Datasources and DataAsset schemas (#7334) [Contrib] Adding support for date for the row condition parser (#7359) [MAINTENANCE] Test against minimum SQLAlchemy versions (#7396) [BUGFIX] Fixing typographical errors and argument omissions (#7398)
great-expectations · Mar 20, 2023 · 231e6c1 · 231e6c1
2 parents 548244f + 45aaf19
commit 231e6c1
Show file tree

Hide file tree

Showing 206 changed files with 6,192 additions and 4,063 deletions.
diff --git a/ci/azure-pipelines-sqlalchemy-compatibility.yml b/ci/azure-pipelines-sqlalchemy-compatibility.yml
@@ -0,0 +1,167 @@
+# This file is responsible for configuring the `sqlalchemy-compatibility` pipeline (https://dev.azure.com/great-expectations/great_expectations/_build)
+#
+# The pipeline is run under the following conditions:
+#   - On the develop branch whenever a commit is made to an open PR
+#
+# In this pipeline we run tests against several databases using the latest patch
+# version of currently supported sqlalchemy versions. E.g 1.3.x, 1.4.x, 2.0.x
+# where x is the latest patch. This will help ensure we are compatible with all
+# previously supported versions as we make changes to support later versions.
+
+trigger:
+  branches:
+    include:
+    - develop
+
+resources:
+  containers:
+  - container: postgres
+    image: postgres:11
+    ports:
+    - 5432:5432
+    env:
+      POSTGRES_DB: "test_ci"
+      POSTGRES_HOST_AUTH_METHOD: "trust"
+
+variables:
+  GE_USAGE_STATISTICS_URL: "https://qa.stats.greatexpectations.io/great_expectations/v1/usage_statistics"
+
+
+stages:
+  - stage: scope_check
+    pool:
+      vmImage: 'ubuntu-latest'
+    jobs:
+      - job: changes
+        steps:
+          - task: ChangedFiles@1
+            name: CheckChanges
+            inputs:
+              verbose: true
+              rules: |
+                [GXChanged]
+                great_expectations/**/*.py
+                pyproject.toml
+                setup.cfg
+                setup.py
+                MANIFEST.in
+                tests/**
+                /*.txt
+                /*.yml
+                requirements*
+                reqs/*.txt
+                ci/**/*.yml
+                assets/scripts/**
+                scripts/*.py
+                scripts/*.sh
+
+  - stage: lint
+    dependsOn: scope_check
+    pool:
+      vmImage: 'ubuntu-latest'
+
+    jobs:
+      - job: lint
+        condition: eq(stageDependencies.scope_check.changes.outputs['CheckChanges.GXChanged'], true)
+        steps:
+          - task: UsePythonVersion@0
+            inputs:
+              versionSpec: 3.7
+            displayName: 'Use Python 3.7'
+
+          - script: |
+              pip install $(grep -E '^(black|invoke|ruff)' reqs/requirements-dev-contrib.txt)
+              EXIT_STATUS=0
+              invoke fmt --check || EXIT_STATUS=$?
+              invoke lint || EXIT_STATUS=$?
+              exit $EXIT_STATUS
+
+  - stage: import_ge
+    dependsOn: scope_check
+    pool:
+      vmImage: 'ubuntu-latest'
+
+    jobs:
+      - job: import_ge
+
+        steps:
+         - task: UsePythonVersion@0
+           inputs:
+             versionSpec: '3.7'
+           displayName: 'Use Python 3.7'
+
+         - script: |
+             pip install .
+           displayName: 'Install GX and required dependencies (i.e. not sqlalchemy)'
+
+         - script: |
+             python -c "import great_expectations as gx; print('Successfully imported GX Version:', gx.__version__)"
+           displayName: 'Import Great Expectations'
+
+  - stage: sqlalchemy_compatibility
+    dependsOn: [scope_check, lint, import_ge]
+    pool:
+      vmImage: 'ubuntu-latest'
+
+    jobs:
+      - job: sqlalchemy_compatibility_postgres
+        timeoutInMinutes: 90
+        condition: eq(stageDependencies.scope_check.changes.outputs['CheckChanges.GXChanged'], true)
+        strategy:
+          # This matrix is intended to run against the latest patch versions of
+          # sqlalchemy minor versions that we support.
+          # (versions as semver major.minor.patch)
+          matrix:
+            # Uncomment if we need 1.3.x verification
+            # sqlalchemy_1_3_x:
+              # sqlalchemy_base_version: '1.3.0'
+            sqlalchemy_1_4_x:
+              sqlalchemy_base_version: '1.4.0'
+            # Uncomment when we are compatible with 2.0.x.
+            # sqlalchemy_2_0_x:
+            #   sqlalchemy_base_version: '2.0.0'
+
+        services:
+          postgres: postgres
+
+        steps:
+          - task: UsePythonVersion@0
+            inputs:
+              versionSpec: '3.7'
+            displayName: 'Use Python 3.7'
+
+          - script: |
+              cp constraints-dev.txt constraints-dev-temp.txt
+              echo "SQLAlchemy~=$(sqlalchemy_base_version)" >> constraints-dev-temp.txt 
+              pip install --constraint constraints-dev-temp.txt ".[dev]" pytest-azurepipelines
+            displayName: 'Install dependencies using SQLAlchemy base version $(sqlalchemy_base_version)'
+
+          - script: |
+              # Run pytest
+              pytest \
+                --postgresql \
+                --ignore 'tests/cli' \
+                --ignore 'tests/integration/usage_statistics' \
+                --napoleon-docstrings \
+                --junitxml=junit/test-results.xml \
+                --cov=. \
+                --cov-report=xml \
+                --cov-report=html \
+                -m 'not unit and not e2e'
+
+            displayName: 'pytest'
+            env:
+              GE_USAGE_STATISTICS_URL: ${{ variables.GE_USAGE_STATISTICS_URL }}
+              SQLALCHEMY_WARN_20: true
+
+          - task: PublishTestResults@2
+            condition: succeededOrFailed()
+            inputs:
+              testResultsFiles: '**/test-*.xml'
+              testRunTitle: 'Publish test results for Python $(python.version)'
+
+          - task: PublishCodeCoverageResults@1
+            inputs:
+              codeCoverageTool: Cobertura
+              summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
+              reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov'
diff --git a/...pectations_experimental/expectations/expect_queried_column_to_be_unique_with_condition.py b/...pectations_experimental/expectations/expect_queried_column_to_be_unique_with_condition.py
@@ -81,7 +81,7 @@ def _validate(
                 "success": False,
                 "result": {
                     "info": "The column values are not unique, under the condition",
-                    "observed_value": query_result,
+                    "observed_value": query_result[:10],
                 },
             }
 

diff --git a/...perimental/expectations/expect_queried_column_to_have_n_distinct_values_with_condition.py b/...perimental/expectations/expect_queried_column_to_have_n_distinct_values_with_condition.py
@@ -84,7 +84,7 @@ def _validate(
                 "success": False,
                 "result": {
                     "info": f"Expected {expected_num_of_distinct_values} but found {actual_num_of_distinct_values} distinct values",
-                    "observed_value": query_result,
+                    "observed_value": query_result[:10],
                 },
             }
 

diff --git a/docs/docusaurus/docs/components/_data.jsx b/docs/docusaurus/docs/components/_data.jsx
@@ -1,5 +1,5 @@
 export default {
-  release_version: 'great_expectations, version 0.16.1',
-  min_python: 'Python 3.7',
-  max_python: 'Python 3.10'
+  release_version: 'great_expectations, version 0.15.50',
+  min_python: '3.7',
+  max_python: '3.10'
 }
diff --git a/docs/docusaurus/docs/components/_prerequisites.jsx b/docs/docusaurus/docs/components/_prerequisites.jsx
@@ -1,5 +1,6 @@
 import React from 'react'
 import Admonition from '@theme/Admonition'
+import GxData from '/docs/components/_data.jsx'
 
 /**
  * A flexible Prerequisites admonition block.
@@ -25,6 +26,7 @@ import Admonition from '@theme/Admonition'
  * </Prerequisites>
  *
  * Available default entries from props:
+ *   requirePython: Valid values are {true} or {false}
  *   requireInstallation: Valid values are {true} or {false}
  *   requireDataContext: Valid values are {true} or {false}
  *   requireSourceData: Valid values are 'filesystem' or 'SQL'
@@ -50,10 +52,11 @@ export default class Prerequisites extends React.Component {
   }
 
   defaultPrerequisiteItems () {
-    const returnItems = [
-      <li key={0.1}>
-        Completed the <a href='/docs/tutorials/getting_started/tutorial_overview'>Getting Started Tutorial</a>
-      </li>]
+    const returnItems = []
+    if (this.props.requirePython === true) {
+      returnItems.push(<li>A supported version of Python (versions {GxData.min_python} to {GxData.max_python})</li>)
+      returnItems.push(<ul><li>For details on how to download and install Python on your platform, please see <a href='https://www.python.org/doc/'>Python's documentation</a> and <a href='https://www.python.org/downloads/'>download sites</a></li></ul>)
+    }
     if (this.props.requireInstallation === true) {
       returnItems.push(<li>Set up an <a href='/docs/guides/setup/installation/local'>installation of Great Expectations</a></li>)
     }
@@ -84,8 +87,7 @@ export default class Prerequisites extends React.Component {
   render () {
     return (
       <div>
-        <Admonition type='caution' title='Prerequisites'>
-          <h5>This guide assumes you have:</h5>
+        <Admonition type='caution' title='This guide assumes you have:'>
           <ul>
             {this.defaultPrerequisiteItems()}
             {this.extractMarkdownListItems().map((prereq, i) => (<li key={i}>{prereq}</li>))}
@@ -97,6 +99,7 @@ export default class Prerequisites extends React.Component {
 }
 
 Prerequisites.defaultProps = {
+  requirePython: false,
   requireInstallation: false,
   requireDataContext: false,
   requireSourceData: null,

diff --git a/docs/docusaurus/docs/components/_templates/_template_basic_install_and_setup.md b/docs/docusaurus/docs/components/_templates/_template_basic_install_and_setup.md
@@ -0,0 +1,94 @@
+---
+[//]: # (TODO: title: How to set up GX to work with general $TODO$)
+tag: [how-to, setup]
+[//]: # (TODO: keywords: [Great Expectations, SQL, $TODO$])
+---
+
+[//]: # (TODO: # How to set up Great Expectations to work with general $TODO$)
+
+import TechnicalTag from '/docs/term_tags/_tag.mdx';
+import Prerequisites from '/docs/components/_prerequisites.jsx'
+
+<!-- ## Introduction -->
+import IntroInstallPythonGxAndDependencies from '/docs/components/setup/installation/_intro_python_environment_with_dependencies.mdx'
+
+<!-- ## Prerequisites -->
+
+<!-- ### 1. Check your Python version -->
+import PythonCheckVersion from '/docs/components/setup/python_environment/_python_check_version.mdx'
+
+<!-- ### 2. Create a Python virtual environment -->
+import PythonCreateVenv from '/docs/components/setup/python_environment/_python_create_venv.md'
+import TipPythonOrPython3Executable from '/docs/components/setup/python_environment/_tip_python_or_python3_executable.md'
+
+<!-- ### 3. Install GX with optional dependencies for ??? -->
+import InstallDependencies from '/docs/components/setup/dependencies/_sql_install_dependencies.mdx'
+
+<!-- ### 4. Verify that GX has been installed correctly -->
+import GxVerifyInstallation from '/docs/components/setup/_gx_verify_installation.md'
+
+<!-- ### 5. Initialize a Data Context to store your credentials -->
+import InitializeDataContextFromCli from '/docs/components/setup/data_context/_filesystem_data_context_initialize_with_cli.md'
+import VerifyDataContextInitializedFromCli from '/docs/components/setup/data_context/_filesystem_data_context_verify_initialization_from_cli.md'
+
+<!-- ### 6. Configure the `config_variables.yml` file with your credentials -->
+[//]: # (TODO: import ConfigureCredentialsInDataContext from '/docs/components/setup/dependencies/_postgresql_configure_credentials_in_config_variables_yml.md')
+
+
+<!-- ## Next steps -->
+[//]: # (TODO: import FurtherConfiguration from '/docs/components/setup/next_steps/_links_after_installing_gx.md')
+
+
+## Introduction
+
+[//]: # (TODO:<IntroInstallPythonGxAndDependencies dependencies="$TODO$" />)
+
+## Prerequisites
+
+<Prerequisites requirePython = {true} requireInstallation = {false} requireDataContext = {false} requireSourceData = {null} requireDatasource = {false} requireExpectationSuite = {false}>
+
+- The ability to install Python modules with pip
+- 
+- A passion for data quality
+
+</Prerequisites>
+
+## Steps
+
+### 1. Check your Python version
+
+<PythonCheckVersion />
+
+<TipPythonOrPython3Executable />
+
+### 2. Create a Python virtual environment
+
+<PythonCreateVenv />
+
+[//]: # (TODO: ### 3. Install GX with optional dependencies for $TODO$)
+
+[//]: # (TODO: <InstallDependencies install_key="sqlalchemy" database_name="SQL"/>)
+
+### 4. Verify that GX has been installed correctly
+
+<GxVerifyInstallation />
+
+[//]: # (TODO: ### 5. Initialize a Data Context to store your PostgreSQL credentials)
+
+<InitializeDataContextFromCli />
+
+:::info Verifying the Data Context initialized successfully
+
+<VerifyDataContextInitializedFromCli />
+
+:::
+
+[//]: # (TODO: ### 6. Configure the `config_variables.yml` file with your PostgreSQL credentials)
+
+<ConfigureCredentialsInDataContext />
+
+## Next steps
+
+<FurtherConfiguration />
+
+
diff --git a/...aurus/docs/components/connect_to_data/cloud/_abs_batching_regex_explaination.md b/...aurus/docs/components/connect_to_data/cloud/_abs_batching_regex_explaination.md
@@ -0,0 +1,12 @@
+Your Data Asset will connect to all files that match the regex that you provide.  Each matched file will become a Batch inside your Data Asset.
+
+For example:
+
+Let's say that your Azure Blob Storage container has the following files:
+- "yellow_tripdata_sample_2021-11.csv"
+- "yellow_tripdata_sample_2021-12.csv"
+- "yellow_tripdata_sample_2023-01.csv"
+
+If you define a Data Asset using the full file name with no regex groups, such as `"yellow_tripdata_sample_2023-01\.csv"` your Data Asset will contain only one Batch, which will correspond to that file.
+
+However, if you define a partial file name with a regex group, such as `"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"` your Data Asset will contain 3 Batches, one corresponding to each matched file.  You can then use the keys `year` and `month` to indicate exactly which file you want to request from the available Batches.
diff --git a/...us/docs/components/connect_to_data/cloud/_abs_fluent_data_asset_config_keys.mdx b/...us/docs/components/connect_to_data/cloud/_abs_fluent_data_asset_config_keys.mdx
@@ -0,0 +1,14 @@
+import CodeBlock from '@theme/CodeBlock';
+
+To specify data to connect to you will need the following elements:
+- `name`: A name by which you can reference the Data Asset in the future.
+- `batching_regex`: A regular expression that indicates which files to treat as batches in your Data Asset and how to identify them.
+- `container`: The name of your Azure Blob Storage container.
+- `name_starts_with`: A string indicating what part of the batching_regex to truncate from the final batch names.
+
+```python title="Python code"
+asset_name = "MyTaxiDataAsset"
+batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
+container = "superconductive-public"
+name_starts_with = "data/taxi_yellow_tripdata_samples/"
+```
diff --git a/...cusaurus/docs/components/connect_to_data/cloud/_batching_regex_explaination.mdx b/...cusaurus/docs/components/connect_to_data/cloud/_batching_regex_explaination.mdx
@@ -0,0 +1,13 @@
+Your Data Asset will connect to all files that match the regex that you provide.  Each matched file will become a Batch inside your Data Asset.
+
+For example:
+
+<p>Let's say that your {props.storage_location_type} has the following files:</p>
+
+- "yellow_tripdata_sample_2021-11.csv"
+- "yellow_tripdata_sample_2021-12.csv"
+- "yellow_tripdata_sample_2023-01.csv"
+
+If you define a Data Asset using the full file name with no regex groups, such as `"yellow_tripdata_sample_2023-01\.csv"` your Data Asset will contain only one Batch, which will correspond to that file.
+
+However, if you define a partial file name with a regex group, such as `"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"` your Data Asset will contain 3 Batches, one corresponding to each matched file.  You can then use the keys `year` and `month` to indicate exactly which file you want to request from the available Batches.
diff --git a/...us/docs/components/connect_to_data/filesystem/_defining_multiple_data_assets.md b/...us/docs/components/connect_to_data/filesystem/_defining_multiple_data_assets.md
@@ -0,0 +1 @@
+Your Datasource can contain multiple Data Assets.  If you have additional files to connect to, you can provide different `name` and `batching_regex` parameters to create additional Data Assets for those files in your Datasource.  You can even include the same files in multiple Data Assets, if a given file matches the `batching_regex` of more than one Data Asset.
diff --git a/...s/connect_to_data/filesystem/_info_filesystem_datasource_relative_base_paths.md b/...s/connect_to_data/filesystem/_info_filesystem_datasource_relative_base_paths.md
@@ -0,0 +1,7 @@
+:::info Using relative paths as the `base_path` of a Filesystem Datasource
+
+If you are using a Filesystem Data Context you can provide a path for `base_path` that is relative to the folder containing your Data Context.
+
+However, an in-memory Ephemeral Data Context doesn't exist in a folder.  Therefore, when using an Ephemeral Data Context, relative paths will be determined based on the folder your Python code is being executed in, instead.
+
+:::