From 8d24fe963ecca49e608f07cace6989a083aa0ffc Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Fri, 27 Jan 2023 07:53:20 -0600 Subject: [PATCH] [DOCS] DOC-280: How to use GX with AWS S3 and Spark (#6782) Co-authored-by: William Shin --- .../_congratulations_aws_s3_spark.md | 1 + .../components/_spark_s3_dependencies.md | 26 ++ ...w_to_use_gx_with_aws_using_s3_and_spark.md | 331 ++++++++++++++++++ .../_configure_your_datasource.md | 42 +++ .../_instantiate_your_projects_datacontext.md | 10 + ...ource_configuration_to_your_datacontext.md | 28 ++ .../_test_your_new_datasource.md | 46 +++ .../connecting_to_your_data/cloud/s3/spark.md | 126 +------ sidebars.js | 1 + 9 files changed, 497 insertions(+), 114 deletions(-) create mode 100644 docs/deployment_patterns/how_to_use_gx_with_aws/components/_congratulations_aws_s3_spark.md create mode 100644 docs/deployment_patterns/how_to_use_gx_with_aws/components/_spark_s3_dependencies.md create mode 100644 docs/deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_s3_and_spark.md create mode 100644 docs/guides/connecting_to_your_data/cloud/s3/components_spark/_configure_your_datasource.md create mode 100644 docs/guides/connecting_to_your_data/cloud/s3/components_spark/_instantiate_your_projects_datacontext.md create mode 100644 docs/guides/connecting_to_your_data/cloud/s3/components_spark/_save_the_datasource_configuration_to_your_datacontext.md create mode 100644 docs/guides/connecting_to_your_data/cloud/s3/components_spark/_test_your_new_datasource.md diff --git a/docs/deployment_patterns/how_to_use_gx_with_aws/components/_congratulations_aws_s3_spark.md b/docs/deployment_patterns/how_to_use_gx_with_aws/components/_congratulations_aws_s3_spark.md new file mode 100644 index 000000000000..60b27009aa97 --- /dev/null +++ b/docs/deployment_patterns/how_to_use_gx_with_aws/components/_congratulations_aws_s3_spark.md @@ -0,0 +1 @@ +🚀🚀 Congratulations! 🚀🚀 You have successfully navigated the entire workflow for using Great Expectations with Amazon Web Services S3 and Spark, from installing Great Expectations through Validating your Data. \ No newline at end of file diff --git a/docs/deployment_patterns/how_to_use_gx_with_aws/components/_spark_s3_dependencies.md b/docs/deployment_patterns/how_to_use_gx_with_aws/components/_spark_s3_dependencies.md new file mode 100644 index 000000000000..bdddb9b558eb --- /dev/null +++ b/docs/deployment_patterns/how_to_use_gx_with_aws/components/_spark_s3_dependencies.md @@ -0,0 +1,26 @@ +Spark possesses a few dependencies that need to be installed before it can be used with AWS. You will need to install the `aws-java-sdk-bundle` and `hadoop-aws` files corresponding to your version of pySpark, and update your Spark configuration accordingly. You can find the `.jar` files you need to install in the following MVN repositories: + +- [hadoop-aws jar that matches your Spark version](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws) +- [aws-java-sdk-bundle jar that matches your Spark version](https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-bundle) + +Once the dependencies are installed, you will need to update your Spark configuration from within Python. First, import these necessary modules: + +```python +import pyspark as pyspark +from pyspark import SparkContext +``` + +Next, update the `pyspark.SparkConf` to match the dependency packages you downloaded. In this example, we are using the 3.3.1 version of `hadoop-aws`, but you will want to enter the version that corresponds to your installed dependency. + +```python +conf = pyspark.SparkConf() +conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1') +``` + +Finally, you will need to add your AWS credentials to the `SparkContext`. + +```python +sc = SparkContext(conf=conf) +sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', [AWS ACCESS KEY]) +sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', [AWS SECRET KEY]) +``` \ No newline at end of file diff --git a/docs/deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_s3_and_spark.md b/docs/deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_s3_and_spark.md new file mode 100644 index 000000000000..8101ce4685ca --- /dev/null +++ b/docs/deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_s3_and_spark.md @@ -0,0 +1,331 @@ +--- +title: How to use Great Expectations with Amazon Web Services using S3 and Spark +--- +import Prerequisites from '@site/docs/components/_prerequisites.jsx' +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import Congratulations from './components/_congratulations_aws_s3_spark.md' +import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; + + + + + + +import VerifyAwsInstalled from './components/_aws_cli_verify_installation.md' + + +import VerifyAwsCredentials from '@site/docs/guides/setup/configuring_metadata_stores/components/_verify_aws_credentials_are_configured_properly.mdx' + + + + + +import VerifyPythonVersion from '@site/docs/guides/setup/installation/components_local/_check_python_version.mdx' +import WhereToGetPython from './components/_python_where_to_get.md' + + + +import CreateVirtualEnvironment from '@site/docs/guides/setup/installation/components_local/_create_an_venv_with_pip.mdx' + + + +import GetLatestPip from '@site/docs/guides/setup/installation/components_local/_ensure_latest_pip.mdx' + + + +import InstallBoto3WithPip from '@site/docs/guides/setup/configuring_metadata_stores/components/_install_boto3_with_pip.mdx' + + +import InstallSparkS3Dependencies from './components/_spark_s3_dependencies.md' + + + +import InstallGxWithPip from '@site/docs/guides/setup/installation/components_local/_install_ge_with_pip.mdx' + + + +import VerifySuccessfulGxInstallation from '@site/docs/guides/setup/installation/components_local/_verify_ge_install_succeeded.mdx' + + + +import CreateDataContextWithCli from '@site/docs/guides/setup/configuring_data_contexts/components_how_to_configure_a_new_data_context_with_the_cli/_initialize_data_context_with_the_cli.mdx' + + + + + +import IdentifyDataContextExpectationsStore from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_an_expectation_store_in_amazon_s3/_identify_your_data_context_expectations_store.mdx' + + + +import AddS3ExpectationsStoreConfiguration from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_an_expectation_store_in_amazon_s3/_update_your_configuration_file_to_include_a_new_store_for_expectations_on_s.mdx' + + + +import VerifyS3ExpectationsStoreExists from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_an_expectation_store_in_amazon_s3/_confirm_that_the_new_expectations_store_has_been_added_by_running_great_expectations_store_list.mdx' + + + +import OptionalCopyExistingExpectationsToS3 from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_an_expectation_store_in_amazon_s3/_copy_existing_expectation_json_files_to_the_s_bucket_this_step_is_optional.mdx' + + + +import OptionalVerifyCopiedExpectationsAreAccessible from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_an_expectation_store_in_amazon_s3/_confirm_that_expectations_can_be_accessed_from_amazon_s_by_running_great_expectations_suite_list.mdx' + + + + + +import IdentifyDataContextValidationResultsStore from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_a_validation_result_store_in_amazon_s3/_identify_your_data_context_validation_results_store.mdx' + + + +import AddS3ValidationResultsStoreConfiguration from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_a_validation_result_store_in_amazon_s3/_update_your_configuration_file_to_include_a_new_store_for_validation_results_on_s.mdx' + + + +import VerifyS3ValidationResultsStoreExists from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_a_validation_result_store_in_amazon_s3/_confirm_that_the_new_validation_results_store_has_been_added_by_running_great_expectations_store_list.mdx' + + + +import OptionalCopyExistingValidationResultsToS3 from '@site/docs/guides/setup/configuring_metadata_stores/components_how_to_configure_a_validation_result_store_in_amazon_s3/_copy_existing_validation_results_to_the_s_bucket_this_step_is_optional.mdx' + + + + +import CreateAnS3BucketForDataDocs from '@site/docs/guides/setup/configuring_data_docs/components_how_to_host_and_share_data_docs_on_amazon_s3/_create_an_s3_bucket.mdx' + + +import ConfigureYourBucketPolicyToEnableAppropriateAccess from '@site/docs/guides/setup/configuring_data_docs/components_how_to_host_and_share_data_docs_on_amazon_s3/_configure_your_bucket_policy_to_enable_appropriate_access.mdx' + + +import ApplyTheDataDocsAccessPolicy from '@site/docs/guides/setup/configuring_data_docs/components_how_to_host_and_share_data_docs_on_amazon_s3/_apply_the_policy.mdx' + + +import AddANewS3SiteToTheDataDocsSitesSectionOfYourGreatExpectationsYml from '@site/docs/guides/setup/configuring_data_docs/components_how_to_host_and_share_data_docs_on_amazon_s3/_add_a_new_s3_site_to_the_data_docs_sites_section_of_your_great_expectationsyml.mdx' + + +import TestThatYourConfigurationIsCorrectByBuildingTheSite from '@site/docs/guides/setup/configuring_data_docs/components_how_to_host_and_share_data_docs_on_amazon_s3/_test_that_your_configuration_is_correct_by_building_the_site.mdx' + + +import AdditionalDataDocsNotes from '@site/docs/guides/setup/configuring_data_docs/components_how_to_host_and_share_data_docs_on_amazon_s3/_additional_notes.mdx' + + + + + +import HowToRunDatasourceCode from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_datasource_code_environment.md' + + + +import InstantiateDataContext from '@site/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_instantiate_your_projects_datacontext.md' + + + +import ConfigureYourDatasource from '@site/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_configure_your_datasource.md' + + + +import SaveDatasourceConfigurationToDataContext from '@site/docs/guides/connecting_to_your_data/cloud/s3/components_pandas/_save_the_datasource_configuration_to_your_datacontext.mdx' + + + +import TestS3Datasource from '@site/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_test_your_new_datasource.md' + + + + + +import PrepareABatchRequestAndValidatorForCreatingExpectations from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_expectation_suite_batch_request_validator_prepare_or_reuse.md' + + + +import CreateExpectationsInteractively from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_expectation_suite_add_expectations_with_validator.md' + + + +import SaveTheExpectationSuite from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_expectation_suite_save.md' + + + + + +import CheckpointCreateAndRun from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_checkpoint_create_and_run.md' + + + +import CreateCheckpoint from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_checkpoint_create_tabs.md' + + + +import SaveCheckpoint from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_checkpoint_save.md' + + + +import RunCheckpoint from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_checkpoint_run.md' + + +import BuildAndViewDataDocs from '@site/docs/deployment_patterns/how_to_use_gx_with_aws/components/_data_docs_build_and_view.md' + +Great Expectations can work within many frameworks. In this guide you will be shown a workflow for using Great Expectations with AWS and cloud storage. You will configure a local Great Expectations project to store Expectations, Validation Results, and Data Docs in Amazon S3 buckets. You will further configure Great Expectations to use Spark and access data stored in another Amazon S3 bucket. + +This guide will demonstrate each of the steps necessary to go from installing a new instance of Great Expectations to Validating your data for the first time and viewing your Validation Results as Data Docs. + + + +- Installed Python 3. (Great Expectations requires Python 3. For details on how to download and install Python on your platform, see [python.org](https://www.python.org/downloads/)). +- Installed the AWS CLI. (For guidance on how install this, please see [Amazon's documentation on how to install the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)) +- Configured your AWS credentials. (For guidance in doing this, please see [Amazon's documentation on configuring the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). +- The ability to install Python packages ([`boto3`](https://github.com/boto/boto3) and `great_expectations`) with pip. +- Identified the S3 bucket and prefix where Expectations and Validation Results will be stored. + + + +## Steps + +## Part 1: Setup + +### 1.1 Ensure that the AWS CLI is ready for use + +#### 1.1.1 Verify that the AWS CLI is installed + + +#### 1.1.2 Verify that your AWS credentials are properly configured + + +### 1.2 Prepare a local installation of Great Expectations and necessary dependencies + +#### 1.2.1 Verify that your Python version meets requirements + + + + +#### 1.2.2 Create a virtual environment for your Great Expectations project + + +#### 1.2.3 Ensure you have the latest version of pip + + +#### 1.2.4 Install boto3 + + +#### 1.2.5 Install Spark dependencies for S3 + + +#### 1.2.6 Install Great Expectations + + +#### 1.2.7 Verify that Great Expectations installed successfully + + +### 1.3 Create your Data Context + + +### 1.4 Configure your Expectations Store on Amazon S3 + +#### 1.4.1 Identify your Data Context Expectations Store + + +#### 1.4.2 Update your configuration file to include a new Store for Expectations on Amazon S3 + + +#### 1.4.3 Verify that the new Amazon S3 Expectations Store has been added successfully + + +#### 1.4.4 (Optional) Copy existing Expectation JSON files to the Amazon S3 bucket + + +#### 1.4.5 (Optional) Verify that copied Expectations can be accessed from Amazon S3 + + +### 1.5 Configure your Validation Results Store on Amazon S3 + +#### 1.5.1 Identify your Data Context's Validation Results Store + + +#### 1.5.2 Update your configuration file to include a new Store for Validation Results on Amazon S3 + + +#### 1.5.3 Verify that the new Amazon S3 Validation Results Store has been added successfully + + +#### 1.5.4 (Optional) Copy existing Validation results to the Amazon S3 bucket + + +### 1.6 Configure Data Docs for hosting and sharing from Amazon S3 + +#### 1.6.1 Create an Amazon S3 bucket for your Data Docs + + +#### 1.6.2 Configure your bucket policy to enable appropriate access + + +#### 1.6.3 Apply the access policy to your Data Docs' Amazon S3 bucket + + +#### 1.6.4 Add a new Amazon S3 site to the `data_docs_sites` section of your `great_expectations.yml` + + +#### 1.6.5 Test that your Data Docs configuration is correct by building the site + + +#### Additional notes on hosting Data Docs from an Amazon S3 bucket + + +## Part 2: Connect to data + +### 2.1 Choose how to run the code for creating a new Datasource + + +### 2.2 Instantiate your project's DataContext + + +### 2.3 Configure your Datasource + + +### 2.4 Save the Datasource configuration to your DataContext + + +### 2.5 Test your new Datasource + + +## Part 3: Create Expectations + +### 3.1: Prepare a Batch Request, empty Expectation Suite, and Validator + + + +### 3.2: Use a Validator to add Expectations to the Expectation Suite + + + +### 3.3: Save the Expectation Suite + + + +## Part 4: Validate Data + +### 4.1: Create and run a Checkpoint + + + +#### 4.1.1 Create a Checkpoint + + + +#### 4.1.2 Save the Checkpoint + + + +#### 4.1.3 Run the Checkpoint + + + +### 4.2: Build and view Data Docs + + + +## Congratulations! + + \ No newline at end of file diff --git a/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_configure_your_datasource.md b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_configure_your_datasource.md new file mode 100644 index 000000000000..569b491b1ea3 --- /dev/null +++ b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_configure_your_datasource.md @@ -0,0 +1,42 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +Using this example configuration, add in your S3 bucket and path to a directory that contains some of your data: + + + + + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L23-L42 +``` + +Run this code to test your configuration. + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L52 +``` + + + + + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_python_example.py#L21-L42 +``` + +Run this code to test your configuration. + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_python_example.py#L53 +``` + + + + + +If you specified an S3 path containing CSV files you will see them listed as `Available data_asset_names` in the output of `test_yaml_config()`. + +Feel free to adjust your configuration and re-run `test_yaml_config()` as needed. \ No newline at end of file diff --git a/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_instantiate_your_projects_datacontext.md b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_instantiate_your_projects_datacontext.md new file mode 100644 index 000000000000..f4b634c59723 --- /dev/null +++ b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_instantiate_your_projects_datacontext.md @@ -0,0 +1,10 @@ +import SparkDataContextNote from '../../../components/spark_data_context_note.md' + +Import these necessary packages and modules. + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L3-L6 +``` + + + +Please proceed only after you have instantiated your `DataContext`. \ No newline at end of file diff --git a/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_save_the_datasource_configuration_to_your_datacontext.md b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_save_the_datasource_configuration_to_your_datacontext.md new file mode 100644 index 000000000000..be8e5305b525 --- /dev/null +++ b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_save_the_datasource_configuration_to_your_datacontext.md @@ -0,0 +1,28 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +Save the configuration into your `DataContext` by using the `add_datasource()` function. + + + + + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L54 +``` + + + + + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_python_example.py#L55 +``` + + + + \ No newline at end of file diff --git a/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_test_your_new_datasource.md b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_test_your_new_datasource.md new file mode 100644 index 000000000000..2fcdaed14521 --- /dev/null +++ b/docs/guides/connecting_to_your_data/cloud/s3/components_spark/_test_your_new_datasource.md @@ -0,0 +1,46 @@ +import TabItem from '@theme/TabItem'; +import Tabs from '@theme/Tabs'; +import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; + +Verify your new by loading data from it into a using a . + + + + + +Add the S3 path to your CSV in the `path` key under `runtime_parameters` in your `RuntimeBatchRequest`. + +:::tip +The path you will want to use is your S3 URI, not the URL. +::: + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L57-L63 +``` + +Then load data into the `Validator`. + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L71-L77 +``` + + + + + +Add the name of the to the `data_asset_name` in your `BatchRequest`. + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L83-L88 +``` + +Then load data into the `Validator`. + +```python file=../../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L96-L102 +``` + + + + \ No newline at end of file diff --git a/docs/guides/connecting_to_your_data/cloud/s3/spark.md b/docs/guides/connecting_to_your_data/cloud/s3/spark.md index 05ed4d2cd4e0..62d62fd4fef6 100644 --- a/docs/guides/connecting_to_your_data/cloud/s3/spark.md +++ b/docs/guides/connecting_to_your_data/cloud/s3/spark.md @@ -4,9 +4,13 @@ title: How to connect to data on S3 using Spark import NextSteps from '../../components/next_steps.md' import Congratulations from '../../components/congratulations.md' -import Prerequisites from '../../components/prerequisites.jsx' +import Prerequisites from '@site/docs/components/_prerequisites.jsx' import WhereToRunCode from '../../components/where_to_run_code.md' -import SparkDataContextNote from '../../components/spark_data_context_note.md' +import InstantiateYourProjectSDatacontext from './components_spark/_instantiate_your_projects_datacontext.md' +import ConfigureYourDatasource from './components_spark/_configure_your_datasource.md' +import SaveTheDatasourceConfigurationToYourDatacontext from './components_spark/_save_the_datasource_configuration_to_your_datacontext.md' +import TestYourNewDatasource from './components_spark/_test_your_new_datasource.md' + import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; @@ -16,8 +20,8 @@ This will allow you to and exp -- Have access to data on an AWS S3 bucket -- Have access to a working Spark installation +- Access to data on an AWS S3 bucket +- Access to a working Spark installation @@ -29,125 +33,19 @@ This will allow you to and exp ### 2. Instantiate your project's DataContext -Import these necessary packages and modules. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L3-L6 -``` - - - -Please proceed only after you have instantiated your `DataContext`. + ### 3. Configure your Datasource -Using this example configuration, add in your S3 bucket and path to a directory that contains some of your data: - - - - - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L23-L42 -``` - -Run this code to test your configuration. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L52 -``` - - - - - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_python_example.py#L21-L42 -``` - -Run this code to test your configuration. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_python_example.py#L53 -``` - - - - - -If you specified an S3 path containing CSV files you will see them listed as `Available data_asset_names` in the output of `test_yaml_config()`. - -Feel free to adjust your configuration and re-run `test_yaml_config()` as needed. + ### 4. Save the Datasource configuration to your DataContext -Save the configuration into your `DataContext` by using the `add_datasource()` function. - - - - - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L54 -``` - - - - - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_python_example.py#L55 -``` - - - - + ### 5. Test your new Datasource -Verify your new by loading data from it into a using a . - - - - - -Add the S3 path to your CSV in the `path` key under `runtime_parameters` in your `RuntimeBatchRequest`. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L57-L63 -``` - -Then load data into the `Validator`. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L71-L77 -``` - - - - - -Add the name of the to the `data_asset_name` in your `BatchRequest`. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L83-L88 -``` - -Then load data into the `Validator`. - -```python file=../../../../../tests/integration/docusaurus/connecting_to_your_data/cloud/s3/spark/inferred_and_runtime_yaml_example.py#L96-L102 -``` - - - - + diff --git a/sidebars.js b/sidebars.js index 6f507485c531..cd4f3d27c08a 100644 --- a/sidebars.js +++ b/sidebars.js @@ -282,6 +282,7 @@ module.exports = { items: [ 'deployment_patterns/how_to_use_great_expectations_in_aws_glue', { label: 'How to use Great Expectations with AWS using S3 and Pandas', type: 'doc', id: 'deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_cloud_storage_and_pandas' }, + { label: 'How to use Great Expectations with AWS using S3 and Spark', type: 'doc', id: 'deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_s3_and_spark' }, { label: 'How to use Great Expectations with AWS using Athena', type: 'doc', id: 'deployment_patterns/how_to_use_gx_with_aws/how_to_use_gx_with_aws_using_athena' } ] },