great-expectations · alexsherstinsky · May 18, 2023 · May 16, 2023 · May 16, 2023 · May 16, 2023
diff --git a/..._your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark.md b/..._your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark.md
@@ -50,17 +50,14 @@ A Filesystem Datasource can be created with two pieces of information:
 
 In our example, we will define these in advance by storing them in the Python variables `datasource_name` and `path_to_folder_containing_csv_files`:
 
-```python title="Python code"
-datasource_name = "MyNewDatasource"
-path_to_folder_containing_csv_files = "../taxi_data"
+```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py define_add_spark_filesystem_args"
 ```
 
 <InfoFilesystemDatasourceRelativeBasePaths />
 
 Once we have determined our `name` and `base_directory`, we pass them in as parameters when we create our Datasource:
 
-```python title = "Python code"
-datasource = context.sources.add_spark_filesystem(name=datasource_name, base_path=path_to_folder_containing_csv_files)
+```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py create_datasource"
 ```
 
 <TipFilesystemDatasourceNestedSourceDataFolders />
@@ -75,19 +72,15 @@ A Data Asset requires two pieces of information to be defined:
 
 For this example, we will define these two values in advance by storing them in the Python variables `asset_name` and (since we are connecting to NYC taxi data in this example) `batching_regex`:
 
-```python title="Python code"
-name = "my_taxi_data_asset"
-batching_regex = "yellow_tripdata_sample_2023_01\.csv"
+```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py define_add_csv_asset_args"
 ```
 
-Once we have determined those two values, we will pass them in as parameters when we create our Data Asset:
+In addition, the argument `header` informs the Spark `DataFrame` reader that the files contain a header column, while the argument `infer_schema` instructs the Spark `DataFrame` reader to make a best effort to determine the schema of the columns automatically.
 
-```python title="Python code"
-data_asset = datasource.add_csv_asset(
-    name=name, batching_regex=batching_regex
-)
-```
+Once we have determined those two values as well as the optional `header` and `infer_schema` arguments, we will pass them in as parameters when we create our Data Asset:
 
+```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py add_asset"
+```
 
 ### 4. Repeat step 3 as needed to add additional files as Data Assets
 

diff --git a/docs/sphinx_api_docs_source/public_api_excludes.py b/docs/sphinx_api_docs_source/public_api_excludes.py
@@ -687,4 +687,16 @@
             "great_expectations/datasource/fluent/serializable_types/pyspark.py"
         ),
     ),
+    IncludeExcludeDefinition(
+        reason='The "columns()" property in this module is not included in the public API',
+        name="columns",
+        filepath=pathlib.Path("great_expectations/datasource/fluent/sql_datasource.py"),
+    ),
+    IncludeExcludeDefinition(
+        reason='The "columns()" property in this module is not included in the public API',
+        name="columns",
+        filepath=pathlib.Path(
+            "great_expectations/datasource/fluent/spark_generic_splitters.py"
+        ),
+    ),
 ]
diff --git a/great_expectations/validator/metrics_calculator.py b/great_expectations/validator/metrics_calculator.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 
+from great_expectations.core._docs_decorators import public_api
 from great_expectations.validator.computed_metric import MetricValue  # noqa: TCH001
 from great_expectations.validator.exception_info import ExceptionInfo  # noqa: TCH001
 from great_expectations.validator.metric_configuration import MetricConfiguration
@@ -41,9 +42,16 @@ def show_progress_bars(self) -> bool:
     def show_progress_bars(self, enable: bool) -> None:
         self._show_progress_bars = enable
 
+    @public_api
     def columns(self, domain_kwargs: Optional[Dict[str, Any]] = None) -> List[str]:
         """
         Convenience method to run "table.columns" metric.
+
+        Arguments:
+            domain_kwargs: Optional dictionary of domain kwargs (e.g., containing "batch_id").
+
+        Returns:
+            The list of Batch columns.
         """
         if domain_kwargs is None:
             domain_kwargs = {}
@@ -62,14 +70,22 @@ def columns(self, domain_kwargs: Optional[Dict[str, Any]] = None) -> List[str]:
 
         return columns
 
+    @public_api
     def head(
         self,
         n_rows: int = 5,
         domain_kwargs: Optional[Dict[str, Any]] = None,
         fetch_all: bool = False,
     ) -> pd.DataFrame:
-        """
-        Convenience method to run "table.head" metric.
+        """Convenience method to return the first several rows or records from a Batch of data.
+
+        Args:
+            n_rows: The number of rows to return.
+            domain_kwargs: If provided, the domain for which to return records.
+            fetch_all: If True, ignore n_rows and return the entire batch.
+
+        Returns:
+            A Pandas DataFrame containing the records' data.
         """
         if domain_kwargs is None:
             domain_kwargs = {}

diff --git a/great_expectations/validator/validator.py b/great_expectations/validator/validator.py
@@ -328,8 +328,6 @@ def get_metric(
     ) -> Any:
         """Convenience method, return the value of the requested metric.
 
-        (To be deprecated in favor of using methods in "MetricsCalculator" class.)
-
         Args:
             metric: MetricConfiguration
 
@@ -345,8 +343,6 @@ def get_metrics(
         """
         Convenience method that resolves requested metrics (specified as dictionary, keyed by MetricConfiguration ID).
 
-        (To be deprecated in favor of using methods in "MetricsCalculator" class.)
-
         Args:
             metrics: Dictionary of desired metrics to be resolved; metric_name is key and MetricConfiguration is value.
 
@@ -365,8 +361,6 @@ def compute_metrics(
         """
         Convenience method that computes requested metrics (specified as elements of "MetricConfiguration" list).
 
-        (To be deprecated in favor of using methods in "MetricsCalculator" class.)
-
         Args:
             metric_configurations: List of desired MetricConfiguration objects to be resolved.
             runtime_configuration: Additional run-time settings (see "Validator.DEFAULT_RUNTIME_CONFIGURATION").
@@ -381,11 +375,15 @@ def compute_metrics(
             min_graph_edges_pbar_enable=min_graph_edges_pbar_enable,
         )
 
+    @public_api
     def columns(self, domain_kwargs: Optional[Dict[str, Any]] = None) -> List[str]:
-        """
-        Convenience method to obtain Batch columns.
+        """Convenience method to obtain Batch columns.
+
+        Arguments:
+            domain_kwargs: Optional dictionary of domain kwargs (e.g., containing "batch_id").
 
-        (To be deprecated in favor of using methods in "MetricsCalculator" class.)
+        Returns:
+            The list of Batch columns.
         """
         return self._metrics_calculator.columns(domain_kwargs=domain_kwargs)
 
@@ -396,7 +394,7 @@ def head(
         domain_kwargs: Optional[Dict[str, Any]] = None,
         fetch_all: bool = False,
     ) -> pd.DataFrame:
-        """Return the first several rows or records from a Batch of data.
+        """Convenience method to return the first several rows or records from a Batch of data.
 
         Args:
             n_rows: The number of rows to return.

diff --git a/...cting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_pandas.py b/...cting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_pandas.py
@@ -52,4 +52,30 @@
 # </snippet>
 
 assert datasource.get_asset_names() == {"my_taxi_data_asset"}
-assert datasource.get_asset(asset_name).name == "my_taxi_data_asset"
+
+my_asset = datasource.get_asset(asset_name)
+assert my_asset
+
+my_batch_request = my_asset.build_batch_request({"year": "2019", "month": "03"})
+batches = my_asset.get_batch_list_from_batch_request(my_batch_request)
+assert len(batches) == 1
+assert set(batches[0].columns()) == {
+    "vendor_id",
+    "pickup_datetime",
+    "dropoff_datetime",
+    "passenger_count",
+    "trip_distance",
+    "rate_code_id",
+    "store_and_fwd_flag",
+    "pickup_location_id",
+    "dropoff_location_id",
+    "payment_type",
+    "fare_amount",
+    "extra",
+    "mta_tax",
+    "tip_amount",
+    "tolls_amount",
+    "improvement_surcharge",
+    "total_amount",
+    "congestion_surcharge",
+}
diff --git a/...ecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py b/...ecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py
@@ -0,0 +1,83 @@
+"""
+To run this code as a local test, use the following console command:
+```
+pytest -v --docs-tests -m integration -k "how_to_connect_to_one_or_more_files_using_spark" tests/integration/test_script_runner.py
+```
+"""
+import pathlib
+
+
+# Python
+# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py get_context">
+import great_expectations as gx
+
+context = gx.get_context()
+# </snippet>
+
+# Python
+# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py define_add_spark_filesystem_args">
+datasource_name = "my_new_datasource"
+path_to_folder_containing_csv_files = "<INSERT_PATH_TO_FILES_HERE>"
+# </snippet>
+
+path_to_folder_containing_csv_files = str(
+    pathlib.Path(
+        gx.__file__,
+        "..",
+        "..",
+        "tests",
+        "test_sets",
+        "taxi_yellow_tripdata_samples",
+    ).resolve(strict=True)
+)
+
+# Python
+# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py create_datasource">
+datasource = context.sources.add_spark_filesystem(
+    name=datasource_name, base_directory=path_to_folder_containing_csv_files
+)
+# </snippet>
+
+assert datasource_name in context.datasources
+
+# Python
+# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py define_add_csv_asset_args">
+asset_name = "my_taxi_data_asset"
+batching_regex = r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2}).csv"
+# </snippet>
+
+# Python
+# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py add_asset">
+datasource.add_csv_asset(
+    name=asset_name, batching_regex=batching_regex, header=True, infer_schema=True
+)
+# </snippet>
+
+assert datasource.get_asset_names() == {"my_taxi_data_asset"}
+
+my_asset = datasource.get_asset(asset_name)
+assert my_asset
+
+my_batch_request = my_asset.build_batch_request({"year": "2019", "month": "03"})
+batches = my_asset.get_batch_list_from_batch_request(my_batch_request)
+assert len(batches) == 1
+assert set(batches[0].columns()) == {
+    "vendor_id",
+    "pickup_datetime",
+    "dropoff_datetime",
+    "passenger_count",
+    "trip_distance",
+    "rate_code_id",
+    "store_and_fwd_flag",
+    "pickup_location_id",
+    "dropoff_location_id",
+    "payment_type",
+    "fare_amount",
+    "extra",
+    "mta_tax",
+    "tip_amount",
+    "tolls_amount",
+    "improvement_surcharge",
+    "total_amount",
+    "congestion_surcharge",
+}
diff --git a/tests/integration/docusaurus/reference/glossary/batch_request.py b/tests/integration/docusaurus/reference/glossary/batch_request.py
@@ -16,7 +16,7 @@
 # <snippet name="tests/integration/docusaurus/reference/glossary/batch_request batch_request">
 import great_expectations as gx
 
-context = gx.data_context.FileDataContext.create(full_path_to_project_directory)
+context = gx.get_context()
 
 # data_directory is the full path to a directory containing csv files
 datasource = context.sources.add_pandas_filesystem(

diff --git a/tests/integration/test_script_runner.py b/tests/integration/test_script_runner.py
@@ -323,7 +323,6 @@
         user_flow_script="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/get_existing_data_asset_from_existing_datasource_pandas_filesystem_example.py",
         data_context_dir="tests/integration/fixtures/no_datasources/great_expectations",
         data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files",
-        backend_dependencies=[BackendDependencies.PANDAS],
     ),
     IntegrationTestFixture(
         name="checkpoints_glossary",
@@ -336,7 +335,6 @@
         user_flow_script="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/organize_batches_in_pandas_filesystem_datasource.py",
         data_context_dir="tests/integration/fixtures/no_datasources/great_expectations",
         data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files",
-        backend_dependencies=[BackendDependencies.PANDAS],
     ),
     IntegrationTestFixture(
         name="how_to_organize_batches_in_a_sql_based_data_asset",
@@ -348,7 +346,13 @@
         user_flow_script="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_pandas.py",
         data_context_dir="tests/integration/fixtures/no_datasources/great_expectations",
         data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files",
-        backend_dependencies=[BackendDependencies.PANDAS],
+    ),
+    IntegrationTestFixture(
+        name="how_to_connect_to_one_or_more_files_using_spark",
+        user_flow_script="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_one_or_more_files_using_spark.py",
+        data_context_dir="tests/integration/fixtures/no_datasources/great_expectations",
+        data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files",
+        backend_dependencies=[BackendDependencies.SPARK],
     ),
 ]