Skip to content

Commit

Permalink
[FEATURE] Implementing Python code snippets under test for "https://d…
Browse files Browse the repository at this point in the history
  • Loading branch information
alexsherstinsky committed May 19, 2023
1 parent ff3e257 commit 421441c
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ We can define a S3 datasource by providing three pieces of information:
- `bucket_name`: The name of our S3 bucket
- `boto3_options`: We can provide various additional options here, but in this example we will leave this empty and use the default values.

```python title="Python code"
datasource_name = "my_s3_datasource"
bucket_name = "my_bucket"
boto3_options = {}
```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py define_add_pandas_s3_args"
```

:::tip What can `boto3_options` specify?
Expand All @@ -57,20 +54,12 @@ The parameter `boto3_options` will allow you to pass such things as:

Once we have those three elements, we can define our Datasource like so:

```python title="Python code"
datasource = context.sources.add_pandas_s3(
name=datasource_name, bucket=bucket_name, boto3_options=boto3_options
)
```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py create_datasource"
```

### 3. Add S3 data to the Datasource as a Data Asset

```python title = "Python code"
batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
s3_prefix = "data/taxi_yellow_tripdata_samples/"
data_asset = datasource.add_csv_asset(
name="my_taxi_data_asset", batching_regex=batching_regex, s3_prefix=s3_prefix
)
```python name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py add_asset"
```

<BatchingRegexExplaination storage_location_type="S3 bucket" />
Expand Down
2 changes: 1 addition & 1 deletion tests/datasource/fluent/test_pandas_s3_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pytest import param

import great_expectations.exceptions as ge_exceptions
from great_expectations.compatibility import aws
from great_expectations.core.util import S3Url
from great_expectations.datasource.fluent import PandasS3Datasource
from great_expectations.datasource.fluent.data_asset.data_connector import (
Expand All @@ -33,7 +34,6 @@

logger = logging.getLogger(__file__)

from great_expectations.compatibility import aws

# apply markers to entire test module
pytestmark = [
Expand Down
11 changes: 3 additions & 8 deletions tests/datasource/fluent/test_spark_s3_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from moto import mock_s3

import great_expectations.exceptions as ge_exceptions
from great_expectations.compatibility import aws
from great_expectations.core.util import S3Url
from great_expectations.datasource.fluent import SparkS3Datasource
from great_expectations.datasource.fluent.data_asset.data_connector import (
Expand All @@ -28,13 +29,6 @@
logger = logging.getLogger(__file__)


try:
import boto3
except ImportError:
logger.debug("Unable to load boto3; install optional boto3 dependency for support.")
boto3 = None


@pytest.fixture()
def aws_region_name() -> str:
return "us-east-1"
Expand All @@ -54,10 +48,11 @@ def aws_credentials() -> None:
os.environ["AWS_SESSION_TOKEN"] = "testing"


@pytest.mark.skipif(not aws.boto3)
@pytest.fixture
def s3_mock(aws_credentials, aws_region_name: str) -> BaseClient:
with mock_s3():
client = boto3.client("s3", region_name=aws_region_name)
client = aws.boto3.client("s3", region_name=aws_region_name)
yield client


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
To run this code as a local test, use the following console command:
```
pytest -v --docs-tests -m integration -k "how_to_connect_to_data_on_s3_using_pandas" tests/integration/test_script_runner.py
```
"""

# Python
# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py get_context">
import great_expectations as gx

context = gx.get_context()
# </snippet>

# Python
# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py define_add_pandas_s3_args">
datasource_name = "my_s3_datasource"
bucket_name = "my_bucket"
boto3_options = {}
# </snippet>

bucket_name = "superconductive-docs-test"

# Python
# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py create_datasource">
datasource = context.sources.add_pandas_s3(
name=datasource_name, bucket=bucket_name, boto3_options=boto3_options
)
# </snippet>

assert datasource_name in context.datasources

# Python
# <snippet name="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py add_asset">
asset_name = "my_taxi_data_asset"
s3_prefix = "data/taxi_yellow_tripdata_samples/"
batching_regex = r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2}).csv"
data_asset = datasource.add_csv_asset(
name=asset_name, batching_regex=batching_regex, s3_prefix=s3_prefix
)
# </snippet>

assert data_asset

assert datasource.get_asset_names() == {"my_taxi_data_asset"}

my_batch_request = data_asset.build_batch_request({"year": "2019", "month": "03"})
batches = data_asset.get_batch_list_from_batch_request(my_batch_request)
assert len(batches) == 1
assert set(batches[0].columns()) == {
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
"trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
"fare_amount",
"extra",
"mta_tax",
"tip_amount",
"tolls_amount",
"improvement_surcharge",
"total_amount",
"congestion_surcharge",
}
6 changes: 6 additions & 0 deletions tests/integration/test_script_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,12 @@
user_flow_script="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_quickly_connect_to_a_single_file_with_pandas.py",
data_context_dir="tests/integration/fixtures/no_datasources/great_expectations",
),
IntegrationTestFixture(
name="how_to_connect_to_data_on_s3_using_pandas",
user_flow_script="tests/integration/docusaurus/connecting_to_your_data/fluent_datasources/how_to_connect_to_data_on_s3_using_pandas.py",
data_context_dir="tests/integration/fixtures/no_datasources/great_expectations",
backend_dependencies=[BackendDependencies.AWS],
),
]


Expand Down

0 comments on commit 421441c

Please sign in to comment.