In [None]:
! pip install great_expectations

In [None]:
! great_expectations --version

In [None]:
import os

In [None]:
# <ALEX_TEST_NEEDED_FOR_SPARK>

In [None]:
import pandas as pd

In [None]:
import findspark
from pyspark.sql import SparkSession

findspark.init()

In [None]:
spark: SparkSession = SparkSession.builder.appName(
    "Python Spark SQL ZEP SparkS3Datasource Example"
).getOrCreate()

In [None]:
# </ALEX_TEST_NEEDED_FOR_SPARK>




# Filesystem data


## How to connect to one or more files using Pandas

### Dependencies
- An installation of GX
- Source data (csv, excel, etc) in a local filesystem

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Connect to the folder containing your Data
# path_to_folder_containing_csv_files="https://raw.githubusercontent.com/great_expectations/"
path_to_folder_containing_csv_files = "../taxi_data"
datasource_name = "MyNewDatasource"
datasource = context.data_sources.add_pandas_filesystem(
    name=datasource_name, base_directory=path_to_folder_containing_csv_files
)

In [None]:
### Add specific files to the Datasource as individual Data Assets
batching_regex = r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2}).csv"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset", batching_regex=batching_regex
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "year": "2018",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe)

In [None]:
# </ALEX_TEST_DEBUG>


## How to connect to one or more files using Spark

### Dependencies
- An installation of GX
- Source data (csv, excel, etc) in a local filesystem

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Connect to the folder containing your Data
# path_to_folder_containing_csv_files="https://raw.githubusercontent.com/great_expectations/"
path_to_folder_containing_csv_files = "../taxi_data"
datasource_name = "MyNewDatasource"
datasource = context.data_sources.add_spark_filesystem(
    name=datasource_name, base_directory=path_to_folder_containing_csv_files
)

In [None]:
### Add specific files as individual Data Assets
batching_regex = r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2}).csv"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset",
    batching_regex=batching_regex,
    header=True,
    infer_schema=True,
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "year": "2018",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe.toPandas())

In [None]:
# </ALEX_TEST_DEBUG>




# In-memory data


## How to connect to in-memory data using Pandas

### Dependencies
- An installation of GX

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Sample data for our example:
import Pandas as pd

df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

In [None]:
### Read your dataframe into a Datasource
datasource_name = "MyNewDatasource"
datasource = context.datasources.add_pandas_dataframe(name=datasource_name)

In [None]:
### Read the Dataframe as a Data Asset
data_asset = datasource.read_dataframe(
    dataframe=df, batch_identifiers=["default_identifier_name"]
)


## How to connect to in-memory data using Spark

### Dependencies
- An installation of GX

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Sample data for our example:
df = [
    {"a": 1, "b": 2, "c": 3},
    {"a": 4, "b": 5, "c": 6},
    {"a": 7, "b": 8, "c": 9},
]

In [None]:
### Read your dataframe into a Datasource
datasource_name = "MyNewDatasource"
datasource = context.datasources.add_spark_dataframe(name=datasource_name)

In [None]:
### Read the Dataframe as a Data Asset
data_asset = datasource.read_dataframe(
    dataframe=df, batch_identifiers=["default_identifier_name"]
)




# Cloud data


## How to connect to data on Azure Blob Storage using Pandas

### Prerequisites
- An installation of GX with ABS dependencies
- Previously initialized a Data Context
- ABS credentials configured
- Source data (csv, excel, etc) in ABS

In [None]:
# Are there any dependencies for this?

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Create Datasource
datasource_name = "MyDatasource"
datasource = context.data_sources.add_pandas_abs(
    name=datasource_name,
    azure_options={
        "account_url": "superconductivetesting.blob.core.windows.net",
        "credential": os.environ["AZURE_CREDENTIAL"],
    },
)

In [None]:
### Add specific files to the Datasource as individual Data Assets
batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
container = "superconductive-public"
name_starts_with = "data/taxi_yellow_tripdata_samples/"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset",
    batching_regex=batching_regex,
    container=container,
    name_starts_with=name_starts_with,
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "month": "03",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe)

In [None]:
# </ALEX_TEST_DEBUG>


## How to connect to data on Azure Blob Storage using Spark

### Prerequisites
- An installation of GX with ABS dependencies
- Previously initialized a Data Context
- ABS credentials configured
- Source data (csv, excel, etc) in a ABS

In [None]:
# Are there any dependencies for this?

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Create Datasource
datasource_name = "MyDatasource"
datasource = context.data_sources.add_spark_abs(
    name=datasource_name,
    azure_options={
        "account_url": "superconductivetesting.blob.core.windows.net",
        # "credential": os.environ['AZURE_CREDENTIAL']
    },
)

In [None]:
### Add specific files to the Datasource as individual Data Assets
batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
container = "superconductive-public"
name_starts_with = "data/taxi_yellow_tripdata_samples/"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset",
    batching_regex=batching_regex,
    container=container,
    header=True,
    infer_schema=True,
    name_starts_with=name_starts_with,
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "month": "03",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe.toPandas())

In [None]:
# </ALEX_TEST_DEBUG>


## How to connect to data on S3 using Pandas

### Prerequisites
- An installation of GX with AWS S3 dependencies
- Previously initialized a Data Context
- AWS credentials configured
- Source data (csv, excel, etc) in a S3 bucket

In [None]:
# Install GX with S3 dependencies
! pip install great_expectations[s3] # Installs Boto3 dependency
! great_expectations --version

# Configure credentials
## Prerequisite: Install the AWS CLI
### Confirm AWS CLI is installed; this will be used to configure AWS credentials.
! aws --version
### Confirm AWS credentials are configured properly
! aws sts get-caller-identity

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Create Datasource
datasource_name = "MyS3Datasource"
bucket_name = "alex-test-0"
boto3_options = {
    # "endpoint_url": "${S3_ENDPOINT}", # Uses the S3_ENDPOINT environment variable to determine which endpoint to use.
    # "region_name": "<your_aws_region_name>"
}
# datasource = context.datasource.add_pandas_s3(name=datasource_name, bucket=bucket_path, boto3_options=boto3_options)
datasource = context.data_sources.add_pandas_s3(
    name=datasource_name, bucket=bucket_name, boto3_options=boto3_options
)

In [None]:
### Add specific files to the Datasource as individual Data Assets
batching_regex = r"(?P<name>.+)_(?P<timestamp>.+)_(?P<price>\d{4})\.csv"
prefix = "test_ci"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset", batching_regex=batching_regex, prefix=prefix
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "price": "1313",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe)

In [None]:
# </ALEX_TEST_DEBUG>


## How to connect to data on S3 using Spark

### Prerequisites
- An installation of GX with AWS S3 dependencies
- Previously initialized a Data Context
- AWS credentials configured
- Source data (csv, excel, etc) in a S3 bucket

In [None]:
# Install GX with S3 dependencies
! pip install great_expectations[s3] # Installs Boto3 dependency
! great_expectations --version

# Configure credentials
## Prerequisite: Install the AWS CLI
### Confirm AWS CLI is installed; this will be used to configure AWS credentials.
! aws --version
### Confirm AWS credentials are configured properly
! aws sts get-caller-identity

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Create Datasource
datasource_name = "MyS3Datasource"
bucket_name = "alex-test-0"
boto3_options = {
    # "endpoint_url": "${S3_ENDPOINT}", # Uses the S3_ENDPOINT environment variable to determine which endpoint to use.
    # "region_name": "<your_aws_region_name>"
}
datasource = context.data_sources.add_spark_s3(
    name=datasource_name, bucket=bucket_name, boto3_options=boto3_options
)

In [None]:
### Add specific files to the Datasource as individual Data Assets
batching_regex = r"(?P<name>.+)_(?P<timestamp>.+)_(?P<price>\d{4})\.csv"
prefix = "test_ci"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset",
    batching_regex=batching_regex,
    header=True,
    infer_schema=True,
    prefix=prefix,
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "price": "1313",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe.toPandas())

In [None]:
# </ALEX_TEST_DEBUG>


## How to connect to data on GCS using Pandas

### Prerequisites
- An installation of GX with GCS dependencies
- Previously initialized a Data Context
- GCS credentials configured
- Source data (csv, excel, etc) in a GCP bucket

### Configure credentials
Great Expectations provides two options for configuring your GCS credentials:
- Use the `gcloud` command line tool and `GOOGLE_APPLICATION_CREDENTIALS` environment variable
  - This is the default option and what was used throughout this guide
- Passing a filepath in as the value of an optional `credentials` parameter when you create your GBQ Datasource
  - This argument should contain a specific filepath that leads to your credentials `.json` file
  - This method utilizes `google.oauth2.service_account.Credentials.from_service_account_file` under the hood
- Passing a JSON string value to the optional `credentials` parameter when you create your GBQ Datasource
  - This string should contain the actual JSON data from your credentials file.
  - This method utilizes `google.oauth2.service_account.Credentials.from_service_account_info` under the hood

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
## Create Datasource
# datasource = context.datasources.add_pandas_gbq(name="MyGbqDatasource")
bucket_name = "test_docs_data"
gcs_options = {}
datasource = context.data_sources.add_pandas_gcs(
    name="MyGcsDatasource", bucket_or_name=bucket_name, gcs_options=gcs_options
)

In [None]:
### Add GCS data to the Datasource as a Data Asset
batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
prefix = "data/taxi_yellow_tripdata_samples/"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset", batching_regex=batching_regex, prefix=prefix
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "month": "03",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe)

In [None]:
# </ALEX_TEST_DEBUG>


## How to connect to data on GCS using Spark

### Prerequisites
- An installation of GX with GCS dependencies
- Previously initialized a Data Context
- GCS credentials configured
- Source data (csv, excel, etc) in a GCP bucket

### Configure credentials
Great Expectations provides two options for configuring your GCS credentials:
- Use the `gcloud` command line tool and `GOOGLE_APPLICATION_CREDENTIALS` environment variable
  - This is the default option and what was used throughout this guide
- Passing a filepath in as the value of an optional `credentials` parameter when you create your GBQ Datasource
  - This argument should contain a specific filepath that leads to your credentials `.json` file
  - This method utilizes `google.oauth2.service_account.Credentials.from_service_account_file` under the hood
- Passing a JSON string value to the optional `credentials` parameter when you create your GBQ Datasource
  - This string should contain the actual JSON data from your credentials file.
  - This method utilizes `google.oauth2.service_account.Credentials.from_service_account_info` under the hood

In [None]:
# </ALEX_TEST_INSTRUCTIONS_UPDATED>

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### Create Datasource
bucket_name = "test_docs_data"
gcs_options = {}
datasource = context.data_sources.add_spark_gcs(
    name="MyGcsDatasource", bucket_or_name=bucket_name, gcs_options=gcs_options
)

In [None]:
### Add GCS data to the Datasource as a Data Asset
batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
prefix = "data/taxi_yellow_tripdata_samples/"
data_asset = datasource.add_csv_asset(
    name="MyTaxiDataAsset",
    batching_regex=batching_regex,
    header=True,
    infer_schema=True,
    prefix=prefix,
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
# data_asset._data_connector.get_data_references()

In [None]:
data_asset._data_connector.get_data_reference_count()

In [None]:
# data_asset._data_connector.get_matched_data_references()

In [None]:
data_asset._data_connector.get_matched_data_reference_count()

In [None]:
# data_asset._data_connector.get_unmatched_data_references()

In [None]:
data_asset._data_connector.get_unmatched_data_reference_count()

In [None]:
batch_request = data_asset.build_batch_request(
    options={
        "month": "03",
    }
)
batch_request

In [None]:
batch_list = data_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
with pd.option_context(
    "display.max_rows",
    10,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    display(batch_list[0].data.dataframe.toPandas())

In [None]:
# </ALEX_TEST_DEBUG>




# SQL Data


## How to connect to a SQL table

### Prerequisites
- GX installed with SQL Dependencies
- Source data in a SQL Database
- Previously initialized Data Context
- SQL Credentials configured


In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

You can use either environment variables or a key in `config_variables.yml` to safely store any passwords needed by your connection string.  After defining your password in one of those ways, you can reference it in your connection string like this:

In [None]:
### Define your connection string
connection_string = "postgresql+psycopg2://username:${MY_PASSWORD}@localhost/test"

In the above example `MY_PASSWORD` would be the name of the environment variable or the key to the value in `config_variables.yml` that corresponds to your credentials.

If you include a password as plain text in your connection string when you define your Datasource, GX will automatically strip it out, add it to `config_variables.yml` and substitute it with a variable as was shown above.

For purposes of this guide's examples, we will store our connection string in the variable `sql_connection_string` with plain text credentials:

In [None]:
### Define your connection string, example
sql_connection_string = "postgresql+psycopg2://username:my_password@localhost/test"

In [None]:
### Connect to the SQL database
datasource = context.data_sources.add_sql(
    name="my_datasource", connection_string=sql_connection_string
)

In [None]:
### Add a table to the Datasource as a Data Asset
table_asset = datasource.add_table_asset(
    name="my_asset", table_name="yellow_tripdata_sample"
)


## How to connect to SQL data using a query

### Prerequisites
- GX installed with SQL Dependencies (Reference below for specific Database requirements)
- Source data in a SQL Database
- Previously initialized Data Context
- Credentials configured

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

You can use either environment variables or a key in `config_variables.yml` to safely store any passwords needed by your connection string.  After defining your password in one of those ways, you can reference it in your connection string like this:

In [None]:
### Define your connection string
connection_string = "postgresql+psycopg2://username:${MY_PASSWORD}@localhost/test"

In the above example `MY_PASSWORD` would be the name of the environment variable or the key to the value in `config_variables.yml` that corresponds to your credentials.

If you include a password as plain text in your connection string when you define your Datasource, GX will automatically strip it out, add it to `config_variables.yml` and substitute it with a variable as was shown above.

For purposes of this guide's examples, we will store our connection string in the variable `sql_connection_string` with plain text credentials:

In [None]:
### Define your connection string, example
sql_connection_string = "postgresql+psycopg2://username:my_password@localhost/test"

In [None]:
### Connect to the SQL database
datasource = context.data_sources.add_sql(
    name="my_datasource", connection_string=sql_connection_string
)

In [None]:
### Add an SQL query to the Datasource as a Data Asset
query_asset = datasource.add_query_asset(
    name="my_asset", query="SELECT * from yellow_tripdata_sample"
)




# SQL Data


## How to connect to a SQL table

### Prerequisites
- GX installed with SQL Dependencies
- Source data in a SQL Database
- Previously initialized Data Context
- SQL Credentials configured


In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

You can use either environment variables or a key in `config_variables.yml` to safely store any passwords needed by your connection string.  After defining your password in one of those ways, you can reference it in your connection string like this:

In [None]:
### Define your connection string
connection_string = "postgresql+psycopg2://username:${MY_PASSWORD}@localhost/test"

In the above example `MY_PASSWORD` would be the name of the environment variable or the key to the value in `config_variables.yml` that corresponds to your credentials.

If you include a password as plain text in your connection string when you define your Datasource, GX will automatically strip it out, add it to `config_variables.yml` and substitute it with a variable as was shown above.

For purposes of this guide's examples, we will store our connection string in the variable `sql_connection_string` with plain text credentials:

In [None]:
### Define your connection string, example
sql_connection_string = "postgresql+psycopg2://username:my_password@localhost/test"

In [None]:
### Connect to any SQL database
datasource = context.data_sources.add_postgres(
    name="my_datasource", connection_string=sql_connection_string
)

In [None]:
### Alternatively: Connect to specifically the PostGreSql database
datasource = context.data_sources.add_postgres(
    name="my_datasource", connection_string=sql_connection_string
)

In [None]:
### Add a table to the Datasource as a Data Asset
table_asset = datasource.add_table_asset(
    name="my_asset", table_name="yellow_tripdata_sample", schema_name="public"
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
batch_request = table_asset.build_batch_request(options={})
batch_request

In [None]:
batch_list = table_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
batch_list[0]

In [None]:
batch_list[0].batch_spec

In [None]:
batch_list[0].data

In [None]:
batch_list[0].data.selectable

In [None]:
batch_list[0].head()


## How to connect to SQL data using a query

### Prerequisites
- GX installed with SQL Dependencies (Reference below for specific Database requirements)
- Source data in a SQL Database
- Previously initialized Data Context
- Credentials configured

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

You can use either environment variables or a key in `config_variables.yml` to safely store any passwords needed by your connection string.  After defining your password in one of those ways, you can reference it in your connection string like this:

In [None]:
### Define your connection string
connection_string = "postgresql+psycopg2://username:${MY_PASSWORD}@localhost/test"

In the above example `MY_PASSWORD` would be the name of the environment variable or the key to the value in `config_variables.yml` that corresponds to your credentials.

If you include a password as plain text in your connection string when you define your Datasource, GX will automatically strip it out, add it to `config_variables.yml` and substitute it with a variable as was shown above.

For purposes of this guide's examples, we will store our connection string in the variable `sql_connection_string` with plain text credentials:

In [None]:
### Define your connection string, example
sql_connection_string = "postgresql+psycopg2://username:my_password@localhost/test"

In [None]:
### Connect to any SQL database
datasource = context.data_sources.add_postgres(
    name="my_datasource", connection_string=sql_connection_string
)

In [None]:
### Alternatively: Connect to specifically the PostGreSql database
datasource = context.data_sources.add_postgres(
    name="my_datasource", connection_string=sql_connection_string
)

In [None]:
### Add an SQL query to the Datasource as a Data Asset
query_asset = datasource.add_query_asset(
    name="my_asset", query="SELECT * FROM yellow_tripdata_sample"
)

In [None]:
# <ALEX_TEST_DEBUG>

In [None]:
batch_request = query_asset.build_batch_request(options={})
batch_request

In [None]:
batch_list = query_asset.get_batch_list_from_batch_request(batch_request=batch_request)

In [None]:
len(batch_list)

In [None]:
batch_list[0]

In [None]:
batch_list[0].batch_spec

In [None]:
batch_list[0].data

In [None]:
batch_list[0].data.selectable

In [None]:
batch_list[0].head()

In [None]:
# </ALEX_TEST_DEBUG>

### PostgreSQL

In [None]:
## Install GX with psycopg2 and sqlalchemy dependencies
!pip install great_expectations[postgresql]

### Sqlite

In [None]:
!pip install great_expectations[sqlalchemy]

### Athena

In [None]:
!pip install great_expectations[athena]

### Redshift

In [None]:
!pip install great_expectations[redshift]

### BigQuery

In [None]:
!pip install great_expectations[bigquery]

### MSSQL

In [None]:
!pip install great_expectations[mssql]

### Snowflake

In [None]:
!pip install great_expectations[snowflake]

### Trino

In [None]:
!pip install great_expectations[trino]

# Data Assets

## How to organize multiple files as Batches in a non-SQL Data Asset

### Prerequisites
- An installation of GX
- Previously initialized a Data Context
- Configured a Datasource for filesystem style source data (csv, excel, etc)

In [None]:
### Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

In [None]:
### get your filesystem datasource
name_of_my_existing_datasource = "MyConfiguredDatasource"
datasource = context.data_sources.get(name_of_my_existing_datasource)

In [None]:
### Add all files matching a regex to a Datasource as a single Data Asset grouped by year and month
all_csv_files_as_month_year_regex = (
    r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2}).csv"
)
data_asset = datasource.add_csv_asset(
    asset_name="MyTaxiDataAsset", batching_regex=all_csv_files_as_month_year_regex
)

## How to organize a SQL Data Asset into multiple Batches

### Prerequisites
- An installation of GX with necessary dependencies
- Credentials set up for the SQL source data
- Previously initialized a Data Context
- Configured a Datasource for a SQL Database

In [None]:
### Add a splitter to the Datasource
table_asset.add_year_and_month_splitter(column_name="pickup_datetime")

:::tip Splitters and Batch Identifiers

When requesting data from a table Data Asset you can use the command `table_asset.batch_parameters_template()` to see how to specify your Batch Request.  This will include the Batch Identifier keys that your splitter has added to your table Data Asset.

::: 

## How to request a Batch of Data from a configured Data Asset

## How to request multiple Batches of Data from a configured Data Asset

## How to configure a non-SQL Data Asset to provide a sampling of its full data

## How to configure a SQL Data Asset to provide a sampling of its full data