In [11]:
from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults

data_context_config = DataContextConfig(
    datasources={
        "pandas": DatasourceConfig(
            class_name="Datasource",
            execution_engine={
                "class_name": "SparkDFExecutionEngine"
            },
            data_connectors={
                "tripdata_monthly_configured": {
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                    "base_directory": "/work/greatexpectations/data_context_config/gx_tutorials/data",
                    "assets": {
                        "yellow": {
                            "pattern": r"yellow_tripdata_(\d{4})-(\d{2})\.csv$",
                            "group_names": ["year", "month"],
                        }
                    },
                }
            },
        )
    },
    store_backend_defaults=FilesystemStoreBackendDefaults(root_directory="/home/jovyan/work/greatexpectations/data_context_config/armazem"),
)

In [12]:
import great_expectations as gx

context = gx.get_context(project_config=data_context_config)

In [13]:
from ruamel import yaml

import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context.types.base import (
    DataContextConfig,
    InMemoryStoreBackendDefaults,
)
from great_expectations.util import get_context

In [14]:
datasource_config = {
    "name": "my_spark_dataframe",
    "class_name": "Datasource",
    "execution_engine": {"class_name": "SparkDFExecutionEngine"},
    "data_connectors": {
        "default_runtime_data_connector_name": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["batch_id"],
        }
    },
}

In [15]:
context.test_yaml_config(yaml.dump(datasource_config))


Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: SparkDFExecutionEngine
Data Connectors:
	default_runtime_data_connector_name:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x7fdcd08d1e70>

In [16]:
context.add_datasource(**datasource_config)

<great_expectations.datasource.new_datasource.Datasource at 0x7fdcd08d2bf0>

In [19]:
batch_request = RuntimeBatchRequest(
    datasource_name="my_spark_dataframe",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="<YOUR_MEANGINGFUL_NAME>",  # This can be anything that identifies this data_asset for you
    batch_identifiers={"batch_id": "default_identifier"},
    runtime_parameters={"batch_data": df},  # Your dataframe goes here
)

In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data = [
    {"a": 1, "b": 2, "c": 3},
    {"a": 4, "b": 5, "c": 6},
    {"a": 7, "b": 8, "c": 9},
]
df = spark.createDataFrame(data)

In [20]:
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9


In [21]:
context

{
  "anonymous_usage_statistics": {
    "explicit_id": true,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "enabled": true,
    "explicit_url": false,
    "data_context_id": "f6ebf043-e9b5-4ce7-b4b0-cfbf584ae7b6"
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_version": 3,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site/",
        "root_directory": "/home/jovyan/work/greatexpectations/data_context_config/armazem"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {
    "pandas": {
      "data_connectors": {
        "tripdata_monthly_configured": {
          "class_name": "ConfiguredAssetFilesystemDataConnector",
          "base_directory": 