In [1]:
import pydeequ
from pyspark.sql import SparkSession
from pydeequ.analyzers import *
from pydeequ.checks import *
from pydeequ.verification import *
from pydeequ.profiles import *
from pydeequ.suggestions import *

from config import (
    MINIO_ACCESS_KEY,
    MINIO_SECRET_KEY,
    MINIO_SERVER_HOST,
)
from data_config import TABLE_MAPPINGS
from delta.tables import *
from pyspark.sql import SparkSession
from util.logger import logger

Please set env variable SPARK_VERSION


In [2]:
spark = (
    SparkSession.builder
    .appName("Test PyDeequ")
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-catalyst_2.12:3.2.0,"
        "com.amazon.deequ:deequ:2.0.1-spark-3.2,"
        "org.apache.spark:spark-core_2.12:3.2.1,"
        "org.apache.spark:spark-sql_2.12:3.2.1,"
        "io.delta:delta-core_2.12:2.0.0,"
        "org.apache.spark:spark-avro_2.12:3.2.0,"
        "org.apache.hadoop:hadoop-aws:3.2.3,"
        "com.amazonaws:aws-java-sdk:1.11.375,"
        "org.apache.spark:spark-tags_2.12:3.2.0,"
    )
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.codegen.wholeStage", "false")
    .config(
        "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
    )
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_SERVER_HOST)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
    )
    .config("spark.databricks.delta.optimize.repartition.enabled", "true")
    .getOrCreate()
)

23/05/17 23:02:59 WARN Utils: Your hostname, ducdn-G3-3579 resolves to a loopback address: 127.0.1.1; using 192.168.2.106 instead (on interface wlo1)
23/05/17 23:02:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ducdn/Documents/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ducdn/.ivy2/cache
The jars for the packages stored in: /home/ducdn/.ivy2/jars
org.apache.spark#spark-catalyst_2.12 added as a dependency
com.amazon.deequ#deequ added as a dependency
org.apache.spark#spark-core_2.12 added as a dependency
org.apache.spark#spark-sql_2.12 added as a dependency
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk added as a dependency
org.apache.spark#spark-tags_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2a194f11-fdc3-4c3e-a517-749a8f1ed3bb;1.0
	confs: [default]
	found com.amazon.deequ#deequ;2.0.1-spark-3.2 in central
	found org.scala-lang#scala-reflect;2.12.10 in central
	found org.scalanlp#breeze_2.12;0.13.2 in central
	found org.scalanlp#breeze-macros_2.12;0.13.2 in central
	found com.github.fommil.netlib#core;1.1.2 in central
	found net.sf.opencsv#

In [5]:
df = spark.read.format("delta").load("s3a://datalake/sliver/cdc.myshop.order_detail")
df.show(n=5, truncate=True)
df.printSchema()

+---+-------------+--------+----------+--------+----------+----------+---------------+-----------------+----+-----+---+
| op|        ts_ms|order_id|product_id|quantity|item_price|created_at|before_order_id|before_product_id|year|month|day|
+---+-------------+--------+----------+--------+----------+----------+---------------+-----------------+----+-----+---+
|  r|1684339130238|      11|       385|       1|     325.0|1682414544|             11|              385|2023|    4| 25|
|  r|1684339130240|      18|       578|       1|     624.0|1682414544|             18|              578|2023|    4| 25|
|  r|1684339130242|      34|       194|       1|    1117.0|1682414544|             34|              194|2023|    4| 25|
|  r|1684339130242|      34|       232|       4|    7840.0|1682414544|             34|              232|2023|    4| 25|
|  r|1684339130243|      45|       254|       2|    3746.0|1682414544|             45|              254|2023|    4| 25|
+---+-------------+--------+----------+-

In [6]:
analysisResult = AnalysisRunner(spark) \
                    .onData(df) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness("order_id")) \
                    .addAnalyzer(Completeness("product_id")) \
                    .addAnalyzer(Completeness("quantity")) \
                    .addAnalyzer(Completeness("item_price")) \
                    .addAnalyzer(Completeness("created_at")) \
                    .addAnalyzer(Mean("item_price")) \
                    .run()

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show(truncate=False)


[Stage 22:>                                                         (0 + 8) / 8]

+-------+----------+------------+------------------+
|entity |instance  |name        |value             |
+-------+----------+------------+------------------+
|Column |order_id  |Completeness|1.0               |
|Column |item_price|Completeness|1.0               |
|Column |item_price|Mean        |2547.2772277227723|
|Column |quantity  |Completeness|1.0               |
|Dataset|*         |Size        |606.0             |
|Column |created_at|Completeness|1.0               |
|Column |product_id|Completeness|1.0               |
+-------+----------+------------+------------------+



                                                                                

#### Contraint sugesstion

In [7]:
suggestionResult = ConstraintSuggestionRunner(spark) \
                    .onData(df) \
                    .addConstraintRule(CompleteIfCompleteRule()) \
                    .addConstraintRule(NonNegativeNumbersRule()) \
                    .addConstraintRule(RetainCompletenessRule()) \
                    .addConstraintRule(RetainTypeRule()) \
                    .addConstraintRule(UniqueIfApproximatelyUniqueRule()) \
                    .run()

print(json.dumps(suggestionResult, indent=2))

23/05/17 23:06:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 37:>                                                         (0 + 8) / 8]

{
  "constraint_suggestions": [
    {
      "constraint_name": "CompletenessConstraint(Completeness(quantity,None))",
      "column_name": "quantity",
      "current_value": "Completeness: 1.0",
      "description": "'quantity' is not null",
      "suggesting_rule": "CompleteIfCompleteRule()",
      "rule_description": "If a column is complete in the sample, we suggest a NOT NULL constraint",
      "code_for_constraint": ".isComplete(\"quantity\")"
    },
    {
      "constraint_name": "ComplianceConstraint(Compliance('quantity' has no negative values,quantity >= 0,None))",
      "column_name": "quantity",
      "current_value": "Minimum: 1.0",
      "description": "'quantity' has no negative values",
      "suggesting_rule": "NonNegativeNumbersRule()",
      "rule_description": "If we see only non-negative numbers in a column, we suggest a corresponding constraint",
      "code_for_constraint": ".isNonNegative(\"quantity\")"
    },
    {
      "constraint_name": "CompletenessConstrain

                                                                                

In [11]:
check = Check(spark, CheckLevel.Warning, "Review Check")
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check \
        .isComplete("order_id")  \
        .isPositive("item_price") \
    ) \
    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

[Stage 48:>                                                         (0 + 8) / 8]

+------------+-----------+------------+-------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check       |check_level|check_status|constraint                                                                                                         |constraint_status|constraint_message|
+------------+-----------+------------+-------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
+------------+-----------+------------+-------------------------------------------------------------------------------------------------------------------+-----------------+------------------+



                                                                                

#### Check profile of data

In [8]:
result = ColumnProfilerRunner(spark) \
    .onData(df) \
    .run()

for col, profile in result.profiles.items():
    print(profile)

                                                                                

NumericProfiles for column: quantity: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 4,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": [
        [
            "1",
            158,
            0.2607260726072607
        ],
        [
            "2",
            145,
            0.23927392739273928
        ],
        [
            "3",
            147,
            0.24257425742574257
        ],
        [
            "4",
            156,
            0.25742574257425743
        ]
    ],
    "kll": "None",
    "mean": 2.4966996699669965,
    "maximum": 4.0,
    "minimum": 1.0,
    "sum": 1513.0,
    "stdDev": 1.1341484639079267,
    "approxPercentiles": []
}
NumericProfiles for column: order_id: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 208,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null,
    "kll": "None",
    "mean": 104.21452145214522,
    