In [1]:
pip install pydeequ==1.0.1

Collecting pydeequ==1.0.1
  Downloading pydeequ-1.0.1-py3-none-any.whl (36 kB)
Installing collected packages: pydeequ
Successfully installed pydeequ-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ['SPARK_VERSION'] = '3.1'

In [3]:
import pydeequ
from pydeequ.profiles import *
from pydeequ.checks import *
from pydeequ.verification import *
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

Deequ is still not supported in spark version: 3.1


In [4]:
spark = SparkSession\
    .builder\
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

:: loading settings :: url = jar:file:/Users/Elena_Sidorova/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/Elena_Sidorova/.ivy2/cache
The jars for the packages stored in: /Users/Elena_Sidorova/.ivy2/jars
com.amazon.deequ#deequ added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-863346a3-218f-4c6a-bebf-f5ace4e765a7;1.0
	confs: [default]
	found com.amazon.deequ#deequ;1.2.2-spark-3.0 in central
	found org.scalanlp#breeze_2.12;0.13.2 in central
	found org.scalanlp#breeze-macros_2.12;0.13.2 in central
	found org.scala-lang#scala-reflect;2.12.1 in central
	found com.github.fommil.netlib#core;1.1.2 in central
	found net.sf.opencsv#opencsv;2.3 in central
	found com.github.rwl#jtransforms;2.4.0 in central
	found junit#junit;4.8.2 in central
	found org.apache.commons#commons-math3;3.2 in central
	found org.spire-math#spire_2.12;0.13.0 in central
	found org.spire-math#spire-macros_2.12;0.13.0 in central
	found org.typelevel#machinist_2.12;0.6.1 in central
	found com.chuusai#shapeless_2.12;2.3.2 in central
	found org.typelevel#macro-c

23/11/13 13:57:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
df = spark.read.parquet("shop-clients.parquet")

                                                                                

In [6]:
df.show()

                                                                                

+--------------------+------+---------+----------+-------------+------------+---+------------+---------------+
|                  id|   sex|     name|   surname|  second_name|passport_num|age|total_orders|           city|
+--------------------+------+---------+----------+-------------+------------+---+------------+---------------+
|0000f59d342448a6a...|  male|    Артем|    Коваль|     Петрович|  8634393217| 64|          20|Санкт-Петербург|
|000389762aa34cf99...|female|    Ольга|    Попова|    Денисович|  1468464122| 64|          14|         Липецк|
|000494226d7149699...|  male|    Денис|     Тупин|     Олегович|  4111709475| 36|          55|           Омск|
|000b516589bf4024a...|female|    Елена|  Ермолова|     Павлович|  7119323175| 64|          76|       Новгород|
|004bb7d7b03a4715a...|  male|Александр|    Иванов|    Денисович|  5511145370| 74|          36|         Москва|
|004bc5df7dfe47a29...|  male|  Дмитрий|    Коваль|     Иванович|  4296589650| 20|          17|         Москва|
|

In [7]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- name: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- second_name: string (nullable = true)
 |-- passport_num: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- total_orders: integer (nullable = true)
 |-- city: string (nullable = true)



# Analyzer

In [8]:
from pydeequ.analyzers import *

analyzer = AnalysisRunner(spark) \
                    .onData(df) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness("name")) \
                    .addAnalyzer(Completeness("surname")) \
                    .addAnalyzer(Completeness("second_name")) \
                    .addAnalyzer(Completeness("passport_num")) \
                    .addAnalyzer(Completeness("city")) \
                    .addAnalyzer(Compliance("age less than 0", 'age<0')) \
                    .addAnalyzer(Compliance("age great than 100", 'age>100')) \
                    .addAnalyzer(Compliance("orders less than 0", 'total_orders<0')) \
                    .run()
                    
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analyzer)
analysisResult_df.show()

+-------+------------------+------------+-------+
| entity|          instance|        name|  value|
+-------+------------------+------------+-------+
| Column|              city|Completeness|    1.0|
| Column|      passport_num|Completeness|    1.0|
| Column|orders less than 0|  Compliance| 7.5E-4|
| Column|   age less than 0|  Compliance|    0.0|
| Column|       second_name|Completeness|    1.0|
|Dataset|                 *|        Size|20000.0|
| Column|              name|Completeness|    1.0|
| Column|           surname|Completeness|    1.0|
| Column|age great than 100|  Compliance|  0.001|
+-------+------------------+------------+-------+





# Constraint Verification

In [9]:
check_age = Check(spark, CheckLevel.Warning, "Users Age Check")\
        .hasMax("age", lambda x: x <= 100.0)  \
        .isNonNegative("age") \
        .isComplete("age")
check_fio = Check(spark, CheckLevel.Warning, "Users FIO Check")\
        .isComplete("name") \
        .isComplete("surname") \
        .isComplete("second_name")
check_sex = Check(spark, CheckLevel.Warning, "Users Sex Check")\
        .isContainedIn("sex", ["male", "female"])
check_dataset = Check(spark, CheckLevel.Error, "Users Dataset Check")\
        .hasSize(lambda x: x >= 20000)\
        .isUnique("id")

Python Callback server started!


In [10]:
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(check_age) \
    .addCheck(check_fio) \
    .addCheck(check_sex) \
    .addCheck(check_dataset) \
    .run()
    
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas().head(20)

                                                                                

Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,Users Age Check,Warning,Warning,"MaximumConstraint(Maximum(age,None))",Failure,Value: 19399.0 does not meet the constraint re...
1,Users Age Check,Warning,Warning,ComplianceConstraint(Compliance(age is non-neg...,Success,
2,Users Age Check,Warning,Warning,"CompletenessConstraint(Completeness(age,None))",Success,
3,Users FIO Check,Warning,Success,"CompletenessConstraint(Completeness(name,None))",Success,
4,Users FIO Check,Warning,Success,"CompletenessConstraint(Completeness(surname,No...",Success,
5,Users FIO Check,Warning,Success,CompletenessConstraint(Completeness(second_nam...,Success,
6,Users Sex Check,Warning,Success,ComplianceConstraint(Compliance(sex contained ...,Success,
7,Users Dataset Check,Error,Success,SizeConstraint(Size(None)),Success,
8,Users Dataset Check,Error,Success,"UniquenessConstraint(Uniqueness(List(id),None))",Success,


In [11]:
VerificationResult.successMetricsAsDataFrame(spark, checkResult).toPandas().head(20)



Unnamed: 0,entity,instance,name,value
0,Column,age is non-negative,Compliance,1.0
1,Column,second_name,Completeness,1.0
2,Column,"sex contained in male,female",Compliance,1.0
3,Dataset,*,Size,20000.0
4,Column,id,Uniqueness,1.0
5,Column,name,Completeness,1.0
6,Column,surname,Completeness,1.0
7,Column,age,Maximum,19399.0
8,Column,age,Completeness,1.0
