In [1]:
# Instalar a vesão 3.0.3 do PySpark
!pip install pyspark==3.0.3

# Instalar o NGROK
!wget -qnc https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -n -q ngrok-stable-linux-amd64.zip

# Autenticar a sessão do SparkUI com NGROK
!./ngrok authtoken 2KBeQEmmd1YNlQ86GGKf3KFOkb3_6sQH7JEnvEhDxwn9A7WnT
get_ipython().system_raw('./ngrok http 4050 &')
!sleep 10
!curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'

from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.0.3
  Downloading pyspark-3.0.3.tar.gz (209.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.1/209.1 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 KB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.3-py2.py3-none-any.whl size=209435969 sha256=9bcd177f13646178e4b1b9a28fbb5be1f64cfe16a651f5ca3d60076247666d5d
  Stored in directory: /root/.cache/pip/wheels/7f/28/02/0373b4f55c817bebc0dfe467728642269180aecfc771cebd06
Successfully built pyspark
Installing collected packages: py4j, py

In [2]:
!pip install pydeequ

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydeequ
  Downloading pydeequ-1.0.1-py3-none-any.whl (36 kB)
Installing collected packages: pydeequ
Successfully installed pydeequ-1.0.1


In [3]:
from pyspark.sql import SparkSession

import pydeequ

spark = (
    SparkSession.builder                  
      .config('spark.ui.port', '4050')
      .config("spark.jars.packages", pydeequ.deequ_maven_coord)
      .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
      .appName("SparkSQL")
      .getOrCreate()
)


ERROR:logger:Please set env variable SPARK_VERSION


In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType
from pyspark.sql.functions import col

schema_remetente_destinatario = StructType([
    StructField('nome', StringType()),
    StructField('banco', StringType()),
    StructField('tipo', StringType())
])

schema_base_pix = StructType([
    StructField('id_transacao', IntegerType()),
    StructField('valor', DoubleType()),
    StructField('remetente', schema_remetente_destinatario),
    StructField('destinatario', schema_remetente_destinatario),
    StructField('chave_pix', StringType()),
    StructField('categoria', StringType()),
    StructField('transaction_date', StringType()),
    StructField('fraude', IntegerType())
])

caminho_json = 'drive/MyDrive/Colab Notebooks/case_final.json'

df = spark.read.json(
    caminho_json,
    schema=schema_base_pix,
    timestampFormat="yyyy-MM-dd HH:mm:ss"
)

df = df.withColumn(
      'destinatario_nome', col('destinatario').getField('nome')
    ).withColumn(
      'destinatario_banco', col('destinatario').getField('banco')
    ).withColumn(
      'destinatario_tipo', col('destinatario').getField('tipo')
    ).withColumn(
      'remetente_nome', col('remetente').getField('nome')
    ).withColumn(
      'remetente_banco', col('remetente').getField('banco')
    ).withColumn(
      'remetente_tipo', col('remetente').getField('tipo')
).drop('remetente', 'destinatario')

In [6]:
from pydeequ.analyzers import AnalysisRunner, AnalyzerContext, ApproxCountDistinct, Completeness, Compliance, Mean, Size


analysisResult = (
    AnalysisRunner(spark).onData(df)
    .addAnalyzer(Size())
    .addAnalyzer(Completeness('id_transacao'))
    .addAnalyzer(Compliance("valor", "valor > 0"))
    .run()
)


In [7]:
analysisResult

JavaObject id=o89

In [8]:
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)


In [9]:
analysisResult_df.show()

+-------+------------+------------+--------+
| entity|    instance|        name|   value|
+-------+------------+------------+--------+
|Dataset|           *|        Size|100000.0|
| Column|id_transacao|Completeness|     1.0|
| Column|       valor|  Compliance| 0.99972|
+-------+------------+------------+--------+



In [10]:
from pydeequ.suggestions import ConstraintSuggestionRunner, DEFAULT

suggestionResult = ConstraintSuggestionRunner(spark).onData(df).addConstraintRule(DEFAULT()).run()

In [11]:
for sugg in suggestionResult['constraint_suggestions']:
  print(f"Sugestao de Constraint: \'{sugg['column_name']}\': {sugg['description']}")
  print(f"PySpark Code: {sugg['code_for_constraint']}\n")

Sugestao de Constraint: 'destinatario_nome': 'destinatario_nome' is not null
PySpark Code: .isComplete("destinatario_nome")

Sugestao de Constraint: 'remetente_nome': 'remetente_nome' has value range 'Jonathan Gonsalves'
PySpark Code: .isContainedIn("remetente_nome", ["Jonathan Gonsalves"])

Sugestao de Constraint: 'remetente_nome': 'remetente_nome' is not null
PySpark Code: .isComplete("remetente_nome")

Sugestao de Constraint: 'id_transacao': 'id_transacao' is not null
PySpark Code: .isComplete("id_transacao")

Sugestao de Constraint: 'id_transacao': 'id_transacao' has no negative values
PySpark Code: .isNonNegative("id_transacao")

Sugestao de Constraint: 'id_transacao': 'id_transacao' is unique
PySpark Code: .isUnique("id_transacao")

Sugestao de Constraint: 'remetente_banco': 'remetente_banco' has value range 'BTG'
PySpark Code: .isContainedIn("remetente_banco", ["BTG"])

Sugestao de Constraint: 'remetente_banco': 'remetente_banco' is not null
PySpark Code: .isComplete("remetente_

In [12]:
from pydeequ.checks import Check, CheckLevel, ConstrainableDataTypes
from pydeequ.verification import VerificationResult, VerificationSuite

check = Check(spark, CheckLevel.Warning, "Review Check")
error = Check(spark, CheckLevel.Error, "Error")

In [13]:
checkResult = (
    VerificationSuite(spark)
      .onData(df)
      .addCheck(
        check.hasDataType("id_transacao",ConstrainableDataTypes.Integral)
        .isNonNegative("id_transacao")
        .isComplete("id_transacao") 
        isUnique('id_transcao')
      )
  .run()
)

In [14]:
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

+------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check       |check_level|check_status|constraint                                                                                                                  |constraint_status|constraint_message|
+------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
+------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+



In [15]:
checkResult = (
    VerificationSuite(spark)
      .onData(df)
      .addCheck(
        error
          .isContainedIn("remetente_tipo", ["CNPJ"])
      )
  .run()
)

In [16]:
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show(truncate=False)

+-----+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------------------------+
|check|check_level|check_status|constraint                                                                                                                      |constraint_status|constraint_message                                  |
+-----+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------------------------+
|Error|Error      |Error       |ComplianceConstraint(Compliance(remetente_tipo contained in CNPJ,`remetente_tipo` IS NULL OR `remetente_tipo` IN ('CNPJ'),None))|Failure          |Value: 0.0 does not meet the constraint requirement!|
+-----+-----------+------------+------------------------------------