In [None]:
pip install --upgrade pip

In [None]:
pip install databricks-labs-dqx

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when
import random
import json

from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.col_functions import *

In [None]:
names = ['Alice', 'Bob', 'Esther']
genders = ['F', 'M', None]

data = []

for i in range(1, 91):
  data.append((
    i, random.choice(names) if random.random() > 0.1 else None,
    random.randint(18, 60) if random.random() > 0.5 else None,
    random.choice(genders)
    ))

for _ in range(10):
  data.append(random.choice(data))

spark = SparkSession.builder.appName('StartingDQX').getOrCreate()

ws_client = WorkspaceClient()

df = spark.createDataFrame(data, ['id', 'name', 'age', 'gender'])

df = df.withColumn('id', when(col('id') % 15 == 0, None).otherwise(col('id')))

In [None]:
df.display()

In [None]:
def data_profile(data_input):
  try:
    profiler = DQProfiler(ws_client)
    summary_stats, profiles = profiler.profile(data_input)
    return summary_stats, profiles
  except Exception as e:
    raise

In [None]:
summary_stats, profiles = data_profile(df)

In [None]:
print('stats', json.dumps(summary_stats))

In [None]:
print('profile', profiles)

In [None]:
from dlt import dlt_table, read_stream, expect, pipeline
from pyspark.sql.functions import col, to_date

# Bronze Layer: ingestão de dados brutos
@dlt_table(name="bronze_orders")
def bronze_orders():
    return (
        read_stream("cloud_files:/mnt/raw/orders", format="json")
    )

# Silver Layer: limpeza e validação com DQX
@dlt_table(name="silver_orders")
@expect("valid_order_id", "order_id IS NOT NULL")
@expect_or_drop("valid_total_amount", "total_amount >= 0")
@expect("valid_order_date", "order_date IS NOT NULL")
def silver_orders():
    df = dlt.read("bronze_orders")
    return (
        df.withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd"))
    )

# Gold Layer: agregação
@dlt_table(name="gold_sales_summary")
def gold_sales_summary():
    df = dlt.read("silver_orders")
    return (
        df.groupBy("order_date")
          .sum("total_amount")
          .withColumnRenamed("sum(total_amount)", "daily_sales")
    )
