In [None]:
pip install --upgrade pip

In [None]:
pip install databricks-labs-dqx

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when
import random
import json

from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.col_functions import *

In [None]:
names = ['Alice', 'Bob', 'Esther']
genders = ['F', 'M', None]

data = []

for i in range(1, 91):
  data.append((
    i, random.choice(names) if random.random() > 0.1 else None,
    random.randint(18, 60) if random.random() > 0.5 else None,
    random.choice(genders)
    ))

for _ in range(10):
  data.append(random.choice(data))

spark = SparkSession.builder.appName('StartingDQX').getOrCreate()

ws_client = WorkspaceClient()

df = spark.createDataFrame(data, ['id', 'name', 'age', 'gender'])

df = df.withColumn('id', when(col('id') % 15 == 0, None).otherwise(col('id')))

In [None]:
df.display()

In [None]:
def data_profile(data_input):
  try:
    profiler = DQProfiler(ws_client)
    summary_stats, profiles = profiler.profile(data_input)
    return summary_stats, profiles
  except Exception as e:
    raise

In [None]:
summary_stats, profiles = data_profile(df)

In [None]:
print('stats', json.dumps(summary_stats))

In [None]:
print('profile', profiles)

In [None]:
import dlt
from pyspark.sql.functions import col

# 1. Tabela com os dados brutos
@dlt.table(
    comment="Dados brutos de usuários extraídos de um sistema externo"
)
def raw_users():
    return spark.read.format("json").load("/mnt/raw_data/users/")

# 2. Tabela com regras de qualidade aplicadas
@dlt.table(
    comment="Dados de usuários limpos com validações de qualidade"
)
@dlt.expect("id_not_null", "id IS NOT NULL")
@dlt.expect_or_drop("age_positive", "age > 0")
@dlt.expect_or_fail("email_format_valid", "email LIKE '%@%.%'")
def cleaned_users():
    return dlt.read("raw_users")

# 3. Tabela de usuários ativos
@dlt.table(
    comment="Usuários ativos com idade entre 18 e 60"
)
def active_users():
    return dlt.read("cleaned_users").filter((col("age") >= 18) & (col("age") <= 60))