In [None]:
# Configuração: cria widgets com parâmetros do job
dbutils.widgets.text('catalog', 'manufatura_lakehouse')
dbutils.widgets.text('schema_bronze', 'bronze')
dbutils.widgets.text('schema_silver', 'silver')

catalog = dbutils.widgets.get('catalog')
schema_bronze = dbutils.widgets.get('schema_bronze')
schema_silver = dbutils.widgets.get('schema_silver')

print(f'Catalog: {catalog}')
print(f'Schema Bronze: {schema_bronze}')
print(f'Schema Silver: {schema_silver}')

In [None]:
%sql
-- Usa catalog e schema configurados
USE CATALOG `${catalog}`;
CREATE SCHEMA IF NOT EXISTS `${catalog}`.`${schema_silver}`;
USE SCHEMA `${schema_silver}`;

In [None]:
%sql
-- =========================================================
-- SILVER: equipment_clean (dedupe + idempotência via hash)
-- =========================================================
CREATE TABLE IF NOT EXISTS ${catalog}.${schema_silver}.equipment_clean (
  equipment_id      STRING,
  equipment_name    STRING,
  equipment_type    STRING,
  location          STRING,
  installation_date DATE,
  manufacturer      STRING,
  model             STRING,
  status            STRING,
  last_update_date  TIMESTAMP,
  row_hash          STRING
) USING DELTA;

-- 1) Stage: normaliza e parseia installation_date e last_update_date
CREATE OR REPLACE TEMP VIEW stage_equipment AS
SELECT
  equipment_id,
  equipment_name,
  equipment_type,
  location,
  -- Parse installation_date (pode ter hora, então tenta timestamp primeiro)
  CAST(COALESCE(
    try_to_timestamp(installation_date, 'yyyy-MM-dd HH:mm:ss'),
    try_to_timestamp(installation_date, "yyyy-MM-dd'T'HH:mm:ss"),
    try_to_timestamp(installation_date, 'dd-MM-yyyy HH:mm:ss'),
    try_to_timestamp(installation_date, 'yyyy-MM-dd'),
    try_to_timestamp(installation_date, 'yyyy/MM/dd'),
    try_to_timestamp(installation_date, 'dd/MM/yyyy'),
    try_to_timestamp(installation_date, 'dd-MM-yyyy'),
    try_to_date(installation_date, 'yyyy-MM-dd'),
    try_to_date(installation_date, 'yyyy/MM/dd'),
    try_to_date(installation_date, 'dd/MM/yyyy'),
    try_to_date(installation_date, 'dd-MM-yyyy')
  ) AS DATE) AS installation_date_parsed,
  manufacturer,
  model,
  UPPER(TRIM(status)) AS status_norm,
  -- Parse last_update_date para timestamp
  COALESCE(
    try_to_timestamp(last_update_date, 'yyyy-MM-dd HH:mm:ss'),
    try_to_timestamp(last_update_date, "yyyy-MM-dd'T'HH:mm:ss"),
    try_to_timestamp(last_update_date, 'yyyy/MM/dd HH:mm:ss'),
    try_to_timestamp(last_update_date, 'dd/MM/yyyy HH:mm:ss'),
    try_to_timestamp(last_update_date, 'dd-MM-yyyy HH:mm:ss'),
    try_to_timestamp(last_update_date, 'yyyy-MM-dd'),
    try_to_timestamp(last_update_date, 'yyyy/MM/dd'),
    try_to_timestamp(last_update_date, 'dd/MM/yyyy'),
    try_to_timestamp(last_update_date, 'dd-MM-yyyy')
  ) AS last_update_ts
FROM ${catalog}.${schema_bronze}.equipment_master
WHERE equipment_id IS NOT NULL;

-- 2) Janela incremental (watermark de 90 dias)
-- Inclui registros com last_update_ts NULL para garantir que não perdemos dados
CREATE OR REPLACE TEMP VIEW stage_equipment_win AS
SELECT *
FROM stage_equipment
WHERE last_update_ts IS NULL 
   OR last_update_ts >= date_sub(current_timestamp(), 90);

-- 3) Dedup: mantém 1 linha por equipment_id (mais recente por last_update_ts)
CREATE OR REPLACE TEMP VIEW stage_equipment_dedup AS
SELECT
  equipment_id,
  equipment_name,
  equipment_type,
  location,
  installation_date_parsed AS installation_date,
  manufacturer,
  model,
  status_norm AS status,
  last_update_ts AS last_update_date
FROM (
  SELECT
    s.*,
    ROW_NUMBER() OVER (
      PARTITION BY equipment_id
      ORDER BY last_update_ts DESC NULLS LAST,
               equipment_name DESC
    ) AS rn
  FROM stage_equipment_win s
) z
WHERE rn = 1;

-- 4) Calcula hash para idempotência
CREATE OR REPLACE TEMP VIEW stage_equipment_final AS
SELECT
  equipment_id,
  equipment_name,
  equipment_type,
  location,
  installation_date,
  manufacturer,
  model,
  status,
  last_update_date,
  sha2(concat_ws('||',
    coalesce(equipment_name,''),
    coalesce(equipment_type,''),
    coalesce(location,''),
    coalesce(date_format(installation_date,'yyyy-MM-dd'),''),
    coalesce(manufacturer,''),
    coalesce(model,''),
    coalesce(status,'')
  ), 256) AS row_hash
FROM stage_equipment_dedup;

-- 5) MERGE idempotente
MERGE INTO ${catalog}.${schema_silver}.equipment_clean AS t
USING stage_equipment_final AS s
ON t.equipment_id = s.equipment_id
WHEN MATCHED AND (t.row_hash IS NULL OR t.row_hash <> s.row_hash) THEN UPDATE SET
  t.equipment_name   = s.equipment_name,
  t.equipment_type   = s.equipment_type,
  t.location         = s.location,
  t.installation_date = s.installation_date,
  t.manufacturer     = s.manufacturer,
  t.model            = s.model,
  t.status           = s.status,
  t.last_update_date = s.last_update_date,
  t.row_hash         = s.row_hash
WHEN NOT MATCHED THEN INSERT (
  equipment_id, equipment_name, equipment_type, location, installation_date,
  manufacturer, model, status, last_update_date, row_hash
) VALUES (
  s.equipment_id, s.equipment_name, s.equipment_type, s.location, s.installation_date,
  s.manufacturer, s.model, s.status, s.last_update_date, s.row_hash
);
