# Bronze layer


## 0) Dependências


In [None]:
# Databricks: instalar Faker (persistente no cluster enquanto ativo)
# Se seu cluster já tem Faker, pode ignorar esta célula.
%pip install Faker


## 1) Parâmetros e helpers


In [None]:
from pyspark.sql import functions as F, types as T
from pyspark.sql import Row
from datetime import datetime, timedelta
from faker import Faker
import random
import string

# ===== Parâmetros =====
CATALOG = "manufatura_lakehouse"         # exemplo: "manufatura_lakehouse" ou None se não usar Unity Catalog
SCHEMA  = "bronze"                        # esquema/database onde serão criadas as tabelas
SEED    = 42

N_EQUIPMENTS        = 50
N_IOT_READINGS      = 50000
N_PRODUCTION_ORDERS = 2000
N_MAINTENANCE_ORDERS = 500
N_QUALITY_INSPECTIONS = 1500

# Percentuais de "problemas"
P_DUP_EQUIPMENT            = 0.03
P_STATUS_CASE_VARIATION    = 0.25
P_STRING_DATE_IN_FIELDS    = 0.50
P_STRING_NUMERIC_IN_FIELDS  = 0.15
P_NULL_LOCATION            = 0.05
P_EQUIPMENT_DUP_DIFF_UPDATE = 0.10

P_DUP_IOT_READINGS         = 0.05
P_IOT_OUT_OF_RANGE         = 0.02
P_SENSOR_TYPE_VARIATION    = 0.20

P_DUP_PRODUCTION_ORDERS    = 0.025
P_NULL_EQUIPMENT_IN_ORDERS = 0.05
P_STATUS_INCONSISTENT      = 0.25

P_DUP_MAINTENANCE_ORDERS   = 0.03
P_MAINTENANCE_TYPE_VARIATION = 0.20
P_PRIORITY_VARIATION       = 0.25

P_DUP_INSPECTIONS          = 0.03
P_BOOL_AS_STRING           = 0.30
P_DEFECT_CODE_VARIATION    = 0.25

random.seed(SEED)
fake = Faker("pt_BR")
Faker.seed(SEED)

# ===== Nome totalmente qualificado de tabela =====
def fqtn(table):
    if CATALOG:
        return f"`{CATALOG}`.`{SCHEMA}`.`{table}`"
    else:
        return f"`{SCHEMA}`.`{table}`"

# ===== Criar schema/database =====
if CATALOG:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS `{CATALOG}`")
    spark.sql(f"CREATE SCHEMA  IF NOT EXISTS `{CATALOG}`.`{SCHEMA}`")
else:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS `{SCHEMA}`")

# ===== Utilidades =====
EQUIPMENT_STATUSES = ["operational", "maintenance", "idle", "broken", "retired"]
PRODUCTION_STATUSES = ["planned", "in_progress", "completed", "cancelled", "on_hold"]
MAINTENANCE_STATUSES = ["scheduled", "in_progress", "completed", "cancelled"]
MAINTENANCE_TYPES = ["preventive", "corrective", "predictive", "emergency"]
MAINTENANCE_PRIORITIES = ["low", "medium", "high", "critical"]
SENSOR_TYPES = ["temperature", "vibration", "pressure", "humidity", "current"]
DEFECT_CODES = ["D001", "D002", "D003", "D004", "D005", "D006"]
DEFECT_CATEGORIES = ["dimensional", "surface", "material", "assembly"]

def random_status_inconsistent(statuses):
    s = random.choice(statuses)
    if random.random() < P_STATUS_CASE_VARIATION:
        choices = [s.upper(), s.capitalize(), s.lower(), s.replace("_", " "), s.replace("_", "-")]
        s = random.choice(choices)
    return s

def random_date_between(days_back=365):
    base = datetime.utcnow()
    delta = timedelta(days=random.randint(0, days_back), seconds=random.randint(0, 86399))
    d = base - delta
    return d

def random_date_mixed_formats(dt):
    formats = [
        "%Y-%m-%d",
        "%Y/%m/%d",
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y",
        "%d-%m-%Y %H:%M:%S",
        "%Y-%m-%dT%H:%M:%S",
    ]
    return dt.strftime(random.choice(formats))

def maybe_stringify_number(x):
    if random.random() < P_STRING_NUMERIC_IN_FIELDS:
        return f"{x}"
    return x

def maybe_null(val, p=0.1):
    return None if random.random() < p else val

def random_bool_inconsistent():
    opts = ["true", "1", "yes", "Y", "false", "0", "no", "N", True, False]
    if random.random() < P_BOOL_AS_STRING:
        return str(random.choice(opts))
    return random.choice([True, False])

def alnum(n=8):
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=n))


## 2) Geração da Bronze – equipment_master


In [None]:
# Schema todo STRING para simular fonte raw heterogênea
equipment_schema = T.StructType([
    T.StructField("equipment_id",      T.StringType(), True),
    T.StructField("equipment_name",      T.StringType(), True),
    T.StructField("equipment_type",       T.StringType(), True),
    T.StructField("location",             T.StringType(), True),
    T.StructField("installation_date",   T.StringType(), True),
    T.StructField("manufacturer",         T.StringType(), True),
    T.StructField("model",               T.StringType(), True),
    T.StructField("status",              T.StringType(), True),  # inconsistente
    T.StructField("last_update_date",    T.StringType(), True),
])

equipment_types = ["CNC", "Press", "Welder", "Assembly", "Packaging", "Quality Control"]
manufacturers = ["Siemens", "ABB", "Fanuc", "Bosch", "Schneider", "Rockwell"]
locations = ["Linha 1", "Linha 2", "Linha 3", "Almoxarifado", "Manutenção", "Qualidade"]

rows = []
for i in range(N_EQUIPMENTS):
    eid = f"EQ{10000+i:05d}"
    eq_type = random.choice(equipment_types)
    name = f"{eq_type}-{random.choice(['Alpha','Beta','Gamma','Delta'])}-{alnum(4)}"
    location = None if random.random() < P_NULL_LOCATION else random.choice(locations)
    install_date = random_date_between(2000)
    
    row = Row(
        equipment_id      = eid,
        equipment_name    = name,
        equipment_type    = eq_type,
        location          = location,
        installation_date = random_date_mixed_formats(install_date) if random.random() < P_STRING_DATE_IN_FIELDS else install_date.strftime("%Y-%m-%d"),
        manufacturer      = random.choice(manufacturers),
        model             = f"MOD-{random.randint(100,999)}",
        status            = random_status_inconsistent(EQUIPMENT_STATUSES),
        last_update_date  = random_date_mixed_formats(random_date_between(500))
    )
    rows.append(row)

# Duplicatas intencionais de equipment_id
dup_count = int(N_EQUIPMENTS * P_DUP_EQUIPMENT)
for _ in range(dup_count):
    base = random.choice(rows).asDict()
    base["location"] = base["location"] if random.random() > 0.5 else None
    base["status"] = random_status_inconsistent(EQUIPMENT_STATUSES)
    base["last_update_date"] = random_date_mixed_formats(random_date_between(200))
    rows.append(Row(**base))

df_equipment = spark.createDataFrame(rows, schema=equipment_schema)
df_equipment.write.mode("overwrite").format("delta").saveAsTable(fqtn("equipment_master"))
display(df_equipment.limit(5))


## 3) Geração da Bronze – iot_sensor_readings


In [None]:
iot_schema = T.StructType([
    T.StructField("reading_id",        T.StringType(), True),
    T.StructField("equipment_id",      T.StringType(), True),
    T.StructField("sensor_id",         T.StringType(), True),
    T.StructField("sensor_type",       T.StringType(), True),
    T.StructField("reading_value",     T.StringType(), True),  # numérico como string às vezes
    T.StructField("reading_timestamp", T.StringType(), True),  # timestamps inconsistentes
    T.StructField("unit",              T.StringType(), True),
])

equipment_ids = [r["equipment_id"] for r in df_equipment.select("equipment_id").distinct().collect()]

# Ranges por tipo de sensor
sensor_ranges = {
    "temperature": (20, 100),  # Celsius
    "vibration": (0, 50),      # mm/s
    "pressure": (0, 10),        # bar
    "humidity": (30, 90),      # %
    "current": (0, 100)        # Amperes
}

units_map = {
    "temperature": "°C",
    "vibration": "mm/s",
    "pressure": "bar",
    "humidity": "%",
    "current": "A"
}

rows = []
for i in range(N_IOT_READINGS):
    eq_id = random.choice(equipment_ids)
    sensor_type = random.choice(SENSOR_TYPES)
    # Variação no nome do tipo
    if random.random() < P_SENSOR_TYPE_VARIATION:
        sensor_type_variants = [sensor_type.upper(), sensor_type.capitalize(), sensor_type.replace("_", " ")]
        sensor_type_display = random.choice(sensor_type_variants)
    else:
        sensor_type_display = sensor_type
    
    sensor_id = f"SENS-{eq_id}-{sensor_type[:3].upper()}-{random.randint(1,5)}"
    
    min_val, max_val = sensor_ranges[sensor_type]
    base_value = random.uniform(min_val, max_val)
    
    # Valores fora de range (anomalias)
    if random.random() < P_IOT_OUT_OF_RANGE:
        if random.random() < 0.5:
            value = base_value * random.uniform(1.5, 3.0)  # acima do range
        else:
            value = base_value * random.uniform(-0.5, 0.3)  # abaixo do range (pode ser negativo)
    else:
        value = base_value
    
    timestamp = random_date_between(90)  # últimos 90 dias
    reading_ts = random_date_mixed_formats(timestamp) if random.random() < 0.3 else timestamp.strftime("%Y-%m-%d %H:%M:%S")
    
    row = Row(
        reading_id        = f"IOT{1000000+i:08d}",
        equipment_id      = eq_id,
        sensor_id         = sensor_id,
        sensor_type       = sensor_type_display,
        reading_value     = str(round(value, 2)) if random.random() < P_STRING_NUMERIC_IN_FIELDS else f"{round(value, 2)}",
        reading_timestamp = reading_ts,
        unit              = units_map[sensor_type]
    )
    rows.append(row)

# Duplicatas (mesmo equipment_id, sensor_id, timestamp)
dup_count = int(N_IOT_READINGS * P_DUP_IOT_READINGS)
for _ in range(dup_count):
    base = random.choice(rows).asDict()
    # Pequena variação no timestamp ou valor
    base["reading_value"] = str(round(float(base["reading_value"]) + random.uniform(-0.1, 0.1), 2))
    rows.append(Row(**base))

df_iot = spark.createDataFrame(rows, schema=iot_schema)
df_iot.write.mode("overwrite").format("delta").saveAsTable(fqtn("iot_sensor_readings"))
display(df_iot.limit(5))


## 4) Geração da Bronze – production_orders


In [None]:
production_schema = T.StructType([
    T.StructField("production_order_id", T.StringType(), True),
    T.StructField("equipment_id",        T.StringType(), True),
    T.StructField("product_id",          T.StringType(), True),
    T.StructField("planned_start",       T.StringType(), True),
    T.StructField("planned_end",         T.StringType(), True),
    T.StructField("actual_start",        T.StringType(), True),
    T.StructField("actual_end",         T.StringType(), True),
    T.StructField("planned_quantity",    T.StringType(), True),
    T.StructField("actual_quantity",    T.StringType(), True),
    T.StructField("status",              T.StringType(), True),
    T.StructField("last_update",         T.StringType(), True),
])

product_ids = [f"PROD{1000+i:05d}" for i in range(100)]  # 100 produtos diferentes

rows = []
for i in range(N_PRODUCTION_ORDERS):
    po_id = f"PO{100000+i:06d}"
    eq_id = None if random.random() < P_NULL_EQUIPMENT_IN_ORDERS else random.choice(equipment_ids)
    prod_id = random.choice(product_ids)
    
    planned_start = random_date_between(180)
    planned_duration_hours = random.randint(2, 24)
    planned_end = planned_start + timedelta(hours=planned_duration_hours)
    
    # Actual pode ser diferente do planned
    actual_start_offset = random.randint(-2, 4)  # horas de atraso/adiantamento
    actual_start = planned_start + timedelta(hours=actual_start_offset)
    actual_duration_hours = planned_duration_hours + random.randint(-2, 3)
    actual_end = actual_start + timedelta(hours=max(1, actual_duration_hours))
    
    planned_qty = random.randint(100, 5000)
    efficiency = random.uniform(0.85, 1.05)  # pode ser > 1.0 (superprodução)
    actual_qty = int(planned_qty * efficiency)
    
    status = random_status_inconsistent(PRODUCTION_STATUSES)
    
    row = Row(
        production_order_id = po_id,
        equipment_id       = eq_id,
        product_id         = prod_id,
        planned_start      = random_date_mixed_formats(planned_start) if random.random() < P_STRING_DATE_IN_FIELDS else planned_start.strftime("%Y-%m-%d %H:%M:%S"),
        planned_end        = random_date_mixed_formats(planned_end) if random.random() < P_STRING_DATE_IN_FIELDS else planned_end.strftime("%Y-%m-%d %H:%M:%S"),
        actual_start       = random_date_mixed_formats(actual_start) if random.random() < P_STRING_DATE_IN_FIELDS else actual_start.strftime("%Y-%m-%d %H:%M:%S"),
        actual_end         = random_date_mixed_formats(actual_end) if random.random() < P_STRING_DATE_IN_FIELDS else actual_end.strftime("%Y-%m-%d %H:%M:%S"),
        planned_quantity   = str(planned_qty) if random.random() < P_STRING_NUMERIC_IN_FIELDS else f"{planned_qty}",
        actual_quantity    = str(actual_qty) if random.random() < P_STRING_NUMERIC_IN_FIELDS else f"{actual_qty}",
        status             = status,
        last_update        = random_date_mixed_formats(random_date_between(365))
    )
    rows.append(row)

# Duplicatas
dup_count = int(N_PRODUCTION_ORDERS * P_DUP_PRODUCTION_ORDERS)
for _ in range(dup_count):
    base = random.choice(rows).asDict()
    base["status"] = random_status_inconsistent(PRODUCTION_STATUSES)
    rows.append(Row(**base))

df_production = spark.createDataFrame(rows, schema=production_schema)
df_production.write.mode("overwrite").format("delta").saveAsTable(fqtn("production_orders"))
display(df_production.limit(5))


## 5) Geração da Bronze – maintenance_orders


In [None]:
maintenance_schema = T.StructType([
    T.StructField("maintenance_order_id", T.StringType(), True),
    T.StructField("equipment_id",          T.StringType(), True),
    T.StructField("maintenance_type",     T.StringType(), True),
    T.StructField("scheduled_start",       T.StringType(), True),
    T.StructField("scheduled_end",         T.StringType(), True),
    T.StructField("actual_start",          T.StringType(), True),
    T.StructField("actual_end",            T.StringType(), True),
    T.StructField("technician_id",         T.StringType(), True),
    T.StructField("status",                T.StringType(), True),
    T.StructField("priority",              T.StringType(), True),
    T.StructField("description",           T.StringType(), True),
    T.StructField("last_update",           T.StringType(), True),
])

technician_ids = [f"TECH{100+i:03d}" for i in range(20)]  # 20 técnicos
descriptions = [
    "Troca de filtros",
    "Lubrificação",
    "Calibração",
    "Substituição de peças",
    "Inspeção visual",
    "Teste de funcionamento",
    "Limpeza geral",
    "Ajuste de parâmetros"
]

rows = []
for i in range(N_MAINTENANCE_ORDERS):
    mo_id = f"MO{10000+i:05d}"
    eq_id = random.choice(equipment_ids)
    
    maint_type = random.choice(MAINTENANCE_TYPES)
    if random.random() < P_MAINTENANCE_TYPE_VARIATION:
        maint_type_variants = [maint_type.upper(), maint_type.capitalize(), maint_type.replace("_", " ")]
        maint_type = random.choice(maint_type_variants)
    
    scheduled_start = random_date_between(180)
    scheduled_duration_hours = random.randint(1, 8)
    scheduled_end = scheduled_start + timedelta(hours=scheduled_duration_hours)
    
    # Actual pode ser diferente
    actual_start_offset = random.randint(-1, 3)
    actual_start = scheduled_start + timedelta(hours=actual_start_offset)
    actual_duration_hours = scheduled_duration_hours + random.randint(-1, 2)
    actual_end = actual_start + timedelta(hours=max(1, actual_duration_hours))
    
    priority = random.choice(MAINTENANCE_PRIORITIES)
    if random.random() < P_PRIORITY_VARIATION:
        priority_variants = [priority.upper(), priority.capitalize(), priority.replace("_", " ")]
        priority = random.choice(priority_variants)
    
    status = random_status_inconsistent(MAINTENANCE_STATUSES)
    
    row = Row(
        maintenance_order_id = mo_id,
        equipment_id         = eq_id,
        maintenance_type     = maint_type,
        scheduled_start      = random_date_mixed_formats(scheduled_start) if random.random() < P_STRING_DATE_IN_FIELDS else scheduled_start.strftime("%Y-%m-%d %H:%M:%S"),
        scheduled_end        = random_date_mixed_formats(scheduled_end) if random.random() < P_STRING_DATE_IN_FIELDS else scheduled_end.strftime("%Y-%m-%d %H:%M:%S"),
        actual_start         = random_date_mixed_formats(actual_start) if random.random() < P_STRING_DATE_IN_FIELDS else actual_start.strftime("%Y-%m-%d %H:%M:%S"),
        actual_end           = random_date_mixed_formats(actual_end) if random.random() < P_STRING_DATE_IN_FIELDS else actual_end.strftime("%Y-%m-%d %H:%M:%S"),
        technician_id        = random.choice(technician_ids),
        status               = status,
        priority             = priority,
        description          = random.choice(descriptions),
        last_update          = random_date_mixed_formats(random_date_between(365))
    )
    rows.append(row)

# Duplicatas
dup_count = int(N_MAINTENANCE_ORDERS * P_DUP_MAINTENANCE_ORDERS)
for _ in range(dup_count):
    base = random.choice(rows).asDict()
    base["status"] = random_status_inconsistent(MAINTENANCE_STATUSES)
    rows.append(Row(**base))

df_maintenance = spark.createDataFrame(rows, schema=maintenance_schema)
df_maintenance.write.mode("overwrite").format("delta").saveAsTable(fqtn("maintenance_orders"))
display(df_maintenance.limit(5))


## 6) Geração da Bronze – quality_inspections


In [None]:
quality_schema = T.StructType([
    T.StructField("inspection_id",      T.StringType(), True),
    T.StructField("production_order_id", T.StringType(), True),
    T.StructField("equipment_id",        T.StringType(), True),
    T.StructField("inspection_type",    T.StringType(), True),
    T.StructField("inspection_date",    T.StringType(), True),
    T.StructField("inspector_id",       T.StringType(), True),
    T.StructField("passed",             T.StringType(), True),  # boolean como string
    T.StructField("failed_quantity",    T.StringType(), True),
    T.StructField("total_quantity",     T.StringType(), True),
    T.StructField("defect_codes",       T.StringType(), True),
    T.StructField("notes",              T.StringType(), True),
    T.StructField("last_update",        T.StringType(), True),
])

production_order_ids = [r["production_order_id"] for r in df_production.select("production_order_id").distinct().collect()]
inspector_ids = [f"INSP{10+i:03d}" for i in range(10)]  # 10 inspetores
inspection_types = ["visual", "dimensional", "functional", "material"]

rows = []
for i in range(N_QUALITY_INSPECTIONS):
    insp_id = f"INS{10000+i:05d}"
    po_id = random.choice(production_order_ids) if production_order_ids else f"PO{random.randint(100000, 200000):06d}"
    eq_id = random.choice(equipment_ids)
    
    inspection_date = random_date_between(180)
    total_qty = random.randint(50, 1000)
    pass_rate = random.uniform(0.92, 0.99)  # 92-99% de aprovação
    passed_qty = int(total_qty * pass_rate)
    failed_qty = total_qty - passed_qty
    
    passed = failed_qty == 0
    defect_codes_list = []
    if failed_qty > 0:
        num_defects = random.randint(1, min(3, len(DEFECT_CODES)))
        defect_codes_list = random.sample(DEFECT_CODES, num_defects)
        # Variação nos códigos
        if random.random() < P_DEFECT_CODE_VARIATION:
            defect_codes_list = [d.lower() if random.random() < 0.5 else d for d in defect_codes_list]
    
    defect_codes_str = ",".join(defect_codes_list) if defect_codes_list else None
    
    row = Row(
        inspection_id       = insp_id,
        production_order_id = po_id,
        equipment_id        = eq_id,
        inspection_type     = random.choice(inspection_types),
        inspection_date     = random_date_mixed_formats(inspection_date) if random.random() < P_STRING_DATE_IN_FIELDS else inspection_date.strftime("%Y-%m-%d %H:%M:%S"),
        inspector_id        = random.choice(inspector_ids),
        passed              = str(random_bool_inconsistent()) if random.random() < P_BOOL_AS_STRING else str(passed),
        failed_quantity     = str(failed_qty) if random.random() < P_STRING_NUMERIC_IN_FIELDS else f"{failed_qty}",
        total_quantity      = str(total_qty) if random.random() < P_STRING_NUMERIC_IN_FIELDS else f"{total_qty}",
        defect_codes        = defect_codes_str,
        notes               = maybe_null(f"Inspação {random.choice(['OK','Aprovado','Reprovado','Parcial'])}", 0.3),
        last_update         = random_date_mixed_formats(random_date_between(365))
    )
    rows.append(row)

# Duplicatas
dup_count = int(N_QUALITY_INSPECTIONS * P_DUP_INSPECTIONS)
for _ in range(dup_count):
    base = random.choice(rows).asDict()
    base["inspection_date"] = random_date_mixed_formats(random_date_between(180))
    rows.append(Row(**base))

df_quality = spark.createDataFrame(rows, schema=quality_schema)
df_quality.write.mode("overwrite").format("delta").saveAsTable(fqtn("quality_inspections"))
display(df_quality.limit(5))


## 7) Checks rápidos (compatibilidade com suas consultas Silver/Gold)


In [None]:
# Amostras e contagens
print("equipment_master:", spark.table(fqtn("equipment_master")).count())
print("iot_sensor_readings:", spark.table(fqtn("iot_sensor_readings")).count())
print("production_orders:", spark.table(fqtn("production_orders")).count())
print("maintenance_orders:", spark.table(fqtn("maintenance_orders")).count())
print("quality_inspections:", spark.table(fqtn("quality_inspections")).count())

# Algumas verificações que seu material usa na Silver:
spark.sql(f"""
SELECT status, COUNT(*) c
FROM {fqtn("equipment_master")}
GROUP BY status
ORDER BY c DESC
""").show(10, False)

spark.sql(f"""
SELECT equipment_id, COUNT(*) c
FROM {fqtn("equipment_master")}
GROUP BY equipment_id
HAVING COUNT(*) > 1
ORDER BY c DESC
""").show(5, False)

spark.sql(f"""
SELECT sensor_type, COUNT(*) c
FROM {fqtn("iot_sensor_readings")}
GROUP BY sensor_type
ORDER BY c DESC
""").show(10, False)

spark.sql(f"""
SELECT production_order_id, COUNT(*) c
FROM {fqtn("production_orders")}
GROUP BY production_order_id
HAVING COUNT(*) > 1
ORDER BY c DESC
""").show(5, False)
