# Bronze layer

## 0) Dependencies

In [0]:
# Databricks: install Faker (persists in cluster while active)
# If your cluster already has Faker, you can ignore this cell.
%pip install Faker


## 1) Par창metros e helpers

In [0]:
from pyspark.sql import functions as F, types as T
from pyspark.sql import Row
from datetime import datetime, timedelta
from faker import Faker
import random
import string

# ===== Parameters =====
CATALOG = "workshop_modelagem_aovivo"            # example: "workshop_catalog" or None if not using Unity Catalog
SCHEMA  = "bronze"                               # schema/database where tables will be created
SEED    = 42

N_CUSTOMERS   = 1000
N_PRODUCTS    = 500
N_ORDERS      = 5000
N_ORDER_ITEMS = 12000

# Percentage of "problems"
P_DUP_ORDERS                = 0.025   # ~2.5% duplicate order_id
P_NULL_CUSTOMER_IN_ORDERS   = 0.05    # ~5% null customer_id
P_STATUS_CASE_VARIATION     = 0.25
P_STRING_DATE_IN_ORDERS     = 0.50    # half as string, half as coherent date
P_STRING_NUMERIC_IN_FIELDS  = 0.15    # % of numeric as string

P_DUP_PRODUCT_ID            = 0.03
P_INCONSISTENT_IS_ACTIVE    = 0.40
P_NULL_BRAND_SUBCATEGORY    = 0.10

P_CUSTOMER_INCONSISTENCY    = 0.20    # states "SP", "sp", "S찾o Paulo"
P_CUSTOMER_DUP_DIFF_UPDATE  = 0.10    # duplicate customer_id with different last_update_date
P_EMPTY_FIELDS_CUSTOMER     = 0.05

P_DUP_ORDERITEM_SAME_OP     = 0.05    # duplicate (order_id, product_id) with different updated_at
P_NULLS_DISCOUNT_PROMO      = 0.15

random.seed(SEED)
fake = Faker("pt_BR")
Faker.seed(SEED)

# ===== Fully qualified table name =====
def fqtn(table):
    if CATALOG:
        return f"`{CATALOG}`.`{SCHEMA}`.`{table}`"
    else:
        return f"`{SCHEMA}`.`{table}`"

# ===== Create schema/database =====
if CATALOG:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS `{CATALOG}`")
    spark.sql(f"CREATE SCHEMA  IF NOT EXISTS `{CATALOG}`.`{SCHEMA}`")
else:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS `{SCHEMA}`")

# ===== Utilities =====
STATUSES = ["delivered","shipped","processing","cancelled","returned"]

def random_status_inconsistent():
    s = random.choice(STATUSES)
    if random.random() < P_STATUS_CASE_VARIATION:
       # capitalization variations
        choices = [s.upper(), s.capitalize(), s.lower()]
        s = random.choice(choices)
    return s

def random_date_between(days_back=365):
    base = datetime.utcnow()
    delta = timedelta(days=random.randint(0, days_back), seconds=random.randint(0, 86399))
    d = base - delta
    return d

def random_date_mixed_formats(dt):
    # returns string in varied formats ("/", "-", with/without time)
    # e.g.: "2025-10-25", "2025/10/25 14:33:20", "25/10/2025", etc.
    formats = [
        "%Y-%m-%d",
        "%Y/%m/%d",
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y",
        "%d-%m-%Y %H:%M:%S",
    ]
    return dt.strftime(random.choice(formats))

def maybe_stringify_number(x):
     # converts numeric to string in some cases
    if random.random() < P_STRING_NUMERIC_IN_FIELDS:
        return f"{x}"
    return x

def maybe_null(val, p=0.1):
    return None if random.random() < p else val

def dirty_state(uf):
    # introduces inconsistencies: "SP", "sp", "S찾o Paulo"
    if random.random() < P_CUSTOMER_INCONSISTENCY:
        variants = [uf, uf.lower(), "S찾o Paulo" if uf.upper()=="SP" else uf]
        return random.choice(variants)
    return uf

def random_bool_inconsistent():
    # "true", "1", "yes", True, False...
    opts = ["true","1","yes","false","0","no", True, False]
    if random.random() < P_INCONSISTENT_IS_ACTIVE:
        return random.choice(opts)
    return True

def alnum(n=8):
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=n))
