# DQX Anomaly Detection Demo

This notebook shows three scenarios for `has_no_anomalies` using the new Spark ML IsolationForest pipeline:

1) **Simple data, defaults** – quick start with auto-derived names/registry.
2) **Structured data, tuned parameters** – demonstrate `AnomalyParams` and `IsolationForestConfig`.
3) **Large data (~10M rows)** – train with sampling caps; notes on runtime/perf.

> Requires Spark ≥ 3.5 / DBR ≥ 15.4 and `databricks-labs-dqx[anomaly]` installed.


In [None]:
# If running in a clean cluster, install the anomaly extra
# %pip install 'databricks-labs-dqx[anomaly]'

from databricks.labs.dqx import anomaly



In [None]:
import mlflow
import pyspark.sql.functions as F



## 1) Simple data, defaults
- Auto-derives `model_name` and `registry_table`
- Uses default sampling (0.3), max_rows (1M), train_ratio (0.8)
- Default score threshold: 0.5


In [None]:
# Simple synthetic data
small_df = spark.createDataFrame(
    [
        ("A", 10.0, 1.0),
        ("A", 11.0, 1.2),
        ("A", 9.5, 0.9),
        ("B", 50.0, 5.0),
        ("B", 49.0, 4.8),
        ("B", 52.0, 5.3),
    ],
    "category string, amount double, quantity double",
)

# Train with defaults (auto names, auto registry table)
anomaly.train(
    df=small_df,
    columns=["amount", "quantity"],
)

# Apply check
checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity"],
        score_threshold=0.5,
    )
]

# Just show scored rows for illustration
from databricks.labs.dqx.anomaly.model_registry import AnomalyModelRegistry

registry_table = None  # auto-derived same as training
model_name = None      # auto-derived same as training

# For demo, reload model and score
# (Normally you would use DQEngine.apply_checks)

loaded_model_name = model_name
loaded_registry_table = registry_table

# Derive names using helper (matches training auto-derivation)
from databricks.labs.dqx.anomaly.trainer import _derive_model_name, _derive_registry_table

active_model_name = loaded_model_name or _derive_model_name(small_df, ["amount", "quantity"])
active_registry = loaded_registry_table or _derive_registry_table(small_df)

active = AnomalyModelRegistry(small_df.sparkSession).get_active_model(active_registry, active_model_name)
scored = mlflow.spark.load_model(active.model_uri).transform(small_df)
scored.select("category", "amount", "quantity", "anomaly_score").show()



## 2) Structured data with AnomalyParams
- Show tuned sampling and algorithm config
- Explicit model/registry names for repeatability


In [None]:
from databricks.labs.dqx import anomaly

adv_params = anomaly.AnomalyParams(
    sample_fraction=0.4,
    max_rows=500_000,
    train_ratio=0.8,
    algorithm_config=anomaly.IsolationForestConfig(
        contamination=0.08,
        num_trees=300,
    ),
)

structured_df = spark.createDataFrame(
    [("2024-01-01", "us", 100.0, 2.0), ("2024-01-02", "us", 105.0, 2.2), ("2024-01-01", "eu", 90.0, 1.8)],
    "ds string, region string, amount double, quantity double",
)

anomaly.train(
    df=structured_df,
    columns=["amount", "quantity"],
    model_name="orders_anomaly_custom",
    registry_table="catalog.schema.dqx_anomaly_models",
    params=adv_params,
)

checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity"],
        model="orders_anomaly_custom",
        registry_table="catalog.schema.dqx_anomaly_models",
        score_threshold=0.6,
    )
]



## 3) Large data (~10M rows)
- Uses sampling caps to keep training tractable
- Expect longer runtime; adjust `sample_fraction`/`max_rows` if needed
- Demonstrates simple train + check


## 4) Model Metrics and Recommended Thresholds
- Query registry to view validation metrics
- Use recommended threshold (F1-optimal)
- View feature importance


In [None]:
# Train a model to examine metrics
metrics_df = spark.createDataFrame(
    [
        (100.0, 2.0), (105.0, 2.2), (95.0, 1.8), (102.0, 2.1), (98.0, 1.9),
        (500.0, 1.0),  # anomaly: high amount, low quantity
    ],
    "amount double, quantity double",
)

anomaly.train(
    df=metrics_df,
    columns=["amount", "quantity"],
    model_name="metrics_demo",
    registry_table="catalog.schema.dqx_anomaly_models",
)

# Query metrics from registry
metrics_query = spark.sql("""
    SELECT 
        model_name,
        metrics['recommended_threshold'] as recommended_threshold,
        metrics['threshold_50_precision'] as precision_at_50,
        metrics['threshold_50_recall'] as recall_at_50,
        metrics['threshold_50_f1'] as f1_at_50,
        metrics['estimated_contamination'] as est_contamination,
        feature_importance,
        training_time
    FROM catalog.schema.dqx_anomaly_models
    WHERE model_name = 'metrics_demo' AND status = 'active'
""")

display(metrics_query)

# Use recommended threshold in checks
recommended_threshold = metrics_query.first()['recommended_threshold']
print(f"Using recommended threshold: {recommended_threshold}")

checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity"],
        model="metrics_demo",
        registry_table="catalog.schema.dqx_anomaly_models",
        score_threshold=recommended_threshold,
    )
]


## 5) Feature Contributions for Explainability
- Enable `include_contributions=True`
- See which columns drove each anomaly score
- Use with DQEngine for complete workflow


In [None]:
# Data with clear anomalies
explainability_df = spark.createDataFrame(
    [
        (100.0, 2.0, 0.10),  # normal
        (105.0, 2.2, 0.12),  # normal
        (95.0, 1.8, 0.08),   # normal
        (9999.0, 1.0, 0.95), # anomaly: very high amount, high discount
    ],
    "amount double, quantity double, discount double",
)

# Train
anomaly.train(
    df=explainability_df,
    columns=["amount", "quantity", "discount"],
    model_name="explainability_demo",
    registry_table="catalog.schema.dqx_anomaly_models",
)

# Apply check with contributions
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient

dq_engine = DQEngine(WorkspaceClient())

checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity", "discount"],
        model="explainability_demo",
        registry_table="catalog.schema.dqx_anomaly_models",
        score_threshold=0.5,
        include_contributions=True,  # Enable feature contributions
    )
]

result_df = dq_engine.apply_checks_by_metadata(explainability_df, checks)

# Show results with contributions
display(result_df.select("amount", "quantity", "discount", "anomaly_score", "anomaly_contributions"))

# Filter to anomalies only
anomalous_rows = result_df.filter(F.col("anomaly_score") > 0.5)
display(anomalous_rows.select("amount", "quantity", "discount", "anomaly_score", "anomaly_contributions"))


## 6) Drift Detection and Retraining
- Train on "old" distribution
- Check on "shifted" distribution
- Observe drift warnings with retrain recommendations


In [None]:
import warnings

# Train on "old" distribution (mean ~100)
old_distribution_df = spark.createDataFrame(
    [(100.0 + i * 2.0, 2.0) for i in range(50)],
    "amount double, quantity double",
)

anomaly.train(
    df=old_distribution_df,
    columns=["amount", "quantity"],
    model_name="drift_demo",
    registry_table="catalog.schema.dqx_anomaly_models",
)

# Simulate "new" distribution with significant shift (mean ~500)
new_distribution_df = spark.createDataFrame(
    [(500.0 + i * 2.0, 5.0) for i in range(20)],
    "amount double, quantity double",
)

# Apply check with drift detection enabled
checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity"],
        model="drift_demo",
        registry_table="catalog.schema.dqx_anomaly_models",
        score_threshold=0.5,
        drift_threshold=3.0,  # Enable drift detection (default threshold)
    )
]

# Capture warnings
with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")
    result_df = dq_engine.apply_checks_by_metadata(new_distribution_df, checks)
    
    # Display warnings
    for warning in w:
        print(f"WARNING: {warning.message}")

# Show results
display(result_df.select("amount", "quantity", "anomaly_score"))


## 7) Quarantine Workflow
- Split valid and invalid rows
- Save to separate tables (valid and quarantine)
- Quarantine includes anomaly scores and contributions


In [None]:
from databricks.labs.dqx.config import OutputConfig

# Data with mix of normal and anomalous rows
quarantine_df = spark.createDataFrame(
    [
        (1, 100.0, 2.0, 0.10),  # normal
        (2, 105.0, 2.2, 0.12),  # normal
        (3, 95.0, 1.8, 0.08),   # normal
        (4, 9999.0, 1.0, 0.95), # anomaly
        (5, 102.0, 2.1, 0.11),  # normal
        (6, 5000.0, 0.5, 0.90), # anomaly
    ],
    "id int, amount double, quantity double, discount double",
)

# Train
anomaly.train(
    df=quarantine_df.filter("id <= 3"),  # train on normal rows only
    columns=["amount", "quantity", "discount"],
    model_name="quarantine_demo",
    registry_table="catalog.schema.dqx_anomaly_models",
)

# Apply check with contributions
checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity", "discount"],
        model="quarantine_demo",
        registry_table="catalog.schema.dqx_anomaly_models",
        score_threshold=0.5,
        include_contributions=True,
    )
]

# Split valid and invalid
valid_df, invalid_df = dq_engine.apply_checks_by_metadata_and_split(quarantine_df, checks)

print(f"Valid rows: {valid_df.count()}")
print(f"Invalid (anomalous) rows: {invalid_df.count()}")

# Show quarantined rows with anomaly details
display(invalid_df.select(
    "id", "amount", "quantity", "discount",
    "anomaly_score", "anomaly_contributions",
    "dqx_check_name", "dqx_error_message"
))

# Save to tables (commented out to avoid creating tables in demo)
# dq_engine.save_results_in_table(
#     output_df=valid_df,
#     quarantine_df=invalid_df,
#     output_config=OutputConfig(location="catalog.schema.valid_orders"),
#     quarantine_config=OutputConfig(location="catalog.schema.quarantine_orders"),
# )


In [None]:
# Generate ~10M rows with simple numeric columns
large_df = spark.range(0, 10_000_000).select(
    F.col("id").alias("row_id"),
    (F.col("id") % 1000).cast("double").alias("amount"),
    (F.col("id") % 50).cast("double").alias("quantity"),
)

# Train with defaults (sampling and max_rows will cap workload)
anomaly.train(
    df=large_df,
    columns=["amount", "quantity"],
    model_name="large_anomaly_demo",
    registry_table="catalog.schema.dqx_anomaly_models",
)

checks = [
    anomaly.has_no_anomalies(
        columns=["amount", "quantity"],
        model="large_anomaly_demo",
        registry_table="catalog.schema.dqx_anomaly_models",
        score_threshold=0.5,
    )
]

# Example: run transform to materialize scores (normally use DQEngine)
from databricks.labs.dqx.anomaly.model_registry import AnomalyModelRegistry
active = AnomalyModelRegistry(spark).get_active_model("catalog.schema.dqx_anomaly_models", "large_anomaly_demo")
scored_large = mlflow.spark.load_model(active.model_uri).transform(large_df.limit(1000))
scored_large.select("row_id", "anomaly_score").show(5)

