In [0]:
from pyspark.sql.functions import col

# Silver Base (transactions)
base_df = spark.read.table(
    "angad_kumar91.fraud_detection_silverlayer.silver_transactions_base"
)

# Silver Features (5-minute windows)
features_df = spark.read.table(
    "angad_kumar91.fraud_detection_silverlayer.silver_txn_features_5min"
)


In [0]:
training_df = (
    base_df.join(
        features_df,
        (base_df.card1 == features_df.card1) &
        (base_df.event_timestamp >= features_df.window_start) &
        (base_df.event_timestamp < features_df.window_end),
        "left"
    )
)


In [0]:
from pyspark.sql.functions import coalesce, lit

training_df = (
    training_df
        .withColumn("txn_count_5min", coalesce(col("txn_count_5min"), lit(1)))
        .withColumn("avg_amount_5min", coalesce(col("avg_amount_5min"), col("TransactionAmt")))
        .withColumn("stddev_amount_5min", coalesce(col("stddev_amount_5min"), lit(0.0)))
        .withColumn("product_diversity_5min", coalesce(col("product_diversity_5min"), lit(1)))
)


In [0]:
feature_cols = [
    "TransactionAmt",
    "log_transaction_amount",
    "is_high_value_txn",
    "is_international_txn",
    "txn_count_5min",
    "avg_amount_5min",
    "stddev_amount_5min",
    "product_diversity_5min"
]

label_col = "isFraud"


In [0]:
pdf = training_df.select(feature_cols + [label_col]).dropna().toPandas()


pdf = pdf.sample(frac=0.2, random_state=42)


In [0]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.ensemble import IsolationForest

X = pdf[feature_cols]
y = pdf[label_col]

with mlflow.start_run(run_name="isolation_forest_v1"):

    # =========================
    # Train model
    # =========================
    model = IsolationForest(
        n_estimators=200,
        contamination=0.02,
        random_state=42
    )

    model.fit(X)

    # =========================
    # Predictions (for metrics)
    # =========================
    pdf["anomaly_score"] = model.decision_function(X)
    pdf["anomaly"] = model.predict(X)

    fraud_anomaly_rate = (
        pdf[pdf[label_col] == 1]["anomaly"]
        .value_counts(normalize=True)
        .get(-1, 0)
    )

    # =========================
    # Log params & metrics
    # =========================
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("contamination", 0.02)
    mlflow.log_metric("fraud_anomaly_rate", fraud_anomaly_rate)

    # =========================
    # Infer signature + input example
    # =========================
    input_example = X.head(5)
    signature = infer_signature(X, model.predict(X))

    # =========================
    # Log model (BEST PRACTICE)
    # =========================
    mlflow.sklearn.log_model(
        model,
        artifact_path="isolation_forest_model",
        input_example=input_example,
        signature=signature
    )


In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS angad_kumar91.fraud_detection_goldlayer")
spark.createDataFrame(pdf).write.mode("overwrite").saveAsTable(
    "angad_kumar91.fraud_detection_goldlayer.isolation_forest_predictions"
)


In [0]:
%sql
select * from angad_kumar91.fraud_detection_goldlayer.isolation_forest_predictions;