In [0]:
# MLflow + ML imports
from mlflow.models.signature import infer_signature

import mlflow
import mlflow.sklearn

import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

from pyspark.sql.functions import col


In [0]:
#  Read Silver Base as BATCH

from pyspark.sql.functions import window, count, avg, stddev, approx_count_distinct

silver_base_batch = spark.read.table(
    "angad_kumar91.fraud_detection_silverlayer.silver_transactions_base"
)

print("Silver base rows:", silver_base_batch.count())


In [0]:
# Batch window aggregation (THIS WILL PRODUCE ROWS)

batch_features_df = (
    silver_base_batch
        .groupBy(
            "card1",
            window("event_timestamp", "5 minutes")
        )
        .agg(
            count("*").alias("txn_count_5min"),
            avg("TransactionAmt").alias("avg_amount_5min"),
            stddev("TransactionAmt").alias("stddev_amount_5min"),
            approx_count_distinct("ProductCD").alias("product_diversity_5min")
        )
        .select(
            "card1",
            col("window.start").alias("window_start"),
            col("window.end").alias("window_end"),
            "txn_count_5min",
            "avg_amount_5min",
            "stddev_amount_5min",
            "product_diversity_5min"
        )
)

print("Batch feature rows:", batch_features_df.count())
batch_features_df.show(5)


In [0]:
# Join for ML training (BATCH)

training_df = (
    silver_base_batch.join(
        batch_features_df,
        (silver_base_batch.card1 == batch_features_df.card1) &
        (silver_base_batch.event_timestamp >= batch_features_df.window_start) &
        (silver_base_batch.event_timestamp < batch_features_df.window_end),
        "left"
    )
)

training_df = training_df.filter(col("txn_count_5min").isNotNull())

print("Training rows:", training_df.count())
training_df.show(5)


In [0]:
# Feature columns for ML

feature_cols = [
    "TransactionAmt",
    "log_transaction_amount",
    "is_high_value_txn",
    "is_international_txn",
    "txn_count_5min",
    "avg_amount_5min",
    "stddev_amount_5min",
    "product_diversity_5min"
]

# Convert to Pandas (OK for POC size)
pdf = training_df.select(
    feature_cols + ["isFraud"]
).toPandas()

X = pdf[feature_cols]
y = pdf["isFraud"]

print("Feature shape:", X.shape)
print("Fraud rate:", y.mean())


In [0]:
# Build ML pipeline

pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("iforest", IsolationForest(
        n_estimators=200,
        contamination=0.035,   # ~3.5% fraud
        random_state=42,
        n_jobs=-1
    ))
])

# Train model

pipeline.fit(X)

# Generate anomaly scores

scaled_X = pipeline.named_steps["scaler"].transform(X)
raw_scores = pipeline.named_steps["iforest"].decision_function(scaled_X)

fraud_scores = -raw_scores  # higher = more suspicious

print("Sample fraud scores:", fraud_scores[:10])


In [0]:
# from mlflow.models.signature import infer_signature

# signature = infer_signature(X, pipeline.predict(X))

# mlflow.sklearn.log_model(
#     pipeline,
#     artifact_path="model",
#     registered_model_name="fraud_isolation_forest",
#     signature=signature
# )

In [0]:

# Infer MLflow signature

signature = infer_signature(X, pipeline.predict(X))

input_example = X.head(5)


In [0]:
# Set MLflow experiment

mlflow.set_experiment("/Users/kumarangad91289@gmail.com/isolation_forest")

with mlflow.start_run(run_name="isolation_forest_v1"):

    # -------------------------
    # Log parameters
    # -------------------------
    mlflow.log_param("model_type", "IsolationForest")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("contamination", 0.035)

    # -------------------------
    # Evaluation (labels used ONLY here)
    # -------------------------
    roc_auc = roc_auc_score(y, fraud_scores)

    precision, recall, _ = precision_recall_curve(y, fraud_scores)
    pr_auc = auc(recall, precision)

    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("pr_auc", pr_auc)

    # -------------------------
    # Log model WITH signature
    # -------------------------
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name="fraud_isolation_forest",
        signature=signature,
        input_example=input_example
    )

    print("ROC-AUC:", roc_auc)
    print("PR-AUC:", pr_auc)

