In [0]:
# Cell 1: Load ML features and train fraud detection model
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import mlflow
import mlflow.spark

print("ü§ñ FRAUD DETECTION ML MODEL")
print("=" * 80)

# Read Gold ML features
ml_features = spark.table("smart_claims_dev.gold.ml_features")

print(f"‚úÖ Loaded ML features: {ml_features.count():,} rows")
print(f"üìä Columns: {len(ml_features.columns)}")

# Check data
print("\nüîç Data preview:")
ml_features.show(3, truncate=False)

# Select features for model
feature_cols = [
    "age",
    "claim_amount",
    "months_as_customer",
    "number_of_vehicles_involved",
    "number_of_witnesses",
    "suspicious_flag",
    "fraud_indicator",
    "no_witnesses_flag",
    "new_customer_flag",
    "total_loss_flag",
    "major_damage_flag",
    "multi_vehicle_flag"
]

print(f"\nüìã Feature columns: {len(feature_cols)}")
print(feature_cols)

In [0]:
# Cell 2: Train fraud detection model (SIMPLIFIED - no UC volume)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import mlflow
import mlflow.spark

print("üîß PREPARING DATA FOR TRAINING")
print("=" * 80)

# Filter out nulls
ml_data = ml_features.select(
    col("claim_id"),
    *feature_cols
).na.drop()

print(f"‚úÖ Clean data: {ml_data.count():,} rows (after removing nulls)")

# Create feature vector
assembler = VectorAssembler(
    inputCols=[c for c in feature_cols if c != "fraud_indicator"],
    outputCol="features"
)

ml_data_vectorized = assembler.transform(ml_data)

print("\nüìä Feature vector created")
ml_data_vectorized.select("claim_id", "features", "fraud_indicator").show(3, truncate=False)

# Split train/test
train_data, test_data = ml_data_vectorized.randomSplit([0.8, 0.2], seed=42)

print(f"\n‚úÖ Train set: {train_data.count():,} rows")
print(f"‚úÖ Test set: {test_data.count():,} rows")
print("=" * 80)

# Train model WITHOUT MLflow (just for this demo)
print("\nü§ñ TRAINING FRAUD DETECTION MODEL")
print("=" * 80)

# Train logistic regression
lr = LogisticRegression(
    featuresCol="features",
    labelCol="fraud_indicator",
    maxIter=100,
    regParam=0.01
)

model = lr.fit(train_data)

# Predictions
predictions = model.transform(test_data)

# Evaluate
evaluator = BinaryClassificationEvaluator(
    labelCol="fraud_indicator",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)

print(f"‚úÖ Model trained successfully!")
print(f"üìä AUC-ROC Score: {auc:.4f}")
print("=" * 80)

# Show sample predictions
print("\nüîç SAMPLE FRAUD PREDICTIONS:")
pred_display = predictions.select(
    "claim_id",
    col("fraud_indicator").alias("actual_fraud"),
    col("prediction").alias("predicted_fraud"),
    col("probability").alias("fraud_probability")
)

pred_display.show(10, truncate=False)

# Save predictions to Gold layer for dashboard
print("\nüíæ SAVING FRAUD SCORES TO GOLD LAYER")
predictions.select(
    "claim_id",
    col("fraud_indicator").alias("actual_fraud_label"),
    col("prediction").alias("fraud_prediction"),
    col("probability").alias("fraud_probability_scores")
).write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("smart_claims_dev.gold.fraud_detection_scores")

print("‚úÖ Fraud scores saved to: smart_claims_dev.gold.fraud_detection_scores")
print("=" * 80)