In [0]:
#02_fraud_ML_anomaly_detection notebook
# ML Anomaly detection using Spark ML 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, current_date, date_format

def main():
    try:
        # =====================================================
        # 1. Read and validate data
        # =====================================================
        print("Reading claims data for ML anomaly detection...")
        claims_data = spark.table("medisure_jen.gold.gold_claims_analytics")
        spark.sql("""
        CREATE TABLE IF NOT EXISTS medisure_jen.audit.fraud_monitoring_log (
            check_timestamp TIMESTAMP,
            critical_alerts INT,
            ml_anomalies INT,
            providers_needing_review INT,
            email_sent BOOLEAN,
            data_freshness_hours DOUBLE,
            volume_status STRING
        )
        """)
        # Check if data is available
        if claims_data.count() == 0:
            print("No claims data available for analysis")
            spark.sql("CREATE SCHEMA IF NOT EXISTS medisure_jen.temp")
            spark.createDataFrame([], ["claim_id"]).write.mode("overwrite")\
                .saveAsTable("medisure_jen.temp.ml_anomalies")
            return 0
        
        print(f"Processing {claims_data.count()} claims for anomaly detection")
        
        # Display sample data for monitoring
        print("Sample claims data for ML analysis:")
        display(claims_data.select("claim_id", "claim_amount", "fraud_risk_score", "diagnosis_code", "provider_id").limit(10))
        
        # =====================================================
        # 2. ML Anomaly Detection
        # =====================================================
        print("Running KMeans clustering for anomaly detection...")
        
        assembler = VectorAssembler(inputCols=["claim_amount", "fraud_risk_score"], outputCol="features")
        kmeans = KMeans(k=3, seed=42)
        pipeline = Pipeline(stages=[assembler, kmeans])
        model = pipeline.fit(claims_data.limit(10000))
        
        results = model.transform(claims_data)
        anomalies = results.filter(col("prediction") == 2)
        
        # =====================================================
        # 3. Save and display results
        # =====================================================
        anomaly_count = anomalies.count()
        print(f"Detected {anomaly_count} anomalous claims using ML")
        
        if anomaly_count > 0:
            print("Top anomalous claims:")
            display(anomalies.select("claim_id", "claim_amount", "fraud_risk_score", "prediction", "diagnosis_code", "provider_id").limit(10))
        
        # Save for potential alerts
        spark.sql("CREATE SCHEMA IF NOT EXISTS medisure_jen.temp")
        anomalies.write.mode("overwrite").saveAsTable("medisure_jen.temp.ml_anomalies")
        
        # =====================================================
        # 4. Additional monitoring: Recent anomalies trend
        # =====================================================
        try:
            recent_anomalies_trend = spark.sql(f"""
            SELECT 
                date_format(processing_date, 'yyyy-MM') as month,
                COUNT(*) as total_claims,
                SUM(CASE WHEN fraud_risk_score > 0.7 THEN 1 ELSE 0 END) as high_risk_claims,
                ROUND(AVG(fraud_risk_score), 3) as avg_fraud_score
            FROM medisure_jen.gold.gold_claims_analytics
            WHERE processing_date >= add_months(current_date(), -6)
            GROUP BY date_format(processing_date, 'yyyy-MM')
            ORDER BY month
            """)
            
            print("Recent fraud trends for context:")
            display(recent_anomalies_trend)
            
        except Exception as trend_error:
            print(f"Trend analysis skipped: {trend_error}")
        
        # =====================================================
        # 5. Summary output
        # =====================================================
        print("="*60)
        print("ML ANOMALY DETECTION SUMMARY")
        print("="*60)
        print(f"Total Claims Processed: {claims_data.count()}")
        print(f"Anomalous Claims Detected: {anomaly_count}")
        print(f"Anomaly Rate: {(anomaly_count/claims_data.count()*100 if claims_data.count() > 0 else 0):.2f}%")
        print("="*60)
        
        return anomaly_count
        
    except Exception as e:
        print(f"ML anomaly detection failed: {e}")
        # Create empty table structure for downstream dependencies
        spark.sql("CREATE SCHEMA IF NOT EXISTS medisure_jen.temp")
        empty_df = spark.createDataFrame([], "claim_id string, claim_amount double, fraud_risk_score double, prediction int, features array<double>")
        empty_df.write.mode("overwrite").saveAsTable("medisure_jen.temp.ml_anomalies")
        return 0

# Execute main function
if __name__ == "__main__":
    main()