In [0]:
# 06_audit_logging.py
# Audit trail for compliance purpose and final clean up
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType, BooleanType, DoubleType, StringType
import datetime

def monitor_pipeline_health():
    try:
        freshness = spark.sql("""
        SELECT 
          MAX(alert_timestamp) as latest_alert,
          current_timestamp() as current_time,
          ROUND((unix_timestamp(current_timestamp()) - unix_timestamp(MAX(alert_timestamp))) / 3600, 2) as hours_diff
        FROM medisure_jen.gold.gold_realtime_fraud_alerts
        """)
        
        today_count_df = spark.sql("""
        SELECT COUNT(*) as current_volume
        FROM medisure_jen.gold.gold_realtime_fraud_alerts
        WHERE date(alert_timestamp) = current_date()
        """)
        today_count = today_count_df.first().current_volume if today_count_df.count() > 0 else 0
        
        historical_avg_df = spark.sql("""
        SELECT COALESCE(AVG(daily_count), 0) as historical_avg
        FROM (
          SELECT date(check_timestamp), COUNT(*) as daily_count
          FROM medisure_jen.audit.fraud_monitoring_log
          GROUP BY date(check_timestamp)
        )
        """)
        historical_avg = historical_avg_df.first().historical_avg if historical_avg_df.count() > 0 else 0
        
        volume_status = "HIGH_VOLUME_ALERT" if today_count > historical_avg * 2 else "NORMAL"
        
        return freshness, today_count, historical_avg, volume_status
        
    except Exception as e:
        print(f"Pipeline health monitoring failed: {e}")
        # Return default values
        return spark.createDataFrame([(0.0,)], ["hours_diff"]), 0, 0, "UNKNOWN"

def main():
    print("="*60)
    print("AUDIT LOGGING STARTED")
    print("="*60)
    
    # 1. Get monitoring metrics with error handling
    try:
        freshness_check, current_volume, historical_avg, volume_status = monitor_pipeline_health()
        freshness_hours = (
            freshness_check.select("hours_diff").first()[0]
            if freshness_check.count() > 0 else 0.0
        )
        if freshness_hours is None:
            freshness_hours = 0.0
    except Exception as e:
        print(f"Error getting monitoring metrics: {e}")
        freshness_hours, current_volume, historical_avg, volume_status = 0.0, 0, 0, "ERROR"
    
    # 2. Get counts from previous tasks with error handling
    try:
        critical_count = spark.table("medisure_jen.temp.critical_alerts").count()
    except:
        critical_count = 0
        print("Warning: critical_alerts table not found")
    
    try:
        ml_count = spark.table("medisure_jen.temp.ml_anomalies").count()
    except:
        ml_count = 0
        print("Warning: ml_anomalies table not found")
    
    try:
        email_sent = spark.table("medisure_jen.temp.email_status").first().email_sent
    except:
        email_sent = False
        print("Warning: email_status table not found")
    
    try:
        providers_review = spark.table("medisure_jen.audit.provider_compliance_daily")\
            .filter(col("compliance_status") != "COMPLIANT").count()
    except:
        providers_review = 0
        print("Warning: provider_compliance_daily table not found")
    
    # 3. Create audit log
    monitoring_schema = StructType([
        StructField("check_timestamp", TimestampType(), True),
        StructField("critical_alerts", IntegerType(), True),
        StructField("ml_anomalies", IntegerType(), True),
        StructField("providers_needing_review", IntegerType(), True),
        StructField("email_sent", BooleanType(), True),
        StructField("data_freshness_hours", DoubleType(), True),
        StructField("volume_status", StringType(), True),
        StructField("current_volume", IntegerType(), True),
        StructField("historical_avg", DoubleType(), True)
    ])
    
    monitoring_log = spark.createDataFrame([(
        datetime.datetime.now(),
        critical_count,
        ml_count,
        providers_review,
        email_sent,
        freshness_hours,
        volume_status,
        current_volume,
        historical_avg
    )], monitoring_schema)
    
    monitoring_log.write.format("delta").mode("append")\
        .option("mergeSchema", "true")\
        .saveAsTable("medisure_jen.audit.fraud_monitoring_log")
    
    print("✅ Audit log saved successfully")
    
    # 4. Cleanup temp tables with error handling
    temp_tables = [
        "medisure_jen.temp.critical_alerts",
        "medisure_jen.temp.ml_anomalies", 
        "medisure_jen.temp.fraud_decision",
        "medisure_jen.temp.email_status",
        "medisure_jen.temp.alert_true_fraud_detected",
        "medisure_jen.temp.alert_false_no_fraud"
    ]
    
    cleaned_count = 0
    for table in temp_tables:
        try:
            spark.sql(f"DROP TABLE IF EXISTS {table}")
            cleaned_count += 1
        except Exception as e:
            print(f"Warning: Could not drop table {table}: {e}")
    
    # 5. Final summary
    print("="*60)
    print("AUDIT LOGGING COMPLETE")
    print("="*60)
    print(f"📊 Critical Alerts: {critical_count}")
    print(f"🤖 ML Anomalies: {ml_count}")
    print(f"📧 Email Sent: {email_sent}")
    print(f"⏰ Data Freshness: {freshness_hours:.2f} hours")
    print(f"🏥 Providers Needing Review: {providers_review}")
    print(f"🗑️  Temp Tables Cleaned: {cleaned_count}/{len(temp_tables)}")
    print("="*60)
    print("🎯 Pipeline execution completed successfully!")
    print("="*60)

# Execute main function
if __name__ == "__main__":
    main()