In [0]:
# 05_compliance_reporting.py
# Used to generate analysis to determine high risk providers and members with fraudulent claims
from pyspark.sql.functions import col, date_format, current_date, add_months
from datetime import datetime

def main():
    print("="*60)
    print("COMPLIANCE REPORTING STARTED")
    print("="*60)
    
    # =====================================================
    # 1. Provider Compliance Report
    # =====================================================
    print("Generating provider compliance report...")
    try:
        provider_compliance_report = spark.sql("""
        SELECT 
          provider_id, provider_name, tin, total_claims, total_amount,
          avg_claim_amount, avg_fraud_score, high_risk_claims,
          ROUND((high_risk_claims / total_claims) * 100, 2) as high_risk_percentage,
          CASE 
            WHEN (high_risk_claims / total_claims) > 0.3 THEN 'REVIEW REQUIRED'
            WHEN (high_risk_claims / total_claims) > 0.1 THEN 'MONITOR'
            ELSE 'COMPLIANT'
          END as compliance_status
        FROM medisure_jen.gold.gold_provider_performance
        WHERE reporting_period = date_format(current_date(), 'yyyy-MM')
        """)
        
        provider_count = provider_compliance_report.count()
        review_required = provider_compliance_report.filter(col("compliance_status") == "REVIEW REQUIRED").count()
        
        print(f"Provider compliance: {provider_count} providers, {review_required} need review")
        
        provider_compliance_report.write.format("delta").mode("overwrite")\
            .option("mergeSchema", "true")\
            .saveAsTable("medisure_jen.audit.provider_compliance_daily")
            
        # Display top providers needing review
        if review_required > 0:
            print("Top providers requiring review:")
            display(provider_compliance_report.filter(col("compliance_status") == "REVIEW REQUIRED")
                .select("provider_id", "provider_name", "high_risk_percentage", "total_claims")
                .orderBy(col("high_risk_percentage").desc())
                .limit(5))
                
    except Exception as e:
        print(f"Error generating provider compliance report: {e}")
    
    # =====================================================
    # 2. Member Risk Profiling (ALWAYS generated)
    # =====================================================
    print("Generating member risk profiles...")
    try:
        member_risk_profiles = spark.sql("""
        SELECT 
          member_id, first_name, last_name, claims_count,
          total_claimed, member_risk_score,
          CASE 
            WHEN member_risk_score > 0.8 THEN 'HIGH RISK'
            WHEN member_risk_score > 0.5 THEN 'MEDIUM RISK'
            ELSE 'LOW RISK'
          END as risk_category
        FROM medisure_jen.gold.gold_member_claims_summary
        WHERE summary_period = date_format(current_date(), 'yyyy-MM-dd')
        """)
        
        high_risk_members = member_risk_profiles.filter(col("risk_category") == "HIGH RISK").count()
        total_members = member_risk_profiles.count()
        
        print(f"Member risk: {total_members} members, {high_risk_members} high risk")
        
        member_risk_profiles.write.format("delta").mode("overwrite")\
            .option("mergeSchema", "true")\
            .saveAsTable("medisure_jen.audit.member_risk_daily")
            
        # Display high risk members
        if high_risk_members > 0:
            print("High risk members:")
            display(member_risk_profiles.filter(col("risk_category") == "HIGH RISK")
                .select("member_id", "first_name", "last_name", "member_risk_score", "claims_count")
                .orderBy(col("member_risk_score").desc())
                .limit(5))
                
    except Exception as e:
        print(f"Error generating member risk profiles: {e}")
    
    # =====================================================
    # 3. Fraud Trends Analysis 
    # =====================================================
    print("Generating fraud trends analysis...")
    try:
        fraud_trends = spark.sql("""
        SELECT 
          processing_month, COUNT(*) as total_claims,
          SUM(CASE WHEN fraud_risk_score > 0.7 THEN 1 ELSE 0 END) as high_risk_claims,
          ROUND(AVG(fraud_risk_score), 3) as avg_fraud_score,
          ROUND(SUM(CASE WHEN fraud_risk_score > 0.7 THEN claim_amount ELSE 0 END), 2) as high_risk_amount
        FROM medisure_jen.gold.gold_claims_analytics
        WHERE processing_month >= date_format(add_months(current_date(), -6), 'yyyy-MM')
        GROUP BY processing_month
        ORDER BY processing_month
        """)
        
        print("Fraud trends analysis:")
        display(fraud_trends)
        
        fraud_trends.write.format("delta").mode("overwrite")\
            .option("mergeSchema", "true")\
            .saveAsTable("medisure_jen.audit.fraud_trends_daily")
            
    except Exception as e:
        print(f"Error generating fraud trends: {e}")
    
    # =====================================================
    # 4. Final Summary
    # =====================================================
    print("="*60)
    print("COMPLIANCE REPORTING COMPLETED")
    print("="*60)
    print("Reports Generated:")
    print(f"   • Provider Compliance: {provider_count if 'provider_count' in locals() else 'N/A'} providers")
    print(f"   • Member Risk Profiles: {total_members if 'total_members' in locals() else 'N/A'} members")
    print(f"   • Fraud Trends: {fraud_trends.count() if 'fraud_trends' in locals() else 'N/A'} months analyzed")
    print("="*60)

# Execute main function
if __name__ == "__main__":
    main()