In [1]:
# Install Java 11
!apt-get update
!apt-get install -y openjdk-11-jdk-headless -qq > /dev/null

# Download Spark 3.5.4 with Hadoop 3.3
!wget https://downloads.apache.org/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
!tar xf spark-3.5.4-bin-hadoop3.tgz

# Set environment variables for Java and Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.4-bin-hadoop3"
os.environ["SPARK_VERSION"] = "3.5"

# Install PySpark 3.5.0, PyDeequ, Pandas and SQLite
!pip install pyspark==3.5.0 pydeequ pandas faker
!apt-get install sqlite3

# Download SQLite JDBC driver
!wget https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.42.0.0/sqlite-jdbc-3.42.0.0.jar -O /content/sqlite-jdbc-3.42.0.0.jar

# Download Deequ 2.0.7 JAR for Spark 3.5
!wget https://repo1.maven.org/maven2/com/amazon/deequ/deequ/2.0.7-spark-3.5/deequ-2.0.7-spark-3.5.jar -O /content/deequ-2.0.7-spark-3.5.jar


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.81)] [Connected to cloud.r-project.org (3.161.136                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Ge

In [2]:
!pip install tabulate



In [10]:
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, min, max, col, count
from pydeequ.analyzers import *
from pydeequ.checks import *
from pydeequ.verification import *
import pandas as pd
from datetime import datetime
import json
from tabulate import tabulate
import sqlite3

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('data_quality_validation.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def create_spark_session():
    """Initialize Spark session with required configurations"""
    logger.info("Initializing Spark session...")
    try:
        spark = (SparkSession.builder
            .config("spark.jars", "/content/sqlite-jdbc-3.42.0.0.jar,/content/deequ-2.0.7-spark-3.5.jar")
            .getOrCreate())
        logger.info("Spark session created successfully")
        return spark
    except Exception as e:
        logger.error(f"Failed to create Spark session: {str(e)}")
        raise

def load_dataframes(spark):
    """Load CSV files into Spark DataFrames with error handling"""
    logger.info("Loading data from CSV files...")
    try:
        patients_df = spark.read.csv('patients.csv', header=True, inferSchema=True)
        encounters_df = spark.read.csv('encounters.csv', header=True, inferSchema=True)
        procedures_df = spark.read.csv('procedures.csv', header=True, inferSchema=True)

        # Log basic statistics
        logger.info(f"Patients dataset loaded: {patients_df.count()} records")
        logger.info(f"Encounters dataset loaded: {encounters_df.count()} records")
        logger.info(f"Procedures dataset loaded: {procedures_df.count()} records")

        return patients_df, encounters_df, procedures_df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

def analyze_dataset(spark, df, dataset_name):
    """Analyze a single dataset"""
    logger.info(f"Analyzing {dataset_name} dataset...")
    try:
        analysis_results = []

        # Basic dataset metrics
        row_count = df.count()
        analysis_results.append({
            "analyzer": "Record Count",
            "value": row_count
        })

        # Column completeness analysis
        for column in df.columns:
            completeness = df.filter(df[column].isNotNull()).count() / float(row_count)
            analysis_results.append({
                "analyzer": f"Completeness ({column})",
                "value": completeness
            })

        # Dataset-specific analysis
        if dataset_name == "patients":
            # Gender distribution
            gender_dist = df.groupBy("GENDER").count().collect()
            for row in gender_dist:
                analysis_results.append({
                    "analyzer": f"Gender Distribution ({row['GENDER']})",
                    "value": row['count'] / float(row_count)
                })

        elif dataset_name == "encounters":
            # Cost statistics
            cost_stats = df.select(
                mean("BASE_ENCOUNTER_COST").alias("mean"),
                min("BASE_ENCOUNTER_COST").alias("min"),
                max("BASE_ENCOUNTER_COST").alias("max")
            ).collect()[0]

            analysis_results.append({
                "analyzer": "Mean Encounter Cost",
                "value": float(cost_stats["mean"])
            })
            analysis_results.append({
                "analyzer": "Min Encounter Cost",
                "value": float(cost_stats["min"])
            })
            analysis_results.append({
                "analyzer": "Max Encounter Cost",
                "value": float(cost_stats["max"])
            })

        elif dataset_name == "procedures":
            # Cost statistics
            cost_stats = df.select(
                mean("BASE_COST").alias("mean"),
                min("BASE_COST").alias("min"),
                max("BASE_COST").alias("max")
            ).collect()[0]

            analysis_results.append({
                "analyzer": "Mean Procedure Cost",
                "value": float(cost_stats["mean"])
            })
            analysis_results.append({
                "analyzer": "Min Procedure Cost",
                "value": float(cost_stats["min"])
            })
            analysis_results.append({
                "analyzer": "Max Procedure Cost",
                "value": float(cost_stats["max"])
            })

        return analysis_results

    except Exception as e:
        logger.error(f"Error analyzing {dataset_name} dataset: {str(e)}")
        raise

def verify_dataset(spark, df, dataset_name):
    """Run verification checks on a dataset"""
    logger.info(f"Verifying {dataset_name} dataset...")
    try:
        verification_results = []

        # Dataset-specific ID field checks
        if dataset_name == "patients":
            id_field = "Id"
        elif dataset_name in ["encounters", "procedures"]:
            id_field = "PATIENT"

        # Common checks using the appropriate ID field
        id_completeness = df.filter(df[id_field].isNotNull()).count() / float(df.count())
        verification_results.append({
            "check_description": f"{id_field} Completeness Check",
            "status": "Success" if id_completeness >= 0.9 else "Error",
            "details": f"{id_field} completeness: {id_completeness:.2%}"
        })

        # Dataset-specific checks
        if dataset_name == "patients":
            # Gender validation
            valid_gender = df.filter(df["GENDER"].isin(["M", "F"])).count() / float(df.count())
            verification_results.append({
                "check_description": "Gender Format Check",
                "status": "Success" if valid_gender == 1.0 else "Error",
                "details": f"Valid gender formats: {valid_gender:.2%}"
            })

            # Birthdate presence
            birthdate_completeness = df.filter(df["BIRTHDATE"].isNotNull()).count() / float(df.count())
            verification_results.append({
                "check_description": "Birthdate Completeness Check",
                "status": "Success" if birthdate_completeness >= 0.9 else "Error",
                "details": f"Birthdate completeness: {birthdate_completeness:.2%}"
            })

        elif dataset_name == "encounters":
            # Cost validation
            valid_costs = df.filter(df["BASE_ENCOUNTER_COST"] >= 0).count() / float(df.count())
            verification_results.append({
                "check_description": "Encounter Cost Validation",
                "status": "Success" if valid_costs == 1.0 else "Error",
                "details": f"Valid costs (non-negative): {valid_costs:.2%}"
            })

            # Date completeness
            date_completeness = df.filter(
                df["START"].isNotNull() & df["STOP"].isNotNull()
            ).count() / float(df.count())
            verification_results.append({
                "check_description": "Date Completeness Check",
                "status": "Success" if date_completeness >= 0.9 else "Error",
                "details": f"Date completeness: {date_completeness:.2%}"
            })

        elif dataset_name == "procedures":
            # Cost validation
            valid_costs = df.filter(df["BASE_COST"] >= 0).count() / float(df.count())
            verification_results.append({
                "check_description": "Procedure Cost Validation",
                "status": "Success" if valid_costs == 1.0 else "Error",
                "details": f"Valid costs (non-negative): {valid_costs:.2%}"
            })

            # Description completeness
            desc_completeness = df.filter(df["DESCRIPTION"].isNotNull()).count() / float(df.count())
            verification_results.append({
                "check_description": "Description Completeness Check",
                "status": "Success" if desc_completeness >= 0.9 else "Error",
                "details": f"Description completeness: {desc_completeness:.2%}"
            })

            # Date completeness
            date_completeness = df.filter(
                df["START"].isNotNull() & df["STOP"].isNotNull()
            ).count() / float(df.count())
            verification_results.append({
                "check_description": "Date Completeness Check",
                "status": "Success" if date_completeness >= 0.9 else "Error",
                "details": f"Date completeness: {date_completeness:.2%}"
            })

        return verification_results

    except Exception as e:
        logger.error(f"Error verifying {dataset_name} dataset: {str(e)}")
        raise

def save_to_sqlite(df, table_name, db_path):
    """Save DataFrame to SQLite with logging"""
    try:
        pandas_df = df.toPandas()
        conn = sqlite3.connect(db_path)
        pandas_df.to_sql(table_name, conn, if_exists='replace', index=False)
        conn.close()
        logger.info(f"Successfully saved {table_name} to SQLite")
    except Exception as e:
        logger.error(f"Error saving {table_name} to SQLite: {str(e)}")
        raise

def print_results_summary(results):
    """Print formatted results summary"""
    print("\n=== DATA QUALITY VALIDATION SUMMARY ===\n")

    # Print dataset metrics
    print("Dataset Metrics:")
    metrics_table = []
    for dataset, metrics in results["dataset_metrics"].items():
        for metric_name, value in metrics.items():
            metrics_table.append([dataset, metric_name, value])
    print(tabulate(metrics_table,
                  headers=["Dataset", "Metric", "Value"],
                  tablefmt="grid"))

    # Print analysis results
    print("\nAnalysis Results:")
    analysis_table = []
    for dataset, analyses in results["analysis_results"].items():
        for analysis in analyses:
            value = analysis['value']
            formatted_value = f"{value:.2%}" if isinstance(value, float) and value <= 1.0 else f"{value:,.2f}"
            analysis_table.append([
                dataset,
                analysis["analyzer"],
                formatted_value
            ])
    print(tabulate(analysis_table,
                  headers=["Dataset", "Analyzer", "Value"],
                  tablefmt="grid"))

    # Print verification results
    print("\nVerification Results:")
    verification_table = []
    for dataset, verifications in results["verification_results"].items():
        for verification in verifications:
            verification_table.append([
                dataset,
                verification["check_description"],
                verification["status"],
                verification["details"]
            ])
    print(tabulate(verification_table,
                  headers=["Dataset", "Check Description", "Status", "Details"],
                  tablefmt="grid"))

def main():
    """Main execution function"""
    try:
        logger.info("Starting data quality validation process")

        # Initialize Spark
        spark = create_spark_session()

        # Load data
        patients_df, encounters_df, procedures_df = load_dataframes(spark)

        # Initialize results dictionary
        results = {
            "timestamp": datetime.now().isoformat(),
            "dataset_metrics": {},
            "analysis_results": {},
            "verification_results": {}
        }

        # Process each dataset
        datasets = {
            "patients": patients_df,
            "encounters": encounters_df,
            "procedures": procedures_df
        }

        for dataset_name, df in datasets.items():
            # Add basic metrics
            results["dataset_metrics"][dataset_name] = {
                "record_count": df.count(),
                "column_count": len(df.columns)
            }

            # Run analysis
            results["analysis_results"][dataset_name] = analyze_dataset(spark, df, dataset_name)

            # Run verification
            results["verification_results"][dataset_name] = verify_dataset(spark, df, dataset_name)

        # Save results to JSON
        with open('data_quality_results.json', 'w') as f:
            json.dump(results, f, indent=4)

        # Save to SQLite
        database_path = 'healthcare_data.db'
        for dataset_name, df in datasets.items():
            save_to_sqlite(df, dataset_name, database_path)

        # Print results summary
        print_results_summary(results)

        logger.info("Data quality validation process completed successfully")

    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise
    finally:
        if 'spark' in locals():
            spark.stop()
            logger.info("Spark session stopped")

if __name__ == "__main__":
    main()


=== DATA QUALITY VALIDATION SUMMARY ===

Dataset Metrics:
+------------+--------------+---------+
| Dataset    | Metric       |   Value |
| patients   | record_count |     974 |
+------------+--------------+---------+
| patients   | column_count |      20 |
+------------+--------------+---------+
| encounters | record_count |   27891 |
+------------+--------------+---------+
| encounters | column_count |      14 |
+------------+--------------+---------+
| procedures | record_count |   47701 |
+------------+--------------+---------+
| procedures | column_count |       9 |
+------------+--------------+---------+

Analysis Results:
+------------+------------------------------------+------------+
| Dataset    | Analyzer                           | Value      |
| patients   | Record Count                       | 974.00     |
+------------+------------------------------------+------------+
| patients   | Completeness (Id)                  | 100.00%    |
+------------+-----------------------