In [0]:
# utils/data_quality.py
class DataQualityChecker:
    def __init__(self, spark):
        self.spark = spark
        
    def check_null_percentage(self, df, column, threshold=0.05):
        total = df.count()
        nulls = df.filter(col(column).isNull()).count()
        null_pct = nulls / total
        
        return {
            'column': column,
            'null_percentage': null_pct,
            'passed': null_pct < threshold,
            'threshold': threshold
        }
    
    def check_duplicate_keys(self, df, key_columns):
        total = df.count()
        unique = df.select(key_columns).distinct().count()
        
        return {
            'total_records': total,
            'unique_keys': unique,
            'duplicates': total - unique,
            'passed': total == unique
        }
    
    def check_value_range(self, df, column, min_val, max_val):
        out_of_range = df.filter(
            (col(column) < min_val) | (col(column) > max_val)
        ).count()
        
        return {
            'column': column,
            'out_of_range_count': out_of_range,
            'passed': out_of_range == 0
        }
