## Pre-Ingestion Table Profiler
####  This notebook analyzes source tables to:
####  1. Identify existing indexes that can hrlp idenitfy candidate columns that can be leveraged as a partition column and also to speed up MIN/MAX queries at runtime
####  2. Profile columns for skew and recommend optimal partition columns
####  3. Estimate ingestion complexity

In [0]:
# Create widgets for user input
dbutils.widgets.text("src_catalog", "sqlserver_edwia_catalog", "Source Catalog")
dbutils.widgets.text("src_schema", "dbo", "Source Schema") 
dbutils.widgets.text("src_table", "store_sales_1tb", "Source Table")


#1. Check Existing Indexes on Source Table

# COMMAND ----------

# Get widget values
src_catalog = dbutils.widgets.get("src_catalog")
src_schema = dbutils.widgets.get("src_schema")
src_table = dbutils.widgets.get("src_table")

####1. Identify existing indexes

In [0]:
index_discovery_query = f"""
SELECT DISTINCT
    c.name as column_name,
    i.name as index_name,
    i.type_desc as index_type,
    CASE 
        WHEN i.type_desc = 'CLUSTERED' THEN 1
        WHEN i.type_desc = 'NONCLUSTERED' THEN 2
        ELSE 3
    END as index_priority
FROM {src_catalog}.sys.indexes i
INNER JOIN {src_catalog}.sys.index_columns ic 
    ON i.object_id = ic.object_id AND i.index_id = ic.index_id
INNER JOIN {src_catalog}.sys.columns c 
    ON ic.object_id = c.object_id AND ic.column_id = c.column_id
INNER JOIN {src_catalog}.sys.tables t 
    ON i.object_id = t.object_id
INNER JOIN {src_catalog}.sys.schemas s 
    ON t.schema_id = s.schema_id
WHERE s.name = '{src_schema}' 
    AND t.name = '{src_table}'
    AND i.type > 0  -- Exclude heap
    AND ic.is_included_column = 0  -- Only key columns, not included columns
    AND c.system_type_id IN (48, 52, 56, 127)  -- tinyint, smallint, int, bigint only
ORDER BY index_priority, column_name
"""

print(f"Discovering indexed columns on {src_schema}.{src_table}...")
indexed_columns_df = spark.sql(index_discovery_query)
indexed_columns = [row['column_name'] for row in indexed_columns_df.collect()]

if not indexed_columns:
    print("⚠️ WARNING: No indexed numeric columns found. Creating an index on your chosen partition column is CRITICAL for performance.")
    print("Consider creating an index before running ingestion.")
else:
    print(f"✓ Found {len(indexed_columns)} indexed columns to profile: {', '.join(indexed_columns)}")
    display(indexed_columns_df)

Discovering indexed columns on dbo.store_sales_1tb...
✓ Found 2 indexed columns to profile: ss_item_sk, ss_sold_date_sk


column_name,index_name,index_type,index_priority
ss_item_sk,idx_ss_item_sk,NONCLUSTERED,2
ss_sold_date_sk,idx_ss_sold_date_sk,NONCLUSTERED,2



#### 2. Profile Columns for Skew and Distribution

In [0]:
if indexed_columns:
    results = []
    
    for column in indexed_columns:
        try:
            print(f"Profiling {column}... ", end="")
            
            # Full table profiling with index support - should be fast
            profile_query = f"""
            WITH freq AS (
                SELECT 
                    {column},
                    COUNT(*) AS frequency
                FROM {src_catalog}.{src_schema}.{src_table}
                WITH(fetchSize = 200000)
                GROUP BY {column}
            ),
            stats AS (
                SELECT
                    COUNT(*) as distinct_values,
                    AVG(CAST(frequency AS DOUBLE)) AS avg_freq,
                    STDDEV_POP(frequency) AS stddev_freq,
                    MAX(frequency) AS max_freq,
                    MIN(frequency) AS min_freq,
                    SUM(frequency) as total_rows
                FROM freq
            ),
            null_check AS (
                SELECT COUNT(*) - COUNT({column}) as null_count,
                       COUNT(*) as total
                FROM {src_catalog}.{src_schema}.{src_table}
            )
            SELECT
                '{column}' as column_name,
                s.distinct_values,
                ROUND(n.null_count * 100.0 / n.total, 2) as null_percentage,
                ROUND(s.stddev_freq / NULLIF(s.avg_freq, 0), 3) AS coefficient_variation,
                ROUND(s.max_freq * 1.0 / NULLIF(s.avg_freq, 0), 2) AS skew_ratio,
                s.total_rows,
                s.min_freq as min_rows_per_value,
                s.max_freq as max_rows_per_value
            FROM stats s
            CROSS JOIN null_check n
            """
            
            result = spark.sql(profile_query).collect()[0]
            results.append(result.asDict())
            print(f"✓ CV={result['coefficient_variation']}, Skew={result['skew_ratio']}")
            
        except Exception as e:
            print(f"✗ Error: {str(e)}")
            results.append({
                'column_name': column,
                'error': str(e)
            })
    
    # Convert to DataFrame
    import pandas as pd
    results_df = pd.DataFrame(results)
else:
    print("No indexed columns to profile. Please create indexes first.")
    results_df = pd.DataFrame()

Profiling ss_item_sk... ✓ CV=0.503, Skew=2.1
Profiling ss_sold_date_sk... ✓ CV=0.715, Skew=21.62


## 3. Partition Column Recommendations

In [0]:
if not results_df.empty and 'coefficient_variation' in results_df.columns:
    # Calculate partition score considering BOTH metrics
    def calculate_score(row):
        if pd.isna(row.get('coefficient_variation')) or row.get('null_percentage', 0) > 5:
            return 0
        if row.get('distinct_values', 0) < 100:
            return 0
        
        cv = row['coefficient_variation']
        skew = row['skew_ratio']
        
        # Score based on CoV (0-50 points)
        if cv < 0.5:
            cv_score = 50
        elif cv < 1.0:
            cv_score = 40
        elif cv < 2.0:
            cv_score = 25
        else:
            cv_score = max(0, 50 - cv * 10)
        
        # Score based on skew ratio (0-50 points)
        if skew < 3:
            skew_score = 50
        elif skew < 10:
            skew_score = 35
        elif skew < 20:
            skew_score = 20
        elif skew < 50:
            skew_score = 10
        else:
            skew_score = 0
        
        return cv_score + skew_score
    
    results_df['partition_score'] = results_df.apply(calculate_score, axis=1)
    
    # Add recommendations based on BOTH metrics
    def get_recommendation(row):
        if row.get('null_percentage', 0) > 5:
            return 'DISQUALIFIED: >5% NULLs'
        if row.get('distinct_values', 0) < 100:
            return 'DISQUALIFIED: <100 distinct values'
        
        cv = row.get('coefficient_variation', 999)
        skew = row.get('skew_ratio', 999)
        
        # Must pass BOTH thresholds for each level
        if cv < 0.5 and skew < 3:
            return 'EXCELLENT - Use this!'
        elif cv < 1.0 and skew < 10:
            return 'GOOD - Acceptable'
        elif cv < 2.0 and skew < 20:
            return 'FAIR - Some skew expected'
        elif cv < 3.0 and skew < 50:
            return 'POOR - High skew, long-tail partitions likely'
        else:
            return 'VERY POOR - Severe skew, avoid if possible'
    
    results_df['recommendation'] = results_df.apply(get_recommendation, axis=1)
    
    # Sort by score
    results_df = results_df.sort_values('partition_score', ascending=False)
    
    # Display results
    print("\n📊 PROFILING RESULTS (Indexed Columns Only)")
    print("=" * 80)
    display(results_df[['column_name', 'null_percentage', 'distinct_values', 'coefficient_variation', 
                        'skew_ratio', 'min_rows_per_value', 'max_rows_per_value', 
                        'partition_score', 'recommendation']])
    
    # Get best column
    best_column = results_df.iloc[0] if not results_df.empty else None
    
    if best_column is not None and best_column['partition_score'] > 50:
        print(f"\n🎯 RECOMMENDED PARTITION COLUMN: {best_column['column_name']}")
        print(f"   - Coefficient of Variation: {best_column['coefficient_variation']}")
        print(f"   - Skew Ratio: {best_column['skew_ratio']} (max value is {best_column['skew_ratio']}x average)")
        print(f"   - Distinct Values: {best_column['distinct_values']:,}")
        print(f"   - NULL Percentage: {best_column['null_percentage']}%")
        print(f"   - Score: {best_column['partition_score']:.0f}/100")
        
        # Performance prediction based on skew ratio
        skew = best_column['skew_ratio']
        if skew < 3:
            perf = "Uniform partition execution (all partitions ±50% of average time)"
        elif skew < 10:
            perf = f"Most partitions fast, but slowest could take {skew:.0f}x average time"
        elif skew < 20:
            perf = f"Significant variance - expect some partitions to take {skew:.0f}x longer"
        else:
            perf = f"SEVERE SKEW WARNING: Some partitions may take {skew:.0f}x longer than average!"
        
        print(f"\n   📈 Expected Performance: {perf}")
        
        # Partition size recommendation based on skew
        if skew < 5:
            print(f"   📦 Recommended partition_size_mb: 2048-4096 (larger partitions OK)")
        elif skew < 20:
            print(f"   📦 Recommended partition_size_mb: 1024-2048 (moderate size)")
        else:
            print(f"   📦 Recommended partition_size_mb: 512-1024 (smaller to limit impact)")
            
    else:
        print("\n⚠️ No ideal partition column found among indexed columns.")
        print("   All columns have either high skew or other issues.")
        print("   Consider:")
        print("   1. Creating an index on a different column")
        print("   2. Using a synthetic partition key (MOD of a sequential ID)")
        print("   3. Round-robin distribution if skew cannot be avoided")



📊 PROFILING RESULTS (Indexed Columns Only)


column_name,null_percentage,distinct_values,coefficient_variation,skew_ratio,min_rows_per_value,max_rows_per_value,partition_score,recommendation
ss_item_sk,0.0,402000,0.503,2.1,6030,36780,90,GOOD - Acceptable
ss_sold_date_sk,0.0,1827,0.715,21.62,126309,83202375,50,"POOR - High skew, long-tail partitions likely"



🎯 RECOMMENDED PARTITION COLUMN: ss_item_sk
   - Coefficient of Variation: 0.503
   - Skew Ratio: 2.1 (max value is 2.1x average)
   - Distinct Values: 402,000
   - NULL Percentage: 0.00%
   - Score: 90/100

   📈 Expected Performance: Uniform partition execution (all partitions ±50% of average time)
   📦 Recommended partition_size_mb: 2048-4096 (larger partitions OK)


#### 4. Detailed Comparison Matrix


In [0]:

if not results_df.empty and len(results_df) > 1:
    print("\n📊 SIDE-BY-SIDE COMPARISON")
    print("=" * 80)
    
    # Create comparison matrix
    comparison_df = results_df[['column_name', 'coefficient_variation', 'skew_ratio', 'partition_score']].copy()
    
    # Add visual indicators
    comparison_df['CV_indicator'] = comparison_df['coefficient_variation'].apply(
        lambda x: '✅' if x < 1 else '⚠️' if x < 2 else '❌'
    )
    comparison_df['Skew_indicator'] = comparison_df['skew_ratio'].apply(
        lambda x: '✅' if x < 5 else '⚠️' if x < 20 else '❌'
    )
    
    display(comparison_df[['column_name', 'coefficient_variation', 'CV_indicator', 
                           'skew_ratio', 'Skew_indicator', 'partition_score']])
    
    # Explain why ss_sold_date_sk might be problematic
    date_cols = results_df[results_df['column_name'].str.contains('date', case=False)]
    if not date_cols.empty:
        for _, col in date_cols.iterrows():
            if col['skew_ratio'] > 10:
                print(f"\n⚠️ WARNING: {col['column_name']} has skew ratio of {col['skew_ratio']:.1f}")
                print(f"   This means some dates have {col['skew_ratio']:.0f}x more data than average.")
                print(f"   Impact: Some partitions could take {col['skew_ratio']:.0f}x longer to process!")


📊 SIDE-BY-SIDE COMPARISON


column_name,coefficient_variation,CV_indicator,skew_ratio,Skew_indicator,partition_score
ss_item_sk,0.503,✅,2.1,✅,90
ss_sold_date_sk,0.715,✅,21.62,❌,50



   This means some dates have 22x more data than average.
   Impact: Some partitions could take 22x longer to process!
