In [22]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/n/groups/marks/users/courtney/projects/regulatory_genomics/models/LOL-EVE/data/benchmark_data/gnomad_ultra_rare_full.csv')

# Define indel cuts
indel_cuts = {
    'Small Indel (≤2bp)': df.indel_length <= 2,
    'Medium Indel (3-15bp)': (df.indel_length >= 3) & (df.indel_length <= 15),
    'Large Indel (16-50bp)': (df.indel_length > 15) & (df.indel_length <= 50),
}


In [23]:
def safe_ratio(ultra, common):
    """Safely calculate ratio, handling edge cases"""
    if common == 0 or np.isnan(common) or np.isnan(ultra):
        return 1.0
    ratio = ultra / common
    if np.isinf(ratio) or np.isnan(ratio):
        return 1.0
    return ratio

def calculate_detailed_counts(df, models, percentile_thresholds):
    """Calculate detailed counts including variants and unique genes for each model and percentile"""
    detailed_counts = {}
    
    for model in models:
        # Skip if model column doesn't exist
        if model not in df.columns:
            continue
            
        detailed_counts[model] = {'ultra_rare': {}, 'common': {}}
        
        # Get data for ultra-rare and common variants (non-NaN values)
        ultra_data = df[df.ultra_rare & df[model].notna()]
        common_data = df[df.common & df[model].notna()]
        
        # Store total counts
        detailed_counts[model]['ultra_rare']['total_variants'] = len(ultra_data)
        detailed_counts[model]['ultra_rare']['total_genes'] = ultra_data['GENE'].nunique()
        detailed_counts[model]['common']['total_variants'] = len(common_data)
        detailed_counts[model]['common']['total_genes'] = common_data['GENE'].nunique()
        
        # For each percentile threshold
        for p in percentile_thresholds:
            if len(ultra_data) > 0:
                ultra_threshold = np.percentile(ultra_data[model], p)
                # Count variants AT OR BELOW the percentile threshold (most deleterious = lowest scores)
                ultra_subset = ultra_data[ultra_data[model] <= ultra_threshold]
                detailed_counts[model]['ultra_rare'][p] = {
                    'variants': len(ultra_subset),
                    'genes': ultra_subset['GENE'].nunique()
                }
            else:
                detailed_counts[model]['ultra_rare'][p] = {'variants': 0, 'genes': 0}
                
            if len(common_data) > 0:
                common_threshold = np.percentile(common_data[model], p)
                # Count variants AT OR BELOW the percentile threshold (most deleterious = lowest scores)
                common_subset = common_data[common_data[model] <= common_threshold]
                detailed_counts[model]['common'][p] = {
                    'variants': len(common_subset),
                    'genes': common_subset['GENE'].nunique()
                }
            else:
                detailed_counts[model]['common'][p] = {'variants': 0, 'genes': 0}
    
    return detailed_counts


In [24]:
def calculate_weighted_ratios_with_errors(df, models, percentile_thresholds, bins=10):
    """Calculate ratios and errors for each model weighted by indel length distribution"""
    # Initialize data structures for weighted ratios and errors
    weighted_ratios = {}
    weighted_errors = {}
    detailed_counts = calculate_detailed_counts(df, models, percentile_thresholds)
    
    # Create bins based on the indel length distribution
    indel_min = df.indel_length.min()
    indel_max = df.indel_length.max()
    
    # Use logarithmic bins to handle the skewed distribution
    bin_edges = np.logspace(np.log10(max(1, indel_min)), np.log10(indel_max), bins+1)
    bin_centers = np.sqrt(bin_edges[:-1] * bin_edges[1:])
    
    # Calculate indel length weights (probability of each bin) - ORIGINAL METHOD
    hist, _ = np.histogram(df.indel_length, bins=bin_edges, density=True)
    weights = hist / hist.sum()  # Normalize to sum to 1
    
    # Initialize data structures for collecting raw ratios from each bin
    bin_ratios = {model: {p: [] for p in percentile_thresholds} for model in models if model in df.columns}
    bin_weights = {model: {p: [] for p in percentile_thresholds} for model in models if model in df.columns}
    
    # For each bin, calculate ratios and weights
    for i in range(bins):
        # Get variants in this bin
        if i < bins - 1:
            bin_df = df[(df.indel_length >= bin_edges[i]) & (df.indel_length < bin_edges[i+1])]
        else:
            bin_df = df[(df.indel_length >= bin_edges[i]) & (df.indel_length <= bin_edges[i+1])]
        
        # Only proceed if we have enough data in this bin
        if len(bin_df) < 10:  # Skip bins with too few variants
            continue
        
        # For each model and percentile, calculate ratios
        for model in models:
            if model not in df.columns:
                continue
                
            if model not in weighted_ratios:
                weighted_ratios[model] = {p: 0.0 for p in percentile_thresholds}
                weighted_errors[model] = {p: 0.0 for p in percentile_thresholds}
            
            for p in percentile_thresholds:
                # Get data for ultra-rare and common variants
                ultra_data = bin_df[bin_df.ultra_rare][model].dropna()
                common_data = bin_df[bin_df.common][model].dropna()
                
                if len(ultra_data) > 0 and len(common_data) > 0:
                    # Calculate percentiles - MATCH ORIGINAL (even though it's technically wrong)
                    ultra = np.percentile(ultra_data, p)
                    common = np.percentile(common_data, p)
                    
                    # Calculate ratio
                    ratio = safe_ratio(ultra, common)
                    
                    # Store raw ratio and corresponding weight for error calculation
                    bin_ratios[model][p].append(ratio)
                    bin_weights[model][p].append(weights[i])
                    
                    # Calculate weighted ratio
                    weighted_ratio = ratio * weights[i]
                    
                    # Add to the weighted sum
                    weighted_ratios[model][p] += weighted_ratio
    
    # Calculate weighted standard errors
    for model in weighted_ratios:
        for p in percentile_thresholds:
            if len(bin_ratios[model][p]) > 1:
                # Calculate weighted standard error
                bin_values = np.array(bin_ratios[model][p])
                bin_w = np.array(bin_weights[model][p])
                bin_w = bin_w / bin_w.sum()  # Normalize weights
                
                # Weighted mean
                weighted_mean = np.average(bin_values, weights=bin_w)
                
                # Weighted variance
                variance = np.average((bin_values - weighted_mean)**2, weights=bin_w)
                
                # Standard error (divide by sqrt of effective sample size)
                n_effective = 1.0 / np.sum(bin_w**2)  # Effective sample size for weighted data
                std_error = np.sqrt(variance / n_effective)
                
                weighted_errors[model][p] = std_error
            else:
                weighted_errors[model][p] = 0.0
    
    return weighted_ratios, weighted_errors, detailed_counts


In [25]:
def calculate_summary_statistics(weighted_ratios, model):
    """Calculate summary statistics across percentiles for a model"""
    ratios = list(weighted_ratios[model].values())
    return {
        'mean': np.mean(ratios),
        'median': np.median(ratios),
        'std': np.std(ratios),
        'variance': np.var(ratios),
        'stderr': np.std(ratios) / np.sqrt(len(ratios))
    }

def create_summary_table(df, models, name_mapping, percentile_thresholds):
    """Create a comprehensive summary table of all results"""
    
    # Store results for each cut
    all_results = {}
    
    for cut_name, cut_mask in indel_cuts.items():
        df_cut = df[cut_mask].copy()
        df_cut['ultra_rare'] = df_cut.MAF < 0.00001
        df_cut['common'] = df_cut.MAF >= 0.001
        
        # Calculate weighted ratios
        weighted_ratios, weighted_errors, detailed_counts = calculate_weighted_ratios_with_errors(
            df_cut, models, percentile_thresholds, bins=10
        )
        
        # Calculate summary statistics for each model
        cut_results = {}
        for model in models:
            if model in weighted_ratios:
                cut_results[model] = calculate_summary_statistics(weighted_ratios, model)
        
        all_results[cut_name] = cut_results
    
    # Prepare data for the table
    table_data = []
    
    for cut_name, cut_results in all_results.items():
        for model, stats in cut_results.items():
            # Get clean model name
            model_name = name_mapping.get(model, model)
            
            table_data.append({
                'Indel_Category': cut_name,
                'Model': model_name,
                'Mean_Ratio': f"{stats['mean']:.3f}",
                'Median_Ratio': f"{stats['median']:.3f}",
                'Std_Dev': f"{stats['std']:.3f}",
                'Std_Error': f"{stats['stderr']:.3f}",
                'Variance': f"{stats['variance']:.3f}"
            })
    
    # Create DataFrame
    summary_df = pd.DataFrame(table_data)
    
    # Sort by category and mean ratio
    summary_df['Mean_Ratio_Numeric'] = summary_df['Mean_Ratio'].astype(float)
    summary_df = summary_df.sort_values(['Indel_Category', 'Mean_Ratio_Numeric'], ascending=[True, False])
    summary_df = summary_df.drop('Mean_Ratio_Numeric', axis=1)
    
    return summary_df, all_results


In [26]:
def create_detailed_counts_table(df, models, name_mapping, percentile_thresholds):
    """Create a detailed table showing counts at each percentile for each model and indel cut"""
    
    # Collect all count data
    all_count_data = []
    
    for cut_name, cut_mask in indel_cuts.items():
        df_cut = df[cut_mask].copy()
        df_cut['ultra_rare'] = df_cut.MAF < 0.00001
        df_cut['common'] = df_cut.MAF >= 0.001
        
        # Calculate detailed counts for this cut
        detailed_counts = calculate_detailed_counts(df_cut, models, percentile_thresholds)
        
        for model in models:
            if model not in detailed_counts:
                continue
                
            model_name = name_mapping.get(model, model)
            
            # Add total counts
            all_count_data.append({
                'Indel_Category': cut_name,
                'Model': model_name,
                'Percentile': 'Total',
                'Ultra_Rare_Variants': detailed_counts[model]['ultra_rare']['total_variants'],
                'Ultra_Rare_Genes': detailed_counts[model]['ultra_rare']['total_genes'],
                'Common_Variants': detailed_counts[model]['common']['total_variants'],
                'Common_Genes': detailed_counts[model]['common']['total_genes']
            })
            
            # Add percentile-specific counts
            for p in percentile_thresholds:
                all_count_data.append({
                    'Indel_Category': cut_name,
                    'Model': model_name,
                    'Percentile': f'{p*100:.1f}%',
                    'Ultra_Rare_Variants': detailed_counts[model]['ultra_rare'][p]['variants'],
                    'Ultra_Rare_Genes': detailed_counts[model]['ultra_rare'][p]['genes'],
                    'Common_Variants': detailed_counts[model]['common'][p]['variants'],
                    'Common_Genes': detailed_counts[model]['common'][p]['genes']
                })
    
    counts_df = pd.DataFrame(all_count_data)
    return counts_df

def create_summary_counts_table(counts_df):
    """Create a summary table of counts that's easier to read"""
    
    # Create a more readable summary for totals
    summary_data = []
    
    for cut in counts_df['Indel_Category'].unique():
        for model in counts_df['Model'].unique():
            cut_model_data = counts_df[(counts_df['Indel_Category'] == cut) & (counts_df['Model'] == model)]
            
            if len(cut_model_data) == 0:
                continue
            
            # Get total counts
            ultra_total = cut_model_data[(cut_model_data['Percentile'] == 'Total')]
            
            if len(ultra_total) > 0:
                summary_data.append({
                    'Indel_Category': cut,
                    'Model': model,
                    'Ultra_Rare_Variants_Total': ultra_total.iloc[0]['Ultra_Rare_Variants'],
                    'Ultra_Rare_Genes_Total': ultra_total.iloc[0]['Ultra_Rare_Genes'],
                    'Common_Variants_Total': ultra_total.iloc[0]['Common_Variants'],
                    'Common_Genes_Total': ultra_total.iloc[0]['Common_Genes'],
                    'Variant_Ratio_UR_to_Common': (ultra_total.iloc[0]['Ultra_Rare_Variants'] / 
                                                  max(1, ultra_total.iloc[0]['Common_Variants'])),
                    'Gene_Ratio_UR_to_Common': (ultra_total.iloc[0]['Ultra_Rare_Genes'] / 
                                              max(1, ultra_total.iloc[0]['Common_Genes']))
                })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df


In [27]:
# Define models and parameters
models = [
    'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen',
    'mean_cross_entropy_diff_hyenadna-large-1m-seqlen',
    'mean_cross_entropy_diff_hyenadna-small-32k-seqlen',
    'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_DNABERT-2-117M',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g',
    'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref',
    'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species',
    'CADD_raw_score',
    'LOL-EVE_AF',
    'PhyloP',
    'mean_cross_entropy_diff_convnet',
    'GC_percentage_delta',
    'dist_tss',
    'max_track_single',
    'mean_diff_evo_1_131k_base',
    'mean_cross_entropy_diff_johahi/specieslm-metazoa-upstream-k6',
    'mean_cross_entropy_diff_songlab/gpn-animal-promoter',
    'mean_cross_entropy_diff_evo2-7b'
]

name_mapping = {
    'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen': 'HyenaDNA-tiny',
    'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen': 'HyenaDNA-medium-450k',
    'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen': 'HyenaDNA-medium-160k',
    'mean_cross_entropy_diff_hyenadna-large-1m-seqlen': 'HyenaDNA-large',
    'mean_cross_entropy_diff_hyenadna-small-32k-seqlen': 'HyenaDNA-small',
    'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16': 'Caduceus-ph',
    'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16': 'Caduceus-ps',
    'mean_cross_entropy_diff_DNABERT-2-117M': 'DNABERT-2',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species': 'NT-2.5b-multi',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g': 'NT-2.5b-1000g',
    'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref': 'NT-500m',
    'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species': 'NT-v2-500m',
    'CADD_raw_score': 'CADD',
    'LOL-EVE_AF': 'LOL-EVE',
    'PhyloP': 'PhyloP',
    'mean_cross_entropy_diff_convnet': 'GPN',
    'GC_percentage_delta': 'GC Content Δ',
    'mean_cross_entropy_diff_johahi/specieslm-metazoa-upstream-k6': 'speciesLM',
    'mean_cross_entropy_diff_songlab/gpn-animal-promoter': 'gpn_promoter',
    'mean_cross_entropy_diff_evo2-7b': 'evo2',
    'mean_diff_evo_1_131k_base': 'evo1',
    'max_track_single': 'Enformer',
    'dist_tss':'Distance TSS'
}

percentile_thresholds = np.array([0.01, 0.025, 0.05, 0.1])

# Ensure required columns exist
df['ultra_rare'] = df.MAF < 0.00001
df['common'] = df.MAF >= 0.001

print("Generating CSV tables...")

# Generate summary statistics table
print("Creating summary statistics table...")
summary_table, summary_stats = create_summary_table(df, models, name_mapping, percentile_thresholds)
summary_table.to_csv('workshop_summary_table.csv', index=False)
print("✓ Summary table saved: 'workshop_summary_table.csv'")

# Generate detailed counts table
print("Creating detailed counts table...")
detailed_counts_table = create_detailed_counts_table(df, models, name_mapping, percentile_thresholds)
detailed_counts_table.to_csv('workshop_detailed_counts.csv', index=False)
print("✓ Detailed counts table saved: 'workshop_detailed_counts.csv'")

# Generate summary counts table
print("Creating summary counts table...")
summary_counts_table = create_summary_counts_table(detailed_counts_table)
summary_counts_table.to_csv('workshop_counts_summary.csv', index=False)
print("✓ Summary counts table saved: 'workshop_counts_summary.csv'")

# Generate percentile counts table (same as detailed but with different name for clarity)
print("Creating percentile counts table...")
percentile_counts_table = detailed_counts_table.copy()
percentile_counts_table.to_csv('workshop_percentile_counts.csv', index=False)
print("✓ Percentile counts table saved: 'workshop_percentile_counts.csv'")

print("\n" + "="*60)
print("ALL CSV TABLES GENERATED SUCCESSFULLY!")
print("="*60)
print("Files created:")
print("1. workshop_summary_table.csv - Summary statistics (mean, median, std, etc.)")
print("2. workshop_detailed_counts.csv - Detailed variant/gene counts by percentile")
print("3. workshop_counts_summary.csv - Summary of total counts and ratios")
print("4. workshop_percentile_counts.csv - Same as detailed counts (for compatibility)")

# Display sample of summary table
print("\nSample of summary table:")
print(summary_table.head(10))


Generating CSV tables...
Creating summary statistics table...
✓ Summary table saved: 'workshop_summary_table.csv'
Creating detailed counts table...
✓ Detailed counts table saved: 'workshop_detailed_counts.csv'
Creating summary counts table...
✓ Summary counts table saved: 'workshop_counts_summary.csv'
Creating percentile counts table...
✓ Percentile counts table saved: 'workshop_percentile_counts.csv'

ALL CSV TABLES GENERATED SUCCESSFULLY!
Files created:
1. workshop_summary_table.csv - Summary statistics (mean, median, std, etc.)
2. workshop_detailed_counts.csv - Detailed variant/gene counts by percentile
3. workshop_counts_summary.csv - Summary of total counts and ratios
4. workshop_percentile_counts.csv - Same as detailed counts (for compatibility)

Sample of summary table:
           Indel_Category                 Model Mean_Ratio Median_Ratio  \
58  Large Indel (16-50bp)                  CADD      2.122        2.123   
59  Large Indel (16-50bp)               LOL-EVE      2.096    