In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
import seaborn as sns
from scipy import stats
from collections import Counter

In [None]:
df = pd.read_csv('')

In [None]:


def safe_ratio(ultra, common):
    if common == 0 or np.isnan(common) or np.isnan(ultra):
        return 1.0
    ratio = ultra / common
    if np.isinf(ratio) or np.isnan(ratio):
        return 1.0
    return ratio

def create_plots():
    # Use all models
    models = [
        'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen',
        'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen',
        'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen',
        'mean_cross_entropy_diff_hyenadna-large-1m-seqlen',
        'mean_cross_entropy_diff_hyenadna-small-32k-seqlen',
        'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16',
        'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16',
        'mean_cross_entropy_diff_DNABERT-2-117M',
        'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species',
        'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g',
        'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref',
        'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species',
        'CADD_raw_score',
        'LOL-EVE',
        'PhyloP',
        'mean_cross_entropy_diff_convnet',
        'dist_tss',
        'GC_percentage_delta',
        'Enformer'
    ]
    
    thresholds = np.array([ 0.001, 0.01, 0.1, 1])
    
    # Calculate ratios with error handling
    all_ratios = []
    for m in models:
        model_ratios = []
        for t in thresholds:
            try:
                ultra = np.percentile(df[df.MAF_mask][m], t)
                common = np.percentile(df[~df.MAF_mask][m], t)

                #print(f'model:{m}, thresh:{t}: ultra_count:{len(ultra)}, common count:{len(common)}')

                ratio = safe_ratio(ultra, common)
                model_ratios.append(ratio)
            except:
                model_ratios.append(1.0)  # fallback value
        all_ratios.append(model_ratios)
    
    all_ratios = np.array(all_ratios)
    
    # Filter out any rows with all ones (failed calculations)
    valid_indices = ~np.all(all_ratios == 1.0, axis=1)
    all_ratios = all_ratios[valid_indices]
    models = [m for i, m in enumerate(models) if valid_indices[i]]
    
    # Name mapping
    name_mapping = {
        'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen': 'HyenaDNA-tiny',
        'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen': 'HyenaDNA-medium-450k',
        'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen': 'HyenaDNA-medium-160k',
        'mean_cross_entropy_diff_hyenadna-large-1m-seqlen': 'HyenaDNA-large',
        'mean_cross_entropy_diff_hyenadna-small-32k-seqlen': 'HyenaDNA-small',
        'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16': 'Caduceus-ph',
        'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16': 'Caduceus-ps',
        'mean_cross_entropy_diff_DNABERT-2-117M': 'DNABERT-2',
        'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species': 'NT-2.5b-multi',
        'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g': 'NT-2.5b-1000g',
        'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref': 'NT-500m',
        'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species': 'NT-v2-500m',
        'CADD_raw_score': 'CADD',
        'LOL-EVE_AF': 'LOL-EVE',
        'PhyloP': 'PhyloP',
        'mean_cross_entropy_diff_convnet': 'GPN',
        'dist_tss': 'Distance to TSS',
        'GC_percentage_delta': 'GC Content Δ',
        'Enformer': 'Enformer'
    }
    
    # Process names
    model_names = [name_mapping.get(m, m.split('mean_cross_entropy_diff_')[-1].split('_seqlen')[0]) for m in models]
    
    # Create pastel colors for non-LOL-EVE models
    pastel_colors = sns.color_palette("pastel", n_colors=len(models)-1)
    
    # Create figure
    fig = plt.figure(figsize=(15, 8))
    
    # Plot lines
    loleve_index = model_names.index('LOL-EVE') if 'LOL-EVE' in model_names else -1
    
    for i, (model, name) in enumerate(zip(models, model_names)):
        if name == 'LOL-EVE':
            plt.plot(thresholds, all_ratios[i], marker='o', label=name, 
                    color='#00aa55', linewidth=3, markersize=8, zorder=3)
        else:
            color_idx = i if i < loleve_index else i-1
            color_idx = min(color_idx, len(pastel_colors)-1)  # Ensure we don't exceed color palette
            plt.plot(thresholds, all_ratios[i], marker='o', label=name,
                    color=pastel_colors[color_idx], linewidth=2, markersize=6, alpha=0.7)
    
    plt.xscale('log')
    plt.xlabel('Percentile', fontsize=16)
    plt.ylabel('Ratio (Ultra-rare / Common)', fontsize=16)
    
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Add red dashed line at y=1
    plt.axhline(y=1, color='red', linestyle='--', linewidth=2, alpha=0.8, zorder=1)
    
    # Add light gridlines
    plt.grid(True, alpha=0.3, zorder=0)
    
    # Customize legend with two columns
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12, ncol=2)
    
    # Set y-axis limits safely
    valid_max = np.max(all_ratios[np.isfinite(all_ratios)])
    plt.ylim(0.5, max(valid_max * 1.1, 2.2))
    
    plt.tight_layout()
    return fig

# Create and display the plot
fig = create_plots()
plt.savefig('ultra_rare_vs_common_appendix.png')