In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
import seaborn as sns

In [None]:
current_dir = '<path_to_repo>'

In [None]:
df_merged = pd.read_csv(f'{current_dir}/LOL-EVE/data/benchmark_data/caual_eqtls_benchmark.csv')

In [None]:
def classify_variant(row, threshold):
    return 'causal' if row['pip'] >= threshold else 'background'

def cohen_d(x, y):
    return (np.mean(x) - np.mean(y)) / (np.sqrt((np.var(x) + np.var(y)) / 2))

def compute_metrics_and_counts(df, model, pip_cutoff):
    df['pip_group'] = df.apply(lambda row: classify_variant(row, pip_cutoff), axis=1)
    
    a = df[df.pip_group == 'causal'][model].abs()
    b = df[df.pip_group == 'background'][model].abs()
    
    cohens_d = cohen_d(a, b)
    
    df['binary_label'] = df['pip_group'].apply(lambda x: 1 if x == 'causal' else 0)
    p, r, _ = precision_recall_curve(df['binary_label'], df[model].abs())
    auprc = auc(r, p)
    
    baseline = len(df[df['pip_group'] == 'causal']) / len(df)
    normalized_auprc = auprc / baseline
    
    causal_count = len(df[df['pip_group'] == 'causal'])
    background_count = len(df[df['pip_group'] == 'background'])
    
    return cohens_d, normalized_auprc, causal_count, background_count

# List of models (unchanged)
models = [
    'PhyloP', 'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen',
    'mean_cross_entropy_diff_hyenadna-large-1m-seqlen',
    'mean_cross_entropy_diff_hyenadna-small-32k-seqlen',
    'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_DNABERT-2-117M',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g',
    'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref',
    'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species',
    'LOL-EVE',
]

# PIP cutoff values to test
pip_cutoffs = np.linspace(0.95, 0.7, num=10)

results = []
counts = []

for model in models:
    print(f"Processing {model}")
    model_results = []
    model_counts = []
    for pip_cutoff in pip_cutoffs:
        cohens_d, normalized_auprc, causal_count, background_count = compute_metrics_and_counts(df_merged, model, pip_cutoff)
        model_results.append({
            'Model': model,
            'PIP Cutoff': pip_cutoff,
            "Cohen's D": cohens_d,
            'Normalized AUPRC': normalized_auprc
        })
        model_counts.append({
            'PIP Cutoff': pip_cutoff,
            'Causal Count': causal_count,
            'Background Count': background_count
        })
    results.extend(model_results)
    counts.append(pd.DataFrame(model_counts))

results_df = pd.DataFrame(results)
counts_df = pd.concat(counts, keys=models, names=['Model', 'Index'])
counts_df = counts_df.reset_index().drop('Index', axis=1)

# Plotting (colors dictionary remains unchanged)
colors = {
    "LOL-EVE": "#00aa55",  # Keeping this the same as requested
    "PhyloP": "#FF9AA2",  # Darker light pink
    "mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen": "#A8E6CF",  # Darker light green
    "mean_cross_entropy_diff_hyenadna-medium-450k-seqlen": "#A2D2FF",  # Darker light blue
    "mean_cross_entropy_diff_hyenadna-medium-160k-seqlen": "#FDFD96",  # Darker light yellow
    "mean_cross_entropy_diff_hyenadna-large-1m-seqlen": "#FFB347",  # Darker light orange
    "mean_cross_entropy_diff_hyenadna-small-32k-seqlen": "#E0AAFF",  # Darker light purple
    "mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16": "#A3C1AD",  # Darker light sage
    "mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16": "#B19CD9",  # Darker light lavender
    "mean_cross_entropy_diff_DNABERT-2-117M": "#FFD1DC",  # Darker misty rose
    "mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species": "#AFEEEE",  # Darker light cyan
    "mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g": "#FFE4E1",  # Darker lavender blush
    "mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref": "#D0F0C0",  # Darker honeydew
    "mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species": "#F0E68C",  # Darker beige
}

# Function to get color for a model
def get_color(model):
    return colors.get(model, "#808080")  # Default to gray if model not in colors dict


plt.figure(figsize=(20, 24))  # Increased height for better separation
sns.set_style("whitegrid")

# Function to simplify model names
def simplify_model_name(name):
    if 'mean_cross_entropy_diff_' in name:
        name = name.replace('mean_cross_entropy_diff_', '')
    if 'nucleotide-transformer' in name:
        name = name.replace('nucleotide-transformer', 'NT')
    return name

# Cohen's D plot
ax1 = plt.subplot(2, 1, 1)
for model in results_df['Model'].unique():
    model_data = results_df[results_df['Model'] == model]
    plt.plot(model_data['PIP Cutoff'], model_data["Cohen's D"], label=simplify_model_name(model), 
             color=get_color(model), linewidth=2.5, marker='o', markersize=8)

plt.title("Cohen's D vs PIP Cutoff", fontsize=24)
plt.xlabel('')
plt.ylabel("Cohen's D", fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.ylim(-0.25, 0.35)  # Adjust y-axis limits for better visualization

# Normalized AUPRC plot
ax2 = plt.subplot(2, 1, 2)
for model in results_df['Model'].unique():
    model_data = results_df[results_df['Model'] == model]
    plt.plot(model_data['PIP Cutoff'], model_data['Normalized AUPRC'], label=simplify_model_name(model), 
             color=get_color(model), linewidth=2.5, marker='o', markersize=8)

plt.title('Normalized AUPRC vs PIP Cutoff', fontsize=24)
plt.xlabel('PIP Cutoff', fontsize=22)
plt.ylabel('Normalized AUPRC', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.ylim(0.85, 1.55)  # Adjust y-axis limits for better visualization

# Create a single legend for both subplots
handles, labels = ax2.get_legend_handles_labels()
fig = plt.gcf()
fig.legend(handles, labels, loc='center left', bbox_to_anchor=(.9, 0.5), fontsize=25, 
           borderaxespad=0, frameon=True, fancybox=True, shadow=True)

plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust layout to make room for the legend
plt.savefig('improved_eqtl_metrics_vs_pip_cutoff.png', dpi=300, bbox_inches='tight')
plt.show()

print("Improved plot saved as 'improved_eqtl_metrics_vs_pip_cutoff.png'")

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Assuming df_merged is your DataFrame
# If it's not defined, you'll need to load your data first

temp = df_merged  # Using the full dataset

def classify_variant(row):
    if row['pip'] < .95:
        return 'background'
    else:
        return 'causal'

def cohen_d(x, y):
    return (np.mean(x) - np.mean(y)) / (np.sqrt((np.var(x) + np.var(y)) / 2))

def compute_ranking_statistic(df, model):
    grouped = df.groupby('PROMOTER_GENE')
    total_genes = 0
    causal_ranked_higher = 0
    
    for _, gene_group in grouped:
        causal = gene_group[gene_group['pip_group'] == 'causal']
        background = gene_group[gene_group['pip_group'] == 'background']
        if len(causal) > 0 and len(background) > 0:
            total_genes += 1
            if causal[model].abs().mean() > background[model].abs().mean():
                causal_ranked_higher += 1
    
    return causal_ranked_higher / total_genes if total_genes > 0 else 0

# Apply classification to the DataFrame
temp['pip_group'] = temp.apply(classify_variant, axis=1)

models = [
    'phylop', 'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen',
    'mean_cross_entropy_diff_hyenadna-large-1m-seqlen',
    'mean_cross_entropy_diff_hyenadna-small-32k-seqlen',
    'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_DNABERT-2-117M',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g',
    'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref',
    'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species',
    'loli_score',
]

results = []

for model in models:
    print(model)
    a = temp[temp.pip_group == 'causal'][model].abs()
    b = temp[temp.pip_group == 'background'][model].abs()
    
    cohens_d = cohen_d(a, b)
    
    temp['binary_label'] = temp['pip_group'].apply(lambda x: 1 if x == 'causal' else 0)
    auroc = roc_auc_score(temp['binary_label'], temp[model].abs())
    p,r,_ = precision_recall_curve(temp['binary_label'], temp[model].abs()) 
    auprc = auc(r, p)
    baseline = len(temp[temp['pip_group'] == 'causal']) / len(temp)
    
    results.append({
        'Model': model,
        "Cohen's d": cohens_d,
        'AUPRC': auprc / baseline
    })
    

# Convert results to a DataFrame for easy viewing and further analysis
results_df = pd.DataFrame(results)
print(results_df)
