In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import glob
import re

In [None]:
consistent_expression_genes = [str(g).lower() for g in list(pd.read_csv('gtex_consistent_genes_500.txt')['Gene Symbol'].dropna())]
variable_expression_genes = [str(g).lower() for g in list(pd.read_csv('gtex_variable_genes_500.txt')['Gene Symbol'].dropna())]


In [None]:
current_dir = ''

In [None]:
df = pd.read_csv(f'{current_dir}/LOL-EVE/data/benchmark_data/tfbs_disruptions.csv')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import pointbiserialr
from mpl_toolkits.axes_grid1 import make_axes_locatable
import random

# 1. Function to compare transcription factors across groups and calculate statistics
def compare_tf_across_groups(df, group1_genes, group2_genes, score_columns):
    """
    Compare each TF's scores between two groups of genes for all score columns, including paired t-test, 
    point-biserial correlation, and additional statistics.
    """
    results = []
    
    for tf in df['TF'].unique():

        for score_col in score_columns:

            # Get scores for each group
            group1_scores = df[(df.GENE.isin(group1_genes)) & (df.TF == tf)][score_col]
            group2_scores = df[(df.GENE.isin(group2_genes)) & (df.TF == tf)][score_col]
            
            if len(group1_scores) > 0 and len(group2_scores) > 0:
                # Create binary group labels (0 for Group 1, 1 for Group 2)
                all_scores = np.concatenate([group1_scores, group2_scores])
                group_labels = np.array([0] * len(group1_scores) + [1] * len(group2_scores))

                # Point-biserial correlation
                biserial_corr, _ = pointbiserialr(group_labels, all_scores)

                # Perform paired t-test if group sizes match, otherwise use Mann-Whitney U test
                if len(group1_scores) == len(group2_scores):
                    statistic, p_value = stats.ttest_rel(group1_scores, group2_scores)
                else:
                    statistic, p_value = stats.mannwhitneyu(group1_scores, group2_scores, alternative='two-sided')

                # Append results with additional metrics
                results.append({
                    'TF': tf,
                    'score_column': score_col,
                    'statistic': statistic,
                    'p_value': p_value,
                    'biserial_corr': biserial_corr,
                    'group1_mean': np.mean(group1_scores),
                    'group2_mean': np.mean(group2_scores),
                    'group1_median': np.median(group1_scores),
                    'group2_median': np.median(group2_scores),
                })
    
    results_df = pd.DataFrame(results)
    
    # Perform FDR correction
    _, q_values = fdrcorrection(results_df['p_value'])
    results_df['q_value'] = q_values
    
    return results_df

# 2. Function to plot the biological expectation accuracy
def plot_biological_expectation_accuracy(results_df, file_name, group1_name, group2_name):
    percentages = []
    fontsize = 16  # Slightly reduced from 20 to fit better
    
    for score_col in results_df['score_column'].unique():
        score_results = results_df[results_df['score_column'] == score_col]
        valid_results = score_results[score_results['group2_mean'] > score_results['group1_mean']]
        percentage = (len(valid_results) / len(score_results)) * 100 if len(score_results) > 0 else 0
        
        percentages.append({
            'score_column': score_col.split('mean_cross_entropy_diff_')[-1], 
            'percentage': percentage
        })

    percentage_df = pd.DataFrame(percentages)
    
    # Calculate delta accuracy and sort
    percentage_df['delta_accuracy'] = percentage_df['percentage'] - 50
    percentage_df = percentage_df.sort_values('delta_accuracy', ascending=False)
    
    # Adjust figure size
    num_models = len(percentage_df)
    fig_width = max(12, num_models * 0.8)  # Minimum width of 12, scales with number of models
    plt.figure(figsize=(fig_width, 10))
    
    colors = {
        "LOL-EVE": "#00aa55",
        "Other": "#2f9aea"
    }
    
    bar_colors = [colors["LOL-EVE"] if col == "LOL-EVE" else colors["Other"] for col in percentage_df['score_column']]
    
    ax = sns.barplot(data=percentage_df, x='score_column', y='delta_accuracy', palette=bar_colors)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.axhline(y=0, color='red', linestyle='--')
    
    plt.xlabel('Model', fontsize=fontsize, labelpad=20)
    plt.ylabel('Delta Accuracy (%)', fontsize=fontsize, labelpad=20)
    
    for i, v in enumerate(percentage_df['delta_accuracy']):
        label = f'{v:.1f}%'
        plt.text(i, v + 0.5, label, ha='center', va='bottom', fontsize=fontsize-4, rotation=0)
    
    plt.xticks(rotation=45, ha='right', fontsize=fontsize-2)
    plt.yticks(fontsize=fontsize-2)
    
    # Adjust y-axis limits for better bar visibility
    y_min, y_max = plt.ylim()
    plt.ylim(y_min, y_max + 5)
    
    # Adjust layout and save with higher DPI
    plt.tight_layout()
    plt.savefig(file_name, dpi=300, bbox_inches='tight')
    plt.show()


score_columns = [
    'mean_cross_entropy_diff_hyenadna-tiny-1k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-450k-seqlen',
    'mean_cross_entropy_diff_hyenadna-medium-160k-seqlen',
    'mean_cross_entropy_diff_hyenadna-large-1m-seqlen',
    'mean_cross_entropy_diff_hyenadna-small-32k-seqlen', 
    'LOL-EVE',
    'mean_cross_entropy_diff_DNABERT-2-117M',
    'mean_cross_entropy_diff_caduceus-ph_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_caduceus-ps_seqlen-131k_d_model-256_n_layer-16',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-multi-species',
    'mean_cross_entropy_diff_nucleotide-transformer-2.5b-1000g',
    'mean_cross_entropy_diff_nucleotide-transformer-500m-human-ref',
    'mean_cross_entropy_diff_nucleotide-transformer-v2-500m-multi-species',
    'Phylop'
]


# # Group comparison (example)
group1 = consistent_expression_genes
group2 = variable_expression_genes

results = compare_tf_across_groups(df, group1, group2, score_columns)

results['score_column'] = results['score_column'].apply(lambda x:'PhyloP' if x == 'Phylop' else x)

plot_biological_expectation_accuracy(results, 'biological_expectation_accuracy_full.png', 'Consistent_Expression', 'Variable_Expression')

In [None]:

def get_model_info(model_name):
    if 'hyenadna' in model_name.lower():
        return 'hyenadna', 'HyenaDNA'
    elif 'caduceus' in model_name.lower():
        return 'caduceus', 'Caduceus'
    elif 'nucleotide-transformer' in model_name.lower():
        return 'nucleotide-transformer', 'NT'
    elif 'dnabert' in model_name.lower():
        return 'dnabert', 'DNABERT-2'
    else:
        return model_name, model_name  # For unique models like LOL-EVE, PhyloP

def plot_biological_expectation_accuracy(results_df, file_name, group1_name, group2_name):
    percentages = []
    for score_col in results_df['score_column'].unique():
        score_results = results_df[results_df['score_column'] == score_col]
        valid_results = score_results[score_results['group2_mean'] > score_results['group1_mean']]
        percentage = (len(valid_results) / len(score_results)) * 100 if len(score_results) > 0 else 0
        percentages.append({'score_column': score_col, 'percentage': percentage})
    
    percentage_df = pd.DataFrame(percentages)
    
    # Get the best model for each family
    best_models = {}
    for model in percentage_df['score_column']:
        family, simplified_name = get_model_info(model)
        if family not in best_models or percentage_df[percentage_df['score_column'] == model]['percentage'].values[0] > best_models[family][1]:
            best_models[family] = (model, percentage_df[percentage_df['score_column'] == model]['percentage'].values[0], simplified_name)

    # Filter and prepare the final DataFrame for plotting
    best_models_df = pd.DataFrame([
        {'score_column': simplified_name, 'percentage': percentage, 'full_name': model}
        for family, (model, percentage, simplified_name) in best_models.items()
    ]).sort_values('percentage', ascending=False)

    # Calculate delta accuracy
    best_models_df['delta_accuracy'] = best_models_df['percentage'] - 50

    # Adjust figure size based on the number of columns
    num_cols = len(best_models_df)
    fig_width = max(14, num_cols * 1.5)  # Increased minimum width and scaling factor
    plt.figure(figsize=(fig_width, 10))  # Increased height to accommodate larger fonts
    
    colors = {
        "LOL-EVE": "#00aa55",
        "Other": "#2f9aea"
    }
    
    bar_colors = [colors["LOL-EVE"] if col == "LOL-EVE" else colors["Other"] for col in best_models_df['score_column']]
    
    ax = sns.barplot(data=best_models_df, x='score_column', y='delta_accuracy', palette=bar_colors)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.axhline(y=0, color='red', linestyle='--')
    
    plt.ylabel('Delta Accuracy (%)', fontsize=20, labelpad=20)  # Increased font size
    plt.xlabel('Model', fontsize=20, labelpad=20)  # Increased font size
    
    for i, v in enumerate(best_models_df['delta_accuracy']):
        label = f'{v:.1f}%'
        plt.text(i, v + 0.5, label, ha='center', va='bottom', fontsize=16, rotation=0)  # Increased font size
    
    plt.xticks(rotation=45, ha='right', fontsize=16)  # Increased font size
    plt.yticks(fontsize=16)  # Increased font size
    
    # Adjust y-axis limits for better bar visibility
    y_min, y_max = plt.ylim()
    plt.ylim(y_min, y_max + 7)  # Increased upper limit to accommodate larger labels
    
    plt.tight_layout()
    plt.savefig(file_name, dpi=300, bbox_inches='tight')
    plt.show()
    
    # Print selected models
    print("Selected models:")
    for _, row in best_models_df.iterrows():
        print(f"{row['score_column']}: {row['full_name']}")


plot_biological_expectation_accuracy(results, 'best_models_comparison.png', 'Consistent_Expression', 'Variable_Expression')