In [None]:
%load_ext autoreload
%autoreload 2
import sys
import os
ProjDIR = "/home/jw3514/Work/ASD_Circuits_CellType/" # Change to your project directory
sys.path.insert(1, f'{ProjDIR}/src/')
from ASD_Circuits import *
from plot import *

try:
    os.chdir(f"{ProjDIR}/notebook_rebuttal/")
    print(f"Current working directory: {os.getcwd()}")
except FileNotFoundError as e:
    print(f"Error: Could not change directory - {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

HGNC, ENSID2Entrez, GeneSymbol2Entrez, Entrez2Symbol = LoadGeneINFO()

In [None]:
# Load config file
with open("../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

expr_matrix_path = config["analysis_types"]["STR_ISH"]["expr_matrix"]
STR_BiasMat = pd.read_parquet(f"../{expr_matrix_path}")
STR_Anno = STR2Region()

expr_matrix_path = config["analysis_types"]["CT_Z2"]["expr_matrix"]
CT_BiasMat = pd.read_parquet(f"../{expr_matrix_path}")
CT_Anno = pd.read_csv(ProjDIR + "dat/MouseCT_Cluster_Anno.csv", index_col="cluster_id_label")

# Structure level 

## 1. ALL DDD genes

In [None]:
DDD_GW = Fil2Dict(config["gene_sets"]["DDD_293"]["geneweights"])
DDD_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, DDD_GW)
DDD_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in DDD_STR_Bias.index.values]

In [None]:
ScoreMatDir="/home/jw3514/Work/ASD_Circuits/dat/allen-mouse-conn/ScoreingMat_jw_v3/"
WeightMat = pd.read_csv(ScoreMatDir + "WeightMat.Ipsi.csv", index_col=0)
IpsiInfoMat=pd.read_csv(ScoreMatDir + "InfoMat.Ipsi.csv", index_col=0)
IpsiInfoMatShort_v1=pd.read_csv(ScoreMatDir + "InfoMat.Ipsi.Short.3900.csv", index_col=0)
IpsiInfoMatLong_v1=pd.read_csv(ScoreMatDir + "InfoMat.Ipsi.Long.3900.csv", index_col=0)

topNs = np.arange(200, 5, -1)
DIR = "/home/jw3514/Work/ASD_Circuits/scripts/RankScores/"
Cont_Distance = np.load("{}/RankScore.Ipsi.Cont.npy".format(DIR))
Cont_DistanceShort = np.load("{}/RankScore.Ipsi.Short.3900.Cont.npy".format(DIR))
Cont_DistanceLong = np.load("{}/RankScore.Ipsi.Long.3900.Cont.npy".format(DIR))

In [None]:
score = calculate_circuit_scores(DDD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")

In [None]:
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax1 = plt.subplots(1,1, dpi=480, figsize=(12,6), facecolor='none')

fig.patch.set_alpha(0)
ax1.patch.set_alpha(0)

BarLen = 34.1
#BarLen = 47.5

topNs = list(range(200, 5, -1))  # Define topNs based on the range used in calculate_circuit_scores
                        
ax1.plot(topNs, score, color='#1f77b4', marker="o", markersize=5, lw=1,
                    ls="dashed", label="DD", alpha = 0.5)

cont = np.median(Cont_Distance, axis=0)
lower = np.percentile(Cont_Distance, 50-BarLen, axis=0)
upper = np.percentile(Cont_Distance, 50+BarLen, axis=0)
ax1.errorbar(topNs, cont, color="grey", marker="o", markersize=1.5, lw=1,
            yerr=(cont - lower, upper - cont ), ls="dashed", label="Siblings")
ax1.set_xlabel("Structure Rank\n", fontsize=17)
ax1.set_ylabel("Circuit Connectivity Score", fontsize=15)
ax1.grid(True, linestyle='--', alpha=0.7)
ax1.set_xlim(0, 121)

# Place legend outside of plot
ax1.legend(fontsize=13, bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()  # Adjust layout to prevent legend cutoff

In [None]:
Spark_ASD_STR_Bias = pd.read_csv("../dat/Unionize_bias/Spark_Meta_EWS.Z2.bias.FDR.csv", index_col=0)
Spark_ASD_STR_Bias["Region"] = Spark_ASD_STR_Bias["REGION"]

In [None]:
Spark_ASD_STR_Bias[Spark_ASD_STR_Bias["qvalues"]<0.05].shape

# Section 2: Structure Level Results

This section includes:
1. DDD exclude ASD correlation with ASD
2. Constraint (top25% LOEUF) with ASD  
3. Residual structures analysis

## 2.1 Helper Functions


# Section 3: Cell Type Results

This section includes cell type-level analyses:
1. DDD exclude ASD correlation with ASD
2. Constraint (top25% LOEUF) with ASD
3. Residual cell type analysis


In [None]:
from sklearn.linear_model import LinearRegression
def fit_structure_bias_linear_model(merged_data, metric='EFFECT', suffixes=('_1', '_2')):

    X = merged_data[f'{metric}{suffixes[1]}'].values.reshape(-1, 1)
    y = merged_data[f'{metric}{suffixes[0]}'].values

    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    residuals = y - y_pred

    results_df = merged_data.copy()
    results_df['predicted'] = y_pred
    results_df['residual'] = residuals

    return results_df

#results_df = fit_structure_bias_linear_model(merged_data2, metric='EFFECT', suffixes=('_ASD', '_DD'))

def merge_str_bias_datasets(dataset1, dataset2, suffixes=('_1', '_2')):
    """
    Merge two structure bias datasets for comparison.
    
    Parameters:
    -----------
    dataset1 : DataFrame
        First dataset with 'Rank', 'EFFECT', and 'Region' columns
    dataset2 : DataFrame
        Second dataset with 'Rank' and 'EFFECT' columns
    suffixes : tuple of str
        Suffixes to append to column names for each dataset
    
    Returns:
    --------
    merged_data : DataFrame
        Merged dataset with comparison metrics for both Rank and EFFECT
    """
    # Select all relevant columns
    dataset1_cols = ['Rank', 'EFFECT', 'Region']
    dataset2_cols = ['Rank', 'EFFECT']
    
    # Merge the datasets on structure names for comparison
    merged_data = pd.merge(dataset1[dataset1_cols], dataset2[dataset2_cols], 
                          left_index=True, right_index=True, suffixes=suffixes)

    # Calculate differences for both Rank and EFFECT metrics
    merged_data[f'DIFF_Rank'] = merged_data[f'Rank{suffixes[0]}'] - merged_data[f'Rank{suffixes[1]}']
    merged_data[f'ABS_DIFF_Rank'] = np.abs(merged_data[f'DIFF_Rank'])
    
    merged_data[f'DIFF_EFFECT'] = merged_data[f'EFFECT{suffixes[0]}'] - merged_data[f'EFFECT{suffixes[1]}']
    merged_data[f'ABS_DIFF_EFFECT'] = np.abs(merged_data[f'DIFF_EFFECT'])

    # Sort by absolute difference in EFFECT by default
    merged_data = merged_data.sort_values('ABS_DIFF_EFFECT', ascending=False)
    merged_data = fit_structure_bias_linear_model(merged_data, metric='EFFECT', suffixes=suffixes)
    
    return merged_data


# Call the function
merged_data = merge_str_bias_datasets(Spark_ASD_STR_Bias, DDD_STR_Bias, suffixes=('_ASD', '_DD'))
plot_structure_bias_comparison(merged_data, suffixes=('_ASD', '_DD'),  metric="EFFECT")

In [None]:
ASD_GW = Fil2Dict(ProjDIR+"dat/Genetics/GeneWeights_DN/Spark_Meta_EWS.GeneWeight.DN.gw")
# ASD_SC_Bias = MouseCT_AvgZ_Weighted(CT_BiasMat, ASD_GW)
# ASD_SC_Bias = add_class(ASD_SC_Bias, CT_Anno)
# ASD_SC_Bias.to_csv(ProjDIR + "/results/CT_Z2/ASD_Spark61.csv")

# DDD_SC_Bias = MouseCT_AvgZ_Weighted(CT_BiasMat, DDD_GW)
# DDD_SC_Bias = add_class(DDD_SC_Bias, CT_Anno)
# DDD_SC_Bias.to_csv(ProjDIR + "/results/CT_Z2/DDD_295.csv")
ASD_SC_Bias = pd.read_csv(ProjDIR + "/results/CT_Z2/ASD_All_bias_addP_sibling.csv", index_col=0)
DDD_SC_Bias = pd.read_csv(ProjDIR + "/results/CT_Z2/DDD_293_bias_addP_sibling.csv", index_col=0)

In [None]:
plot_correlation_scatter_mouseCT(DDD_SC_Bias, ASD_SC_Bias, name1="DD Cell Type Bias", name2="ASD Cell Type Bias", effect_col1="EFFECT", effect_col2="EFFECT", dpi=240)

### Exclude ASD genes and see whats left

In [None]:
# file_path = "/home/jw3514/Work/ASD_Circuits_CellType/dat/Genetics/Fu_et_al_2022.xlsx"
# fu_DF_Pval = pd.read_excel(file_path, sheet_name="Supplementary Table 11", skiprows=0)
# ASD_GENES = fu_DF_Pval[fu_DF_Pval["ASD185"] == 1]["gene_gencodeV33"]
# ASD_GENES = [GeneSymbol2Entrez[gene] for gene in ASD_GENES]

ASD_GENES = list(ASD_GW.keys())

In [None]:
DDD_GW_filt_ASD = {k: v for k, v in DDD_GW.items() if k not in ASD_GENES}
print(len(DDD_GW_filt_ASD))
DDD_rmASD_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, DDD_GW_filt_ASD)
DDD_rmASD_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in DDD_rmASD_STR_Bias.index.values]

In [None]:
Dict2Fil(DDD_GW, ProjDIR+"/dat/Genetics/GeneWeights/DDD.top293.gw")
Dict2Fil(DDD_GW_filt_ASD, ProjDIR+"/dat/Genetics/GeneWeights/DDD.top245.ExcludeASD.gw")

In [None]:
merged_data2 = merge_str_bias_datasets(Spark_ASD_STR_Bias, DDD_rmASD_STR_Bias, suffixes=('_ASD', '_DD_ExcludeASD'))
plot_structure_bias_comparison(merged_data2, suffixes=('_ASD', '_DD_ExcludeASD'), metric='EFFECT')

In [None]:
GENCIC = pd.read_csv('../results/GENCIC_MouseSTRBias.csv', index_col=0)
Circuit_STRs = GENCIC[GENCIC["Circuits.46"]==1]["Structure"].values

In [None]:
## 2.3 Residual Structures Analysis: DDD Exclude ASD vs ASD

def plot_top_residual_structures(merged_data, top_n=30, top_threshold=40, 
                                name1="ASD", name2="DD", figsize=(10, 8)):
    """
    Plot brain structures with largest residuals from regression analysis.
    
    Parameters:
    -----------
    merged_data : DataFrame
        Merged dataset with residual and region information
    top_n : int
        Number of top structures to display
    top_threshold : int
        Filter to structures in top N of at least one dataset
    name1, name2 : str
        Names of the two datasets being compared
    figsize : tuple
        Figure size (width, height)
    
    Returns:
    --------
    top_diff : DataFrame
        Top structures with largest residuals
    """
    # Filter to only structures that appear in top threshold of at least one dataset
    top_structures = merged_data[(merged_data[f"Rank_{name1}"] <= top_threshold) | 
                                (merged_data[f"Rank_{name2}"] <= top_threshold)]

    print(f"Total structures in top {top_threshold} of at least one dataset: {len(top_structures)}")

    # Sort by absolute difference for top structures only
    top_structures = top_structures.copy()
    top_structures["ABS_DIFF"] = abs(merged_data[f"residual"])
    top_structures = top_structures.sort_values('ABS_DIFF', ascending=True)

    # Take the top N structures with largest differences from those in top threshold
    top_n = min(top_n, len(top_structures))  # Use min to avoid index errors if fewer structures available
    top_diff = top_structures.tail(top_n)  # Get largest differences

    print(f"Showing top {len(top_diff)} structures with largest differences (from top {top_threshold} filter)")

    # Define regions and colors
    REGIONS_seq = ['Isocortex','Olfactory_areas', 'Cortical_subplate', 
                    'Hippocampus','Amygdala','Striatum', 
                    "Thalamus", "Hypothalamus", "Midbrain", 
                    "Medulla", "Pallidum", "Pons", 
                    "Cerebellum"]
    REG_COR_Dic = dict(zip(REGIONS_seq, ["#268ad5", "#D5DBDB", "#7ac3fa", 
                                        "#2c9d39", "#742eb5", "#ed8921", 
                                        "#e82315", "#E6B0AA", "#f6b26b",  
                                        "#20124d", "#2ECC71", "#D2B4DE", 
                                        "#ffd966", ]))

    # Create publication-quality plot of residuals for top_diff structures
    plt.rcParams.update({'font.size': 12, 'font.family': 'Arial'})
    fig, ax = plt.subplots(figsize=figsize, dpi=300)

    # Sort top_diff by residuals for better visualization
    top_diff_sorted = top_diff.sort_values('ABS_DIFF', ascending=True)

    # Create colors based on region
    colors = [REG_COR_Dic.get(region, '#808080') for region in top_diff_sorted['Region']]

    # Create horizontal bar plot with better styling
    bars = ax.barh(range(len(top_diff_sorted)), 
                   top_diff_sorted[f'residual'], 
                   color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)

    # Customize the plot with publication-quality styling
    ax.set_yticks(range(len(top_diff_sorted)))
    ax.set_yticklabels([name.replace('_', ' ') for name in top_diff_sorted.index], 
                       fontsize=12, fontweight='normal')
    ax.set_xlabel(f'Residuals ({name1} vs {name2})', fontsize=14, fontweight='bold')
    #ax.set_title(f'Top {len(top_diff)} Brain Structures with Largest Residuals', 
    #             fontsize=16, fontweight='bold', pad=20)

    # Remove top and right spines for cleaner look
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1)
    ax.spines['bottom'].set_linewidth(1)

    # Add subtle grid
    ax.grid(True, axis='x', alpha=0.3, linestyle='-', linewidth=0.5)
    ax.set_axisbelow(True)

    # Add vertical line at x=0 with better styling
    ax.axvline(x=0, color='black', linestyle='-', alpha=0.7, linewidth=1)

    # Create legend for regions with better styling
    unique_regions = sorted(list(set(top_diff_sorted['Region'])))
    legend_elements = [plt.Rectangle((0,0),1,1, facecolor=REG_COR_Dic.get(region, '#808080'), 
                                    alpha=0.8, edgecolor='black', linewidth=0.5) 
                       for region in unique_regions if region in REG_COR_Dic]
    legend_labels = [region.replace('_', ' ') for region in unique_regions if region in REG_COR_Dic]

    if legend_elements:
        ax.legend(
            legend_elements, legend_labels,
            loc='center left',       # You can change this to your preferred location, e.g. 'center left'
            bbox_to_anchor=(0.70, 0.1), # Pushes legend outside right of plot at vertical center
            fontsize=10, 
            frameon=True,
            fancybox=True,
            shadow=True,
            framealpha=0.9
        )

    # Adjust layout and margins
    plt.tight_layout()
    plt.subplots_adjust(left=0.3)  # Make room for structure names
    plt.show()
    
    return top_diff

# Call the function
merged_data_eval = merged_data2[merged_data2.index.isin(Circuit_STRs)]
top_diff = plot_top_residual_structures(merged_data_eval, top_n=20, top_threshold=40,
                                       name1="ASD", name2="DD_ExcludeASD", figsize=(6, 8))

### Cell Types

In [None]:
# DDD_rmASD_SC_Bias = MouseCT_AvgZ_Weighted(CT_BiasMat, DDD_GW_filt_ASD)
# DDD_rmASD_SC_Bias = add_class(DDD_rmASD_SC_Bias, CT_Anno)
# DDD_rmASD_SC_Bias.to_csv(ProjDIR + "/results/CT_Z2/DDD_245.rmASD.csv")

DDD_rmASD_SC_Bias = pd.read_csv(ProjDIR + "/results/CT_Z2/DDD_293_ExcludeASD_bias_addP_sibling.csv", index_col=0)

In [None]:
plot_correlation_scatter_mouseCT(DDD_rmASD_SC_Bias, ASD_SC_Bias, name1="DD (ASD Excluded) Cell Type Bias", name2="ASD Cell Type Bias", effect_col1="EFFECT", effect_col2="EFFECT", dpi=240)

In [None]:
DDD_rmASD_SC_Bias.head(2)

In [None]:
## 3.4 Residual Cell Type Analysis: DDD Exclude ASD vs ASD

def merge_ct_bias_datasets(dataset1, dataset2, suffixes=('_1', '_2')):
    """
    Merge two structure bias datasets for comparison.
    
    Parameters:
    -----------
    dataset1 : DataFrame
        First dataset with 'Rank', 'EFFECT', and 'Region' columns
    dataset2 : DataFrame
        Second dataset with 'Rank' and 'EFFECT' columns
    suffixes : tuple of str
        Suffixes to append to column names for each dataset
    
    Returns:
    --------
    merged_data : DataFrame
        Merged dataset with comparison metrics for both Rank and EFFECT
    """
    # Select all relevant columns
    dataset1_cols = ['Rank', 'EFFECT', 'class_id_label', 'subclass_id_label', 'CCF_broad.freq', 'CCF_acronym.freq']
    dataset2_cols = ['Rank', 'EFFECT']
    
    # Merge the datasets on structure names for comparison
    merged_data = pd.merge(dataset1[dataset1_cols], dataset2[dataset2_cols], 
                          left_index=True, right_index=True, suffixes=suffixes)

    # Calculate differences for both Rank and EFFECT metrics
    merged_data[f'DIFF_Rank'] = merged_data[f'Rank{suffixes[0]}'] - merged_data[f'Rank{suffixes[1]}']
    merged_data[f'ABS_DIFF_Rank'] = np.abs(merged_data[f'DIFF_Rank'])
    
    merged_data[f'DIFF_EFFECT'] = merged_data[f'EFFECT{suffixes[0]}'] - merged_data[f'EFFECT{suffixes[1]}']
    merged_data[f'ABS_DIFF_EFFECT'] = np.abs(merged_data[f'DIFF_EFFECT'])

    # Sort by absolute difference in EFFECT by default
    merged_data = merged_data.sort_values('ABS_DIFF_EFFECT', ascending=False)
    merged_data = fit_structure_bias_linear_model(merged_data, metric='EFFECT', suffixes=suffixes)
    
    return merged_data

In [None]:
# Print unique class_id_label values from CT_Anno
print(CT_Anno['class_id_label'].unique())

In [None]:
ct_merged_data = merge_ct_bias_datasets(ASD_SC_Bias, DDD_rmASD_SC_Bias, suffixes=('_ASD', '_DD'))

CNU_LGE_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '09 CNU-LGE GABA'].index.tolist() if x in ct_merged_data.index]
IT_ET_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '01 IT-ET Glut'].index.tolist() if x in ct_merged_data.index]
NP_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '02 NP-CT-L6b Glut'].index.tolist() if x in ct_merged_data.index]
CGE_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '06 CTX-CGE GABA'].index.tolist() if x in ct_merged_data.index]
MGE_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '07 CTX-MGE GABA'].index.tolist() if x in ct_merged_data.index]


D1D2_labels = ['061 STR D1 Gaba', '062 STR D2 Gaba']
STR_D1D2 = [idx for idx in CT_Anno[CT_Anno['subclass_id_label'].isin(D1D2_labels)].index if idx in ct_merged_data.index]
Other_LGE = [idx for idx in CT_Anno[CT_Anno['class_id_label'] == '09 CNU-LGE GABA'].index if idx in ct_merged_data.index and idx not in STR_D1D2]

HIP = ['016 CA1-ProS Glut', '017 CA3 Glut']
HIP_Glut = [x for x in CT_Anno[CT_Anno['subclass_id_label'].isin(HIP)].index.tolist() if x in ct_merged_data.index]
Other_IT_ET = [x for x in CT_Anno[CT_Anno['class_id_label'] == '01 IT-ET Glut'].index if x in ct_merged_data.index and x not in HIP_Glut]

AMY =  ['012 MEA Slc17a7 Glut',
 '013 COAp Grxcr2 Glut',
 '014 LA-BLA-BMA-PA Glut',
 '015 ENTmv-PA-COAp Glut',]
AMY_Glut = [x for x in CT_Anno[CT_Anno['subclass_id_label'].isin(AMY)].index.tolist() if x in ct_merged_data.index]
Other_IT_ET = [x for x in CT_Anno[CT_Anno['class_id_label'] == '01 IT-ET Glut'].index if x in ct_merged_data.index and x not in AMY_Glut and x not in HIP_Glut]

RU_Cluster = [x for x in CT_Anno[CT_Anno['subclass_id_label'] == '152 RE-Xi Nox4 Glut'].index.tolist() if x in ct_merged_data.index]
PF_Cluster = [x for x in CT_Anno[CT_Anno['subclass_id_label'] == '154 PF Fzd5 Glut'].index.tolist() if x in ct_merged_data.index]
RU_PF = RU_Cluster + PF_Cluster
Other_TH_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '18 TH Glut'].index.tolist() if x in ct_merged_data.index and x not in RU_PF]

AMY_HYA_Glut = [x for x in CT_Anno[CT_Anno['class_id_label'] == '13 CNU-HYa Glut'].index.tolist() if x in ct_merged_data.index]
AMY_HYA_GABA = [x for x in CT_Anno[CT_Anno['class_id_label'] == '11 CNU-HYa GABA'].index.tolist() if x in ct_merged_data.index]


In [None]:
cluster_dict = {
    "D1/D2 MSN": STR_D1D2,
    "CNU_LGE_GABA (Other)": Other_LGE,
    "PF_RE_TH_Glut": RU_PF,
    "TH_Glut (Other)": Other_TH_Cluster,
    "CNU_HYA_Glut": AMY_HYA_Glut,
    "CNU_HYA_GABA": AMY_HYA_GABA,
    "CTX_CGE_GABA": CGE_Cluster,
    "IT_ET_Glut": IT_ET_Cluster,
    "NP_CT_L6b_Glut": NP_Cluster,
    "CTX_MGE_GABA": MGE_Cluster,
}

plot_palette = [
    "orange",   # D1/D2 MSN
    "green",    # CNU_LGE_GABA (Other)
    "purple",   # PF_RE_TH_Glut
    "red",      # TH_Glut (Other)
    "blue",     # CNU_HYA_Glut
    "gold",     # CNU_HYA_GABA
    "pink",     # CTX_CGE_GABA
    "teal",     # IT_ET_Glut
    "sienna",   # NP_CT_L6b_Glut
    "indigo"    # CTX_MGE_GABA
]

pairwise_tests = [
    ("D1/D2 MSN", "CNU_LGE_GABA (Other)"),
    ("PF_RE_TH_Glut", "TH_Glut (Other)"),
    #("CTX_CGE_GABA", ["CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
    ("D1/D2 MSN", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
    #("PF_RE_TH_Glut", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
    ("CNU_HYA_Glut", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
    ("CNU_HYA_GABA", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
]
# ct_merged_data = merge_ct_bias_datasets(ASD_SC_Bias, DDD_rmASD_SC_Bias, suffixes=('_ASD', '_DD'))
# _ = cluster_residual_boxplot(
#     ct_merged_data,
#     cluster_dict,
#     metric="residual",
#     palette=plot_palette,
#     figsize=(12,8),
#     pairwise_tests=pairwise_tests
# )

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu

def bh_fdr(pvals):
    """Benjamini–Hochberg FDR correction; returns adjusted p-values."""
    pvals = np.asarray(pvals, dtype=float)
    n = np.sum(np.isfinite(pvals))
    out = pvals.copy()
    if n == 0:
        return out
    idx = np.where(np.isfinite(pvals))[0]
    p = pvals[idx]
    order = np.argsort(p)
    ranked = p[order]
    adj = ranked * n / (np.arange(1, n + 1))
    adj = np.minimum.accumulate(adj[::-1])[::-1]
    adj = np.clip(adj, 0, 1)
    out_idx = np.empty_like(adj)
    out_idx[order] = adj
    out[idx] = out_idx
    return out

def p_to_star(p):
    if p < 1e-4: return "****"
    if p < 1e-3: return "***"
    if p < 1e-2: return "**"
    if p < 5e-2: return "*"
    return "ns"

def wrap_label(s, max_len=16):
    if len(s) <= max_len:
        return s
    for sep in [" ", "_"]:
        if sep in s:
            parts = s.split(sep)
            line1, line2 = [], []
            cur = 0
            for part in parts:
                add = len(part) + (1 if line1 else 0)
                if cur + add <= max_len:
                    line1.append(part); cur += add
                else:
                    line2.append(part)
            if line2:
                return " ".join(line1) + "\n" + " ".join(line2)
    return s

def cluster_residual_boxplot(
    results_df,
    cluster_dict,
    metric="residual",
    palette=None,
    figsize=(12, 8),
    pairwise_tests=None,
    p_adjust="fdr_bh",     # None or "fdr_bh"
    p_style="stars",       # "stars" or "exact"
    show_ns=False,
    wrap_xticks=True,
    wrap_len=16,
    point_size=2.2,
    point_alpha=0.16,
    point_color="0.2",
    rasterize_points=True,
    box_width=0.6,
    fontsize=12,
    title=None,
    show=True
):
    # ---- checks ----
    if metric not in results_df.columns:
        raise ValueError(f"metric='{metric}' not in results_df.columns")
    if pairwise_tests is None:
        pairwise_tests = []

    cluster_labels = list(cluster_dict.keys())

    # palette
    if palette is None:
        palette = sns.color_palette("tab10", n_colors=len(cluster_labels))
    elif isinstance(palette, dict):
        palette = [palette[k] for k in cluster_labels]
    else:
        if len(palette) < len(cluster_labels):
            raise ValueError(f"palette has {len(palette)} colors but needs {len(cluster_labels)}.")
        palette = palette[:len(cluster_labels)]

    # ---- build plot_df ----
    vals_list, n_points = [], []
    for k in cluster_labels:
        v = results_df.loc[cluster_dict[k], metric].dropna().values
        vals_list.append(v)
        n_points.append(len(v))

    plot_df = pd.DataFrame({
        "Cluster": np.repeat(cluster_labels, n_points),
        metric: np.concatenate(vals_list) if len(vals_list) else np.array([])
    })

    # ---- plot ----
    sns.set_style("white")
    sns.set_context("paper", font_scale=1.0)
    fig, ax = plt.subplots(figsize=figsize, dpi=240)

    sns.boxplot(
        x="Cluster", y=metric, data=plot_df,
        palette=palette, width=box_width,
        showfliers=False, linewidth=1.0,
        showmeans=True,
        meanprops={"marker": "o", "markerfacecolor": "black",
                   "markeredgecolor": "black", "markersize": 5},
        ax=ax
    )
    for patch in ax.artists:
        patch.set_alpha(0.88)

    sns.stripplot(
        x="Cluster", y=metric, data=plot_df,
        color=point_color, alpha=point_alpha,
        jitter=0.22, size=point_size,
        ax=ax
    )
    if rasterize_points:
        for coll in ax.collections:
            coll.set_rasterized(True)

    ax.axhline(0, color="black", linewidth=2.0, alpha=0.85, linestyle="--", zorder=2)
    ax.grid(axis="y", color="0.86", linestyle="-", linewidth=0.8)
    ax.grid(axis="x", visible=False)

    #ax.set_ylabel(metric.capitalize(), fontsize=fontsize + 2)
    ax.set_ylabel("Bias Residual", fontsize=fontsize * 1.8)
    ax.set_xlabel("")
    ax.tick_params(axis="y", labelsize=fontsize)

    xticklabels = cluster_labels
    if wrap_xticks:
        xticklabels = [wrap_label(s, max_len=wrap_len) for s in xticklabels]
    ax.set_xticklabels(xticklabels, rotation=35, ha="right", fontsize=fontsize*1.3)

    if title:
        ax.set_title(title, fontsize=fontsize + 2)

    # ---- prepare comparisons ----
    def get_vals(group):
        group = [group] if isinstance(group, str) else list(group)
        arrs = []
        for k in group:
            if k not in cluster_dict:
                continue
            arrs.append(results_df.loc[cluster_dict[k], metric].dropna().values)
        return np.concatenate(arrs) if len(arrs) else np.array([])

    tests = []
    for gA, gB in pairwise_tests:
        A = get_vals(gA)
        B = get_vals(gB)
        if len(A) == 0 or len(B) == 0:
            continue
        # x positions (mean index if multi-group)
        gA_list = [gA] if isinstance(gA, str) else list(gA)
        gB_list = [gB] if isinstance(gB, str) else list(gB)
        x1 = float(np.mean([cluster_labels.index(k) for k in gA_list]))
        x2 = float(np.mean([cluster_labels.index(k) for k in gB_list]))
        _, p = mannwhitneyu(A, B, alternative="two-sided")
        local_top = max(np.max(A), np.max(B))
        tests.append({"x1": x1, "x2": x2, "p": p, "local_top": local_top})

    if len(tests) == 0:
        plt.subplots_adjust(bottom=0.28, top=0.92)
        if show:
            plt.show()
        return plot_df

    # adjust p
    raw_p = np.array([t["p"] for t in tests], dtype=float)
    adj_p = bh_fdr(raw_p) if p_adjust == "fdr_bh" else raw_p
    for t, p_adj in zip(tests, adj_p):
        t["p_adj"] = p_adj

    # ---- annotate (sorted with Python, fixes your crash) ----
    y_min = float(np.nanmin(plot_df[metric].values))
    y_max = float(np.nanmax(plot_df[metric].values))
    y_range = (y_max - y_min) if y_max != y_min else 1.0
    h = 0.020 * y_range
    clearance = 0.03 * y_range
    y_step = 0.10 * y_range

    # sort: lower brackets first, then shorter spans
    tests_sorted = sorted(tests, key=lambda t: (t["local_top"], abs(t["x2"] - t["x1"])))

    placed = []  # (xlo, xhi, ylo, yhi)
    for t in tests_sorted:
        p_use = t["p_adj"] if p_adjust else t["p"]
        label = p_to_star(p_use) if p_style == "stars" else f"$p$={p_use:.2e}"
        if (label == "ns") and (not show_ns):
            continue

        x1, x2 = t["x1"], t["x2"]
        xlo, xhi = min(x1, x2), max(x1, x2)

        y = t["local_top"] + clearance
        while True:
            yhi = y + h
            overlap = False
            for pxlo, pxhi, pylo, pyhi in placed:
                if not (xhi < pxlo - 0.3 or xlo > pxhi + 0.3):
                    if not (yhi < pylo - 0.01 or y > pyhi + 0.01):
                        overlap = True
                        break
            if not overlap:
                break
            y += y_step

        placed.append((xlo, xhi, y, y + h))
        ax.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.1, c="k", alpha=0.9)
        ax.text((x1 + x2) / 2, y + h - 2*h, label, ha="center", va="bottom", fontsize=fontsize*2.0)

    plt.subplots_adjust(bottom=0.28, top=0.92)
    if show:
        plt.show()
    return plot_df

In [None]:
_ = cluster_residual_boxplot(
    ct_merged_data,
    cluster_dict,
    metric="residual",
    palette=plot_palette,
    figsize=(12,8),
    pairwise_tests=pairwise_tests,
    p_adjust="fdr_bh",
    p_style="stars",
    show_ns=False,
    wrap_xticks=True,
    wrap_len=16,
    point_size=2.2,
    point_alpha=0.16
)

In [None]:
from scipy.stats import mannwhitneyu

# Extract values for PF_Cluster
pf_values = ct_merged_data[ct_merged_data.index.isin(PF_Cluster)]["EFFECT_ASD"].dropna().values

# Non-PF_Cluster as background
background_values = ct_merged_data[~ct_merged_data.index.isin(PF_Cluster)]["EFFECT_ASD"].dropna().values

# Mann-Whitney U test: alternative='greater' tests if PF_Cluster > background
stat, pval = mannwhitneyu(pf_values, background_values, alternative='greater')
 
print("Mann-Whitney U test for PF_Cluster vs others (greater):")
print(f"U statistic: {stat:.2f}, p-value: {pval:.3g}")

In [None]:
sorted(ct_merged_data["class_id_label"].unique())

In [None]:
sorted(ct_merged_data["subclass_id_label"].unique())

In [None]:
'152 RE-Xi Nox4 Glut'
'154 PF Fzd5 Glut'

In [None]:
ct_merged_data["residual"].hist(bins=200)

In [None]:
HIP_Glut = [x for x in CT_Anno[CT_Anno['subclass_id_label'].isin(HIP)].index.tolist() if x in ct_merged_data.index]
Other_IT_ET

In [None]:
cluster_dict = {
    "D1/D2 MSN": STR_D1D2,
    "CNU_LGE_GABA (Other)": Other_LGE,
    "PF_RE_TH_Glut": RU_PF,
    "TH_Glut (Other)": Other_TH_Cluster,
    "HIP_Glut": HIP_Glut,
    "AMY_Glut": AMY_Glut,
    "Other_IT_ET": Other_IT_ET,
    "NP_CT_L6b_Glut": NP_Cluster,
    "CTX_MGE_GABA": MGE_Cluster,
}

plot_palette = ["orange", "green", "purple", "red", "blue", "yellow", "pink"]

pairwise_tests = [
    ("D1/D2 MSN", "CNU_LGE_GABA (Other)"),
    ("PF_RE_TH_Glut", "TH_Glut (Other)"),
    #("CTX_CGE_GABA", ["CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
    ("HIP_Glut", "Other_IT_ET"),
    #("PF_RE_TH_Glut", ["CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
]
ct_merged_data = merge_ct_bias_datasets(ASD_SC_Bias, DDD_rmASD_SC_Bias, suffixes=('_ASD', '_DD'))
_ = cluster_residual_boxplot(
    ct_merged_data,
    cluster_dict,
    metric="residual",
    palette=plot_palette,
    figsize=(12,8),
    pairwise_tests=pairwise_tests
)

In [None]:
# Plot all cell type classes using the shown framework

# Get all unique class labels that appear in ct_merged_data
all_class_labels = sorted(ct_merged_data["class_id_label"].unique())

# Build a cluster dict mapping each class label to its indices
cluster_dict = {
    label: [idx for idx in CT_Anno[CT_Anno['class_id_label'] == label].index if idx in ct_merged_data.index]
    for label in all_class_labels
}


num_classes = len(cluster_dict)
plot_palette = sns.color_palette("tab20", num_classes)

# Optionally, do not run pairwise_tests for all (too many comparisons)
pairwise_tests = []

# Re-merge (to ensure you have right columns/metrics)
ct_merged_data = merge_ct_bias_datasets(ASD_SC_Bias, DDD_rmASD_SC_Bias, suffixes=('_ASD', '_DD'))

_ = cluster_residual_boxplot(
    ct_merged_data,
    cluster_dict,
    metric="residual",
    palette=plot_palette,
    figsize=(max(12, num_classes*0.7), 8),
    pairwise_tests=pairwise_tests
)

# Constraint Genes

In [None]:
gnomad4 = pd.read_csv("/home/jw3514/Work/data/gnomad/gnomad.v4.0.constraint_metrics.tsv", sep="\t")
search_text = 'ENST'
gnomad4 = gnomad4[(gnomad4["transcript"].str.contains(search_text))]
gnomad4 = gnomad4[gnomad4["mane_select"]==True]
for i, row in gnomad4.iterrows():
    symbol = row["gene"]
    gnomad4.loc[i, "Entrez"] = int(GeneSymbol2Entrez.get(symbol, 0))

In [None]:
gnomad4["lof.oe_ci.upper"].hist()

In [None]:
# Take subset where lof.oe_ci.upper is in the bottom 10%
bottom_10_percent_threshold = gnomad4["lof.oe_ci.upper"].quantile(0.1)
gnomad4_bottom10 = gnomad4[gnomad4["lof.oe_ci.upper"] <= bottom_10_percent_threshold]
columns_to_keep_g4 = ["Entrez", "gene", "lof.pLI", "lof.z_score", "lof.oe_ci.upper"]
gnomad4_bottom10 = gnomad4_bottom10[columns_to_keep_g4]
gnomad4_bottom10 = gnomad4_bottom10.sort_values(by="lof.oe_ci.upper", ascending=True)

# Make sure Entrez is int and exclude rows with Entrez = 0
gnomad4_bottom10["Entrez"] = gnomad4_bottom10["Entrez"].astype(int)
gnomad4_bottom10 = gnomad4_bottom10[gnomad4_bottom10["Entrez"] != 0]
gnomad4_bottom10_excludeASD = gnomad4_bottom10[gnomad4_bottom10["gene"] != "ENSG00000186092"]
gnomad4_bottom10

In [None]:
bottom_10_percent_threshold

In [None]:
gnomad4_bottom10.shape

### ASD vs Constraint with Different constraint thresholds

### pLI 0.99

In [None]:
# USE PLI TO DEFINE TOP CONSTRAINED GENES; AVOID VARIABLE NAME COLLISION
gnomad4_top_PLI = gnomad4[gnomad4["lof.pLI"] > 0.99]
print(gnomad4_top_PLI.shape)
#constraint_gw_top_PLI = dict(zip(gnomad4_top_PLI["Entrez"], 1/gnomad4_top_PLI["lof.oe_ci.upper"]))
constraint_gw_top_PLI = dict(zip(gnomad4_top_PLI["Entrez"], [1]*len(gnomad4_top_PLI)))
Dict2Fil(constraint_gw_top_PLI, ProjDIR+"/dat/Genetics/GeneWeights/"+"constraint_top_decile_PLI.gw")
constraint_top_PLI_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_top_PLI)
constraint_top_PLI_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_top_PLI_STR_Bias.index.values]
merged_data_ASD_Constraint_PLI = merge_str_bias_datasets(Spark_ASD_STR_Bias, constraint_top_PLI_STR_Bias, suffixes=('_ASD', '_Constraint'))
plot_structure_bias_comparison(merged_data_ASD_Constraint_PLI, suffixes=('_ASD', '_Constraint'),  metric="EFFECT", show_region_legend=True)

merged_data_DDD_Constraint_PLI = merge_str_bias_datasets(DDD_STR_Bias, constraint_top_PLI_STR_Bias, suffixes=('_DD', '_Constraint'))
plot_structure_bias_comparison(merged_data_DDD_Constraint_PLI, suffixes=('_DD', '_Constraint'),  metric="EFFECT")

In [None]:
# Calculate circuit score for pLI constraint genes
score_Constraint = calculate_circuit_scores(constraint_top_PLI_STR_Bias, IpsiInfoMat, sort_by="EFFECT")


In [None]:
## 2.5 Residual Structures: Constraint (LOEUF top 25%) vs ASD

def plot_top_residual_structures(merged_data, top_n=30, top_threshold=40, 
                                name1="ASD", name2="DD", figsize=(10, 8)):
    """
    Plot brain structures with largest residuals from regression analysis.
    
    Parameters:
    -----------
    merged_data : DataFrame
        Merged dataset with residual and region information
    top_n : int
        Number of top structures to display
    top_threshold : int
        Filter to structures in top N of at least one dataset
    name1, name2 : str
        Names of the two datasets being compared
    figsize : tuple
        Figure size (width, height)
    
    Returns:
    --------
    top_diff : DataFrame
        Top structures with largest residuals
    """
    # Filter to only structures that appear in top threshold of at least one dataset
    top_structures = merged_data[(merged_data[f"Rank_{name1}"] <= top_threshold) | 
                                (merged_data[f"Rank_{name2}"] <= top_threshold)]

    print(f"Total structures in top {top_threshold} of at least one dataset: {len(top_structures)}")

    # Sort by absolute difference for top structures only
    top_structures = top_structures.copy()
    top_structures["ABS_DIFF"] = abs(merged_data[f"residual"])
    top_structures = top_structures.sort_values('ABS_DIFF', ascending=True)

    # Take the top N structures with largest differences from those in top threshold
    top_n = min(top_n, len(top_structures))  # Use min to avoid index errors if fewer structures available
    top_diff = top_structures.tail(top_n)  # Get largest differences

    print(f"Showing top {len(top_diff)} structures with largest differences (from top {top_threshold} filter)")

    # Define regions and colors
    REGIONS_seq = ['Isocortex','Olfactory_areas', 'Cortical_subplate', 
                    'Hippocampus','Amygdala','Striatum', 
                    "Thalamus", "Hypothalamus", "Midbrain", 
                    "Medulla", "Pallidum", "Pons", 
                    "Cerebellum"]
    REG_COR_Dic = dict(zip(REGIONS_seq, ["#268ad5", "#D5DBDB", "#7ac3fa", 
                                        "#2c9d39", "#742eb5", "#ed8921", 
                                        "#e82315", "#E6B0AA", "#f6b26b",  
                                        "#20124d", "#2ECC71", "#D2B4DE", 
                                        "#ffd966", ]))

    # Create publication-quality plot of residuals for top_diff structures
    plt.rcParams.update({'font.size': 12, 'font.family': 'Arial'})
    fig, ax = plt.subplots(figsize=figsize, dpi=300)

    # Sort top_diff by residuals for better visualization
    top_diff_sorted = top_diff.sort_values('ABS_DIFF', ascending=True)

    # Create colors based on region
    colors = [REG_COR_Dic.get(region, '#808080') for region in top_diff_sorted['Region']]

    # Create horizontal bar plot with better styling
    bars = ax.barh(range(len(top_diff_sorted)), 
                   top_diff_sorted[f'residual'], 
                   color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)

    # Customize the plot with publication-quality styling
    ax.set_yticks(range(len(top_diff_sorted)))
    ax.set_yticklabels([name.replace('_', ' ') for name in top_diff_sorted.index], 
                       fontsize=12, fontweight='normal')
    ax.set_xlabel(f'Residuals ({name1} vs {name2})', fontsize=14, fontweight='bold')
    #ax.set_title(f'Top {len(top_diff)} Brain Structures with Largest Residuals', 
    #             fontsize=16, fontweight='bold', pad=20)

    # Remove top and right spines for cleaner look
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1)
    ax.spines['bottom'].set_linewidth(1)

    # Add subtle grid
    ax.grid(True, axis='x', alpha=0.3, linestyle='-', linewidth=0.5)
    ax.set_axisbelow(True)

    # Add vertical line at x=0 with better styling
    ax.axvline(x=0, color='black', linestyle='-', alpha=0.7, linewidth=1)

    # Create legend for regions with better styling
    unique_regions = sorted(list(set(top_diff_sorted['Region'])))
    legend_elements = [plt.Rectangle((0,0),1,1, facecolor=REG_COR_Dic.get(region, '#808080'), 
                                    alpha=0.8, edgecolor='black', linewidth=0.5) 
                       for region in unique_regions if region in REG_COR_Dic]
    legend_labels = [region.replace('_', ' ') for region in unique_regions if region in REG_COR_Dic]

    if legend_elements:
        ax.legend(
            legend_elements, legend_labels,
            loc='center left',       # You can change this to your preferred location, e.g. 'center left'
            bbox_to_anchor=(0.85, 0.2), # Pushes legend outside right of plot at vertical center
            fontsize=10, 
            frameon=True,
            fancybox=True,
            shadow=True,
            framealpha=0.9
        )

    # Adjust layout and margins
    plt.tight_layout()
    plt.subplots_adjust(left=0.3)  # Make room for structure names
    plt.show()
    
    return top_diff

# Call the function
merged_data_eval = merged_data_ASD_Constraint_PLI[merged_data_ASD_Constraint_PLI.index.isin(Circuit_STRs)]
top_diff = plot_top_residual_structures(merged_data_eval, top_n=20, top_threshold=40,
                                       name1="ASD", name2="Constraint", figsize=(6, 6))

In [None]:

# Compare DDD vs Constraint
merged_data_eval = merged_data_DDD_Constraint_PLI[merged_data_DDD_Constraint_PLI.index.isin(Circuit_STRs)]
top_diff = plot_top_residual_structures(
    merged_data_eval, 
    top_n=20, 
    top_threshold=40,
    name1="DD", 
    name2="Constraint", 
    figsize=(6, 8)
)

In [None]:
# Calculate pLI SC Bias
pLI_SC_Bias = MouseCT_AvgZ_Weighted(CT_BiasMat, constraint_gw_top_PLI)
pLI_SC_Bias = add_class(pLI_SC_Bias, CT_Anno)
pLI_SC_Bias.to_csv(ProjDIR + "/results/CT_Z2/pLI_SC_Bias.csv")

In [None]:
plot_correlation_scatter_mouseCT(pLI_SC_Bias, ASD_SC_Bias, name1="Constraint Cell Type Bias", name2="ASD Cell Type Bias", effect_col1="EFFECT", effect_col2="EFFECT", dpi=240)
plot_correlation_scatter_mouseCT(pLI_SC_Bias, DDD_SC_Bias, name1="Constraint Cell Type Bias", name2="DD Cell Type Bias", effect_col1="EFFECT", effect_col2="EFFECT", dpi=240)

In [None]:


CNU_LGE_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '09 CNU-LGE GABA'].index.tolist() if x in ct_merged_data.index]
IT_ET_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '01 IT-ET Glut'].index.tolist() if x in ct_merged_data.index]
NP_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '02 NP-CT-L6b Glut'].index.tolist() if x in ct_merged_data.index]
CGE_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '06 CTX-CGE GABA'].index.tolist() if x in ct_merged_data.index]
MGE_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '07 CTX-MGE GABA'].index.tolist() if x in ct_merged_data.index]


D1D2_labels = ['061 STR D1 Gaba', '062 STR D2 Gaba']
STR_D1D2 = [idx for idx in CT_Anno[CT_Anno['subclass_id_label'].isin(D1D2_labels)].index if idx in ct_merged_data.index]
Other_LGE = [idx for idx in CT_Anno[CT_Anno['class_id_label'] == '09 CNU-LGE GABA'].index if idx in ct_merged_data.index and idx not in STR_D1D2]

HIP = ['016 CA1-ProS Glut', '017 CA3 Glut']
HIP_Glut = [x for x in CT_Anno[CT_Anno['subclass_id_label'].isin(HIP)].index.tolist() if x in ct_merged_data.index]


RU_Cluster = [x for x in CT_Anno[CT_Anno['subclass_id_label'] == '152 RE-Xi Nox4 Glut'].index.tolist() if x in ct_merged_data.index]
PF_Cluster = [x for x in CT_Anno[CT_Anno['subclass_id_label'] == '154 PF Fzd5 Glut'].index.tolist() if x in ct_merged_data.index]
RU_PF = RU_Cluster + PF_Cluster
Other_TH_Cluster = [x for x in CT_Anno[CT_Anno['class_id_label'] == '18 TH Glut'].index.tolist() if x in ct_merged_data.index and x not in RU_PF]


### LOEUF top 25%

In [None]:
# Take subset where lof.oe_ci.upper is in the bottom 25%
bottom_25_percent_threshold = gnomad4["lof.oe_ci.upper"].quantile(0.25)
gnomad4_bottom25 = gnomad4[gnomad4["lof.oe_ci.upper"] <= bottom_25_percent_threshold]
columns_to_keep_g4 = ["Entrez", "gene", "lof.pLI", "lof.z_score", "lof.oe_ci.upper"]
gnomad4_bottom25 = gnomad4_bottom25[columns_to_keep_g4]
gnomad4_bottom25 = gnomad4_bottom25.sort_values(by="lof.oe_ci.upper", ascending=True)

# Make sure Entrez is int and exclude rows with Entrez = 0
gnomad4_bottom25["Entrez"] = gnomad4_bottom25["Entrez"].astype(int)
gnomad4_bottom25 = gnomad4_bottom25[gnomad4_bottom25["Entrez"] != 0]
gnomad4_bottom25_excludeASD = gnomad4_bottom25[gnomad4_bottom25["gene"] != "ENSG00000186092"]
gnomad4_bottom25

In [None]:
# USE LOEUF TOP 25% TO DEFINE TOP CONSTRAINED GENES
print(gnomad4_bottom25.shape)
# Create gene weights (using equal weights of 1, similar to pLI analysis)
constraint_gw_top_LOEUF25 = dict(zip(gnomad4_bottom25["Entrez"], [1]*len(gnomad4_bottom25)))
Dict2Fil(constraint_gw_top_LOEUF25, ProjDIR+"/dat/Genetics/GeneWeights/"+"constraint_top25_LOEUF.gw")

# Calculate structure bias
constraint_top_LOEUF25_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_top_LOEUF25)
constraint_top_LOEUF25_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_top_LOEUF25_STR_Bias.index.values]

# Compare with ASD
merged_data_ASD_Constraint_LOEUF25 = merge_str_bias_datasets(Spark_ASD_STR_Bias, constraint_top_LOEUF25_STR_Bias, suffixes=('_ASD', '_Constraint'))
plot_structure_bias_comparison(merged_data_ASD_Constraint_LOEUF25, suffixes=('_ASD', '_Constraint'),  metric="EFFECT")

# Compare with DDD
merged_data_DDD_Constraint_LOEUF25 = merge_str_bias_datasets(DDD_rmASD_STR_Bias, constraint_top_LOEUF25_STR_Bias, suffixes=('_DD (exclude ASD)', '_Constraint'))
plot_structure_bias_comparison(merged_data_DDD_Constraint_LOEUF25, suffixes=('_DD (exclude ASD)', '_Constraint'),  metric="EFFECT")


In [None]:
# Residual analysis: ASD vs Constraint (LOEUF top 25%)
merged_data_eval_LOEUF25 = merged_data_ASD_Constraint_LOEUF25[merged_data_ASD_Constraint_LOEUF25.index.isin(Circuit_STRs)]
top_diff_ASD_LOEUF25 = plot_top_residual_structures(merged_data_eval_LOEUF25, top_n=20, top_threshold=40,
                                       name1="ASD", name2="Constraint", figsize=(6, 6))


In [None]:

## 2.6 Circuit Connectivity Scores

# Calculate circuit scores
score_ASD = calculate_circuit_scores(Spark_ASD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_DDD = calculate_circuit_scores(DDD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_DDD_rmASD = calculate_circuit_scores(DDD_rmASD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_Constraint_LOEUF25 = calculate_circuit_scores(constraint_top_LOEUF25_STR_Bias, IpsiInfoMat, sort_by="EFFECT")

# Calculate pLI constraint structure bias and circuit score
constraint_top_PLI_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_top_PLI)
constraint_top_PLI_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_top_PLI_STR_Bias.index.values]
score_Constraint_PLI = calculate_circuit_scores(constraint_top_PLI_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_DDD = calculate_circuit_scores(DDD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_DDD_rmASD = calculate_circuit_scores(DDD_rmASD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_Constraint = calculate_circuit_scores(constraint_top_LOEUF25_STR_Bias, IpsiInfoMat, sort_by="EFFECT")

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax1 = plt.subplots(1,1, dpi=480, figsize=(10,6), facecolor='none')

fig.patch.set_alpha(0)
ax1.patch.set_alpha(0)

BarLen = 34.1
#BarLen = 47.5

topNs = list(range(200, 5, -1))  # Define topNs based on the range used in calculate_circuit_scores

# Use different colors for each score profile
ASD_color = "#d62728"  # changed ASD to red
DDD_color = "#1f77b4"
rmASD_color = "#ff7f0e"
Constraint_color = "#2ca02c"
siblings_color = "grey"
# ax1.plot(topNs, score_ASD, color=ASD_color, marker="o", markersize=5, lw=1,
#          ls="dashed", label="ASD", alpha=0.5)
# ax1.plot(topNs, score_DDD, color=DDD_color, marker="o", markersize=5, lw=1,
#          ls="dashed", label="DD", alpha=0.9)
ax1.plot(topNs, score_DDD_rmASD, color=rmASD_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="DD (exclude ASD)", alpha=0.9)
ax1.plot(topNs, score_Constraint, color=Constraint_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="Constraint Genes", alpha=0.9)

cont = np.median(Cont_Distance, axis=0)
lower = np.percentile(Cont_Distance, 50-BarLen, axis=0)
upper = np.percentile(Cont_Distance, 50+BarLen, axis=0)
ax1.errorbar(topNs, cont, color=siblings_color, marker="o", markersize=1.5, lw=1,
             yerr=(cont - lower, upper - cont), ls="dashed", label="Siblings")

ax1.set_xlabel("Structure Rank\n", fontsize=17)
ax1.set_ylabel("Circuit Connectivity Score", fontsize=15)
ax1.grid(True, linestyle='--', alpha=0.7)
ax1.set_xlim(0, 121)

# Place legend inside the figure (upper right, just inside plot)
ax1.legend(fontsize=13, loc='upper right', frameon=True)
plt.tight_layout()  # Adjust layout

In [None]:
# Cell type bias already calculated in Section 3.5```
LOEUF25_SC_Bias = pd.read_csv(ProjDIR + "/results/CT_Z2/Constraint_top25_LOEUF_bias_addP_random.csv", index_col=0)

In [None]:
LOEUF25_SC_Bias.head(2)

In [None]:
# Correlation plots: LOEUF top 25% vs ASD and DDD
plot_correlation_scatter_mouseCT(LOEUF25_SC_Bias, ASD_SC_Bias, name1="Constraint (LOEUF25) Cell Type Bias", name2="ASD Cell Type Bias", effect_col1="EFFECT", effect_col2="EFFECT", dpi=240)
plot_correlation_scatter_mouseCT(LOEUF25_SC_Bias, DDD_rmASD_SC_Bias, name1="Constraint (LOEUF25) Cell Type Bias", name2="DD (exclude ASD) \nCell Type Bias", effect_col1="EFFECT", effect_col2="EFFECT", dpi=240)


In [None]:
# cluster_dict = {
#     "D1/D2 MSN": STR_D1D2,
#     "CNU_LGE_GABA (Other)": Other_LGE,
#     "PF_RE_TH_Glut": RU_PF,
#     "TH_Glut (Other)": Other_TH_Cluster,
#     "CNU_HYA_Glut": AMY_HYA_Glut,
#     "CNU_HYA_GABA": AMY_HYA_GABA,
#     "CTX_CGE_GABA": CGE_Cluster,
#     "IT_ET_Glut": IT_ET_Cluster,
#     "NP_CT_L6b_Glut": NP_Cluster,
#     "CTX_MGE_GABA": MGE_Cluster,
# }

# plot_palette = ["orange", "green", "purple", "red", "blue", "yellow", "pink"]

# pairwise_tests = [
#     ("D1/D2 MSN", "CNU_LGE_GABA (Other)"),
#     ("PF_RE_TH_Glut", "TH_Glut (Other)"),
#     #("CTX_CGE_GABA", ["CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
#     ("D1/D2 MSN", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
#     #("PF_RE_TH_Glut", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
#     #("CNU_HYA_Glut", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
#     ("CNU_HYA_GABA", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
# ]
# _ = cluster_residual_boxplot(
#     ct_merged_data_LOEUF25,
#     cluster_dict,
#     metric="residual",
#     palette=plot_palette,
#     figsize=(12,8),
#     pairwise_tests=pairwise_tests
# )

In [None]:
# Cell type residual analysis: ASD vs LOEUF top 25%
ct_merged_data_LOEUF25 = merge_ct_bias_datasets(ASD_SC_Bias, LOEUF25_SC_Bias, suffixes=('_ASD', '_Constraint'))
plot_palette = [
    "orange",   # D1/D2 MSN
    "green",    # CNU_LGE_GABA (Other)
    "purple",   # PF_RE_TH_Glut
    "red",      # TH_Glut (Other)
    "blue",     # CNU_HYA_Glut
    "gold",     # CNU_HYA_GABA
    "pink",     # CTX_CGE_GABA
    "teal",     # IT_ET_Glut
    "sienna",   # NP_CT_L6b_Glut
    "indigo"    # CTX_MGE_GABA
]


pairwise_tests = [
    ("D1/D2 MSN", "CNU_LGE_GABA (Other)"),
    ("PF_RE_TH_Glut", "TH_Glut (Other)"),
    #("CTX_CGE_GABA", ["CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"])
    ("D1/D2 MSN", ["CTX_CGE_GABA", "CTX_MGE_GABA", "NP_CT_L6b_Glut", "IT_ET_Glut"]),
]

_ = cluster_residual_boxplot(
    ct_merged_data_LOEUF25,
    cluster_dict,
    metric="residual",
    palette=plot_palette,
    figsize=(12,8),
    pairwise_tests=pairwise_tests
)


In [None]:
_ = cluster_residual_boxplot(
    ct_merged_data_LOEUF25,
    cluster_dict,
    metric="residual",
    palette=plot_palette,
    figsize=(12,8),
    pairwise_tests=pairwise_tests,
    p_adjust="fdr_bh",
    p_style="stars",
    show_ns=False,
    wrap_xticks=True,
    wrap_len=16,
    point_size=2.2,
    point_alpha=0.16
)

In [None]:
# Calculate circuit scores for LOEUF top 25%
score_DDD_rmASD = calculate_circuit_scores(DDD_rmASD_STR_Bias, IpsiInfoMat, sort_by="EFFECT")
score_Constraint_LOEUF25 = calculate_circuit_scores(constraint_top_LOEUF25_STR_Bias, IpsiInfoMat, sort_by="EFFECT")


In [None]:
# Plot circuit scores comparison with LOEUF top 25%
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax1 = plt.subplots(1,1, dpi=480, figsize=(10,6), facecolor='none')

fig.patch.set_alpha(0)
ax1.patch.set_alpha(0)

BarLen = 34.1

topNs = list(range(200, 5, -1))

# Use different colors for each score profile
DDD_color = "#1f77b4"
rmASD_color = "#ff7f0e"
Constraint_LOEUF25_color = "#2ca02c"
siblings_color = "grey"

# ax1.plot(topNs, score_DDD, color=DDD_color, marker="o", markersize=5, lw=1,
#          ls="dashed", label="DD", alpha=0.9)
ax1.plot(topNs, score_DDD_rmASD, color=rmASD_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="DD (exclude ASD)", alpha=0.9)
ax1.plot(topNs, score_Constraint_LOEUF25, color=Constraint_LOEUF25_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="Constraint Genes (LOEUF top 25%)", alpha=0.9)

cont = np.median(Cont_Distance, axis=0)
lower = np.percentile(Cont_Distance, 50-BarLen, axis=0)
upper = np.percentile(Cont_Distance, 50+BarLen, axis=0)
ax1.errorbar(topNs, cont, color=siblings_color, marker="o", markersize=1.5, lw=1,
             yerr=(cont - lower, upper - cont), ls="dashed", label="Siblings")

ax1.set_xlabel("Structure Rank\n", fontsize=17)
ax1.set_ylabel("Circuit Connectivity Score", fontsize=15)
ax1.grid(True, linestyle='--', alpha=0.7)
ax1.set_xlim(0, 121)

ax1.legend(fontsize=13, loc='upper right', frameon=True)
plt.tight_layout()


### Comparison: pLI≥0.99 vs LOEUF top 25%

Direct comparison between the two constraint gene selection criteria


In [None]:
# Compare pLI vs LOEUF top 25% gene selections
pLI_genes = set(constraint_gw_top_PLI.keys())
LOEUF25_genes = set(constraint_gw_top_LOEUF25.keys())

print(f"pLI≥0.99 genes: {len(pLI_genes)}")
print(f"LOEUF top 25% genes: {len(LOEUF25_genes)}")
print(f"Overlap: {len(pLI_genes & LOEUF25_genes)}")
print(f"Only in pLI: {len(pLI_genes - LOEUF25_genes)}")
print(f"Only in LOEUF top 25%: {len(LOEUF25_genes - pLI_genes)}")
print(f"Jaccard index: {len(pLI_genes & LOEUF25_genes) / len(pLI_genes | LOEUF25_genes):.3f}")


In [None]:
# Compare pLI vs LOEUF top 25% structure biases directly
merged_data_pLI_LOEUF25 = merge_str_bias_datasets(constraint_top_PLI_STR_Bias, constraint_top_LOEUF25_STR_Bias, suffixes=('_pLI', '_LOEUF25'))
plot_structure_bias_comparison(merged_data_pLI_LOEUF25, suffixes=('_pLI', '_LOEUF25'),  metric="EFFECT")


In [None]:
# Summary comparison of correlations
from scipy.stats import pearsonr

# Calculate correlations for pLI
corr_pLI_ASD, pval_pLI_ASD = pearsonr(merged_data_ASD_Constraint_PLI["EFFECT_ASD"], merged_data_ASD_Constraint_PLI["EFFECT_Constraint"])
corr_pLI_DDD, pval_pLI_DDD = pearsonr(merged_data_DDD_Constraint_PLI["EFFECT_DDD"], merged_data_DDD_Constraint_PLI["EFFECT_Constraint"])

# Calculate correlations for LOEUF top 25%
corr_LOEUF25_ASD, pval_LOEUF25_ASD = pearsonr(merged_data_ASD_Constraint_LOEUF25["EFFECT_ASD"], merged_data_ASD_Constraint_LOEUF25["EFFECT_Constraint"])
corr_LOEUF25_DDD, pval_LOEUF25_DDD = pearsonr(merged_data_DDD_Constraint_LOEUF25["EFFECT_DDD"], merged_data_DDD_Constraint_LOEUF25["EFFECT_Constraint"])

print("=" * 60)
print("Structure Bias Correlations")
print("=" * 60)
print(f"\npLI≥0.99 ({len(pLI_genes)} genes):")
print(f"  ASD correlation:  r = {corr_pLI_ASD:.3f}, p = {pval_pLI_ASD:.2e}")
print(f"  DDD correlation:  r = {corr_pLI_DDD:.3f}, p = {pval_pLI_DDD:.2e}")
print(f"\nLOEUF top 25% ({len(LOEUF25_genes)} genes):")
print(f"  ASD correlation:  r = {corr_LOEUF25_ASD:.3f}, p = {pval_LOEUF25_ASD:.2e}")
print(f"  DDD correlation:  r = {corr_LOEUF25_DDD:.3f}, p = {pval_LOEUF25_DDD:.2e}")
print("=" * 60)


In [None]:
# Cell Type bias correlation comparison
print("=" * 60)
print("Cell Type Bias Correlations")
print("=" * 60)

# Calculate correlations for pLI cell type bias
merged_pLI_ASD_CT = pd.merge(pLI_SC_Bias[['EFFECT']], ASD_SC_Bias[['EFFECT']], 
                             left_index=True, right_index=True, suffixes=('_pLI', '_ASD'))
corr_ct_pLI_ASD, pval_ct_pLI_ASD = pearsonr(merged_pLI_ASD_CT['EFFECT_pLI'], merged_pLI_ASD_CT['EFFECT_ASD'])

merged_pLI_DDD_CT = pd.merge(pLI_SC_Bias[['EFFECT']], DDD_SC_Bias[['EFFECT']], 
                             left_index=True, right_index=True, suffixes=('_pLI', '_DD'))
corr_ct_pLI_DDD, pval_ct_pLI_DDD = pearsonr(merged_pLI_DDD_CT['EFFECT_pLI'], merged_pLI_DDD_CT['EFFECT_DDD'])

# Calculate correlations for LOEUF top 25% cell type bias
merged_LOEUF25_ASD_CT = pd.merge(LOEUF25_SC_Bias[['EFFECT']], ASD_SC_Bias[['EFFECT']], 
                                 left_index=True, right_index=True, suffixes=('_LOEUF25', '_ASD'))
corr_ct_LOEUF25_ASD, pval_ct_LOEUF25_ASD = pearsonr(merged_LOEUF25_ASD_CT['EFFECT_LOEUF25'], merged_LOEUF25_ASD_CT['EFFECT_ASD'])

merged_LOEUF25_DDD_CT = pd.merge(LOEUF25_SC_Bias[['EFFECT']], DDD_SC_Bias[['EFFECT']], 
                                 left_index=True, right_index=True, suffixes=('_LOEUF25', '_DD'))
corr_ct_LOEUF25_DDD, pval_ct_LOEUF25_DDD = pearsonr(merged_LOEUF25_DDD_CT['EFFECT_LOEUF25'], merged_LOEUF25_DDD_CT['EFFECT_DDD'])

print(f"\npLI≥0.99:")
print(f"  ASD correlation:  r = {corr_ct_pLI_ASD:.3f}, p = {pval_ct_pLI_ASD:.2e}")
print(f"  DDD correlation:  r = {corr_ct_pLI_DDD:.3f}, p = {pval_ct_pLI_DDD:.2e}")
print(f"\nLOEUF top 25%:")
print(f"  ASD correlation:  r = {corr_ct_LOEUF25_ASD:.3f}, p = {pval_ct_LOEUF25_ASD:.2e}")
print(f"  DDD correlation:  r = {corr_ct_LOEUF25_DDD:.3f}, p = {pval_ct_LOEUF25_DDD:.2e}")
print("=" * 60)


In [None]:
# Side-by-side comparison: pLI vs LOEUF top 25% circuit scores
plt.style.use('seaborn-v0_8-whitegrid')
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=480, figsize=(18,6), facecolor='none')

fig.patch.set_alpha(0)
for ax in [ax1, ax2]:
    ax.patch.set_alpha(0)

BarLen = 34.1
topNs = list(range(200, 5, -1))

# Color scheme
ASD_color = "#d62728"
DDD_color = "#1f77b4"
rmASD_color = "#ff7f0e"
Constraint_color = "#2ca02c"
siblings_color = "grey"

cont = np.median(Cont_Distance, axis=0)
lower = np.percentile(Cont_Distance, 50-BarLen, axis=0)
upper = np.percentile(Cont_Distance, 50+BarLen, axis=0)

# Panel 1: pLI≥0.99
ax1.plot(topNs, score_DDD, color=DDD_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="DD", alpha=0.9)
ax1.plot(topNs, score_DDD_rmASD, color=rmASD_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="DD (exclude ASD)", alpha=0.9)
ax1.plot(topNs, score_Constraint, color=Constraint_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="Constraint (pLI≥0.99)", alpha=0.9)
ax1.errorbar(topNs, cont, color=siblings_color, marker="o", markersize=1.5, lw=1,
             yerr=(cont - lower, upper - cont), ls="dashed", label="Siblings")
ax1.set_xlabel("Structure Rank", fontsize=15)
ax1.set_ylabel("Circuit Connectivity Score", fontsize=15)
ax1.set_title("pLI≥0.99", fontsize=16, fontweight='bold')
ax1.grid(True, linestyle='--', alpha=0.7)
ax1.set_xlim(0, 121)
ax1.legend(fontsize=11, loc='upper right', frameon=True)

# Panel 2: LOEUF top 25%
ax2.plot(topNs, score_DDD, color=DDD_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="DD", alpha=0.9)
ax2.plot(topNs, score_DDD_rmASD, color=rmASD_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="DD (exclude ASD)", alpha=0.9)
ax2.plot(topNs, score_Constraint_LOEUF25, color=Constraint_color, marker="o", markersize=5, lw=1,
         ls="dashed", label="Constraint (LOEUF top 25%)", alpha=0.9)
ax2.errorbar(topNs, cont, color=siblings_color, marker="o", markersize=1.5, lw=1,
             yerr=(cont - lower, upper - cont), ls="dashed", label="Siblings")
ax2.set_xlabel("Structure Rank", fontsize=15)
ax2.set_ylabel("Circuit Connectivity Score", fontsize=15)
ax2.set_title("LOEUF top 25%", fontsize=16, fontweight='bold')
ax2.grid(True, linestyle='--', alpha=0.7)
ax2.set_xlim(0, 121)
ax2.legend(fontsize=11, loc='upper right', frameon=True)

plt.tight_layout()


## Summary of pLI≥0.99 vs LOEUF top 25% Analysis

### Key Findings:

**Gene Selection:**
- **pLI≥0.99**: Uses genes with high probability of being loss-of-function intolerant (pLI > 0.99)
- **LOEUF top 25%**: Uses genes in the bottom quartile of LOEUF scores (most constrained)

**Analyses Completed:**

1. **Structure Bias Analysis**
   - Calculated bias for both constraint criteria
   - Compared correlations with ASD and DDD structure biases
   
2. **Residual Analysis** 
   - Identified brain structures with largest differences between ASD/DDD and constraint patterns
   - Focused on circuit-relevant structures
   
3. **Cell Type Bias Analysis**
   - Computed cell type-specific biases for both criteria
   - Assessed correlations with ASD and DDD cell type patterns
   - Performed residual boxplot analysis across cell type classes
   
4. **Circuit Connectivity Scores**
   - Calculated circuit scores based on structure bias rankings
   - Compared with DDD and control distributions

5. **Direct Comparison**
   - Gene overlap between the two selection criteria
   - Correlation comparison for both structure and cell type levels
   - Side-by-side visualization of circuit scores

This comprehensive analysis allows direct comparison of results using different constraint gene selection approaches.


In [None]:
# TOPN CONSTRAINED GENES
TOPN_CONS = 1000


In [None]:
gnomad4_constraint_top61 = gnomad4_bottom10.head(2000)
constraint_gw_top61 = dict(zip(gnomad4_constraint_top61["Entrez"], 1/gnomad4_constraint_top61["lof.oe_ci.upper"]))
Dict2Fil(constraint_gw_top61, ProjDIR+"/dat/Genetics/GeneWeights/"+"constraint_top_decile_LOEUF_top61.gw")
constraint_top61_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_top61)
constraint_top61_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_top61_STR_Bias.index.values]

In [None]:
merged_data_ASD_Constraint = merge_str_bias_datasets(Spark_ASD_STR_Bias, constraint_top61_STR_Bias, suffixes=('_ASD', '_Constraint'))
plot_structure_bias_comparison(merged_data_ASD_Constraint, suffixes=('_ASD', '_Constraint'),  metric="EFFECT")

In [None]:
constraint_gw = dict(zip(gnomad4_bottom10["Entrez"], 1/gnomad4_bottom10["lof.oe_ci.upper"]))
Dict2Fil(constraint_gw, ProjDIR+"/dat/Genetics/GeneWeights/"+"constraint_top_decile_LOEUF.gw")
constraint_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw)
constraint_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_STR_Bias.index.values]


In [None]:
merged_data_ASD_Constraint = merge_str_bias_datasets(Spark_ASD_STR_Bias, constraint_STR_Bias, suffixes=('_ASD', '_Constraint'))
plot_structure_bias_comparison(merged_data_ASD_Constraint, suffixes=('_ASD', '_Constraint'),  metric="EFFECT")

In [None]:
merged_data_DDD_Constraint = merge_str_bias_datasets(DDD_rmASD_STR_Bias, constraint_STR_Bias, suffixes=('_DD', '_Constraint'))
plot_structure_bias_comparison(merged_data_DDD_Constraint, suffixes=('_DD', '_Constraint'),  metric="EFFECT")

In [None]:
constraint_GW_filt_ASD = {k: v for k, v in constraint_gw.items() if k not in ASD_GENES}
Dict2Fil(constraint_GW_filt_ASD, ProjDIR+"/dat/Genetics/GeneWeights/"+"constraint_top_decile_LOEUF_excludeASD.gw")
print(len(constraint_GW_filt_ASD))
Constraint_rmASD_STR_Bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_GW_filt_ASD)
Constraint_rmASD_STR_Bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in Constraint_rmASD_STR_Bias.index.values]

In [None]:
merged_data_ASD_Constraint_excludeASD = merge_str_bias_datasets(Spark_ASD_STR_Bias, Constraint_rmASD_STR_Bias, suffixes=('_ASD', '_Constrain_ExcludeASD'))
plot_structure_bias_comparison(merged_data_ASD_Constraint_excludeASD, suffixes=('_ASD', '_Constrain_ExcludeASD'),  metric="EFFECT")

### Cell Types

In [None]:
218 * (3**4)

In [None]:
# fig, axes = plt.subplots(2, 1, figsize=(8, 8))
# plot_boxplot_mouseCT(Constraint_CT_Bias, ClusterAnn, ALL_Mouse_Class, "EFFECT", ax=axes[0]) 
# plot_boxplot_mouseCT(Constraint_CT_Bias, ClusterAnn, ALL_Mouse_Class, "-logP", ax=axes[1]) 
# plt.tight_layout()
# plt.show()

In [None]:
Geneset = list(constraint_gw.keys())
Weights = list(ASD_GW.values())
tmp_bias_dfs = []
for i in range(10000):
    tmp_geneset = random.sample(Geneset, len(Weights))
    tmp_gw = dict(zip(tmp_geneset, Weights))
    tmp_bias = MouseSTR_AvgZ_Weighted(STR_BiasMat, tmp_gw)
    tmp_bias["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in tmp_bias.index.values]
    tmp_bias_dfs.append(tmp_bias)

In [None]:
def plot_null_distribution_analysis(structure_name, null_dfs, observed_df, title_prefix="", plot=True):
    """
    Plot null distribution analysis for a given brain structure.
    
    Parameters:
    - structure_name: Name of the structure to analyze
    - null_dfs: List of dataframes containing null distribution data
    - observed_df: Dataframe containing observed data
    - title_prefix: Optional prefix for the plot title
    """
    # Extract EFFECT values from all null datasets
    null_effects = []
    for df in null_dfs:
        null_effects.append(df.loc[structure_name, "EFFECT"])

    # Get observed value
    observed_effect = observed_df.loc[structure_name, "EFFECT"] 

    # Calculate p-value (one-tailed test: observed > null)
    null_effects = np.array(null_effects)
    p_value = (np.sum(null_effects >= observed_effect) + 1) / (len(null_effects) + 1)

    # Plot histogram
    if plot:    
        plt.figure(figsize=(10, 6))
        plt.hist(null_effects, bins=50, alpha=0.7, color='lightblue', edgecolor='black', label='Null distribution (Constraint Genes)')
        plt.axvline(observed_effect, color='red', linestyle='--', linewidth=2, label=f'Observed (Spark ASD): {observed_effect:.4f}')
        plt.xlabel('EFFECT')
        plt.ylabel('Frequency')
        plt.title(f'{title_prefix}{structure_name} EFFECT: Null Distribution vs Observed')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.text(0.05, 0.95, f'P-value: {p_value:.4f}', transform=plt.gca().transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        plt.show()

        print(f"Observed Spark ASD effect: {observed_effect:.4f}")
        print(f"Null mean: {np.mean(null_effects):.4f}")
        print(f"Null std: {np.std(null_effects):.4f}")
        print(f"P-value: {p_value:.4f}")
    
    return p_value, observed_effect, null_effects
# Run analysis for Nucleus Accumbens
p_value, observed_effect, null_effects = plot_null_distribution_analysis("Nucleus_accumbens", tmp_bias_dfs, Spark_ASD_STR_Bias)

In [None]:
# Run analysis for Nucleus Accumbens
p_value, observed_effect, null_effects = plot_null_distribution_analysis("Caudoputamen", tmp_bias_dfs, Spark_ASD_STR_Bias)

In [None]:
# Run analysis for all structures in Spark_ASD_STR_Bias
P_constraint = {}
for structure in Spark_ASD_STR_Bias.index:
    p_value, observed_effect, null_effects = plot_null_distribution_analysis(structure, tmp_bias_dfs, Spark_ASD_STR_Bias, title_prefix="", plot=False)
    P_constraint[structure] = p_value

# Add P_constraint to the dataframe
Spark_ASD_STR_Bias_with_p = Spark_ASD_STR_Bias.copy()
Spark_ASD_STR_Bias_with_p['P_constraint'] = Spark_ASD_STR_Bias_with_p.index.map(P_constraint)


In [None]:
Spark_ASD_STR_Bias_with_p

In [None]:
Spark_ASD_STR_Bias_with_p[Spark_ASD_STR_Bias_with_p["P_constraint"] < 0.05].sort_values(by="P_constraint")

In [None]:
Spark_ASD_STR_Bias_with_p[Spark_ASD_STR_Bias_with_p["P_constraint"] > 0.1]

In [None]:
p_value, observed_effect, null_effects = plot_null_distribution_analysis("Facial_motor_nucleus", tmp_bias_dfs, Spark_ASD_STR_Bias)

## Comprehensive Decile Analysis

This section analyzes the correlation between ASD/DDD structure bias and constraint genes across **all 10 deciles** of LOEUF (Loss-Of-Function Observed/Expected Upper Bound Fraction).

**Decile Definition:**
- **Decile 1**: Most constrained genes (lowest LOEUF values)
- **Decile 10**: Least constrained genes (highest LOEUF values)

**Analysis:**
1. For each decile, we calculate structure bias using genes within that constraint range
2. We compute correlation with ASD and DDD (excluding ASD genes) structure bias patterns
3. Statistical significance is tested using Pearson correlation p-values

**Key Questions:**
- Does the correlation strength change across constraint levels?
- Are the most constrained genes most similar to ASD/DDD patterns?
- Is there a gradient of similarity across the constraint spectrum?


In [None]:
### Analysis across all 10 deciles of constraint
# Create 10 deciles based on lof.oe_ci.upper
decile_results = []

for decile_num in range(1, 11):
    # Calculate decile boundaries
    lower_quantile = (decile_num - 1) / 10
    upper_quantile = decile_num / 10
    
    lower_threshold = gnomad4["lof.oe_ci.upper"].quantile(lower_quantile)
    upper_threshold = gnomad4["lof.oe_ci.upper"].quantile(upper_quantile)
    
    # Filter genes in this decile
    gnomad4_decile = gnomad4[
        (gnomad4["lof.oe_ci.upper"] > lower_threshold) & 
        (gnomad4["lof.oe_ci.upper"] <= upper_threshold)
    ]
    
    # Prepare data
    columns_to_keep = ["Entrez", "gene", "lof.pLI", "lof.z_score", "lof.oe_ci.upper"]
    gnomad4_decile = gnomad4_decile[columns_to_keep].copy()
    gnomad4_decile["Entrez"] = gnomad4_decile["Entrez"].astype(int)
    gnomad4_decile = gnomad4_decile[gnomad4_decile["Entrez"] != 0]
    
    # Create gene weights (inverse of LOEUF)
    constraint_gw_decile = dict(zip(gnomad4_decile["Entrez"], 1/gnomad4_decile["lof.oe_ci.upper"]))
    
    # Calculate structure bias
    constraint_STR_Bias_decile = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_decile)
    constraint_STR_Bias_decile["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_STR_Bias_decile.index.values]
    
    # Calculate correlations with ASD
    merged_ASD = merge_str_bias_datasets(Spark_ASD_STR_Bias, constraint_STR_Bias_decile, suffixes=('_ASD', '_Constraint'))
    corr_ASD = merged_ASD["EFFECT_ASD"].corr(merged_ASD["EFFECT_Constraint"])
    
    # Calculate correlations with DDD
    merged_DDD = merge_str_bias_datasets(DDD_rmASD_STR_Bias, constraint_STR_Bias_decile, suffixes=('_DD', '_Constraint'))
    corr_DDD = merged_DDD["EFFECT_DDD"].corr(merged_DDD["EFFECT_Constraint"])
    
    # Store results
    decile_results.append({
        'Decile': decile_num,
        'N_genes': len(gnomad4_decile),
        'LOEUF_min': lower_threshold,
        'LOEUF_max': upper_threshold,
        'LOEUF_mean': gnomad4_decile["lof.oe_ci.upper"].mean(),
        'Correlation_ASD': corr_ASD,
        'Correlation_DDD': corr_DDD
    })
    
    print(f"Decile {decile_num}: N={len(gnomad4_decile)}, LOEUF=[{lower_threshold:.3f}, {upper_threshold:.3f}], Corr_ASD={corr_ASD:.3f}, Corr_DDD={corr_DDD:.3f}")

# Convert to DataFrame
decile_results_df = pd.DataFrame(decile_results)
decile_results_df


In [None]:
### Visualize correlation trends across constraint deciles
fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=240)

# Plot 1: Correlation vs Decile
ax1 = axes[0]
ax1.plot(decile_results_df['Decile'], decile_results_df['Correlation_ASD'], 
         marker='o', markersize=8, linewidth=2, label='ASD vs Constraint', color='#1f77b4')
ax1.plot(decile_results_df['Decile'], decile_results_df['Correlation_DDD'], 
         marker='s', markersize=8, linewidth=2, label='DD (excl. ASD) vs Constraint', color='#ff7f0e')
ax1.set_xlabel('Constraint Decile\n(1=Most Constrained, 10=Least Constrained)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Correlation with Structure Bias', fontsize=14, fontweight='bold')
ax1.set_title('Correlation vs Constraint Decile', fontsize=16, fontweight='bold', pad=20)
ax1.legend(fontsize=12, loc='best')
ax1.grid(True, alpha=0.3, linestyle='--')
ax1.set_xticks(range(1, 11))
ax1.axhline(y=0, color='gray', linestyle='--', alpha=0.5)

# Plot 2: Correlation vs Mean LOEUF
ax2 = axes[1]
ax2.plot(decile_results_df['LOEUF_mean'], decile_results_df['Correlation_ASD'], 
         marker='o', markersize=8, linewidth=2, label='ASD vs Constraint', color='#1f77b4')
ax2.plot(decile_results_df['LOEUF_mean'], decile_results_df['Correlation_DDD'], 
         marker='s', markersize=8, linewidth=2, label='DD (excl. ASD) vs Constraint', color='#ff7f0e')
ax2.set_xlabel('Mean LOEUF\n(Lower = More Constrained)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Correlation with Structure Bias', fontsize=14, fontweight='bold')
ax2.set_title('Correlation vs Mean LOEUF', fontsize=16, fontweight='bold', pad=20)
ax2.legend(fontsize=12, loc='best')
ax2.grid(True, alpha=0.3, linestyle='--')
ax2.axhline(y=0, color='gray', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()


In [None]:
### Statistical significance testing for correlations
from scipy.stats import pearsonr

# Add p-values to the analysis
decile_results_pval = []

for decile_num in range(1, 11):
    # Calculate decile boundaries
    lower_quantile = (decile_num - 1) / 10
    upper_quantile = decile_num / 10
    
    lower_threshold = gnomad4["lof.oe_ci.upper"].quantile(lower_quantile)
    upper_threshold = gnomad4["lof.oe_ci.upper"].quantile(upper_quantile)
    
    # Filter genes in this decile
    gnomad4_decile = gnomad4[
        (gnomad4["lof.oe_ci.upper"] > lower_threshold) & 
        (gnomad4["lof.oe_ci.upper"] <= upper_threshold)
    ]
    
    # Prepare data
    columns_to_keep = ["Entrez", "gene", "lof.pLI", "lof.z_score", "lof.oe_ci.upper"]
    gnomad4_decile = gnomad4_decile[columns_to_keep].copy()
    gnomad4_decile["Entrez"] = gnomad4_decile["Entrez"].astype(int)
    gnomad4_decile = gnomad4_decile[gnomad4_decile["Entrez"] != 0]
    
    # Create gene weights
    constraint_gw_decile = dict(zip(gnomad4_decile["Entrez"], 1/gnomad4_decile["lof.oe_ci.upper"]))
    
    # Calculate structure bias
    constraint_STR_Bias_decile = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_decile)
    constraint_STR_Bias_decile["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_STR_Bias_decile.index.values]
    
    # Calculate correlations and p-values with ASD
    merged_ASD = merge_str_bias_datasets(Spark_ASD_STR_Bias, constraint_STR_Bias_decile, suffixes=('_ASD', '_Constraint'))
    corr_ASD, pval_ASD = pearsonr(merged_ASD["EFFECT_ASD"], merged_ASD["EFFECT_Constraint"])
    
    # Calculate correlations and p-values with DDD
    merged_DDD = merge_str_bias_datasets(DDD_rmASD_STR_Bias, constraint_STR_Bias_decile, suffixes=('_DD', '_Constraint'))
    corr_DDD, pval_DDD = pearsonr(merged_DDD["EFFECT_DDD"], merged_DDD["EFFECT_Constraint"])
    
    # Store results
    decile_results_pval.append({
        'Decile': decile_num,
        'N_genes': len(gnomad4_decile),
        'LOEUF_mean': gnomad4_decile["lof.oe_ci.upper"].mean(),
        'Correlation_ASD': corr_ASD,
        'P_value_ASD': pval_ASD,
        'Correlation_DDD': corr_DDD,
        'P_value_DDD': pval_DDD,
        'Sig_ASD': '***' if pval_ASD < 0.001 else '**' if pval_ASD < 0.01 else '*' if pval_ASD < 0.05 else 'ns',
        'Sig_DDD': '***' if pval_DDD < 0.001 else '**' if pval_DDD < 0.01 else '*' if pval_DDD < 0.05 else 'ns'
    })

# Convert to DataFrame
decile_results_with_pval = pd.DataFrame(decile_results_pval)
decile_results_with_pval


In [None]:
### Bar plot with significance annotations
fig, ax = plt.subplots(figsize=(14, 7), dpi=240)

x = np.arange(len(decile_results_with_pval))
width = 0.35

# Create bars
bars1 = ax.bar(x - width/2, decile_results_with_pval['Correlation_ASD'], width, 
               label='ASD vs Constraint', color='#1f77b4', alpha=0.8, edgecolor='black', linewidth=1)
bars2 = ax.bar(x + width/2, decile_results_with_pval['Correlation_DDD'], width, 
               label='DD (excl. ASD) vs Constraint', color='#ff7f0e', alpha=0.8, edgecolor='black', linewidth=1)

# Add significance annotations
for i, (idx, row) in enumerate(decile_results_with_pval.iterrows()):
    # ASD significance
    if row['Sig_ASD'] != 'ns':
        ax.text(i - width/2, row['Correlation_ASD'] + 0.02 if row['Correlation_ASD'] > 0 else row['Correlation_ASD'] - 0.05, 
                row['Sig_ASD'], ha='center', va='bottom' if row['Correlation_ASD'] > 0 else 'top', 
                fontsize=10, fontweight='bold')
    
    # DDD significance
    if row['Sig_DDD'] != 'ns':
        ax.text(i + width/2, row['Correlation_DDD'] + 0.02 if row['Correlation_DDD'] > 0 else row['Correlation_DDD'] - 0.05, 
                row['Sig_DDD'], ha='center', va='bottom' if row['Correlation_DDD'] > 0 else 'top', 
                fontsize=10, fontweight='bold')

# Customize plot
ax.set_xlabel('Constraint Decile (1=Most Constrained, 10=Least Constrained)', fontsize=14, fontweight='bold')
ax.set_ylabel('Correlation with Structure Bias', fontsize=14, fontweight='bold')
ax.set_title('Structure Bias Correlation Across Constraint Deciles\n(*** p<0.001, ** p<0.01, * p<0.05)', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels([f'{i}\n({row["LOEUF_mean"]:.2f})' for i, row in decile_results_with_pval.iterrows()], fontsize=10)
ax.legend(fontsize=12, loc='best')
ax.axhline(y=0, color='black', linestyle='-', alpha=0.3, linewidth=1)
ax.grid(True, alpha=0.3, linestyle='--', axis='y')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
### Helper function to visualize specific deciles in detail
def plot_decile_comparison(decile_num, dataset1, dataset1_name="ASD", dataset2=None, dataset2_name="DDD"):
    """
    Plot detailed scatter comparison for a specific constraint decile.
    
    Parameters:
    -----------
    decile_num : int
        Decile number (1-10) to visualize
    dataset1 : DataFrame
        First dataset for comparison (e.g., Spark_ASD_STR_Bias)
    dataset1_name : str
        Name of first dataset
    dataset2 : DataFrame, optional
        Second dataset for comparison (e.g., DDD_rmASD_STR_Bias)
    dataset2_name : str
        Name of second dataset
    """
    # Calculate decile boundaries
    lower_quantile = (decile_num - 1) / 10
    upper_quantile = decile_num / 10
    
    lower_threshold = gnomad4["lof.oe_ci.upper"].quantile(lower_quantile)
    upper_threshold = gnomad4["lof.oe_ci.upper"].quantile(upper_quantile)
    
    # Filter genes in this decile
    gnomad4_decile = gnomad4[
        (gnomad4["lof.oe_ci.upper"] > lower_threshold) & 
        (gnomad4["lof.oe_ci.upper"] <= upper_threshold)
    ]
    
    # Prepare data
    columns_to_keep = ["Entrez", "gene", "lof.pLI", "lof.z_score", "lof.oe_ci.upper"]
    gnomad4_decile = gnomad4_decile[columns_to_keep].copy()
    gnomad4_decile["Entrez"] = gnomad4_decile["Entrez"].astype(int)
    gnomad4_decile = gnomad4_decile[gnomad4_decile["Entrez"] != 0]
    
    # Create gene weights
    constraint_gw_decile = dict(zip(gnomad4_decile["Entrez"], 1/gnomad4_decile["lof.oe_ci.upper"]))
    
    # Calculate structure bias
    constraint_STR_Bias_decile = MouseSTR_AvgZ_Weighted(STR_BiasMat, constraint_gw_decile)
    constraint_STR_Bias_decile["Region"] = [STR_Anno.get(ct_idx, "Unknown") for ct_idx in constraint_STR_Bias_decile.index.values]
    
    print(f"Decile {decile_num}: N_genes={len(gnomad4_decile)}, LOEUF range=[{lower_threshold:.3f}, {upper_threshold:.3f}]")
    
    # Create plots
    n_plots = 1 if dataset2 is None else 2
    fig, axes = plt.subplots(1, n_plots, figsize=(8*n_plots, 6), dpi=240)
    if n_plots == 1:
        axes = [axes]
    
    # Plot 1: dataset1 vs constraint
    merged1 = merge_str_bias_datasets(dataset1, constraint_STR_Bias_decile, 
                                      suffixes=(f'_{dataset1_name}', '_Constraint'))
    plot_structure_bias_comparison(merged1, suffixes=(f'_{dataset1_name}', '_Constraint'), metric='EFFECT')
    
    # Plot 2: dataset2 vs constraint (if provided)
    if dataset2 is not None:
        merged2 = merge_str_bias_datasets(dataset2, constraint_STR_Bias_decile, 
                                          suffixes=(f'_{dataset2_name}', '_Constraint'))
        plot_structure_bias_comparison(merged2, suffixes=(f'_{dataset2_name}', '_Constraint'), metric='EFFECT')
    
    return constraint_STR_Bias_decile, gnomad4_decile

# Example usage: Plot decile 1 (most constrained)
# constraint_bias_d1, genes_d1 = plot_decile_comparison(1, Spark_ASD_STR_Bias, "ASD", DDD_rmASD_STR_Bias, "DDD")

# Example usage: Plot decile 10 (least constrained)
# constraint_bias_d10, genes_d10 = plot_decile_comparison(10, Spark_ASD_STR_Bias, "ASD", DDD_rmASD_STR_Bias, "DDD")


In [None]:
records = []
for i in range(len(tmp_bias_dfs)):
    top_avg_bias = tmp_bias_dfs[i].head(50)["EFFECT"].mean()
    records.append(top_avg_bias)
#records


In [None]:
null_effects = records

# Get observed value
observed_effect = Spark_ASD_STR_Bias.head(50)["EFFECT"].mean()

# Calculate p-value (one-tailed test: observed > null)
null_effects = np.array(null_effects)
p_value = (np.sum(null_effects >= observed_effect) + 1) / (len(null_effects) + 1)

# Plot histogram
if 1:    
    plt.figure(figsize=(10, 6))
    plt.hist(null_effects, bins=50, alpha=0.7, color='lightblue', edgecolor='black', label='Null distribution (Constraint Genes)')
    plt.axvline(observed_effect, color='red', linestyle='--', linewidth=2, label=f'Observed (Spark ASD): {observed_effect:.4f}')
    plt.xlabel('EFFECT')
    plt.ylabel('Frequency')
    #plt.title(f'{title_prefix}{structure_name} EFFECT: Null Distribution vs Observed')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.text(0.05, 0.95, f'P-value: {p_value:.4f}', transform=plt.gca().transAxes, 
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    plt.show()

    print(f"Observed Spark ASD effect: {observed_effect:.4f}")
    print(f"Null mean: {np.mean(null_effects):.4f}")
    print(f"Null std: {np.std(null_effects):.4f}")
    print(f"P-value: {p_value:.4f}")

In [None]:
Corrs_ASD_Constraint = []
Corrs_DDD_Constraint = []
for i in range(len(tmp_bias_dfs)):
    top_avg_bias = tmp_bias_dfs[i]

    # ASD correlation
    tmp_merged_data_ASD_Constraint = merge_str_bias_datasets(
        Spark_ASD_STR_Bias, top_avg_bias, suffixes=('_ASD', '_Constraint'))
    corr_asd = tmp_merged_data_ASD_Constraint["EFFECT_ASD"].corr(
        tmp_merged_data_ASD_Constraint["EFFECT_Constraint"])
    Corrs_ASD_Constraint.append(corr_asd)

    # DDD correlation
    tmp_merged_data_DDD_Constraint = merge_str_bias_datasets(
        DDD_rmASD_STR_Bias, top_avg_bias, suffixes=('_DD', '_Constraint'))
    corr_ddd = tmp_merged_data_DDD_Constraint["EFFECT_DDD"].corr(
        tmp_merged_data_DDD_Constraint["EFFECT_Constraint"])
    Corrs_DDD_Constraint.append(corr_ddd)

Corrs_ASD_Constraint = np.array(Corrs_ASD_Constraint)
Corrs_DDD_Constraint = np.array(Corrs_DDD_Constraint)
#print(f"Correlation between EFFECT_ASD and EFFECT_Constraint: {np.mean(Corrs_ASD_Constraint):.4f}")
#print(f"Correlation between EFFECT_DDD and EFFECT_Constraint: {np.mean(Corrs_DDD_Constraint):.4f}")


In [None]:
# Panel 1: ASD
null_effects_asd = Corrs_ASD_Constraint
observed_effect_asd = 0.87
null_effects_asd = np.array(null_effects_asd)
p_value_asd = (np.sum(null_effects_asd >= observed_effect_asd) + 1) / (len(null_effects_asd) + 1)

# Panel 2: DDD
null_effects_ddd = Corrs_DDD_Constraint
observed_effect_ddd = 0.90
null_effects_ddd = np.array(null_effects_ddd)
p_value_ddd = (np.sum(null_effects_ddd >= observed_effect_ddd) + 1) / (len(null_effects_ddd) + 1)

# Plot both panels (2x1)
fig, axes = plt.subplots(2, 1, figsize=(10, 12), sharex=True)

# ASD panel
ax = axes[0]
ax.hist(null_effects_asd, bins=50, alpha=0.7, color='lightblue', edgecolor='black', label='Null distribution (Constraint Genes)')
ax.axvline(observed_effect_asd, color='red', linestyle='--', linewidth=2, label=f'Observed (Spark ASD): {observed_effect_asd:.4f}')
ax.set_ylabel('Frequency')
ax.legend()
ax.grid(True, alpha=0.3)
ax.text(0.05, 0.95, f'P-value: {p_value_asd:.4f}', transform=ax.transAxes, 
        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), va='top')
ax.set_title('ASD (Spark): Correlation Null Distribution vs Observed')

# DDD panel
ax = axes[1]
ax.hist(null_effects_ddd, bins=50, alpha=0.7, color='lightgreen', edgecolor='black', label='Null distribution (Constraint Genes)')
ax.axvline(observed_effect_ddd, color='red', linestyle='--', linewidth=2, label=f'Observed (DDD): {observed_effect_ddd:.4f}')
ax.set_xlabel('EFFECT')
ax.set_ylabel('Frequency')
ax.legend()
ax.grid(True, alpha=0.3)
ax.text(0.05, 0.95, f'P-value: {p_value_ddd:.4f}', transform=ax.transAxes, 
        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), va='top')
ax.set_title('DDD: Correlation Null Distribution vs Observed')

plt.tight_layout()
plt.show()

print(f"Observed Spark ASD effect: {observed_effect_asd:.4f}")
print(f"ASD Null mean: {np.mean(null_effects_asd):.4f}")
print(f"ASD Null std: {np.std(null_effects_asd):.4f}")
print(f"ASD P-value: {p_value_asd:.4f}")
print("")
print(f"Observed DDD effect: {observed_effect_ddd:.4f}")
print(f"DDD Null mean: {np.mean(null_effects_ddd):.4f}")
print(f"DDD Null std: {np.std(null_effects_ddd):.4f}")
print(f"DDD P-value: {p_value_ddd:.4f}")