# Cross-Species Validation: GENCIC Mouse vs Human fMRI Connectivity
,
,## Comparing ASD Risk Gene Enrichment with Functional Connectivity Abnormalities
,
,**Reference:** Buch et al. (Nature Neuroscience 2024)
,
,### Scientific Question
,Do mouse brain regions with high ASD risk gene enrichment (GENCIC) correspond to human brain regions showing altered functional connectivity in autism?
,
,### Analysis Strategy
,- **Mouse data**: GENCIC bias scores (ASD gene enrichment per brain structure)
,- **Human data**: fMRI connectivity abnormalities associated with autism symptoms
,- **Cross-species mapping**: Three confidence tiers
,  - **ALL**: All mapped regions
,  - **TIER 1**: Score > 0.4 (higher confidence)
,  - **TIER 2**: Score > 0.6 (highest confidence)

---
# 1. SETUP & DATA LOADING

In [None]:
%load_ext autoreload
%autoreload 2
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, linregress
from adjustText import adjust_text

ProjDIR = "/home/jw3514/Work/ASD_Circuits_CellType/"
sys.path.insert(1, f'{ProjDIR}/src/')
from ASD_Circuits import *

os.chdir(f"{ProjDIR}/notebook_rebuttal/")
HGNC, ENSID2Entrez, GeneSymbol2Entrez, Entrez2Symbol = LoadGeneINFO()

sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
print(f"Working directory: {os.getcwd()}")

## 1.1 Load Data

In [None]:
# Mouse GENCIC data
gencic_data = pd.read_csv('../results/GENCIC_MouseSTRBias.csv')
print(f"GENCIC: {gencic_data.shape[0]} structures, Bias range: [{gencic_data['Bias'].min():.3f}, {gencic_data['Bias'].max():.3f}]")

Fig2A = pd.read_csv('Buch_et_al/Fig2a_ccu1.csv')
Fig2A.columns = Fig2A.columns.tolist()
Fig2A.index = Fig2A.columns

# Fig2B: Social Affect-related connectivity
Fig2B = pd.read_csv('Buch_et_al/Fig2b_ccu2.csv')
Fig2B.columns = Fig2B.columns.tolist()
Fig2B.index = Fig2B.columns

# Fig2C: Repetitive/Restrictive Behaviors (RRB)-related connectivity
Fig2C = pd.read_csv('Buch_et_al/Fig2c_ccu3.csv')
Fig2C.columns = Fig2C.columns.tolist()
Fig2C.index = Fig2C.columns

# Fig2D: Overall atypical connectivity
Fig2D = pd.read_csv('Buch_et_al/Fig2d_atypical_all.csv')
Fig2D.columns = Fig2D.columns.tolist()
Fig2D.index = Fig2D.columns
print(f"fMRI: 4 connectivity matrices with {Fig2A.shape[0]} regions each")

# Cross-species mapping
RegionMapping = pd.read_excel('Buch_et_al/claude_mapping_v6.xlsx').drop(columns=["Bias"], errors='ignore')
RegionMappingT1 = RegionMapping[RegionMapping["Score"] > 0.4]
RegionMappingT2 = RegionMapping[RegionMapping["Score"] > 0.6]
print(f"Mappings: ALL={len(RegionMapping)} | T1={len(RegionMappingT1)} | T2={len(RegionMappingT2)}")

---
# 2. DATA PREPROCESSING

In [None]:
def aggregate_bias_by_tier(gencic_data, region_mapping):
    gencic_human = gencic_data.merge(region_mapping, left_on='Structure', right_on='Mouse_Structure', how='inner')
    bias_by_region = gencic_human.groupby('Representative_Human_Group').agg({
        'Bias': ['mean', 'median', 'max', 'std', 'count'],
        'Pvalue': lambda x: (x < 0.05).sum()
    }).round(4)
    bias_by_region.columns = ['Mean_Bias', 'Median_Bias', 'Max_Bias', 'Std_Bias', 'N_Structures', 'N_Significant']
    return bias_by_region.sort_values('Mean_Bias', ascending=False)

bias_by_human_region = aggregate_bias_by_tier(gencic_data, RegionMapping)
bias_by_human_region_T1 = aggregate_bias_by_tier(gencic_data, RegionMappingT1)
bias_by_human_region_T2 = aggregate_bias_by_tier(gencic_data, RegionMappingT2)
print(f"Aggregated bias: ALL={len(bias_by_human_region)} | T1={len(bias_by_human_region_T1)} | T2={len(bias_by_human_region_T2)} human regions")

---
# 3. ANALYSIS FUNCTIONS

In [None]:
def calculate_fmri_metrics(fmri_matrix):
    metrics = pd.DataFrame(index=fmri_matrix.columns)
    metrics['Total_Strength'] = fmri_matrix.sum()
    metrics['Total_Strength_Abs'] = fmri_matrix.abs().sum()
    metrics['Mean_Strength'] = fmri_matrix.apply(lambda col: col[col != 0].mean() if (col != 0).sum() > 0 else 0)
    metrics['N_Connections'] = (fmri_matrix != 0).sum()
    return metrics

def analyze_fmri_gencic_correlation(fmri_matrix, bias_data, dataset_name, verbose=True):
    fmri_metrics = calculate_fmri_metrics(fmri_matrix)
    merged = bias_data.merge(fmri_metrics, left_index=True, right_index=True, how='inner')
    
    if verbose:
        print(f"\n{'='*80}\n{dataset_name}\n{'='*80}")
        print(f"Regions: {len(merged)}")
        for metric in ['Total_Strength_Abs', 'Mean_Strength']:
            valid = merged.dropna(subset=[metric, 'Mean_Bias'])
            if len(valid) > 3:
                rho, pval = spearmanr(valid['Mean_Bias'], valid[metric])
                sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else "†" if pval < 0.10 else ""
                print(f"  {metric:20s}: rho = {rho:7.4f}, p = {pval:.4f} {sig}")
    
    return merged, {}

def mergeCP(df):
    df = df.copy()
    for merge_list, new_name in [(['Caudate', 'Putamen'], 'Caudate-Putamen'), (['SMA', 'Premotor'], 'SMA-Premotor')]:
        structs = [s for s in merge_list if s in df.index]
        if len(structs) == 2:
            df.loc[new_name] = df.loc[structs].mean()
            df = df.drop(structs)
    return df

def plot_combined(Ys_dfs, title_text="", figsize=(11,8)):
    Xs = Ys_dfs[0]['Mean_Bias']
    Ys = np.zeros_like(Xs)
    for df in Ys_dfs:
        Ys += df['Total_Strength_Abs'].values
    Xs, Ys = np.asarray(Xs), np.asarray(Ys)
    
    rho, pval = spearmanr(Xs, Ys)
    linres = linregress(Xs, Ys)
    
    plt.figure(figsize=figsize)
    plt.scatter(Xs, Ys, color='#377eb8', edgecolor='k', s=80, alpha=0.9, zorder=2)
    X_fit = np.linspace(Xs.min(), Xs.max(), 100)
    plt.plot(X_fit, linres.slope * X_fit + linres.intercept, color="#e6550d", linestyle="--", linewidth=2.2, zorder=1)
    
    texts = []
    for i, label in enumerate(Ys_dfs[0].index):
        texts.append(plt.text(
            Xs[i], Ys[i], label, fontsize=15, fontweight='bold', alpha=0.91,
            bbox=dict(boxstyle="round,pad=0.30", fc="white", ec="gray", lw=1.0, alpha=0.65), zorder=3))
    adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black', lw=0.8), expand_points=(1.3, 2.0))
    
    plt.xlabel('ASD Mutation Bias', fontsize=17, fontweight='bold')
    plt.ylabel('fMRI Connectivity-Symptom Association \nStrength (SA + RRB + VIQ)', fontsize=17, fontweight='bold')
    plt.title(title_text, fontsize=16, fontweight='bold')
    
    stat_text = f"$r$ = {linres.rvalue:.2f}\n$p$ = {linres.pvalue:.2f}"
    plt.gca().text(0.02, 0.98, stat_text, transform=plt.gca().transAxes, fontsize=25, va='top', ha='left',
                  bbox=dict(boxstyle="round,pad=0.35", fc="white", ec="gray", lw=1.1, alpha=0.70))
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(alpha=0.3, linestyle=":", zorder=0)
    plt.tight_layout()
    plt.show()
    print(f"Spearman ρ={rho:.3f}, p={pval:.4f} | Linear r={linres.rvalue:.3f}, p={linres.pvalue:.4f}")

print("✓ Functions loaded")

---
# 4. MAIN RESULTS: ALL REGIONS

In [None]:
print("\n" + "="*80 + "\nINDIVIDUAL fMRI ANALYSES (ALL REGIONS)\n" + "="*80)
merged_viq, _ = analyze_fmri_gencic_correlation(Fig2A, bias_by_human_region, "Verbal IQ")
merged_sa, _ = analyze_fmri_gencic_correlation(Fig2B, bias_by_human_region, "Social Affect")
merged_rrb, _ = analyze_fmri_gencic_correlation(Fig2C, bias_by_human_region, "RRB")

In [None]:
plot_combined([merged_sa], "ALL REGIONS: GENCIC Bias vs SA fMRI")
plot_combined([merged_rrb], "ALL REGIONS: GENCIC Bias vs RRB fMRI")
plot_combined([merged_viq], "ALL REGIONS: GENCIC Bias vs VIQ fMRI")

In [None]:
print("\n" + "="*80 + "\nCOMBINED: SA + RRB + VIQ (ALL REGIONS)\n" + "="*80 + "\n")
plot_combined([merged_sa, merged_rrb, merged_viq], "ALL REGIONS: GENCIC Bias vs Combined fMRI")

In [None]:
merged_viq_cp, merged_sa_cp, merged_rrb_cp = mergeCP(merged_viq), mergeCP(merged_sa), mergeCP(merged_rrb)
print("\n" + "="*80 + "\nCOMBINED with CP MERGED (ALL REGIONS)\n" + "="*80 + "\n")
plot_combined([merged_sa_cp, merged_rrb_cp, merged_viq_cp], "ALL REGIONS (CP Merged): GENCIC vs fMRI")

---
# 5. TIER 1 ANALYSIS (Score > 0.4)

In [None]:
print("\n" + "="*80 + "\nTIER 1 INDIVIDUAL ANALYSES\n" + "="*80)
merged_viq_T1, _ = analyze_fmri_gencic_correlation(Fig2A, bias_by_human_region_T1, "Verbal IQ (T1)")
merged_sa_T1, _ = analyze_fmri_gencic_correlation(Fig2B, bias_by_human_region_T1, "Social Affect (T1)")
merged_rrb_T1, _ = analyze_fmri_gencic_correlation(Fig2C, bias_by_human_region_T1, "RRB (T1)")

In [None]:
print("\n" + "="*80 + "\nTIER 1 COMBINED\n" + "="*80 + "\n")
plot_combined([merged_sa_T1, merged_rrb_T1, merged_viq_T1], "TIER 1 (Score > 0.4): GENCIC vs fMRI")

In [None]:
merged_viq_cp_T1, merged_sa_cp_T1, merged_rrb_cp_T1 = mergeCP(merged_viq_T1), mergeCP(merged_sa_T1), mergeCP(merged_rrb_T1)
print("\n" + "="*80 + "\nTIER 1 COMBINED (CP MERGED)\n" + "="*80 + "\n")
plot_combined([merged_sa_cp_T1, merged_rrb_cp_T1, merged_viq_cp_T1], "TIER 1 - CP Merged: GENCIC vs fMRI")

---
# 6. TIER 2 ANALYSIS (Score > 0.6)

In [None]:
print("\n" + "="*80 + "\nTIER 2 INDIVIDUAL ANALYSES\n" + "="*80)
merged_viq_T2, _ = analyze_fmri_gencic_correlation(Fig2A, bias_by_human_region_T2, "Verbal IQ (T2)")
merged_sa_T2, _ = analyze_fmri_gencic_correlation(Fig2B, bias_by_human_region_T2, "Social Affect (T2)")
merged_rrb_T2, _ = analyze_fmri_gencic_correlation(Fig2C, bias_by_human_region_T2, "RRB (T2)")

In [None]:
print("\n" + "="*80 + "\nTIER 2 COMBINED\n" + "="*80 + "\n")
plot_combined([merged_sa_T2, merged_rrb_T2, merged_viq_T2], "TIER 2 (Score > 0.6): GENCIC vs fMRI")

In [None]:
merged_viq_cp_T2, merged_sa_cp_T2, merged_rrb_cp_T2 = mergeCP(merged_viq_T2), mergeCP(merged_sa_T2), mergeCP(merged_rrb_T2)
print("\n" + "="*80 + "\nTIER 2 COMBINED (CP MERGED)\n" + "="*80 + "\n")
#plot_combined([merged_sa_cp_T2, merged_rrb_cp_T2, merged_viq_cp_T2], "TIER 2 - CP Merged: GENCIC vs fMRI")
plot_combined([merged_sa_cp_T2, merged_rrb_cp_T2, merged_viq_cp_T2], "")

In [None]:
len(merged_viq_cp_T2)

---
# 7. COMPARATIVE SUMMARY

In [None]:
# Summary table
summary = []
for tier_name, dfs in [
    ('ALL', (merged_sa, merged_rrb, merged_viq)), 
    ('T1', (merged_sa_T1, merged_rrb_T1, merged_viq_T1)),
    ('T2', (merged_sa_T2, merged_rrb_T2, merged_viq_T2))
]:
    Xs = dfs[0]['Mean_Bias']
    Ys = sum(df['Total_Strength_Abs'].values for df in dfs)
    rho, pval = spearmanr(Xs, Ys)
    linres = linregress(Xs, Ys)
    summary.append({
        'Tier': tier_name,
        'N': len(Xs),
        'Spearman_rho': rho,
        'Spearman_p': pval,
        'Linear_r': linres.rvalue,
        'Linear_p': linres.pvalue,
        'Slope': linres.slope
    })

summary_df = pd.DataFrame(summary)
print("\n" + "="*120)
print("SUMMARY: Combined SA + RRB + VIQ Analysis")
print("="*120)
display(summary_df.round(4))

In [None]:
# Side-by-side comparison
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
for idx, (tier_name, dfs) in enumerate([
    ('ALL', (merged_sa, merged_rrb, merged_viq)),
    ('T1 (>0.4)', (merged_sa_T1, merged_rrb_T1, merged_viq_T1)),
    ('T2 (>0.6)', (merged_sa_T2, merged_rrb_T2, merged_viq_T2))
]):
    ax = axes[idx]
    Xs = dfs[0]['Mean_Bias']
    Ys = sum(df['Total_Strength_Abs'].values for df in dfs)
    ax.scatter(Xs, Ys, color='#377eb8', edgecolor='k', s=80, alpha=0.9, zorder=2)
    linres = linregress(Xs, Ys)
    X_fit = np.linspace(Xs.min(), Xs.max(), 100)
    ax.plot(X_fit, linres.slope * X_fit + linres.intercept, color="#e6550d", linestyle="--", linewidth=2.2)
    ax.text(0.02, 0.98, f"$r$={linres.rvalue:.3f}, $p$={linres.pvalue:.4f}\nN={len(Xs)}",
           transform=ax.transAxes, fontsize=11, va='top', ha='left',
           bbox=dict(boxstyle="round", fc="white", ec="gray", alpha=0.7))
    ax.set_xlabel('Mean_Bias', fontsize=13, fontweight='bold')
    ax.set_ylabel('fMRI Connectivity-Symptom Association Strength\n(SA+RRB+VIQ)', fontsize=13, fontweight='bold')
    ax.set_title(tier_name, fontsize=14, fontweight='bold')
    ax.grid(alpha=0.3, linestyle=":")
plt.suptitle('Cross-Tier Comparison', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

---
# 8. ATYPICAL CONNECTIVITY (Fig2D)

In [None]:
print("\n" + "="*80 + "\nOVERALL ATYPICAL CONNECTIVITY\n" + "="*80)
analyze_fmri_gencic_correlation(Fig2D, bias_by_human_region, "Atypical (ALL)")
analyze_fmri_gencic_correlation(Fig2D, bias_by_human_region_T1, "Atypical (T1)")
analyze_fmri_gencic_correlation(Fig2D, bias_by_human_region_T2, "Atypical (T2)")

---
# 10. COMPREHENSIVE DATA TABLE

Generate comprehensive table indexed by regions from claude_mapping_v6_DCG, including:
- All unique regions from claude_mapping_v6_DCG as index
- VIQ, Social Affect, and RRB phenotype measurements for each region
- Mouse Mean bias mapped to each region


In [None]:
# Step 1: Get all unique regions from claude_mapping_v6_DCG as the base index
all_mapped_regions = RegionMapping['Representative_Human_Group'].unique()
print(f"Total unique regions in claude_mapping_v6_DCG: {len(all_mapped_regions)}")

# Step 2: Calculate fMRI metrics for the 3 phenotype measurements (VIQ, Social, RRB)
def calculate_fmri_phenotype_metrics(fmri_matrix, phenotype_name):
    """Calculate Total_Strength_Abs for phenotype measurements"""
    metrics = pd.DataFrame(index=fmri_matrix.columns)
    metrics[f'{phenotype_name}_Total_Strength_Abs'] = fmri_matrix.abs().sum()
    return metrics

# Calculate metrics for each phenotype
viq_metrics = calculate_fmri_phenotype_metrics(Fig2A, 'VIQ')
sa_metrics = calculate_fmri_phenotype_metrics(Fig2B, 'Social')
rrb_metrics = calculate_fmri_phenotype_metrics(Fig2C, 'RRB')

print("✓ fMRI phenotype metrics calculated")


In [None]:
# Step 3: Create table indexed by all regions from claude_mapping_v6_DCG
comprehensive_table = pd.DataFrame(index=all_mapped_regions)
comprehensive_table.index.name = 'Human_Region'

# Step 4: Add 3 phenotype measurements (VIQ, Social, RRB)
comprehensive_table = comprehensive_table.join(viq_metrics, how='left')
comprehensive_table = comprehensive_table.join(sa_metrics, how='left')
comprehensive_table = comprehensive_table.join(rrb_metrics, how='left')

# Step 5: Add mouse Mean bias mapped to each region
gencic_bias_all = aggregate_bias_by_tier(gencic_data, RegionMapping)
gencic_bias_all.index.name = 'Human_Region'
comprehensive_table = comprehensive_table.join(gencic_bias_all[['Mean_Bias']], how='left')

# Step 6: Add mapping information (optional, for reference)
mapping_info = RegionMapping.groupby('Representative_Human_Group').agg({
    'Score': ['mean', 'count'],
    'Mouse_Structure': lambda x: ', '.join(x.unique()[:5])  # First 5 mouse structures
}).round(4)

# Flatten multi-level columns
if isinstance(mapping_info.columns, pd.MultiIndex):
    mapping_info.columns = ['_'.join(col).strip() if col[1] else col[0] for col in mapping_info.columns.values]
mapping_info = mapping_info.rename(columns={
    'Score_mean': 'Mapping_Score_Mean',
    'Score_count': 'N_Mouse_Structures',
    'Mouse_Structure': 'Mouse_Structures_Sample'
})
mapping_info.index.name = 'Human_Region'
comprehensive_table = comprehensive_table.join(mapping_info, how='left')

# Reorder columns: Mean_Bias first, then 3 phenotypes
column_order = [
    'Mean_Bias',
    'VIQ_Total_Strength_Abs',
    'Social_Total_Strength_Abs',
    'RRB_Total_Strength_Abs',
    'Mapping_Score_Mean',
    'N_Mouse_Structures',
    'Mouse_Structures_Sample'
]

# Keep only columns that exist
column_order = [col for col in column_order if col in comprehensive_table.columns]
# Add any remaining columns
remaining_cols = [col for col in comprehensive_table.columns if col not in column_order]
comprehensive_table = comprehensive_table[column_order + remaining_cols]

# Sort by Mean_Bias (descending), with NaN values at the end
comprehensive_table = comprehensive_table.sort_values('Mean_Bias', ascending=False, na_position='last')

print(f"✓ Comprehensive table created: {comprehensive_table.shape[0]} regions, {comprehensive_table.shape[1]} columns")
print(f"  Regions with Mean_Bias: {comprehensive_table['Mean_Bias'].notna().sum()}")
print(f"  Regions with VIQ data: {comprehensive_table['VIQ_Total_Strength_Abs'].notna().sum()}")
print(f"  Regions with Social data: {comprehensive_table['Social_Total_Strength_Abs'].notna().sum()}")
print(f"  Regions with RRB data: {comprehensive_table['RRB_Total_Strength_Abs'].notna().sum()}")


In [None]:
# Display the comprehensive table
print("\n" + "="*120)
print("COMPREHENSIVE DATA TABLE: Indexed by claude_mapping_v6_DCG regions")
print("="*120)
print(f"\nTotal regions from claude_mapping_v6_DCG: {len(comprehensive_table)}")
print(f"Regions with Mean_Bias: {comprehensive_table['Mean_Bias'].notna().sum()}")
print(f"Regions with VIQ data: {comprehensive_table['VIQ_Total_Strength_Abs'].notna().sum()}")
print(f"Regions with Social data: {comprehensive_table['Social_Total_Strength_Abs'].notna().sum()}")
print(f"Regions with RRB data: {comprehensive_table['RRB_Total_Strength_Abs'].notna().sum()}")

display(comprehensive_table.round(4))

In [None]:
# Save the comprehensive table
output_file = 'Buch_et_al/Comprehensive_fMRI_GENCIC_Table.csv'
comprehensive_table.to_csv(output_file)
print(f"\n✓ Table saved to: {output_file}")
print(f"  Table contains all {len(comprehensive_table)} regions from claude_mapping_v6_DCG")
print(f"  Columns: Mean_Bias + 3 phenotype measurements (VIQ, Social, RRB)")


In [None]:
# Create a summary table for regions with both Mean_Bias and at least one phenotype measurement
plotting_data = comprehensive_table[
    (comprehensive_table['Mean_Bias'].notna()) & 
    (
        comprehensive_table['VIQ_Total_Strength_Abs'].notna() |
        comprehensive_table['Social_Total_Strength_Abs'].notna() |
        comprehensive_table['RRB_Total_Strength_Abs'].notna()
    )
].copy()

# Add combined fMRI metrics (SA + RRB + VIQ) as used in plots
plotting_data['Combined_fMRI_Total_Strength_Abs'] = (
    plotting_data['Social_Total_Strength_Abs'].fillna(0) + 
    plotting_data['RRB_Total_Strength_Abs'].fillna(0) + 
    plotting_data['VIQ_Total_Strength_Abs'].fillna(0)
)

# Create simplified plotting data table
plotting_summary = plotting_data[[
    'Mean_Bias',
    'VIQ_Total_Strength_Abs', 
    'Social_Total_Strength_Abs', 
    'RRB_Total_Strength_Abs',
    'Combined_fMRI_Total_Strength_Abs',
    'Mapping_Score_Mean',
    'N_Mouse_Structures'
]].copy()

plotting_summary = plotting_summary.sort_values('Mean_Bias', ascending=False)

print("\n" + "="*120)
print("PLOTTING DATA SUMMARY: Regions with Mean_Bias and phenotype data")
print("="*120)
print(f"Total regions: {len(plotting_summary)}")
display(plotting_summary.round(4))

# Save plotting summary
plotting_output_file = 'Buch_et_al/Plotting_Data_Summary.csv'
plotting_summary.to_csv(plotting_output_file)
print(f"\n✓ Plotting summary saved to: {plotting_output_file}")


# 9. CONCLUSIONS

## Key Findings

1. **Cross-Species Validity:** Mouse regions with high ASD gene enrichment correspond to human regions with altered connectivity.
2. **Mapping Quality Impact:** Tier 1 mappings (score > 0.4) show stronger correlations than all regions, supporting the mapping confidence approach.
3. **Combined Measures:** The sum of Social (SA), Restricted/Repetitive Behaviors (RRB), and Verbal IQ (VIQ) metrics provides the most robust cross-species correspondence.
4. **Anatomical Specificity:** Merging CP structures enhances correlation strength, suggesting functional unity.

## Implications

These results support the translational validity of mouse ASD models for understanding human autism neurobiology.