In [None]:
"""
Author: Wen-Jou Chang
Baylor College of Medicine

This script is used to generate the supplementary figures in the paper, with the exception of following figures:
- Figure S2 (permuation results histograms for all categories): in main_figure_code.ipynb, same code for fig. 3G
- Figure S3 (annotated example): manual annotation
- Figure S4 (decay plots for the 5 rest categories): in main_figure_code.ipynb, same code for fig. 3A-F
- Figure S5 (power advantage scatter plot): in main_figure_code.ipynb, same code for fig. 3H
- Figure S6 (neurological subcategory decay plots): in main_figure_code.ipynb, same code for fig. 4B-D
"""

In [25]:

"""
Initialization
"""
# imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.colors import LinearSegmentedColormap
from matplotlib_venn import venn3
from collections import Counter, defaultdict
from graphviz import Digraph
import re
import sys
import importlib
import util

from util import CATEGORY_NAMES, COLOR_TEMPLATE, CORSIV_PROBE_LIST, CONTROLS, CATEGORIES, read_in_probes, calculate_points, plot_enrichment, breakdown, export_paper

SFIG_PATH = "OUTPUT_FIGURE_PATH"
PROJECT_PATH = None
if PROJECT_PATH:
    os.chdir(PROJECT_PATH)


epic = pd.read_csv("data/humanData/Illumina/EPIC.hg38.txt", sep="\t", header=None)
epic_probe_list = set(epic.iloc[:,3])
hm450 = pd.read_csv("data/humanData/Illumina/HM450.hg38.txt", sep="\t", header=None)
hm450_probe_list = set(hm450.iloc[:,3])
illumina = epic_probe_list.union(hm450_probe_list)

plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 16
plt.rcParams['axes.linewidth'] = 2  
plt.rcParams['xtick.major.width'] = 2  
plt.rcParams['ytick.major.width'] = 2  



In [None]:
# fig. S1: venn diagram of overlap between our results, ewas atlas, and ewas catalog
ewas_atlas = set(pd.read_csv("data/supp/EWAS_Atlas_studies.tsv", sep="\t")["PMID"].astype(int)) # downloaded from ewas atlas
ewas_catalog = pd.read_csv("data/supp/ewascatalog-studies.txt", sep="\t") # downloaded from ewas catalog
ewas_catalog = ewas_catalog[~ewas_catalog['PMID'].str.contains(r'\D', na=True)]
ewas_catalog = set(ewas_catalog["PMID"].astype(int))
our = pd.read_excel("data/2203_studies_info.xlsx")
our = set(our["PMID"].astype(int))
venn = venn3([our, ewas_catalog, ewas_atlas], ( "Our Results", "EWAS Catalog", "EWAS Atlas"))


In [None]:
# fig. S7B: adhd manual analysis
import math

manual = pd.read_csv("data/supp/all_ADHD_probes.csv")
manual = manual[manual["probeId"].str.startswith("cg") | manual["probeId"].str.startswith("ch.")]
manual = manual[manual["probeId"].isin(epic_probe_list) | manual["probeId"].isin(hm450_probe_list)]
def to_float(x):
    try:
        return float(x)
    except:
        if x in ["<0.05", "pass"]:
            return 0.049
        if x in ["not pass", " "]:
            return math.nan
        if x == "> 0.1":
            return 0.11
        if x == "≤ 0.1":
            return 0.09
        if x == "≤ 0.05":
            return 0.049
        if x == "≤0.01":
            return 0.009
        return x
manual["p-value"] = manual["p-value"].apply(to_float)
manual["q-value"] = manual["q-value"].apply(to_float)
manual["adj-p-value"] = manual["adj-p-value"].apply(to_float)
manual = manual.groupby(['pmcid', 'From']).filter(lambda x: len(x) <= 1000)


keep = pd.read_csv("data/supp/all_ADHD_source.csv")
manual = pd.merge(manual, keep, on=["pmcid", "Notes", "From", "Title"])
manual = manual[manual["Keep"]==1]
manual = manual[(manual["p-value"] < 0.05)]
manual = manual.drop_duplicates(subset=["pmcid", "probeId"])
manual[["pmcid","probeId"]].to_csv("data/supp/adhd_manual_probes.csv", index=False)

c = Counter(manual["probeId"])
l1, l2, l3, p, p2, _ = calculate_points(c, manual)
# output_path = f"{SFIG_PATH}/adhd_manual_enrichment_nominal.svg"
paper, r = plot_enrichment([l1, l2, l3, p, p2], "ADHD (Nominally Significant)", 7, output=None, format="svg")

In [None]:
# fig. S7A venn diagram
manual = pd.read_csv("data/supp/adhd_manual_probes.csv")
automtated = pd.read_csv("data/probe/neurological_all_probes.csv")
automtated["Filtered Mesh Term"] = automtated["Filtered Mesh Term"].apply(lambda x: [term.strip() for term in x.split("|")])
automtated = automtated[automtated["Filtered Mesh Term"].apply(lambda x: "Attention Deficit Disorder with Hyperactivity" in x)][["pmcid", "probeId"]]
m = pd.merge(manual, automtated, how="outer", indicator=True)

# Create counts for Venn diagram
left_only = len(m[m['_merge'] == 'left_only'])
right_only = len(m[m['_merge'] == 'right_only']) 
both = len(m[m['_merge'] == 'both'])

# Create and plot Venn diagram
plt.figure(figsize=(8,8))
venn2(subsets=(left_only, right_only, both), 
      set_labels=('Manual', 'Automated'),
      set_colors=('lightblue', 'lightgreen'))
plt.title('Overlap between Manual and \nAutomated Probe Instances', pad=20, fontsize=24)
plt.show()


In [None]:
# fig. S7B pie chart
qc = pd.read_csv("data/supp/probes_unique_to_automated.csv").iloc[:,-3:]
# Calculate total counts for each column
excluded_count = qc['Excluded'].sum()
nonsig_count = qc['Nonsignificant'].sum() 

# Create pie chart
plt.figure(figsize=(4,4))
plt.pie([excluded_count, nonsig_count], 
        labels=[f'Excluded\n({excluded_count} probes)', f'Not Significant\n({nonsig_count} probes)'],
        autopct='%1.1f%%',
        colors=['#fbb4ae', '#b3cde3'])
plt.title('Reasons for Probe Mismatch \nBetween Automated and Manual Approaches')
plt.show()

In [32]:
# common code to generate decay plots for subcategories
current_category = "metabolic"
target_idx = CATEGORY_NAMES.index(current_category)
mesh_ttoc = defaultdict(set) #term:code
file_path = 'data/humanData/mtrees2024.txt'
# Read the lines from the file
with open(file_path, 'r') as file:
    for line in file:
        # Split each line into A and B
        parts = line.strip().split(';')
        if len(parts) == 2:
            term, code = parts
            mesh_ttoc[term].add(code)
        else:
            print(parts)
mesh_ctot =  {v:k for k, vs in mesh_ttoc.items() for v in vs}

def starts_with_any(given_string, string_list):
    for prefix in string_list:
        if given_string.startswith(prefix):
            return True
    return False
neuro_mesh_tree = {}
keywords = CATEGORIES[target_idx]
for kw in keywords:
    neuro_mesh_tree[kw] = set([k for k, v in mesh_ttoc.items() for c in v if starts_with_any(c, mesh_ttoc[kw])])
def filter_mesh_list(input):
    return any(input in sublist for sublist in neuro_mesh_tree.values())
if current_category != "metabolic":
    neuro_df = pd.read_csv(f"data/probe/{current_category}_all_probes.csv")
    if current_category != "cancer":
        neuro_df["Filtered Mesh Term"] = neuro_df["Filtered Mesh Term"].apply(lambda x: [term.strip() for term in x.split("|")])
    else:
        neuro_df["Filtered Mesh Term"] = neuro_df["Filtered Mesh Term"].apply(eval)
else:
    neuro_df = pd.read_csv(f"data/probe/metabolic_diseases_all_probes.csv")  
    neuro_df["Filtered Mesh Term"] = neuro_df["Filtered Mesh Term"].apply(eval)

temp1 = neuro_df.drop_duplicates(subset="pmcid")
mesh_count_by_study = defaultdict(int)
mesh_terms = list(temp1["Filtered Mesh Term"])
pmcids = list(temp1["pmcid"])
term_pmcid_map = defaultdict(set)
for i, m in enumerate(mesh_terms):
    for t in m:
        mesh_count_by_study[t] += 1
        term_pmcid_map[t].add(pmcids[i])
mesh_count_by_study = {key:count for key, count in mesh_count_by_study.items() if filter_mesh_list(key)}
mesh_count_by_study[current_category.capitalize()] = len(set(neuro_df["pmcid"]))
mesh_count_by_study = dict(sorted(mesh_count_by_study.items(), reverse=True))
categories = list(mesh_count_by_study.keys())
counts = list(mesh_count_by_study.values())

enriched_categories = []
not_enriched_categories = []

d1 = []
d2 = []
probes = []
papers_ct = []
terms, counts = zip(*[(k, v) for k, v in mesh_count_by_study.items()])
paper_sets = []
for term in terms:
    output_path = f"{SFIG_PATH}/{current_category if current_category != 'neurological' else 'neuro'}/{term}.svg"
    p, paper, r, p2, curr_paper_set = breakdown(neuro_df, term_pmcid_map, [term], target_idx, output=None, show_figure=False, format="svg", export_all=True, show_y_label=True)
    probes.append(p)
    d1.append(paper)
    d2.append(r)
    papers_ct.append(p2)
    paper_sets.append(curr_paper_set)

df = pd.DataFrame({"Categories":terms, "Enrichment Ratio":d2, "CoRSIV Probes": probes, "CoRSIV Papers":papers_ct, "Highest Number of Papers": d1, "Total Number of Papers": counts, "Paper Sets": paper_sets})
df.sort_values("Enrichment Ratio", ascending=False, inplace=True)
df.index = df["Categories"]
df.drop(columns=["Categories"], inplace=True)
# df = df[(df["Enrichment Ratio"] > 1) & (df["Highest Number of Papers"]> 1) & (df.index!=current_category.capitalize())]


In [None]:
# common code for category specific mesh hierarchy
from graphviz import Digraph
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


def visualize(input_nodes, output_name=None, format="svg", target="Enrichment Ratio"):
    input_nodes = {x for y in input_nodes for x in mesh_ttoc[y]}
    dot = Digraph()
    dot.attr(rankdir='LR')  # Keep layout horizontal (Left-to-Right)

    edges = set()
    nodes = {}
    
    for code in input_nodes:
        parts = code.split('.')
        for i in range(1, len(parts) + 1):
            partial_code = '.'.join(parts[:i])
            if partial_code not in nodes and partial_code in mesh_ctot and any(partial_code.startswith(c) for k in keywords for c in mesh_ttoc[k]):
                nodes[partial_code] = mesh_ctot[partial_code]
    def add_code_edges(code):
        parts = code.split('.')
        for i in range(1, len(parts)):
            parent_code = '.'.join(parts[:i])
            child_code = '.'.join(parts[:i+1])
            if parent_code in nodes and child_code in nodes:
                edges.add((parent_code, child_code))
            
    def get_node_color(node):
        term = nodes[node]
        if term not in df.index:
            return "#FFFFFF"  # White for nodes not in df
        
        highest_papers = df.loc[term, 'Highest Number of Papers']
        enrichment_ratio = df.loc[term, 'Enrichment Ratio']
        
        if enrichment_ratio < 1 or highest_papers < 2:
            return "#FFFFFF"  # White for nodes with low papers or enrichment
        
        color_rgb = mcolors.to_rgb(COLOR_TEMPLATE[target_idx])
        intensity = min(1, df.loc[term, target] / df[target].max())
        scaled_color = tuple(1 - (1 - c) * intensity for c in color_rgb)  # Invert intensity calculation
        return mcolors.to_hex(scaled_color)

    
    for code, term in nodes.items():
        add_code_edges(code)  # Add edges between parent and child nodes
    if current_category == "neurological":
        nodes["N"] = "Neurological" 
        edges.add(("N", "F03"))
        edges.add(("N", "C10"))
    # Add nodes for each unique code with term name as label
    for code, term in nodes.items():
        node_color = get_node_color(code)
        if term in df.index:
            custom_label = f"{term} ({df.loc[term, 'Total Number of Papers']})"
        else:
            custom_label = term
        dot.node(code, label=custom_label, shape='box', style='filled', fillcolor=node_color, fontname='Helvetica')
    for parent_code, child_code in edges:
        dot.edge(parent_code, child_code)

    dot.attr(concentrate='true', splines='ortho')
    dot.attr(nodesep='0.2', ranksep='0.5')
    
    # Create color legend
    fig, ax = plt.subplots(figsize=(6, 1))
    cmap = mcolors.LinearSegmentedColormap.from_list("custom", ["white", COLOR_TEMPLATE[target_idx]])
    norm = mcolors.Normalize(vmin=0, vmax=df[target].max())
    cb = plt.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=cmap), 
                      cax=ax, orientation='horizontal', label=target)
    
    # Save legend as separate image

    legend_path =f"{SFIG_PATH}/hierarchy_plots/{current_category}_legend.svg"
    # plt.savefig(legend_path, bbox_inches='tight')

    if output_name:
        dot.render(output_name, format=format, cleanup=True)
    else:
        dot.view()
    return nodes

minimum_papers = 15
output_name = f"{SFIG_PATH}/hierarchy_plots/{current_category}_hierarchy_{minimum_papers}_papers" if minimum_papers > 0 else f"{SFIG_PATH}/hierarchy_plots/{current_category}_full_hierarchy"
# Modify the visualize function call to adjust edge routing
tmp = {key:count for key, count in mesh_count_by_study.items() if count >= minimum_papers}

nodes = visualize(tmp.keys(), output_name=None, format="svg")

In [None]:
# fig. S9: heatmap of probes by subcategories for metabolic and endocrine

mesh_ttoc = defaultdict(set) #term:code

mesh_tree = {} #big category:set of subcategories
file_path = 'data/humanData/mtrees2024.txt'

# Read the lines from the file
with open(file_path, 'r') as file:
    for line in file:
        # Split each line into A and B
        parts = line.strip().split(';')
        if len(parts) == 2:
            term, code = parts
            mesh_ttoc[term].add(code)
        else:
            print(parts)
            
mesh_ctot =  {v:k for k, vs in mesh_ttoc.items() for v in vs} #code:term


def next_layer(given_string, string_list):
    for prefix in string_list:
        if given_string.startswith(prefix) and len(given_string) > len(prefix) and given_string[len(prefix)] == '.' and '.' not in given_string[len(prefix)+1:]:
            return True
    return False

keywords = [cc for c in CATEGORIES for cc in c]

for kw in keywords:
    mesh_tree[kw] = set([k for k, v in mesh_ttoc.items() for c in v if next_layer(c, mesh_ttoc[kw])])

dfs = []
for key_category in ["metabolic", "endocrine"]:
    if key_category == "metabolic":
        df = pd.read_csv("data/probe/metabolic_diseases_all_probes.csv")
        df["Filtered Mesh Term"] = df["Filtered Mesh Term"].apply(eval)
    else:
        df = pd.read_csv(f"data/probe/{key_category}_all_probes.csv")
        df["Filtered Mesh Term"] = df["Filtered Mesh Term"].apply(lambda x: [term.strip() for term in x.split("|")])
    target_categories = set([])
    for c in CATEGORIES[CATEGORY_NAMES.index(key_category)]:
        target_categories |= mesh_tree[c]
    df["Filtered Mesh Term"] = df["Filtered Mesh Term"].apply(lambda x: [term for term in x if term in target_categories])
    c = Counter(df["probeId"])
    c = {k:v for k, v in c.items() if v <= 5 and v >= 2 and k in CORSIV_PROBE_LIST}
    df = df[df["probeId"].isin(c)]
    df = df.explode('Filtered Mesh Term')
    dfs.append(df)
    df_pivoted = df.groupby(['probeId', 'Filtered Mesh Term']).size().reset_index(name='count')
    df_pivoted = df_pivoted.pivot(index='probeId', columns='Filtered Mesh Term', values='count').fillna(0)
    df_pivoted.rename_axis(index=None, columns=None, inplace=True)
    dfs.append(df_pivoted)
    colors = [(1, 1, 1),
            (0, 0, 1)]
    cmap = LinearSegmentedColormap.from_list("custom_blue", colors, N=100)
    g = sns.clustermap(df_pivoted, method='ward', metric='euclidean', cmap=cmap, figsize=(8, 6), annot=False, vmax=5)
    cbar = g.ax_heatmap.collections[0].colorbar
    cbar.set_ticks([0, 5])
    cbar.set_ticklabels(['0', '5'])
    g.ax_row_dendrogram.set_visible(False)
    for line in g.ax_col_dendrogram.collections:
        line.set_linewidth(2)

    cbar.set_label("Number of \nPapers", rotation=90, fontsize=16)
    g.ax_heatmap.set_yticks([])
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha='center', va='top', fontsize=20)
    g.ax_heatmap.tick_params(axis='x', which='major', pad=10)
    g.figure.suptitle(f"{len(df_pivoted):,} CoRSIV Probes Reported in ≥2 {key_category.capitalize()} Papers", y=1.05, fontsize=26)
    # g.savefig(f"{SFIG_PATH}/heatmap/{key_category}_subcategory_2papers.jpeg", format="jpeg", dpi=300)

In [None]:
# fig. S12 and S13A density plots
import math
import scipy.stats as stats
import matplotlib.pyplot as plt

non_corsiv_baseline = illumina - CORSIV_PROBE_LIST
bins = [-1, 0, 1.0]

idx = 0 # change idx here based on data type: becon / icc / iir
target_cols = ["Mean Cor All Brain", "ICC", "iir1"]
fnames = ["data/humanData/becon_all_probes.csv", "data/humanData/Flanagan/Flanagan_icc_results.csv", "data/humanData/Flanagan/Flanagan_iir_results.csv"]
probe_cols = ["CpG ID", "ID", "ID"]
xlabels = ["Brain-Blood Correlation (BECon)", "Intraclass Correlation Coefficient (ICC)", r"IIR$_{2-98\%}$"]
output_path = ["becon", "icc", "iir"]


cat_probes_dict = []
for cat in CATEGORY_NAMES:
    cat_probes_dict.append(read_in_probes(cat))
    
    
target_col = target_cols[idx]
df = pd.read_csv(fnames[idx])
colname = probe_cols[idx]
regions = list(zip(["Non-CoRSIV", "CoRSIV"], [non_corsiv_baseline, CORSIV_PROBE_LIST]))
xlabel = xlabels[idx]
output_id = output_path[idx]

# Create subplots with 3 rows and 4 columns
fig, axes = plt.subplots(3, 4, figsize=(12, 9))
axes = axes.flatten()
lst = [c for c in CATEGORY_NAMES if c != "neurological"] if idx == 0 else CATEGORY_NAMES

for cat_idx, catname in enumerate(lst):
    ax = axes[cat_idx]
    i = CATEGORY_NAMES.index(catname)

    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    papers_threshold = 2
    p = set(k for k, v in cat_probes_dict[i].items() if v >= papers_threshold)
    
    for rname, rset in regions:
        probes_in_region = rset.intersection(p)
        filtered_df = df[df[colname].isin(probes_in_region)]
        dfs_for_plot.append((filtered_df, rname))

    for j, (df_subset, rname) in enumerate(dfs_for_plot):
        density = stats.gaussian_kde(df_subset[target_col])
        xs = np.linspace(-1, 1, 200)
        ys = density(xs)
        color = COLOR_TEMPLATE[i] if rname == 'CoRSIV' else 'grey'
        ax.plot(xs, ys, "-", color=color, label=rname, linewidth=3)
        ax.fill_between(xs, ys, alpha=0.5, color=color)
        peak_index = np.argmax(ys)
        ax.text(xs[peak_index], ys[peak_index]+0.04, f"{rname}", fontsize=15, #\n(n={len(df_subset):,})
                verticalalignment='bottom', horizontalalignment='center',
                color=color)

    ax.tick_params(axis='both', which='major', labelsize=16, length=5)
    if idx == 2:
        ax.set_xticks([0, 0.5, 1.0])
        ax.set_xlim(-0.1, 1.1)
    elif idx == 1:
        ax.set_xticks([-1, -0.5, 0.0, 0.5, 1.0])
    else:
        ax.set_yticks([0, 1.0, 2.0])
        ax.set_ylim(0, 2.5)
        ax.set_xticks([-1, 0.0, 1.0])
    ax.set_title(catname.capitalize(), 
                 fontsize=20, pad=10)
ax = axes[10] if idx == 0 else axes[11]

dfs_for_plot = []

for rname, rset in regions:
    filtered_df = df[df[colname].isin(rset)]
    dfs_for_plot.append((filtered_df, rname))

for j, (df_subset, rname) in enumerate(dfs_for_plot):
    density = stats.gaussian_kde(df_subset[target_col])
    xs = np.linspace(-1, 1, 200)
    ys = density(xs)
    color = 'black' if rname == 'CoRSIV' else 'grey'
    ax.plot(xs, ys, "-", color=color, label=rname, linewidth=3)
    ax.fill_between(xs, ys, alpha=0.5, color=color)
    peak_index = np.argmax(ys)
    ax.text(xs[peak_index], ys[peak_index]+0.04, f"{rname}", fontsize=15, #\n(n={len(df_subset):,})
            verticalalignment='bottom', horizontalalignment='center',
            color=color)

ax.tick_params(axis='both', which='major', labelsize=16, length=5)
if idx == 2:
    ax.set_xticks([0, 0.5, 1.0])
    ax.set_xlim(-0.1, 1.1)
elif idx == 1:
    ax.set_xticks([-1, -0.5, 0.0, 0.5, 1.0])
else:
    ax.set_yticks([0, 1.0, 2.0])
    ax.set_ylim(0, 2.5)
    ax.set_xticks([-1, 0.0, 1.0])
    
ax.set_title("All Probes", 
                fontsize=20, pad=10)
if idx == 0:
    fig.delaxes(axes[-1])

# Add a vertical line
x = 0
fig.add_artist(plt.Line2D([x, x], [0.06,0.94], transform=fig.transFigure, color='black', linestyle='-', linewidth=2))
fig.text(x-0.04, 0.5, 'Density', va='center', rotation='vertical', fontsize=24)

# Add a horizontal line
y = 0
fig.add_artist(plt.Line2D([0.04, 0.98], [y, y], transform=fig.transFigure, color='black', linestyle='-', linewidth=2))
# Add vertical text to the left of the vertical line
fig.text(0.5, y-0.04, xlabel, ha='center', rotation='horizontal', fontsize=24)
plt.tight_layout()
plt.show()
# output = f"{SFIG_PATH}/{output_id}_kde_all_categories.svg"
# plt.savefig(output, format="svg")

In [None]:
# fig. S13B: becon regression plots for all categories except neurological
import math
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

non_corsiv_baseline = illumina - CORSIV_PROBE_LIST
target_col = "Mean Cor All Brain"
df = pd.read_csv("data/humanData/becon_all_probes.csv")
regions = list(zip(["CoRSIV", "Non-CoRSIV"], [CORSIV_PROBE_LIST, non_corsiv_baseline]))

# Create subplot grid
fig, axs = plt.subplots(3, 2, figsize=(12, 11))
axs = axs.flatten()

# Plot each category except neurological
plot_idx = 0
for cat in ["cancer", "endocrine", "immune", "metabolic", "urogenital"]:
    cat_probes = read_in_probes(cat)
    ax = axs[plot_idx]
    max_papers = max(cat_probes.values())

    for rname, rset in regions:
        medians = []
        paper_counts = []
        for pidx in range(1, max_papers + 1):
            p = set(k for k, v in cat_probes.items() if v == pidx)
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if len(filtered_df) < 15:
                max_papers = pidx - 1
                break
            else:
                medians.append(filtered_df[target_col].median())
                paper_counts.append(pidx)
        
        X = np.array(medians)
        y = np.array(paper_counts)
        X_const = sm.add_constant(X)
        model = sm.OLS(y, X_const).fit()

        color = COLOR_TEMPLATE[CATEGORY_NAMES.index(cat)] if rname == 'CoRSIV' else 'grey'
        marker = 'D' if rname == 'CoRSIV' else 'o'
        ax.scatter(X, y, color=color, label=rname, s=200, marker=marker)
        ax.plot(X, model.predict(X_const), color=color, linestyle='--', linewidth=4, zorder=10, alpha=0.6)
        r_squared = round(model.rsquared, 3)
        slope = round(model.params[1], 3)
        f_pvalue = round(model.f_pvalue, 3)
        
        annotation_text = f"R² = {r_squared:.2f}\nP = {f_pvalue:.2f}"
        
        # Add annotation
        ax.annotate(annotation_text, 
                    xy=(X[-1]-0.04, y[-1]/2),
                    xytext=(10, 0), 
                    textcoords='offset points',
                    color=color,
                    fontsize=20,
                    ha='left', 
                    va='center',
                    bbox=dict(boxstyle='round,pad=0.5', fc='none', ec='none', alpha=1))

    # ax.set_xlabel("Median Brain-Blood Correlation (BECon)", fontsize=22)
    # ax.set_ylabel('Number of Papers', fontsize=18)
    ax.set_title(cat.capitalize(), fontsize=28, pad=15)
    ax.set_xlim(-0.05, 0.6)
    ax.set_xticks(np.arange(0.0, 0.7, 0.1))
    ax.tick_params(axis='x', which='major', length=5)
    ax.set_yticks(range(1, max_papers + 1))
    ax.set_ylim(0, max_papers + 1)
    ax.tick_params(axis='both', which='major', labelsize=25)
    
    plot_idx += 1

axs[-1].axis('off')
# Add legend to the last subplot
handles = [plt.Line2D([0], [0], marker='D', color='w', markerfacecolor="k", 
                      label="CoRSIV", markersize=15)]
handles.append(plt.Line2D([0], [0], marker='o', color='w', label='Control', markersize=20, markerfacecolor='grey'))

axs[-1].legend(handles=handles, loc='center', fontsize=24, frameon=False)

x = 0
fig.add_artist(plt.Line2D([x, x], [0.06,0.94], transform=fig.transFigure, color='black', linestyle='-', linewidth=2))
fig.text(x-0.04, 0.5, 'Number of Papers', va='center', rotation='vertical', fontsize=24)

# Add a horizontal line
y = 0
fig.add_artist(plt.Line2D([0.04, 0.96], [y, y], transform=fig.transFigure, color='black', linestyle='-', linewidth=2))
# Add vertical text to the left of the vertical line
fig.text(0.5, y-0.04, "Median Brain-Blood Correlation (BECon)", ha='center', rotation='horizontal', fontsize=24)
plt.tight_layout()

# output = f"{SFIG_PATH}/becon_regression_all_categories.svg"
# plt.savefig(output, format="svg")
plt.show()

In [2]:
# fig. S14: distribution of papers by the number of probes reported
dfs = []
for cat in CATEGORY_NAMES:
    df = pd.read_csv(f"before_filter1000/{cat}_all_probes.csv")
    dfs.append(df)
df = pd.concat(dfs)
df.drop_duplicates(subset=["pmcid", "probeId"], inplace=True)
ref = pd.read_excel("data/2203_studies_info.xlsx")["PMCID"]
df = pd.merge(df, ref, right_on="PMCID", left_on="pmcid", how="left")
df = df[["pmcid", "probeId"]]
grouped_df = df.groupby('pmcid')['probeId'].nunique().reset_index()
grouped_df.columns = ['pmcid', 'unique_probe_count']

tmp = grouped_df[grouped_df['unique_probe_count'] < 10000]
plt.figure(figsize=(6,6))
plt.hist(tmp['unique_probe_count'], bins=range(0, int(tmp['unique_probe_count'].max()) + 100, 100), edgecolor='black', color='skyblue')
plt.axvline(x=1000, color='red', linestyle='--', alpha=0.7)

plt.xlabel('Number of Probes Reported', fontsize=24)
plt.ylabel('Number of Papers', fontsize=24)
plt.title('Distribution of Papers by the Number of Probes Reported', fontsize=24, pad=20)

plt.xlim(0, 10000)
plt.tight_layout()
plt.show()
# plt.savefig(f"{SFIG_PATH}/supp_table_size_zoomedin.jpeg", dpi=300, bbox_inches='tight')
# plt.close()


In [None]:
# fig. S15: control metrics
control_info = pd.read_csv("data/humanData/corsiv_control/corsiv_control_matching.csv")
control_info.columns = ["Control_ID", "Control Chr", "Control Start", "Control End", "Control Region Size (bp)", 
                       "Control CpG Count", "Control TSS Count", "Control Gene Body Count", "Control TES Count", 
                       "Control Probe Count", "CoRSIV_ID", "CoRSIV Chr", "CoRSIV Start", "CoRSIV End", 
                       "CoRSIV Region Size (bp)", "CoRSIV CpG Count", "CoRSIV TSS Count", "CoRSIV Gene Body Count", 
                       "CoRSIV TES Count", "CoRSIV Probe Count"]
control_info = control_info[control_info["CoRSIV Probe Count"] > 0]
metrics = ["Region Size (bp)", "Probe Count", "CpG Count", "TSS Count", "Gene Body Count", "TES Count"]
fig, axes = plt.subplots(2, 3, figsize=(8, 6), gridspec_kw={'hspace': 0.2, 'wspace':0.6})
axes = axes.flatten()


for i, m in enumerate(metrics):
    ax = axes[i]
    data = pd.DataFrame({
        'x': control_info[f'CoRSIV {m}'],
        'y': control_info[f'Control {m}']
    })
    data['frequency'] = data.groupby(['x', 'y'])['x'].transform('count')
    reversed_Blues = plt.colormaps["Blues"].reversed()
    scatter = ax.scatter(data['x'], data['y'], 
                        c=data['frequency'], 
                        cmap='Blues',
                        s=50, 
                        edgecolor='grey',
                        alpha=0.8)
    ax.set_aspect('equal')
    max_val = max(ax.get_xlim()[1], ax.get_ylim()[1])
    ax.set_xlim(0, max_val)
    ax.set_ylim(0, max_val)
    ax.set_title(m, fontsize=16)
    ax.set_xticks(ax.get_yticks())
    ax.set_yticks(ax.get_yticks())
    ax.set_xlim(-1, max_val)
    ax.set_ylim(-1, max_val)
max_freq = max([data.groupby(['x', 'y'])['x'].count().max() for m in metrics])
plt.subplots_adjust(right=0.88)
norm = plt.Normalize(vmin=1, vmax=1000)
sm = plt.cm.ScalarMappable(cmap='Blues', norm=norm)
sm.set_array([])
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
cbar = fig.colorbar(sm, cax=cbar_ax, label='Number of Regions')
cbar.ax.set_ylabel('Number of Regions', fontsize=20)
cbar.ax.tick_params(labelsize=16)
# Set colorbar ticks to be 1, 10, 100, 1000 (log scale)
cbar.set_ticks(range(200, 1200, 200))
labels = [str(i) if i < 1000 else ">1000" for i in range(200, 1200, 200)]
cbar.set_ticklabels(labels)

for ax in axes:
    scatter = ax.collections[0]
    scatter.set_norm(norm)
    
x = 0.03
fig.add_artist(plt.Line2D([x, x], [0.15,0.85], transform=fig.transFigure, color='black', linestyle='-', linewidth=2))
fig.text(x-0.08, 0.5, 'Control', va='center', rotation='vertical', fontsize=24)

y = 0.05
fig.add_artist(plt.Line2D([0.12, 0.88], [y, y], transform=fig.transFigure, color='black', linestyle='-', linewidth=2))
fig.text(0.5, y-0.08, "CoRSIV", ha='center', rotation='horizontal', fontsize=24)
# plt.tight_layout()
plt.show()
# plt.savefig(f"{SFIG_PATH}/control_metrics.jpeg", dpi=300, bbox_inches='tight')

