In [2]:

"""
Initialization
"""
# imports
import os
import pandas as pd
from collections import Counter
import time
import matplotlib.pyplot as plt
import numpy as np

# constant, data
os.chdir("/Users/antata/Library/CloudStorage/OneDrive-BaylorCollegeofMedicine/text-mining/categories/")
category_names = ["cancer", "cardiovascular", "digestive", "endocrine", "hematological", "immune", "metabolic", "neurological", "obesity", "respiratory", "urogenital"]#, "T2D"]
category_name_to_idx = {c: i for i, c in enumerate(category_names)}
color_template = ['#e6194B','#f58231','#f3c300','#469990','#808000','#2f8e3b','#0db7dd','#4363d8','#800000','#f032e6','#911eb4']#, "#000075"]#'#8298e5'
controls = []
corsiv_probe_df = pd.read_csv("../control/corsiv_all_probes_id.txt", sep="\t", names=["chr", "start", "end", "probeId", "corsiv_start", "corsiv_end", "corsiv_id"])
corsiv_probe_list = set(corsiv_probe_df.iloc[:,3])
for i in range(1, 11):
    control_probe_df = pd.read_csv(f"../control/control_probes_{i}.txt", sep="\t", names=["chr", "start", "end", "probeId", "_", "corsiv_start", "corsiv_end", "corsiv_id"])
    control_probe_list = set(control_probe_df.iloc[:,3])
    controls.append(control_probe_list)
control_probe_df = pd.read_csv("../control/control_all_probes_id.txt", sep="\t", names=["chr", "start", "end", "probeId", "id"])
control_probe_list = set(control_probe_df.iloc[:,3])
epic = pd.read_csv("../humanData/database/EPIC.hg38.txt", sep="\t", header=None)
epic_probe_list = set(epic.iloc[:,3])
hm450 = pd.read_csv("../humanData/database/HM450.hg38.txt", sep="\t", header=None)
hm450_probe_list = set(hm450.iloc[:,3])
illumina = epic_probe_list.union(hm450_probe_list)
enrichment_cutoff = (len(corsiv_probe_list) / len(illumina)*1.5)
enrichment_cutoff
non_corsiv_baseline = illumina - corsiv_probe_list
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2  # Thicker outer box
plt.rcParams['xtick.major.width'] = 2  # Thicker x-axis ticks
plt.rcParams['ytick.major.width'] = 2  # Thicker y-axis ticks

In [3]:
"""
read all probes if combined file alrdy exists
"""
cat_probes_dict = []

def read_in_probes(input_cat):
    if input_cat == "metabolic":
        df = pd.read_csv(f"probe/metabolic_diseases_all_probes.csv")
    else:
        df = pd.read_csv(f"probe/{input_cat}_all_probes.csv")
    probe_list = df["probeId"].to_list()
    c = dict(Counter(probe_list))
    cat_probes_dict.append(c.copy())
    return


for cat in category_names:
    start = time.time()
    print(cat)
    read_in_probes(cat)
    end = time.time()
    print(f'Time for {cat} code to run: ', end - start)


cancer
Time for cancer code to run:  0.15421319007873535
cardiovascular
Time for cardiovascular code to run:  0.020445823669433594
digestive
Time for digestive code to run:  0.03392982482910156
endocrine
Time for endocrine code to run:  0.03881025314331055
hematological
Time for hematological code to run:  0.02509284019470215
immune
Time for immune code to run:  0.04961729049682617
metabolic
Time for metabolic code to run:  0.045599937438964844
neurological
Time for neurological code to run:  0.11101698875427246
obesity
Time for obesity code to run:  0.01691579818725586
respiratory
Time for respiratory code to run:  0.029135942459106445
urogenital
Time for urogenital code to run:  0.09289073944091797


In [None]:
df = pd.read_csv("becon/becon_all_probes.csv")
bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]

df["Mean Cor All Brain"].hist(bins=bins, edgecolor='black', color='grey')
plt.show()


In [47]:
import math
import scipy.stats as stats

target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["CoRSIV", "Non-CoRSIV"], [corsiv_probe_list, non_corsiv_baseline]))
subfig_size = 4
median_pairs = [[] for _ in category_names*2]
for i, catname in enumerate(category_names):
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    for pidx in range(1, max_papers + 1):
        p = set(k for k, v in cat_probes_dict[i].items() if v == pidx)
        skip = False
        for rname, rset in regions:
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if rname == "CoRSIV" and len(filtered_df) < 10:
                skip = True
                break
            else:
                if rname == "CoRSIV":
                    median_pairs[i].append(filtered_df[target_col].median())
                else:
                    median_pairs[i+1].append(filtered_df[target_col].median())
        if skip:
            break
    break

In [None]:
import math
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["CoRSIV", "Non-CoRSIV"], [corsiv_probe_list, non_corsiv_baseline]))
subfig_size = 4


fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(3, 6)  # 2 rows, 3 columns
ax1 = fig.add_subplot(gs[0, 0:4])  # First subplot (narrow)
ax2 = fig.add_subplot(gs[0, 4:6])  # Second subplot (wider)
ax3 = fig.add_subplot(gs[1, 0:3])  # First subplot (narrow)
ax4 = fig.add_subplot(gs[1, 3:6])  # Second subplot (wider)
ax5 = fig.add_subplot(gs[2, 0:4])  # First subplot (narrow)
ax6 = fig.add_subplot(gs[2, 4:6])  # Second subplot (wider)
axs = [ax1, ax2, ax3, ax4, ax5, ax6]
fig.subplots_adjust(hspace=0.01, wspace=0.01)  # Adjust vertical and horizontal spacing to be tiny

plot_idx = 0
for i, catname in enumerate(category_names):
    if catname == "obesity":
        continue
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    for pidx in range(1, max_papers + 1):
        p = set(k for k, v in cat_probes_dict[i].items() if v == pidx)
        skip = False
        for rname, rset in regions:
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if rname == "CoRSIV" and len(filtered_df) < 10:
                skip = True
                break
            else:
                dfs_for_plot.append(filtered_df)
        if skip:
            break
    nrows = int(len(dfs_for_plot)/2)
    if nrows < 3:
        continue
    ax = axs[plot_idx]
    box_positions = []
    box_data = []
    labels = []
    colors = []
    for j in range(0, len(dfs_for_plot), 2):
        non_corsiv_df = dfs_for_plot[j+1]
        corsiv_df = dfs_for_plot[j]
        position = (j // 2) + 1
        box_positions.extend([position - 0.2, position + 0.2])
        box_data.append(non_corsiv_df[target_col])
        labels.append(f"Non-CoRSIV\n≥ {j//2+1} papers")
        colors.append('grey')
        box_data.append(corsiv_df[target_col])
        labels.append(f"CoRSIV\n≥ {j//2+1} papers")
        colors.append(color_template[i])
    
    bp = ax.boxplot(box_data, positions=box_positions, patch_artist=True, widths=0.3)
    for median in bp['medians']:
        median.set_color('black')
    
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    
    # ax.set_xlabel('Number of Papers')
    # ax.set_ylabel(target_col)
    ax.set_title(category_names[i].capitalize())
    ax.set_xticks(range(1, nrows + 1))
    ax.set_xticklabels(range(1, nrows + 1))
    ax.set_yticks(np.arange(-1.0, 1.1, 0.5))
    plot_idx += 1
fig.text(0.5, -0.03, 'Number of Papers Reporting Probes', ha='center', va='center', fontsize=18)
fig.text(-0.03, 0.5, "Mean Blood-Brain Correlation", ha='center', va='center', rotation='vertical', fontsize=18)
# Add a vertical line at x=2 outside of all figures
fig.add_artist(plt.Line2D([0, 0], [0.08, 0.92], color='black', linestyle='-', linewidth=2, transform=fig.transFigure))
fig.add_artist(plt.Line2D([0.08, 0.98], [0, 0], color='black', linestyle='-', linewidth=2, transform=fig.transFigure))
# plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{target_col}_boxplot_combined.pdf", format="pdf")
plt.tight_layout()
plt.show()


In [None]:
import math
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["CoRSIV", "Non-CoRSIV"], [corsiv_probe_list, non_corsiv_baseline]))
subfig_size = 4

for i, catname in enumerate(category_names):
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    corsiv_median = []
    non_corsiv_median = []
    for pidx in range(1, max_papers + 1):
        p = set(k for k, v in cat_probes_dict[i].items() if v == pidx)
        skip = False
        for rname, rset in regions:
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if rname == "CoRSIV" and len(filtered_df) < 10:
                skip = True
                break
            else:
                if rname == "CoRSIV":
                    corsiv_median.append(filtered_df[target_col].median())
                else:
                    non_corsiv_median.append(filtered_df[target_col].median())
                dfs_for_plot.append(filtered_df)
        if skip:
            break
    nrows = int(len(dfs_for_plot)/2)
    if nrows < 3:
        continue
    fig, ax = plt.subplots(figsize=(10, 4))  # Single figure with adjusted size
    
    box_positions = []
    box_data = []
    labels = []
    colors = []
    for j in range(0, len(dfs_for_plot), 2):
        non_corsiv_df = dfs_for_plot[j+1]
        corsiv_df = dfs_for_plot[j]
        
        position = (j // 2) + 1
        box_positions.extend([position - 0.2, position + 0.2])
        
        box_data.append(non_corsiv_df[target_col])
        labels.append(f"Non-CoRSIV\n≥ {j//2+1} papers")
        colors.append('grey')
        
        if corsiv_df is not None:
            box_data.append(corsiv_df[target_col])
            labels.append(f"CoRSIV\n≥ {j//2+1} papers")
            colors.append(color_template[i])
    
    bp = ax.boxplot(box_data, positions=box_positions, patch_artist=True, widths=0.3)
    for median in bp['medians']:
        median.set_color('black')
    
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    
    ax.set_xlabel('Number of Papers')
    ax.set_ylabel(target_col)
    ax.set_title(f"{category_names[i].capitalize()} - {target_col}")
    
    ax.set_xticks(range(1, nrows + 1))
    ax.set_xticklabels(range(1, nrows + 1))
    
    # Add legend
    # non_corsiv_patch = plt.Rectangle((0, 0), 1, 1, fc="grey")
    # corsiv_patch = plt.Rectangle((0, 0), 1, 1, fc=color_template[i])
    # ax.legend([non_corsiv_patch, corsiv_patch], ['Non-CoRSIV', 'CoRSIV'], loc='upper right')

    models = []
    data = []
    # Example data
    for enum, lst in enumerate([corsiv_median, non_corsiv_median]):
        X = np.array(list(range(1, len(lst)+1)))
        y = np.array(lst)
        X = sm.add_constant(X)
        model = sm.OLS(y, X).fit()
        
        # Store the model results for later use
        models.append(model)
        data.append((X[:, 1], y))

        r_squared = round(model.rsquared, 3)
        f_stat = round(model.fvalue, 3)
        f_pvalue = round(model.f_pvalue, 3)
        x1_pvalue = round(model.pvalues[1], 3)
        slope = round(model.params[1], 3)
        b_pvalue = round(model.pvalues[0], 3)
        intercept = round(model.params[0], 3)
        # Annotate the plot with regression statistics
        region = "CoRSIV" if enum == 0 else "Non-CoRSIV"
        pos = 1.05 if enum == 0 else 1.35
        annotation_text = (f"{region}:\nR² = {r_squared}\n"
                           f"Slope = {slope}\n"
                        f"Slope p-value = {x1_pvalue}\n"
                        f"Intercept = {intercept}\n"
                        f"Intercept p-value = {b_pvalue}")
        
        # Position the annotation in the upper left corner
        ax.annotate(annotation_text, xy=(pos, 0.95), xycoords='axes fraction',
                    fontsize=8, ha='left', va='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    ax.set_ylim(-1, 1)
    ax.set_xticks(range(1, len(data[0][0]) + 1))
    plt.tight_layout()
    plt.show()
    # plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{target_col}_{category_names[i]}_boxplot.jpeg", format="jpeg")

    plt.close()
    fig, ax = plt.subplots(figsize=(5, 4)) 
    
    # Plot CoRSIV data and regression
    ax.scatter(data[0][0], data[0][1], color=color_template[i], label='Data (CoRSIV)')
    ax.plot(data[0][0], models[0].predict(sm.add_constant(data[0][0])), color=color_template[i], linestyle='--', label='Regression (CoRSIV)')
    
    # Plot Non-CoRSIV data and regression
    ax.scatter(data[1][0], data[1][1], color='grey', label='Data (Non-CoRSIV)')
    ax.plot(data[1][0], models[1].predict(sm.add_constant(data[1][0])), color='grey', linestyle='--', label='Regression (Non-CoRSIV)')
    
    ax.set_xlabel('Number of Papers')
    ax.set_ylabel(target_col)
    ax.set_ylim(-0.2, 1)
    ax.set_xticks(range(1, len(data[0][0]) + 1))
    plt.tight_layout()
    # plt.show()
    # plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{target_col}_{category_names[i]}_regression.jpeg", format="jpeg")
    # plt.close()
    # break



In [None]:
import math
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["CoRSIV", "Non-CoRSIV"], [corsiv_probe_list, non_corsiv_baseline]))
subfig_size = 4

for i, catname in enumerate(category_names):
    max_papers = max(cat_probes_dict[i].values())
    corsiv_median = []
    non_corsiv_median = []
    corsiv_size = []
    non_corsiv_size = []
    for pidx in range(1, max_papers + 1):
        p = set(k for k, v in cat_probes_dict[i].items() if v == pidx)
        skip = False
        for rname, rset in regions:
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if rname == "CoRSIV" and len(filtered_df) < 15:
                skip = True
                break
            else:
                if rname == "CoRSIV":
                    corsiv_median.append(filtered_df[target_col].mean())
                    corsiv_size.append(len(filtered_df))
                else:
                    non_corsiv_median.append(filtered_df[target_col].mean())
                    non_corsiv_size.append(len(filtered_df))
        if skip:
            break
    
    if len(corsiv_median) < 3:
        continue

    fig, ax = plt.subplots(figsize=(6, 4))

    models = []
    data = []
    for enum, lst in enumerate([corsiv_median, non_corsiv_median]):
        y = np.array(list(range(1, len(lst)+1)))
        X = np.array(lst)
        X_const = sm.add_constant(X)
        model = sm.OLS(y, X_const).fit()
        
        models.append(model)
        data.append((X, y))

        r_squared = round(model.rsquared, 3)
        x1_pvalue = round(model.pvalues[1], 3)
        slope = round(model.params[1], 3)
        b_pvalue = round(model.pvalues[0], 3)
        intercept = round(model.params[0], 3)

        region = "CoRSIV" if enum == 0 else "Non-CoRSIV"
        color = color_template[i] if enum == 0 else 'grey'
        label = f'Data ({region})'
        
        ax.scatter(X, y, color=color, label=label)
        ax.plot(X, model.predict(X_const), color=color, linestyle='--', label=f'Regression ({region})')
        # for k, txt in enumerate(corsiv_size if region == "CoRSIV" else non_corsiv_size):
        #     ax.annotate(f"{txt:,}", (X[k], y[k]), textcoords="offset points", xytext=(0,10), ha='center', fontsize=10)

        pos = 0.45 if enum == 1 else 0.75
        annotation_text = (f"{region}:\nR² = {r_squared}\n"
                           f"Slope = {slope}\n"
                           f"Slope p-value = {x1_pvalue}")
        #                           f"Intercept = {intercept}\n"
        #                   f"Intercept p-value = {b_pvalue}"
        
        ax.annotate(annotation_text, xy=(1.05, pos), xycoords='axes fraction',
                    fontsize=8, ha='left', va='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    ax.set_title(category_names[i].capitalize())
    ax.set_xlabel(target_col)
    ax.set_xlim(-0.2, 0.6)
    ax.set_xticks(np.arange(-0.2, 0.6, 0.2))
    ax.set_ylabel('Number of Papers')
    ax.set_yticks(range(1, len(data[0][0]) + 1))
    # ax.legend()
    plt.tight_layout()
    # plt.show()
    # plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{target_col}_{category_names[i]}_regression.jpeg", format="jpeg")
    # plt.close()
    # break


In [None]:
from statsmodels.nonparametric.kernel_regression import KernelReg
import numpy as np

x = data['x'].values
y = data['y'].values
model = KernelReg(y, x, var_type='c')  # 'c' for continuous data
y_pred, _ = model.fit(x)

plt.scatter(x, y)
plt.plot(x, y_pred, color='blue')


In [None]:
import numpy as np
import statsmodels.api as sm

# Example data
X = np.array([1, 2, 3, 4, 5, 6])
y = np.array(median_pairs[0])
slope, intercept, r_value, p_value, std_err = stats.linregress(X, y)
print(slope, intercept, r_value, p_value, std_err)
# # Add constant to the model (for the intercept)
# X = sm.add_constant(X)

# Fit the model
# Add constant to the model (for the intercept)
X = sm.add_constant(X)

# Fit the multiple linear regression model
model = sm.OLS(y, X).fit()
# Plot the model
import matplotlib.pyplot as plt

# Create a scatter plot of the data points
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 1], y, color='blue', alpha=0.7, label='Data points')

# Plot the regression line
y_pred = model.predict(X)
plt.plot(X[:, 1], y_pred, color='red', label='Regression line')

# Add labels and title
plt.xlabel('X')
plt.ylabel('Y')

plt.grid(True, alpha=0.3)
plt.show()

# Print model summary
print(model.summary())


In [None]:
import math
import scipy.stats as stats
import matplotlib.pyplot as plt

bins = [-1, 0, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["Non-CoRSIV", "CoRSIV"], [non_corsiv_baseline, corsiv_probe_list]))
subfig_size = 3  # Size for each subplot

# Create a 2x5 grid of subplots
fig, axs = plt.subplots(2, 5, figsize=(subfig_size*5, subfig_size*2))
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

plot_index = 0
for i, catname in enumerate(category_names):
    if catname.lower() == 'hematological':
        continue  # Skip hematological category
    
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    p = set(k for k, v in cat_probes_dict[i].items() if v >= 2)
    for rname, rset in regions:
        probes_in_region = rset.intersection(p)
        filtered_df = df[df["CpG ID"].isin(probes_in_region)]
        dfs_for_plot.append((filtered_df, rname))

    ax = axs[plot_index]
    
    for j, (df_subset, rname) in enumerate(dfs_for_plot):
        density = stats.gaussian_kde(df_subset[target_col])
        xs = np.linspace(min(bins), max(bins), 200)
        ys = density(xs)
        color = color_template[i] if rname == 'CoRSIV' else 'grey'
        ax.plot(xs, ys, "-", color=color, label=f"{rname} Probes ≥ 2 papers", linewidth=3)
        ax.fill_between(xs, ys, alpha=0.5, color=color)
    
    # Set x-ticks
    ax.set_xticks([-1, 0.0, 1.0])
    ax.set_yticks([0, 1.0, 2.0])
    ax.set_ylim(0, 2.1)
    # ax.set_xlabel(target_col)
    # ax.set_ylabel('Density')
    ax.set_title(category_names[i].capitalize())
    
    plot_index += 1

plt.tight_layout()
output = f"/Users/antata/Desktop/text-mining-figures/Fig5/becon_kde.svg"
plt.savefig(output, format="svg")
plt.show()
plt.close()


In [None]:
import math
import scipy.stats as stats
import matplotlib.pyplot as plt

bins = np.arange(-1, 1.1, 0.1)
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["Non-CoRSIV", "CoRSIV"], [non_corsiv_baseline, corsiv_probe_list]))
subfig_size = 3  # Size for each subplot

# Create a 2x5 grid of subplots
fig, axs = plt.subplots(2, 5, figsize=(subfig_size*5, subfig_size*2))
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

plot_index = 0
for i, catname in enumerate(category_names):
    if catname.lower() == 'hematological':
        continue  # Skip hematological category
    
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    p = set(k for k, v in cat_probes_dict[i].items() if v >= 2)
    for rname, rset in regions:
        probes_in_region = rset.intersection(p)
        filtered_df = df[df["CpG ID"].isin(probes_in_region)]
        dfs_for_plot.append((filtered_df, rname))

    ax = axs[plot_index]
    
    for j, (df_subset, rname) in enumerate(dfs_for_plot):
        density = stats.gaussian_kde(df_subset[target_col])
        xs = np.linspace(min(bins), max(bins), 200)
        ys = density(xs)
        hist, bin_edges = np.histogram(df_subset[target_col], bins=bins)
        hist = hist / np.sum(hist)  # Normalize the histogram
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
        ax.bar(bin_centers, hist, width=np.diff(bin_edges), align='center', alpha=0.7, 
               color=color_template[i] if rname == 'CoRSIV' else 'grey', 
               label=f"{rname} Probes ≥ 2 papers", edgecolor='black')
    
    # Set x-ticks
    ax.set_xticks([-1, 0.0, 1.0])
    # ax.set_yticks([0, 1.0, 2.0])
    # ax.set_ylim(0, 2.1)
    # ax.set_xlabel(target_col)
    # ax.set_ylabel('Density')
    ax.set_title(category_names[i].capitalize())
    
    plot_index += 1

plt.tight_layout()
plt.show()
plt.close()


In [None]:
import math
import scipy.stats as stats

bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["Non-CoRSIV", "CoRSIV"], [non_corsiv_baseline, corsiv_probe_list]))
subfig_size = 8  # Increased figure size for better visibility
paper_limit = 0
for i, catname in enumerate(category_names):
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    for pidx in range(1, max_papers + 1):
        p = set(k for k, v in cat_probes_dict[i].items() if v >= pidx)
        skip = False
        for rname, rset in regions:
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if rname == "CoRSIV" and len(filtered_df) < 10:
                skip = True
                break
            else:
                paper_limit = max(paper_limit, pidx)
                dfs_for_plot.append((filtered_df, rname, pidx))
        if skip:
            break
    print(paper_limit)
    fig, ax = plt.subplots(figsize=(subfig_size, subfig_size*0.75))
    
    for j, (df_subset, rname, pidx) in enumerate(dfs_for_plot):
        # Plot density curve instead of histogram
        density = stats.gaussian_kde(df_subset[target_col])
        xs = np.linspace(min(bins), max(bins), 200)
        ys = density(xs)
        
        line_style = '-'
        color = color_template[i] if rname == 'CoRSIV' else 'grey'
        alpha = min(1.0, 0.2 + (pidx / paper_limit) * 0.8)  # Adjust alpha based on highest paper count for category
        print(pidx, paper_limit, alpha)
        ax.plot(xs, ys, line_style, color=color, alpha=alpha,
                label=f"{rname} Probes ≥ {pidx} papers")
    
    
    ax.set_xlabel(target_col)
    ax.set_ylabel('Density')
    ax.set_title(f"{category_names[i].capitalize()} - {target_col}")
    # ax.legend()
    
    plt.tight_layout()
    plt.show()
    # plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{target_col}_{category_names[i]}_sensitivity.pdf", format="pdf")
    plt.close()
    # break


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# Example data
data = np.random.beta(2, 5, size=1000)

# KDE estimation
density = gaussian_kde(data)

# Create values over which to evaluate the density
x_vals = np.linspace(min(data), max(data), 100)

# Plot the KDE
plt.plot(x_vals, density(x_vals), label='KDE')
plt.hist(data, bins=30, density=True, alpha=0.5, label='Histogram (Normalized)')
plt.legend()
plt.show()


In [4]:
import math
import scipy.stats as stats

bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["Non-CoRSIV", "CoRSIV"], [non_corsiv_baseline, corsiv_probe_list]))
subfig_size = 4
outer_dfs = []
for i, catname in enumerate(category_names):
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    median_output = []
    skewness_output = []
    pvalue_output = []
    paper_output = []
    region_output = []
    probes_output = []
    for pidx in range(1, max_papers + 1):
        p = set(k for k, v in cat_probes_dict[i].items() if v >= pidx)
        skip = False
        for rname, rset in regions:
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if rname == "CoRSIV" and len(filtered_df) < 10:
                skip = True
                break
            else:
                dfs_for_plot.append(filtered_df)
        if skip:
            break
    if len(dfs_for_plot) % 2 != 0:
        dfs_for_plot = dfs_for_plot[:-1]
    for j, df_subset in enumerate(dfs_for_plot):
        num_probes = len(df_subset)
        if j % 2:
            stat, p_value = stats.mannwhitneyu(dfs_for_plot[j-1][target_col], df_subset[target_col])
        data = np.array(df_subset[target_col])
        mean = np.mean(data)
        median = np.median(data)
        std_dev = np.std(data, ddof=1)
        skewness_2 = 3* (mean - median) / std_dev
        skewness_output.append(round(skewness_2, 2))
        pvalue_output.append(f"{p_value:.1e}" if j % 2 else np.nan)
        # pvalue_output.append(p_value if j % 2 else np.nan)
        median_output.append(round(median, 2))
        region_output.append("CoRSIV" if j % 2 else "Non-CoRSIV")
        paper_output.append(j//2+1)
        probes_output.append(len(data))
    outer_dfs.append(pd.DataFrame({"Region": region_output, "Papers": paper_output, "Median": median_output, "Skewness": skewness_output, "P-value": pvalue_output, "Number of Probes": probes_output}))
    # break


In [None]:
import matplotlib.pyplot as plt
linesize = 1.3
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 16
plt.rcParams['axes.linewidth'] = linesize  # Thicker outer box
plt.rcParams['xtick.major.width'] = linesize  # Thicker x-axis ticks
plt.rcParams['ytick.major.width'] = linesize  # Thicker y-axis ticks
figsize = 3
fig, axs = plt.subplots(2, 3, figsize=(2.8*figsize, 1.8*figsize))
fig.set_tight_layout({'h_pad': -0.5, 'w_pad': -0.5})
axs = axs.flatten()
col = "Median"
j = 0
for i, df in enumerate(outer_dfs):
    if len(df) < 3*2 or i == 8:
        continue
    ax = axs[j]
    non_corsiv = df[df['Region'] == 'Non-CoRSIV']
    corsiv = df[df['Region'] == 'CoRSIV']
    
    ax.plot(non_corsiv['Papers'], non_corsiv[col], 
            color="grey", linestyle='-', marker='o', 
            label='Non-CoRSIV')
    ax.plot(corsiv['Papers'], corsiv[col], 
            color=color_template[i], linestyle='-', marker='s', 
            label='CoRSIV')
    ax.set_ylim(-0.2, 1)
    ax.set_yticks(np.arange(-0.2, 1.1, 0.6))
    # ax.set_xlabel('Number of Papers')
    # ax.set_ylabel(col)
    ax.set_title(f'{category_names[i].capitalize()}')
    ax.set_xticks(range(1, len(non_corsiv['Papers']) + 1))
    ax.set_xticklabels(range(1, len(non_corsiv['Papers']) + 1))
    j += 1
    # Adjust x-axis label position
fig.text(0.5, -0.05, '≥ Number of Papers', ha='center', va='center', fontsize=18)
fig.text(-0.03, 0.5, "Median Blood-Brain Correlation", ha='center', va='center', rotation='vertical', fontsize=18)
# Add a vertical line at x=2 outside of all figures
fig.add_artist(plt.Line2D([0, 0], [0.08, 0.92], color='black', linestyle='-', linewidth=2, transform=fig.transFigure))
fig.add_artist(plt.Line2D([0.08, 0.98], [0, 0], color='black', linestyle='-', linewidth=2, transform=fig.transFigure))
plt.tight_layout()
# Add legend to the last subplot
handles, labels = axs[-1].get_legend_handles_labels()
# Update the handles to change CoRSIV color to black
handles = [handle if label == 'Non-CoRSIV' else plt.Line2D([], [], color='black', marker='s', linestyle='-', markersize=6) for handle, label in zip(handles, labels)]
fig.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.25, 0.5), ncol=1, fontsize=14, frameon=False)

plt.show()


In [None]:
import matplotlib.pyplot as plt
linesize = 1.3
plt.rcParams['axes.linewidth'] = linesize  # Thicker outer box
plt.rcParams['xtick.major.width'] = linesize  # Thicker x-axis ticks
plt.rcParams['ytick.major.width'] = linesize  # Thicker y-axis ticks
figsize = 3
fig, axs = plt.subplots(2, 5, figsize=(5*figsize, 2*figsize))
axs = axs.flatten()
col = "Median"
for i, df in enumerate(outer_dfs):
    if i == 4:
        continue
    ax = axs[i] if i < 4 else axs[i-1]
    non_corsiv = df[df['Region'] == 'Non-CoRSIV']
    corsiv = df[df['Region'] == 'CoRSIV']
    
    ax.plot(non_corsiv['Papers'], non_corsiv[col], 
            color="grey", linestyle='-', marker='o', 
            label='Non-CoRSIV')
    ax.plot(corsiv['Papers'], corsiv[col], 
            color=color_template[i], linestyle='-', marker='s', 
            label='CoRSIV')

#     ax.set_xlabel('Number of Papers')
#     ax.set_ylabel(col)
    ax.set_title(f'{category_names[i].capitalize()}')
    ax.set_xticks(range(1, len(non_corsiv['Papers']) + 1))
    ax.set_xticklabels(range(1, len(non_corsiv['Papers']) + 1))
    # ax.legend()
    # ax.grid(False, linestyle='--', alpha=0.7)
    # Adjust x-axis label position
fig.text(0.5, -0.05, 'Number of Papers', ha='center', va='center', fontsize=18)
fig.text(-0.01, 0.5, col, ha='center', va='center', rotation='vertical', fontsize=18)

# Add a vertical line at x=2 outside of all figures
fig.add_artist(plt.Line2D([0, 0], [0.05, 0.95], color='black', linestyle='-', linewidth=2, transform=fig.transFigure))
fig.add_artist(plt.Line2D([0.03, 1], [0, 0], color='black', linestyle='-', linewidth=2, transform=fig.transFigure))

plt.tight_layout()
plt.show()


In [4]:
# Create a set to store the union of all dict keys
all_probes_union = set()

# Iterate through each dictionary in cat_probes_dict and update the union set
for probe_dict in cat_probes_dict:
    all_probes_union.update(probe_dict.keys())

# # Export the union set to a text file
# with open(f'becon/all_probes.txt', 'w') as f:
#     for probe in all_probes_union:
#         f.write(f"{probe}\n")

print(f"Total number of unique probes across all categories: {len(all_probes_union)}")


Total number of unique probes across all categories: 234586


In [None]:
df = pd.read_csv("becon/becon_all_probes.csv")
df.reset_index(inplace=True, drop=True)
df.to_csv("becon/becon_all_probes.csv", index=False)

In [None]:
df = pd.read_csv("becon/becon_all_probes.csv")
# df.reset_index(inplace=True, drop=True)
# df.to_csv("becon/becon_all_probes.csv", index=False)
df[["CpG ID", "Mean Cor All Brain"]]

def generate_plot_all_categories(column_to_plot, baseline, paper_threshold = 2, step_size = 0.05, output = False, show = False):
    
    if baseline == "Control":
        baseline_dataset = control_probe_list
    elif baseline == "Non-CoRSIV":
        baseline_dataset = non_corsiv_baseline
    else:
        raise ValueError("Invalid baseline value")
    
        
    df = pd.read_csv(f"becon/becon_all_probes.csv")
    
    fig, axs = plt.subplots(6, 2, figsize=(20, 30))
    axs = axs.flatten()
    axs[-1].axis('off')  # Turn off the last subplot
    axs = axs[:-1]  # Remove the last subplot from the list
    line_styles = ['-', ':', '-', ':', '-', ':']
    bins = np.arange(-1, 1+step_size, step_size)
        
    for j, category_name in enumerate(category_names):
        ax = axs[j]

        columns_names_for_plot = ["CoRSIV", f"{baseline}", f"CoRSIV {category_name.capitalize()}", f"{baseline} {category_name.capitalize()}"]
        dfs_for_plot = [df[df["CpG ID"].isin(corsiv_probe_list)], #corsiv
                            df[df["CpG ID"].isin(baseline_dataset)], #noncorsiv / control
                            df[df["CpG ID"].isin(corsiv_probe_list) & df["CpG ID"].isin(cat_probes_dict[j])], #corsiv disease
                            df[df["CpG ID"].isin(baseline_dataset) & df["CpG ID"].isin(cat_probes_dict[j])], #noncorsiv/control disease
                        ]
        pset = []
        for pidx in range(2, paper_threshold+1):
            p2 = set({k for k, v in cat_probes_dict[j].items() if v >= pidx})
            pset.append(p2)
        for k, p in enumerate(pset):
            dfs_for_plot.append(df[df["CpG ID"].isin(p)& df["CpG ID"].isin(corsiv_probe_list)])
            dfs_for_plot.append(df[df["CpG ID"].isin(p)& df["CpG ID"].isin(baseline_dataset)])
            columns_names_for_plot.append(f"CoRSIV {category_name.capitalize()} ≥ {k+2} Papers")
            columns_names_for_plot.append(f"{baseline} {category_name.capitalize()} ≥ {k+2} Papers")

            
        # Define line styles and colors
        colors = ['grey', 'grey', 'black', 'black', color_template[category_name_to_idx[category_name]], color_template[category_name_to_idx[category_name]]]

        for i, (df_subset, label) in enumerate(zip(dfs_for_plot, columns_names_for_plot)):
            number = len(set(df_subset["CpG ID"]))
            median = df_subset[column_to_plot].median()
            counts, _ = np.histogram(df_subset[column_to_plot], bins=bins)
            total_count = np.sum(counts)
            percentages = (counts / total_count) * 100
            ax.plot(bins[:-1], percentages, linewidth=2, label=f"{label} (n={number:,}, median={median:.2f})", color=colors[i], linestyle=line_styles[i])
        ax.set_xlabel(f"{column_to_plot}")
        ax.set_ylabel("Percentage (%)")
        ax.set_title(category_name.capitalize())
        ax.legend(bbox_to_anchor=(0, 1), loc='upper left', frameon=False, fontsize=10)
    plt.suptitle(f"BECon - {column_to_plot}", y=1, fontsize=16)
    plt.tight_layout()
    if output:
        plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/becon_{column_to_plot}_{baseline}_{step_size}.pdf", format="pdf")
    if show:
        plt.show()
counters = 0
for baseline in ["Control", "Non-CoRSIV"]:
    generate_plot_all_categories("Mean Cor All Brain", baseline, output=True)
    counters += 1
print(counters)


In [None]:
def get_median_df(output=False):
    regions = list(zip(["CoRSIV", "Control", "Non-CoRSIV"], [corsiv_probe_list, control_probe_list, non_corsiv_baseline]))
    df = pd.read_csv(f"becon/becon_all_probes.csv")
    df = df[["CpG ID", "Cor Blood-BA7", "Cor Blood- BA10", "Cor Blood- BA20", "Mean Cor All Brain", "Var in All Brain", "Var in Blood"]]
    data = []
    for i, catname in enumerate(category_names):
        max_papers = max(cat_probes_dict[i].values())
        for pidx in range(1, max_papers + 1):
            p = set(k for k, v in cat_probes_dict[i].items() if v >= pidx)
            skip = False
            for rname, rset in regions:
                probes_in_region = rset.intersection(p)
                filtered_df = df[df["CpG ID"].isin(probes_in_region)]
                if rname == "CoRSIV" and len(filtered_df) < 10:
                    skip = True
                    break
                medians = {col: filtered_df[col].median() for col in df.columns[1:]}
                data.append({
                    "region_type": rname,
                    "category": catname,
                    "papers": pidx,
                    "Median Correlation Blood-BA7": medians["Cor Blood-BA7"],
                    "Median Correlation Blood-BA10": medians["Cor Blood- BA10"],
                    "Median Correlation Blood-BA20": medians["Cor Blood- BA20"],
                    "Median Mean Correlation All Brain": medians["Mean Cor All Brain"],
                    "Median Variability in All Brain": medians["Var in All Brain"],
                    "Median Variability in Blood": medians["Var in Blood"],
                    "Number of Probes": len(filtered_df)
                })
            if skip:
                break
    for rname, rset in regions:
        filtered_df = df[df["CpG ID"].isin(rset)]
        medians = {col: filtered_df[col].median() for col in df.columns[1:]}
        data.append({
            "region_type": rname,
            "category": np.nan,
            "papers": np.nan,
            "Median Correlation Blood-BA7": medians["Cor Blood-BA7"],
            "Median Correlation Blood-BA10": medians["Cor Blood- BA10"],
            "Median Correlation Blood-BA20": medians["Cor Blood- BA20"],
            "Median Mean Correlation All Brain": medians["Mean Cor All Brain"],
            "Median Variability in All Brain": medians["Var in All Brain"],
            "Median Variability in Blood": medians["Var in Blood"],
            "Number of Probes": len(filtered_df)
        })

    df = pd.DataFrame(data)
    print(f"DataFrame shape: {df.shape}")
    
    if output:
        df.to_csv(f"becon/becon_median_df_10_probes_minimum.csv", index=False)
    return df

get_median_df(output=True)

In [None]:
df = pd.read_csv("becon/becon_median_df_10_probes_minimum.csv")
df
import matplotlib.pyplot as plt
import seaborn as sns


# Create a 6x2 grid of subplots (11 plots in total, last one will be empty)
size = 3
fig, axes = plt.subplots(4, 3, figsize=(3*size, 4.5*size))
fig.suptitle('Median Mean Correlation All Brain by Disease Category', fontsize=16)

# Flatten the axes array for easier iteration
axes = axes.flatten()

# Iterate through each disease category
for i, category in enumerate(category_names):
    
    # Filter data for the current category
    cat_data = df[df['category'] == category]
    
    # Plot data for each region type
    for region_type, color in zip(['CoRSIV', 'Control', 'Non-CoRSIV'], [color_template[i], 'grey', 'black']):
        region_data = cat_data[cat_data['region_type'] == region_type]
        axes[i].plot(region_data['papers'], region_data['Median Mean Correlation All Brain'], 
                     marker='o', linestyle='-', color=color, label=region_type)
    
    # Set labels and title
    axes[i].set_xlabel('Number of Papers')
    axes[i].set_ylabel('Median Mean Correlation All Brain')
    axes[i].set_title(category)
    # axes[i].legend()
    # Set x-axis to integer ticks
    axes[i].xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    
    # Set y-axis limits and ticks
    axes[i].set_ylim(0, 1.0)
    axes[i].set_yticks(np.arange(0, 1.01, 0.2))
    # Remove the aspect ratio setting to allow subplots to be rectangular
    # axes[i].set_aspect('equal', adjustable='box')  # Make the subplot square

    
    # Format y-axis tick labels to show only one decimal place
    axes[i].yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f"{x:.1f}"))

# Remove the empty subplot
fig.delaxes(axes[-1])

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1 import make_axes_locatable

def plot_median_scatter(col1, col2, ax, tissue, category, cmap, min_papers, max_papers, show_legend=False):
    region_markers = {"CoRSIV": "D", "Non-CoRSIV": "o"}
    
    for region_type, marker in region_markers.items():
        region_df = df[(df["region_type"] == region_type) & (df["category"] == category)]
        
        for _, row in region_df.iterrows():
            papers = row["papers"]
            color_intensity = (papers - min_papers) / (max_papers - min_papers+1)
            color = cmap(color_intensity)
            ax.scatter(row[col1], row[col2], c=[color], s=60, alpha=1, marker=marker)
    
    # Add black star for all CoRSIVs
    all_corsiv = df[(df["region_type"] == "CoRSIV") & (df["category"].isna())]
    ax.scatter(all_corsiv[col1], all_corsiv[col2], c='black', s=120, marker='*')
    
    # Add hollow black star for all Non-CoRSIVs
    all_non_corsiv = df[(df["region_type"] == "Non-CoRSIV") & (df["category"].isna())]
    ax.scatter(all_non_corsiv[col1], all_non_corsiv[col2], facecolors='none', edgecolors='black', s=120, marker='*')
    
    ax.set_xlabel(f"Median IRR₁₀₋₉₀ in {tissue}", fontsize=14)
    ax.set_ylabel("Median Mean Cor \nAll Brain vs. Blood", fontsize=12)

    ax.set_xlim(0, 1.0)
    ax.set_ylim(-0.2, 1.0)
    ax.set_xticks(np.arange(0, 1.2, 0.2))
    ax.set_yticks(np.arange(-0.2, 1.2, 0.2))
    ax.set_aspect(1/1.2)  # Adjust aspect ratio to make height and width equal
    ax.tick_params(axis='both', which='major', labelsize=12)
    
    # Add colorbar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=min_papers, vmax=max_papers))
    sm.set_array([])
    cbar = plt.colorbar(sm, cax=cax)
    # Remove the auto aspect setting as it makes the colorbar tiny
    # We'll rely on the set_box_aspect call below to adjust the colorbar height
    cax.set_box_aspect(20)  # Adjust colorbar height to match main plot's y-axis
    if category == "cancer":
        cbar.set_label(f'≥ Number of Papers', rotation=90, labelpad=5)
    cbar.set_ticks(np.arange(int(min_papers), int(max_papers)+1, 1))
    cbar.set_ticklabels(np.arange(int(min_papers), int(max_papers)+1, 1))
    
    if show_legend:
        # Create legend for region types
        legend_elements = [plt.Line2D([0], [0], marker=marker, color='gray', label=region, markersize=8, linestyle='None')
                            for region, marker in region_markers.items()]
        legend_elements.extend([
            plt.Line2D([0], [0], marker='*', color='black', label='All CoRSIVs', markersize=8, linestyle='None'),
            plt.Line2D([0], [0], marker='*', markerfacecolor='none', markeredgecolor='black', label='All Non-CoRSIVs', markersize=8, linestyle='None')
        ])
for tissue in ["All Brain", "Blood"]:
    fig, axes = plt.subplots(4, 3, figsize=(12, 16), gridspec_kw={'width_ratios': [1, 1, 1], 'wspace': 0.6})
    fig.suptitle("BECon", fontsize=28, y=0.97)


    df = pd.read_csv("becon/becon_median_df_10_probes_minimum.csv")
    # Get min and max papers for each category
    category_paper_ranges = {}
    for category in category_names:
        category_df = df[df["category"] == category]
        category_paper_ranges[category] = (category_df["papers"].min(), category_df["papers"].max())

    for i, category in enumerate(category_names):  # Limit to 11 categories
        row = i // 3
        col = i % 3
        
        # Create color map for this category
        base_color = color_template[i]
        light_color = tuple(c for c in matplotlib.colors.to_rgb(base_color))  # 100% of base color
        dark_color = tuple(0.3 * c for c in matplotlib.colors.to_rgb(base_color))  # 50% of base color (grey instead of black)
        
        # Get min and max papers for this category
        min_papers, max_papers = category_paper_ranges[category]
        
        cmap = LinearSegmentedColormap.from_list('custom', [dark_color, light_color], N=100)
        
        # Plot iir1 vs ICC
        ax = axes[row, col]
        plot_median_scatter(f"Median Variability in {tissue}", "Median Mean Correlation All Brain", ax, tissue, category, cmap, min_papers, max_papers, show_legend=(i==0))
        ax.set_title(category.capitalize(), fontsize=18)

    # Mute last subplot
    # Create legend for region types and categories
    legend_elements = [plt.Line2D([0], [0], marker=marker, color='gray', label=region, markersize=12, linestyle='None')
                        for region, marker in {"CoRSIV": "D", "Non-CoRSIV": "o"}.items()]
    legend_elements.extend([
        plt.Line2D([0], [0], marker='*', color='black', label='All CoRSIVs', markersize=12, linestyle='None'),
        plt.Line2D([0], [0], marker='*', markerfacecolor='none', markeredgecolor='black', label='All Non-CoRSIVs', markersize=12, linestyle='None')
    ])

    # Add legend to the last subplot
    axes[-1, -1].legend(handles=legend_elements, loc='center', fontsize=14, ncol=1, frameon=False)
    axes[-1, -1].axis('off')

    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(top=0.93)  # Make room for the main title
    plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/becon_median_scatter_{''.join(t.lower()for t in tissue.split())}_variability_vs_mean_correlation.pdf", format="pdf")


In [None]:
def get_probes(count_dictionary):
    all_probes = []
    corsiv_probes = []
    i = 1
    max_probe_count = max(count_dictionary.values())
    while i <= max_probe_count:
        dummy_dict = {key:count for key, count in count_dictionary.items() if count == i}
        all_probes.append(dummy_dict.keys())
        i += 1
    probe_cutoff = max_probe_count
    for i in range(len(all_probes)-1, 0, -1):
        if len(all_probes[i]) < 10:
            continue
        probe_cutoff = i
        break
    all_probes = all_probes[:probe_cutoff+1]
    i = 1
    while i <= probe_cutoff:
        dummy_dict = {key:count for key, count in count_dictionary.items() if count == i}
        corsiv_probe_set = set(dummy_dict.keys()).intersection(corsiv_probe_list)
        corsiv_probes.append(corsiv_probe_set)
        i += 1
    return all_probes, corsiv_probes

paper_threshold = 3
for i in range(11):
    all , corsiv_probes_becon = get_probes(cat_probes_dict[i])
    all_becon = set()
    corsiv_becon = set()
    for p in all[paper_threshold-1:]:
        all_becon |= p
    for p in corsiv_probes_becon[paper_threshold-1:]:
        corsiv_becon |= p
    with open(f'becon/{category_names[i]}_all_{paper_threshold}.txt', 'w') as f:
        for item in all_becon:
            f.write(f"{item}\n")
    with open(f'becon/{category_names[i]}_corsiv_{paper_threshold}.txt', 'w') as f:
        for item in corsiv_becon:
            f.write(f"{item}\n")

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests


for i in range(11):
    # Set up Selenium WebDriver (choose the right driver for your browser, e.g., ChromeDriver or GeckoDriver for Firefox)
    driver = webdriver.Chrome()

    # URL of the BECon tool
    becon_url = 'https://redgar598.shinyapps.io/BECon/'  # Update with actual URL of BECon

    # Visit the page
    driver.get(becon_url)

    # Wait until the file upload element is available (this may require adjusting based on the page structure)
    wait = WebDriverWait(driver, 10)
    upload_element = wait.until(EC.presence_of_element_located((By.ID, 'datafile')))  # Change ID to the actual one
    upload_element.send_keys(f'/Users/antata/Library/CloudStorage/OneDrive-BaylorCollegeofMedicine/text-mining/categories/becon/{category_names[i]}_all_3.txt')
    # # wait.until(EC.presence_of_element_located((By.ID, 'plot1')))  # Change to actual result element
    time.sleep(30)  # Wait for 10 seconds (adjust as needed

    download_link_element = driver.find_element(By.ID, 'downloadData')
    download_href = download_link_element.get_attribute('href')
    response = requests.get(download_href)
    if response.status_code == 200:
        file_path = f'becon/{category_names[i]}_all_3_result.csv'  # Adjust the file path and name
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print("File downloaded successfully")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
    driver.quit()

In [None]:
bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
for type in ["corsiv", "all"]:
    for i in range(11):
        if i == 5 and type == "corsiv":
            continue
        df = pd.read_csv(f"becon/{category_names[i]}_{type}_3_result.csv", index_col=0)
        # Define bin edges

        # Plot histogram
        plt.figure()
        plt.hist(df['Mean Cor All Brain'], bins=bins, edgecolor='black', color='skyblue')
        plt.xlabel('Mean Correlation between Blood vs. Brain')
        plt.ylabel('Number of Probes')
        plt.title(f"{category_names[i].capitalize()} - {type.capitalize()} Probes")
        # plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{category_names[i]}_{type}.jpeg")
        plt.close()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define the bin edges
bins = [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
columns = ["Cor Blood-BA7", "Cor Blood- BA10", "Cor Blood- BA20", "Mean Cor All Brain"]
for col in columns:
    # Create two figures (one for each 'type')
    for type in ["corsiv", "all"]:
        fig, axes = plt.subplots(4, 3, figsize=(12, 16))  # 4 rows and 3 columns
        axes = axes.ravel()  # Flatten axes to make indexing easier

        for i in range(11):
            if i == 5 and type == "corsiv":  # Skip this condition
                continue

            # Read the CSV
            df = pd.read_csv(f"becon/{category_names[i]}_{type}_3_result.csv", index_col=0)

            # Plot histogram on the corresponding subplot
            axes[i].hist(df[col], bins=bins, edgecolor='black', color='skyblue')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Number of Probes')
            axes[i].set_title(f"{category_names[i].capitalize()}")

        # Turn off any empty subplots
        for j in range(i + 1, 12):
            fig.delaxes(axes[j])

        # Adjust layout and save the figure
        plt.tight_layout()
        plt.savefig(f"/Users/antata/Desktop/text-mining-figures/becon/{col}_{type}_combined.jpeg")
        plt.close()


In [None]:
import pandas as pd
import math

type="corsiv"
summary_stats = []

for i in range(11):
    if i == 5 and type == "corsiv":
        summary_stats.append({
            'min': math.nan,
            'max': math.nan,
            'median': math.nan,
            'mean': math.nan
        })
    else:
        df = pd.read_csv(f"becon/{category_names[i]}_{type}_3_result.csv")
        summary_stats.append({
            'min': round(df["Mean Cor All Brain"].min(), 2),
            'max': round(df["Mean Cor All Brain"].max(), 2),
            'median': round(df["Mean Cor All Brain"].median(), 2),
            'mean': round(df["Mean Cor All Brain"].mean(), 2)
        })

# Convert the list of dictionaries into a DataFrame
summary_df = pd.DataFrame(summary_stats)

# Add an index column for easier tracking of each DataFrame's row
summary_df.index = category_names

# Print the summary DataFrame
print(summary_df)


In [None]:
# Figure 5B: becon density plot
import math
import scipy.stats as stats
import matplotlib.pyplot as plt

non_corsiv_baseline = illumina - CORSIV_PROBE_LIST
bins = [-1, 0, 1.0]
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["Non-CoRSIV", "CoRSIV"], [non_corsiv_baseline, CORSIV_PROBE_LIST]))


# Create a 2x5 grid of subplots
fig, axs = plt.subplots(2, 5, figsize=(16, 6), gridspec_kw={'width_ratios': [1, 1, 1, 1, 1], 'wspace': 0.5, 'hspace': 0.3})
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

plot_index = 0
for i, catname in enumerate(category_names):
    if catname.lower() == 'hematological':
        continue  # Skip hematological category
    
    dfs_for_plot = []
    max_papers = max(cat_probes_dict[i].values())
    p = set(k for k, v in cat_probes_dict[i].items() if v >= 2)
    for rname, rset in regions:
        probes_in_region = rset.intersection(p)
        filtered_df = df[df["CpG ID"].isin(probes_in_region)]
        dfs_for_plot.append((filtered_df, rname))

    ax = axs[plot_index]
    
    for j, (df_subset, rname) in enumerate(dfs_for_plot):
        density = stats.gaussian_kde(df_subset[target_col])
        xs = np.linspace(min(bins), max(bins), 200)
        ys = density(xs)
        color = color_template[i] if rname == 'CoRSIV' else 'grey'
        ax.plot(xs, ys, "-", color=color, label=f"{rname} Probes ≥ 2 papers", linewidth=3)
        ax.fill_between(xs, ys, alpha=0.5, color=color)
    ax.set_aspect('equal')

    
    # Set x-ticks
    ax.set_xticks([-1, 0.0, 1.0])
    ax.set_yticks([0, 1.0, 2.0])
    ax.set_ylim(0, 2.1)
    # ax.set_xlabel(target_col)
    # ax.set_ylabel('Density')
    ax.set_title(category_names[i].capitalize(), fontsize=22)
    
    plot_index += 1

plt.tight_layout()
output = f"{FIGURE_PATH}/Fig5/becon_kde.svg"
plt.savefig(output, format="svg")



In [None]:
# Figure 5B: becon regression plot
import math
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

non_corsiv_baseline = illumina - CORSIV_PROBE_LIST
target_col = "Mean Cor All Brain"
df = pd.read_csv("becon/becon_all_probes.csv")
regions = list(zip(["CoRSIV", "Non-CoRSIV"], [CORSIV_PROBE_LIST, non_corsiv_baseline]))
cat_probes_dict = []
for cat in CATEGORY_NAMES:
    cat_probes_dict.append(read_in_probes(cat))
# Create a 2x5 grid of subplots
fig, axs = plt.subplots(2, 3, figsize=(10, 6), gridspec_kw={'width_ratios': [1, 1, 1], 'wspace': 0.2, 'hspace': 0.3})
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

plot_index = 0
for i, catname in enumerate(CATEGORY_NAMES):
    if i in [1,2,4,8,9]:
        continue  # Skip hematological category
    
    ax = axs[plot_index]
    max_papers = max(cat_probes_dict[i].values())
    skip = False
    for rname, rset in regions:
        medians = []
        paper_counts = []
        for pidx in range(1, max_papers + 1):
            p = set(k for k, v in cat_probes_dict[i].items() if v == pidx)
            probes_in_region = rset.intersection(p)
            filtered_df = df[df["CpG ID"].isin(probes_in_region)]
            if len(filtered_df) < 15:
                max_papers = pidx - 1
                break
            else:
                medians.append(filtered_df[target_col].mean())
                paper_counts.append(pidx)
        
        # if len(medians) < 3:
        #     continue
        X = np.array(medians)
        y = np.array(paper_counts)
        X_const = sm.add_constant(X)
        model = sm.OLS(y, X_const).fit()
        
        color = COLOR_TEMPLATE[i] if rname == 'CoRSIV' else 'grey'
        ax.scatter(X, y, color=color, label=f"{rname} Probes")
        ax.plot(X, model.predict(X_const), color=color, linestyle='--', label=f'Regression ({rname})')
        r_squared = round(model.rsquared, 3)
        slope = round(model.params[1], 3)
        x1_pvalue = round(model.pvalues[1], 3)
        
        pos = 0.75 if rname == 'CoRSIV' else 0.45
        annotation_text = (f"{rname}:\nR² = {r_squared}\n"
                           f"Slope = {slope}\n"
                           f"Slope p-value = {x1_pvalue}")
        
        # ax.annotate(annotation_text, xy=(0.6, pos), xycoords='axes fraction',
        #             fontsize=8, ha='left', va='top',
        #             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # ax.set_xlabel(target_col)
    # ax.set_ylabel('Number of Papers')
    ax.set_title(CATEGORY_NAMES[i].capitalize(), fontsize=20)
    ax.set_xlim(-0.2, 1)
    ax.set_xticks(np.arange(-0.2, 1.2, 0.2))
    ax.set_xticklabels(['-0.2', '', '', '', '', '', '1.0'])
    ax.set_yticks(range(1, max_papers + 1))
    # ax.set_aspect('equal')

    
    plot_index += 1

# plt.tight_layout()
# output = f"{FIGURE_PATH}/Fig5/becon_regression.svg"
# plt.savefig(output, format="svg")
