In [1]:

"""
Author: Wen-Jou Chang
Baylor College of Medicine

This script is used to calculate the median ICC and IIR values for different probe regions and categories.
"""

# imports
import os
import pandas as pd
from collections import Counter
import time
import numpy as np

# constant, data

category_names = ["cancer", "cardiovascular", "digestive", "endocrine", "hematological", "immune", "metabolic", "neurological", "obesity", "respiratory", "urogenital"]

# read in corsiv probes
corsiv_probe_df = pd.read_csv("data/humanData/corsiv_control/corsiv_all_probes_id.txt", sep="\t", names=["chr", "start", "end", "probeId", "corsiv_start", "corsiv_end", "corsiv_id"])
corsiv_probe_list = set(corsiv_probe_df.iloc[:,3])


control_probe_df = pd.read_csv("data/humanData/corsiv_control/control_all_probes_id.txt", sep="\t", names=["chr", "start", "end", "probeId", "id"])
control_probe_list = set(control_probe_df.iloc[:,3])

# read in illumina probes
epic = pd.read_csv("data/humanData/Illumina/EPIC.hg38.txt", sep="\t", header=None)
epic_probe_list = set(epic.iloc[:,3])
hm450 = pd.read_csv("data/humanData/Illumina/HM450.hg38.txt", sep="\t", header=None)
hm450_probe_list = set(hm450.iloc[:,3])
illumina = epic_probe_list.union(hm450_probe_list)
non_corsiv_baseline = illumina - corsiv_probe_list

In [None]:
# sanity check on three sets of probe's intersection
print(len(corsiv_probe_list), len(illumina), len(control_probe_list))
print(len(corsiv_probe_list.intersection(illumina)))
print(len(corsiv_probe_list.intersection(control_probe_list)))
print(len(illumina.intersection(control_probe_list)))


In [None]:
# concatenate data into one file, based on input from Alan
study = {"GSE61151":"Flanagan"}
for s in study:
    for t1 in ["iir", "icc"]:
        dfs = []
        for t2 in ["noncorsiv", "corsiv"]:
            df = pd.read_csv(f"data/humanData/Flanagan/{s}_{t2}_{t1}_results.csv", index_col=0)
            dfs.append(df)
        df = pd.concat(dfs, axis=0)
        df.drop_duplicates(subset=["ID"], inplace=True)
        df.to_csv(f"data/humanData/Flanagan/{study[s]}_{t1}_results.csv", index=False)

In [None]:
"""
read all probes if combined file alrdy exists
"""
cat_probes_dict = []

def read_in_probes(input_cat):
    if input_cat == "metabolic":
        df = pd.read_csv(f"probe/metabolic_diseases_all_probes.csv")
    else:
        df = pd.read_csv(f"probe/{input_cat}_all_probes.csv")
    probe_list = df["probeId"].to_list()
    c = dict(Counter(probe_list))
    return c


for cat in category_names:
    start = time.time()
    print(cat)
    c = read_in_probes(cat)
    cat_probes_dict.append(c)
    end = time.time()
    print(f'Time for {cat} code to run: ', end - start)

In [None]:
def get_median_df(dataset, output=False):
    """
    Calculate median ICC and IIR values for different probe regions and categories.
    
    Takes a dataset name and processes ICC/IIR results files to compute median values
    across CoRSIV, Control, and Non-CoRSIV regions. For each category and number of papers,
    calculates median ICC and IIR values for probes that appear in at least that many papers.
    Only includes categories with at least 15 CoRSIV probes.
    
    Args:
        dataset (str): Name of the dataset (e.g. "Flanagan")
        output (bool): Whether to save results to CSV file
        
    Returns:
        pandas.DataFrame: DataFrame containing median values and probe counts
    """
    # Define the three probe region types we'll analyze
    regions = list(zip(["CoRSIV", "Control", "Non-CoRSIV"], [corsiv_probe_list, control_probe_list, non_corsiv_baseline]))
    
    # Read in ICC and IIR results and merge them
    icc = pd.read_csv(f"iir_icc/{dataset}_icc_results.csv")
    iir = pd.read_csv(f"iir_icc/{dataset}_iir_results.csv")
    m = pd.merge(icc[["ID", "ICC"]], iir[["ID", "iir1", "iir2"]], on="ID", how="inner")
    m[["ICC", "iir1", "iir2"]] = m[["ICC", "iir1", "iir2"]].round(3)
    
    data = []
    # For each category and minimum paper count
    for i, catname in enumerate(category_names):
        max_papers = max(cat_probes_dict[i].values())
        for pidx in range(1, max_papers + 1):
            # Get probes that appear in at least pidx papers
            p = set(k for k, v in cat_probes_dict[i].items() if v >= pidx)
            skip = False
            
            # Calculate medians for each region type
            for rname, rset in regions:
                probes_in_region = rset.intersection(p)
                filtered_df = m[m["ID"].isin(probes_in_region)]
                # Skip if fewer than 15 CoRSIV probes
                if rname == "CoRSIV" and len(filtered_df) < 15:
                    skip = True
                    break
                medians = {col: filtered_df[col].median() for col in m.columns[1:]}
                data.append({
                    "region_type": rname,
                    "category": catname,
                    "papers": pidx,
                    "Median ICC": medians["ICC"],
                    "Median iir1": medians["iir1"],
                    "Median iir2": medians["iir2"],
                    "Number of Probes": len(filtered_df)
                })
            if skip:
                break
                
    # Calculate overall medians for each region type
    for rname, rset in regions:
        print(1)
        filtered_df = m[m["ID"].isin(rset)]
        medians = {col: filtered_df[col].median() for col in m.columns[1:]}
        data.append({
            "region_type": rname,
            "category": np.nan,
            "papers": np.nan,
            "Median ICC": medians["ICC"],
            "Median iir1": medians["iir1"],
            "Median iir2": medians["iir2"],
            "Number of Probes": len(filtered_df)
        })

    df = pd.DataFrame(data)
    print(f"DataFrame shape: {df.shape}")
    
    if output:
        df.to_csv(f"data/humanData/{dataset}/{dataset}_median_15_probes_minimum.csv", index=False)
    return df

get_median_df("Flanagan", output=True)