In [2]:
import os
import pandas as pd
import pybedtools
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
from tqdm import tqdm
import numpy as np
import pyranges as pr

organism = "hg38" # mm10 or hg38
dataset_name = "iPSC"

PROJECT_DIR = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/"
ground_truth_dir = os.path.join(PROJECT_DIR, "data/ground_truth_files/")
tss_file = os.path.join(PROJECT_DIR, f"data/genome_data/genome_annotation/{organism}/gene_tss.bed")
genome_file = os.path.join(PROJECT_DIR, f"data/genome_data/reference_genome/{organism}/{organism}.chrom.sizes")


chipatlas_files = {
    "t_cell": "Oth.Bld.05.AllAg.CD8PULUS_T_cells.bed",
    "K562": "Oth.Bld.05.AllAg.K-562.bed",
    "macrophage": "Oth.Bld.05.AllAg.Macrophages.bed",
    "mESC": "Oth.Emb.05.AllAg.AllCell.bed",
    "iPSC": "Oth.PSC.05.AllAg.iPS_cells.bed",
}

chipatlas_file = os.path.join(ground_truth_dir, chipatlas_files[dataset_name])


In [3]:

def build_chip_tf_peak_tg_distance_df(
    chip_atlas_bed_file: str,
    gene_tss_bed_file: str,
    genome_file: str = None,
    max_peak_distance: int = 100_000,
    distance_factor_scale: float = 20_000.0
) -> pd.DataFrame:
    """
    Build a TF–peak–TG dataframe from ChIP-Atlas peaks and gene TSS BED,
    linking each peak to *all* genes with TSS within max_peak_distance,
    and assigning an exponential distance-based score.

    Returns columns:
      - tf_name
      - peak_chr, peak_start, peak_end, peak_id
      - tg_name, tss_chr, tss_start, tss_end
      - TSS_dist, TSS_dist_score
    """

    # -----------------------------
    # 1. Load and clean BedTools
    # -----------------------------
    chip_bed = pybedtools.BedTool(chip_atlas_bed_file)
    tss_bed  = pybedtools.BedTool(gene_tss_bed_file)

    # Drop random / chrUn contigs from TSS
    tss_bed = tss_bed.filter(
        lambda f: "random" not in f.chrom and "chrUn" not in f.chrom
    ).saveas()

    # Sort both consistently
    if genome_file is not None:
        chip_sorted = chip_bed.sort(g=genome_file)
        tss_sorted  = tss_bed.sort(g=genome_file)
    else:
        chip_sorted = chip_bed.sort()
        tss_sorted  = tss_bed.sort()

    # -----------------------------
    # 2. Get all peak–TSS pairs within window
    # -----------------------------
    # bedtools window -w max_peak_distance
    # This returns all (peak, TSS) pairs whose intervals are within ±max_peak_distance
    chip_tss_pairs = chip_sorted.window(tss_sorted, w=max_peak_distance)

    # ChIP-Atlas peaks are typically BED9:
    #  chrom, start, end, annot, score, strand, thickStart, thickEnd, rgb
    # TSS BED is usually: chrom, start, end, gene_name, strand
    # -> total 9 + 5 = 14 columns
    cols = [
        "peak_chr", "peak_start", "peak_end",
        "peak_annot", "peak_score", "peak_strand",
        "peak_thick_start", "peak_thick_end", "peak_rgb",
        "tss_chr", "tss_start", "tss_end",
        "tg_name", "tss_strand"
    ]
    df = chip_tss_pairs.to_dataframe(names=cols)

    # Ensure numeric
    for col in ["peak_start", "peak_end", "tss_start", "tss_end"]:
        df[col] = df[col].astype(int)

    # -----------------------------
    # 3. Compute distance & score
    # -----------------------------
    def interval_distance(row):
        """
        Mimic bedtools closest -d:
        0 if intervals overlap, else distance between nearest edges.
        """
        ps, pe = row["peak_start"], row["peak_end"]
        gs, ge = row["tss_start"], row["tss_end"]

        # overlap
        if (pe >= gs) and (ge >= ps):
            return 0
        # peak before gene
        elif pe < gs:
            return gs - pe
        # peak after gene
        else:
            return ps - ge

    df["TSS_dist"] = df.apply(interval_distance, axis=1)

    # Filter to actual max_peak_distance based on the computed distance
    df = df[df["TSS_dist"] <= max_peak_distance].copy()

    # Exponential distance score
    df["TSS_dist_score"] = np.exp(-df["TSS_dist"] / float(distance_factor_scale))

    # -----------------------------
    # 4. Extract TF name & peak_id
    # -----------------------------
    # ChIP-Atlas annot looks like:
    # ID=SRX4061019;Name=Smad4%20(@%20Forelimb);Title=...
    df["tf_name"] = (
        df["peak_annot"]
        .astype(str)
        .str.extract(r"Name=([^%]+)", expand=False)
        .str.upper()  # optional: normalize to uppercase
    )

    # Peak ID like chr:start-end
    df["peak_id"] = (
        df["peak_chr"].astype(str)
        + ":" +
        df["peak_start"].astype(str)
        + "-" +
        df["peak_end"].astype(str)
    )

    # Reorder columns to a tidy TF–peak–TG layout
    df = df[
        [
            "tf_name",
            "peak_id", "peak_chr", "peak_start", "peak_end",
            "tg_name", "tss_chr", "tss_start", "tss_end",
            "TSS_dist", "TSS_dist_score",
            "peak_annot", "peak_score", "peak_strand", "peak_rgb",
        ]
    ]

    return df


In [5]:
MAX_PEAK_DISTANCE      = 100_000
DISTANCE_SCALE_FACTOR  = 20_000

chip_tf_peak_tg_df = build_chip_tf_peak_tg_distance_df(
    chip_atlas_bed_file=chipatlas_file,
    gene_tss_bed_file=tss_file,
    genome_file=genome_file,
    max_peak_distance=MAX_PEAK_DISTANCE,
    distance_factor_scale=DISTANCE_SCALE_FACTOR
)

chip_tf_peak_tg_df = chip_tf_peak_tg_df[["tf_name", "peak_id", "tg_name", "TSS_dist", "TSS_dist_score"]]
chip_tf_peak_tg_df.rename(columns={"tf_name":"source_id", "tg_name":"target_id", "TSS_dist":"tss_distance", "TSS_dist_score":"tss_distance_score"}, inplace=True)
chip_tf_peak_tg_df.to_csv(os.path.join(ground_truth_dir, f"chipatlas_{dataset_name}_tf_peak_distance.csv"), index=False)
chip_tf_peak_tg_df

edge_chip_df = chip_tf_peak_tg_df[["source_id", "target_id"]].drop_duplicates()
edge_chip_df.to_csv(os.path.join(ground_truth_dir, f"chipatlas_{dataset_name}.csv"), index=False)

print(f"Number of TFs: {edge_chip_df['source_id'].nunique()}")
print(f"Number of Targets: {edge_chip_df['target_id'].nunique()}")
print(f"Number of edges: {edge_chip_df.shape[0]}")

In [6]:
print(f"Number of TFs: {edge_chip_df['source_id'].nunique()}")
print(f"Number of Targets: {edge_chip_df['target_id'].nunique()}")
print(f"Number of edges: {edge_chip_df.shape[0]}")

Number of TFs: 76
Number of Targets: 41341
Number of edges: 967565


In [18]:
def split_df_peak_id_to_chrom_start_end(df: pd.DataFrame, peak_id_col: str = "peak_id", pad_peak_range=1000) -> pd.DataFrame:
    """
    Given a DataFrame with a column containing peak IDs in the format "chr:start-end",
    split that column into separate 'chrom', 'start', and 'end' columns.

    Parameters:
    - df: Input DataFrame containing the peak ID column.
    - peak_id_col: Name of the column in df that contains the peak IDs.

    Returns:
    - A new DataFrame with additional 'chrom', 'start', and 'end' columns.
    """
    # Split chrom vs start-end
    tmp = df[peak_id_col].str.split(":", expand=True)
    df["Chromosome"] = tmp[0]

    # Split start vs end
    tmp2 = tmp[1].str.split("-", expand=True)
    df["Start"] = tmp2[0].astype(int).apply(lambda x: max(0, x - pad_peak_range))
    df["End"] = tmp2[1].astype(int).apply(lambda x: x + pad_peak_range)

    return df