In [6]:
import os
import pandas as pd
import pybedtools
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
from tqdm import tqdm
import numpy as np

PROJECT_DIR = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/"

organism = "hg38" # mm10 or hg38
dataset_name = "t_cell"

ground_truth_dir = os.path.join(PROJECT_DIR, "data/ground_truth_files/")
tss_file = os.path.join(PROJECT_DIR, f"data/genome_data/genome_annotation/{organism}/gene_tss.bed")
genome_file = os.path.join(PROJECT_DIR, f"data/genome_data/reference_genome/{organism}/{organism}.chrom.sizes")

chipatlas_file = os.path.join(ground_truth_dir, "Oth.Bld.05.AllAg.CD8PULUS_T_cells.bed")


In [None]:


df = pd.read_csv(chipatlas_file, sep="\t", header=None, skiprows=1)


In [8]:

def build_chip_tf_peak_tg_distance_df(
    chip_atlas_bed_file: str,
    gene_tss_bed_file: str,
    genome_file: str = None,
    max_peak_distance: int = 100_000,      # 100 Kb window
    distance_factor_scale: float = 20_000.0  # e.g. your DISTANCE_SCALE_FACTOR
) -> pd.DataFrame:
    """
    Build a TF–peak–TG dataframe from ChIP-Atlas peaks and gene TSS BED,
    linking each peak to *all* genes with TSS within max_peak_distance,
    and assigning an exponential distance-based score.

    Returns columns:
      - tf_name
      - peak_chr, peak_start, peak_end, peak_id
      - tg_name, tss_chr, tss_start, tss_end
      - TSS_dist, TSS_dist_score
    """

    # -----------------------------
    # 1. Load and clean BedTools
    # -----------------------------
    chip_bed = pybedtools.BedTool(chip_atlas_bed_file)
    tss_bed  = pybedtools.BedTool(gene_tss_bed_file)

    # Drop random / chrUn contigs from TSS
    tss_bed = tss_bed.filter(
        lambda f: "random" not in f.chrom and "chrUn" not in f.chrom
    ).saveas()

    # Sort both consistently
    if genome_file is not None:
        chip_sorted = chip_bed.sort(g=genome_file)
        tss_sorted  = tss_bed.sort(g=genome_file)
    else:
        chip_sorted = chip_bed.sort()
        tss_sorted  = tss_bed.sort()

    # -----------------------------
    # 2. Get all peak–TSS pairs within window
    # -----------------------------
    # bedtools window -w max_peak_distance
    # This returns all (peak, TSS) pairs whose intervals are within ±max_peak_distance
    chip_tss_pairs = chip_sorted.window(tss_sorted, w=max_peak_distance)

    # ChIP-Atlas peaks are typically BED9:
    #  chrom, start, end, annot, score, strand, thickStart, thickEnd, rgb
    # TSS BED is usually: chrom, start, end, gene_name, strand
    # -> total 9 + 5 = 14 columns
    cols = [
        "peak_chr", "peak_start", "peak_end",
        "peak_annot", "peak_score", "peak_strand",
        "peak_thick_start", "peak_thick_end", "peak_rgb",
        "tss_chr", "tss_start", "tss_end",
        "tg_name", "tss_strand"
    ]
    df = chip_tss_pairs.to_dataframe(names=cols)

    # Ensure numeric
    for col in ["peak_start", "peak_end", "tss_start", "tss_end"]:
        df[col] = df[col].astype(int)

    # -----------------------------
    # 3. Compute distance & score
    # -----------------------------
    def interval_distance(row):
        """
        Mimic bedtools closest -d:
        0 if intervals overlap, else distance between nearest edges.
        """
        ps, pe = row["peak_start"], row["peak_end"]
        gs, ge = row["tss_start"], row["tss_end"]

        # overlap
        if (pe >= gs) and (ge >= ps):
            return 0
        # peak before gene
        elif pe < gs:
            return gs - pe
        # peak after gene
        else:
            return ps - ge

    df["TSS_dist"] = df.apply(interval_distance, axis=1)

    # Filter to actual max_peak_distance based on the computed distance
    df = df[df["TSS_dist"] <= max_peak_distance].copy()

    # Exponential distance score
    df["TSS_dist_score"] = np.exp(-df["TSS_dist"] / float(distance_factor_scale))

    # -----------------------------
    # 4. Extract TF name & peak_id
    # -----------------------------
    # ChIP-Atlas annot looks like:
    # ID=SRX4061019;Name=Smad4%20(@%20Forelimb);Title=...
    df["tf_name"] = (
        df["peak_annot"]
        .astype(str)
        .str.extract(r"Name=([^%]+)", expand=False)
        .str.upper()  # optional: normalize to uppercase
    )

    # Peak ID like chr:start-end
    df["peak_id"] = (
        df["peak_chr"].astype(str)
        + ":" +
        df["peak_start"].astype(str)
        + "-" +
        df["peak_end"].astype(str)
    )

    # Reorder columns to a tidy TF–peak–TG layout
    df = df[
        [
            "tf_name",
            "peak_id", "peak_chr", "peak_start", "peak_end",
            "tg_name", "tss_chr", "tss_start", "tss_end",
            "TSS_dist", "TSS_dist_score",
            "peak_annot", "peak_score", "peak_strand", "peak_rgb",
        ]
    ]

    return df


In [9]:
MAX_PEAK_DISTANCE      = 1000_000      # or 100_000 if you want to match your other code
DISTANCE_SCALE_FACTOR  = 20_000

chip_tf_peak_tg_df = build_chip_tf_peak_tg_distance_df(
    chip_atlas_bed_file=chipatlas_file,
    gene_tss_bed_file=tss_file,
    genome_file=genome_file,
    max_peak_distance=MAX_PEAK_DISTANCE,
    distance_factor_scale=DISTANCE_SCALE_FACTOR
)

print(chip_tf_peak_tg_df.head())

chip_tf_peak_tg_df = chip_tf_peak_tg_df[["tf_name", "peak_id", "tg_name", "TSS_dist", "TSS_dist_score"]]
chip_tf_peak_tg_df.rename(columns={"tf_name":"source_id", "tg_name":"target_id", "TSS_dist":"tss_distance", "TSS_dist_score":"tss_distance_score"}, inplace=True)
chip_tf_peak_tg_df


edge_chip_df = chip_tf_peak_tg_df[["source_id", "target_id"]]
edge_chip_df.to_csv(os.path.join(ground_truth_dir, "chipatlas_t_cell.csv"), index=False)

  tf_name          peak_id peak_chr  peak_start  peak_end      tg_name  \
0    CTCF  chr1:9891-10361     chr1        9891     10361      DDX11L1   
1    CTCF  chr1:9891-10361     chr1        9891     10361       WASH7P   
2    CTCF  chr1:9891-10361     chr1        9891     10361    MIR6859-1   
3    CTCF  chr1:9891-10361     chr1        9891     10361  MIR1302-2HG   
4    CTCF  chr1:9891-10361     chr1        9891     10361    MIR1302-2   

  tss_chr  tss_start  tss_end  TSS_dist  TSS_dist_score  \
0    chr1      12010    12011      1649        0.920857   
1    chr1      15100    15101      4739        0.789031   
2    chr1      17436    17437      7075        0.702050   
3    chr1      28589    28590     18228        0.401961   
4    chr1      30366    30367     20005        0.367787   

                                          peak_annot  peak_score peak_strand  \
0  ID=SRX9867030;Name=CTCF%20(@%20CD8%2B%20T%20ce...         278           .   
1  ID=SRX9867030;Name=CTCF%20(@%20CD8%2B