In [1]:
import os
import pandas as pd
import pybedtools
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
from tqdm import tqdm
import numpy as np
import pyranges as pr

organism = "hg38" # mm10 or hg38
dataset_name = "K562"

PROJECT_DIR = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/"
ground_truth_dir = os.path.join(PROJECT_DIR, "data/ground_truth_files/")
tss_file = os.path.join(PROJECT_DIR, f"data/genome_data/genome_annotation/{organism}/gene_tss.bed")
genome_file = os.path.join(PROJECT_DIR, f"data/genome_data/reference_genome/{organism}/{organism}.chrom.sizes")


chipatlas_files = {
    "t_cell": "Oth.Bld.05.AllAg.CD8PULUS_T_cells.bed",
    "K562": "Oth.Bld.05.AllAg.K-562.bed",
    "macrophage": "Oth.Bld.05.AllAg.Macrophages.bed",
    "mESC": "Oth.Emb.05.AllAg.AllCell.bed"
}

chipatlas_file = os.path.join(ground_truth_dir, chipatlas_files[dataset_name])


In [2]:

def build_chip_tf_peak_tg_distance_df(
    chip_atlas_bed_file: str,
    gene_tss_bed_file: str,
    genome_file: str = None,
    max_peak_distance: int = 100_000,
    distance_factor_scale: float = 20_000.0
) -> pd.DataFrame:
    """
    Build a TF–peak–TG dataframe from ChIP-Atlas peaks and gene TSS BED,
    linking each peak to *all* genes with TSS within max_peak_distance,
    and assigning an exponential distance-based score.

    Returns columns:
      - tf_name
      - peak_chr, peak_start, peak_end, peak_id
      - tg_name, tss_chr, tss_start, tss_end
      - TSS_dist, TSS_dist_score
    """

    # -----------------------------
    # 1. Load and clean BedTools
    # -----------------------------
    chip_bed = pybedtools.BedTool(chip_atlas_bed_file)
    tss_bed  = pybedtools.BedTool(gene_tss_bed_file)

    # Drop random / chrUn contigs from TSS
    tss_bed = tss_bed.filter(
        lambda f: "random" not in f.chrom and "chrUn" not in f.chrom
    ).saveas()

    # Sort both consistently
    if genome_file is not None:
        chip_sorted = chip_bed.sort(g=genome_file)
        tss_sorted  = tss_bed.sort(g=genome_file)
    else:
        chip_sorted = chip_bed.sort()
        tss_sorted  = tss_bed.sort()

    # -----------------------------
    # 2. Get all peak–TSS pairs within window
    # -----------------------------
    # bedtools window -w max_peak_distance
    # This returns all (peak, TSS) pairs whose intervals are within ±max_peak_distance
    chip_tss_pairs = chip_sorted.window(tss_sorted, w=max_peak_distance)

    # ChIP-Atlas peaks are typically BED9:
    #  chrom, start, end, annot, score, strand, thickStart, thickEnd, rgb
    # TSS BED is usually: chrom, start, end, gene_name, strand
    # -> total 9 + 5 = 14 columns
    cols = [
        "peak_chr", "peak_start", "peak_end",
        "peak_annot", "peak_score", "peak_strand",
        "peak_thick_start", "peak_thick_end", "peak_rgb",
        "tss_chr", "tss_start", "tss_end",
        "tg_name", "tss_strand"
    ]
    df = chip_tss_pairs.to_dataframe(names=cols)

    # Ensure numeric
    for col in ["peak_start", "peak_end", "tss_start", "tss_end"]:
        df[col] = df[col].astype(int)

    # -----------------------------
    # 3. Compute distance & score
    # -----------------------------
    def interval_distance(row):
        """
        Mimic bedtools closest -d:
        0 if intervals overlap, else distance between nearest edges.
        """
        ps, pe = row["peak_start"], row["peak_end"]
        gs, ge = row["tss_start"], row["tss_end"]

        # overlap
        if (pe >= gs) and (ge >= ps):
            return 0
        # peak before gene
        elif pe < gs:
            return gs - pe
        # peak after gene
        else:
            return ps - ge

    df["TSS_dist"] = df.apply(interval_distance, axis=1)

    # Filter to actual max_peak_distance based on the computed distance
    df = df[df["TSS_dist"] <= max_peak_distance].copy()

    # Exponential distance score
    df["TSS_dist_score"] = np.exp(-df["TSS_dist"] / float(distance_factor_scale))

    # -----------------------------
    # 4. Extract TF name & peak_id
    # -----------------------------
    # ChIP-Atlas annot looks like:
    # ID=SRX4061019;Name=Smad4%20(@%20Forelimb);Title=...
    df["tf_name"] = (
        df["peak_annot"]
        .astype(str)
        .str.extract(r"Name=([^%]+)", expand=False)
        .str.upper()  # optional: normalize to uppercase
    )

    # Peak ID like chr:start-end
    df["peak_id"] = (
        df["peak_chr"].astype(str)
        + ":" +
        df["peak_start"].astype(str)
        + "-" +
        df["peak_end"].astype(str)
    )

    # Reorder columns to a tidy TF–peak–TG layout
    df = df[
        [
            "tf_name",
            "peak_id", "peak_chr", "peak_start", "peak_end",
            "tg_name", "tss_chr", "tss_start", "tss_end",
            "TSS_dist", "TSS_dist_score",
            "peak_annot", "peak_score", "peak_strand", "peak_rgb",
        ]
    ]

    return df


In [3]:
# MAX_PEAK_DISTANCE      = 1_000_000      # or 100_000 if you want to match your other code
# DISTANCE_SCALE_FACTOR  = 20_000

# chip_tf_peak_tg_df = build_chip_tf_peak_tg_distance_df(
#     chip_atlas_bed_file=chipatlas_file,
#     gene_tss_bed_file=tss_file,
#     genome_file=genome_file,
#     max_peak_distance=MAX_PEAK_DISTANCE,
#     distance_factor_scale=DISTANCE_SCALE_FACTOR
# )

# chip_tf_peak_tg_df = chip_tf_peak_tg_df[["tf_name", "peak_id", "tg_name", "TSS_dist", "TSS_dist_score"]]
# chip_tf_peak_tg_df.rename(columns={"tf_name":"source_id", "tg_name":"target_id", "TSS_dist":"tss_distance", "TSS_dist_score":"tss_distance_score"}, inplace=True)
# chip_tf_peak_tg_df

# edge_chip_df = chip_tf_peak_tg_df[["source_id", "target_id"]].drop_duplicates()
# edge_chip_df.to_csv(os.path.join(ground_truth_dir, f"chipatlas_{dataset_name}.csv"), index=False)

In [18]:
def split_df_peak_id_to_chrom_start_end(df: pd.DataFrame, peak_id_col: str = "peak_id", pad_peak_range=1000) -> pd.DataFrame:
    """
    Given a DataFrame with a column containing peak IDs in the format "chr:start-end",
    split that column into separate 'chrom', 'start', and 'end' columns.

    Parameters:
    - df: Input DataFrame containing the peak ID column.
    - peak_id_col: Name of the column in df that contains the peak IDs.

    Returns:
    - A new DataFrame with additional 'chrom', 'start', and 'end' columns.
    """
    # Split chrom vs start-end
    tmp = df[peak_id_col].str.split(":", expand=True)
    df["Chromosome"] = tmp[0]

    # Split start vs end
    tmp2 = tmp[1].str.split("-", expand=True)
    df["Start"] = tmp2[0].astype(int).apply(lambda x: max(0, x - pad_peak_range))
    df["End"] = tmp2[1].astype(int).apply(lambda x: x + pad_peak_range)

    return df

In [5]:
edge_chip_df = pd.read_csv(os.path.join(ground_truth_dir, f"chipatlas_{dataset_name}.csv"))
edge_chip_df

Unnamed: 0,source_id,peak_id,target_id,distance
0,ADNP,chr1:9818-10460,DDX11L1,1551
1,ADNP,chr1:9825-10454,DDX11L1,1557
2,NR2C2,chr1:9846-10627,DDX11L1,1384
3,RELA,chr1:9856-10319,DDX11L1,1692
4,EP400,chr1:9866-10466,DDX11L1,1545
...,...,...,...,...
17417545,KDM1A,chrUn_KI270423v1:523-696,.,-1
17417546,ZNF133,chrUn_KI270423v1:525-702,.,-1
17417547,ATRX,chrUn_KI270423v1:540-675,.,-1
17417548,LMNA,chrUn_KI270392v1:576-785,.,-1


In [29]:
selected_edge_chip_df = edge_chip_df.loc[
    edge_chip_df["target_id"] == "APOL4"
].dropna().copy()

selected_edge_chip_df = split_df_peak_id_to_chrom_start_end(selected_edge_chip_df, peak_id_col="peak_id", pad_peak_range=10_000)
selected_edge_chip_df = selected_edge_chip_df.rename(columns={"peak_id":"chip_peak_id"})

selected_edge_chip_df

Unnamed: 0,source_id,chip_peak_id,target_id,distance,Chromosome,Start,End
16683156,CTCF,chr22:36194734-36195349,APOL4,6461,chr22,36184734,36205349
16683157,CTCF,chr22:36194772-36195404,APOL4,6406,chr22,36184772,36205404
16683158,CTCF,chr22:36194844-36195371,APOL4,6439,chr22,36184844,36205371
16683159,CTCF,chr22:36194864-36195320,APOL4,6490,chr22,36184864,36205320
16683160,CTCF,chr22:36194867-36195281,APOL4,6529,chr22,36184867,36205281
...,...,...,...,...,...,...,...
16683528,ZBTB7A,chr22:36213087-36213232,APOL4,11278,chr22,36203087,36223232
16683529,TEAD4,chr22:36213101-36213269,APOL4,11292,chr22,36203101,36223269
16683530,TAL1,chr22:36213118-36213267,APOL4,11309,chr22,36203118,36223267
16683531,SPI1,chr22:36213137-36213262,APOL4,11328,chr22,36203137,36223262


In [30]:
sliding_window = pd.read_parquet(os.path.join(PROJECT_DIR, f"data/training_data_cache/K562_base_settings/chr22/sliding_window_chr22.parquet"), engine="pyarrow")

sliding_window = split_df_peak_id_to_chrom_start_end(sliding_window, peak_id_col="peak_id", pad_peak_range=10_000)
sliding_window = sliding_window.rename(columns={"TF": "source_id", "peak_id": "sliding_window_peak_id"})

sliding_window

Unnamed: 0,source_id,sliding_window_peak_id,sliding_window_score,Chromosome,Start,End
0,ATF3,chr22:16447038-16447935,0.169131,chr22,16437038,16457935
1,ATF3,chr22:16543361-16544112,0.809747,chr22,16533361,16554112
2,ATF3,chr22:16562363-16562849,0.000000,chr22,16552363,16572849
3,ATF3,chr22:16585791-16588168,1.345531,chr22,16575791,16598168
4,ATF3,chr22:16595480-16596804,0.136343,chr22,16585480,16606804
...,...,...,...,...,...,...
93679,ZSCAN31,chr22:50568753-50569154,0.225804,chr22,50558753,50579154
93680,ZSCAN31,chr22:50705274-50705753,0.000000,chr22,50695274,50715753
93681,ZSCAN31,chr22:50732078-50732614,0.000000,chr22,50722078,50742614
93682,ZSCAN31,chr22:50775049-50775502,0.000000,chr22,50765049,50785502


In [31]:
a = pr.PyRanges(selected_edge_chip_df[["Chromosome","Start","End","chip_peak_id"]])
b = pr.PyRanges(sliding_window[["Chromosome","Start","End","sliding_window_peak_id"]])

hits = a.join(b)   # intervals that intersect
hits_df = hits.df
hits_df = hits_df.drop_duplicates(subset=["chip_peak_id", "sliding_window_peak_id"])

print("Overlapping interval pairs:", len(hits_df))
hits_df

Overlapping interval pairs: 634


Unnamed: 0,Chromosome,Start,End,chip_peak_id,Start_b,End_b,sliding_window_peak_id
0,chr22,36184734,36205349,chr22:36194734-36195349,36184982,36205349,chr22:36194982-36195349
111,chr22,36184772,36205404,chr22:36194772-36195404,36184982,36205349,chr22:36194982-36195349
222,chr22,36184844,36205371,chr22:36194844-36195371,36184982,36205349,chr22:36194982-36195349
333,chr22,36184864,36205320,chr22:36194864-36195320,36184982,36205349,chr22:36194982-36195349
444,chr22,36184867,36205281,chr22:36194867-36195281,36184982,36205349,chr22:36194982-36195349
...,...,...,...,...,...,...,...
69930,chr22,36203118,36223267,chr22:36213118-36213267,36184982,36205349,chr22:36194982-36195349
70041,chr22,36203118,36223267,chr22:36213118-36213267,36210350,36230742,chr22:36220350-36220742
70152,chr22,36203137,36223262,chr22:36213137-36213262,36184982,36205349,chr22:36194982-36195349
70263,chr22,36203137,36223262,chr22:36213137-36213262,36210350,36230742,chr22:36220350-36220742


In [None]:
merged = (
    hits_df
    .merge(
        selected_edge_chip_df, 
        on="chip_peak_id", 
        how="left",
        suffixes=("", "_chip")  # usually not needed since ids differ
    )
    .merge(
        sliding_window,
        on="sliding_window_peak_id",
        how="left",
        suffixes=("", "_sw")
    )
)

print("Merged rows:", len(merged))
merged_same_source = merged[merged["source_id"] == merged["source_id_sw"]]

print("Merged rows with same source:", len(merged_same_source))

merged_same_source = merged_same_source[["source_id", "chip_peak_id", "sliding_window_peak_id", "target_id", "distance", "sliding_window_score"]].drop_duplicates()
merged_same_source

Merged rows: 70485
Merged rows with same source: 49


Unnamed: 0,source_id,chip_peak_id,sliding_window_peak_id,target_id,distance,sliding_window_score
7073,ZNF143,chr22:36195003-36195232,chr22:36194982-36195349,APOL4,6578,0.0
11023,MAX,chr22:36197627-36197764,chr22:36194982-36195349,APOL4,4046,0.0
11185,ZNF281,chr22:36197679-36197957,chr22:36194982-36195349,APOL4,3853,0.290717
14044,SOX6,chr22:36201104-36201341,chr22:36194982-36195349,APOL4,469,1.160521
14155,SOX6,chr22:36201104-36201341,chr22:36220350-36220742,APOL4,469,1.132278
14266,SOX6,chr22:36201125-36201331,chr22:36194982-36195349,APOL4,479,1.160521
14377,SOX6,chr22:36201125-36201331,chr22:36220350-36220742,APOL4,479,1.132278
18255,PRDM10,chr22:36201570-36201986,chr22:36194982-36195349,APOL4,0,0.0
18366,PRDM10,chr22:36201570-36201986,chr22:36220350-36220742,APOL4,0,1.158187
22450,KDM5B,chr22:36201649-36201864,chr22:36194982-36195349,APOL4,0,0.0


In [33]:
merged_same_source.to_csv("/gpfs/Home/esm5360/general_code/alphagenome/chip_atlas_sliding_window_apol4.csv", index=False)

In [17]:
merged_same_source

Unnamed: 0,source_id,chip_peak_id,sliding_window_peak_id,target_id,distance,sliding_window_score
800945,ZNF143,chr22:36195003-36195232,chr22:36194982-36195349,APOL4,6578,0.0
