In [12]:
import os
import pandas as pd
import pybedtools
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
from tqdm import tqdm

PROJECT_DIR = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/"

organism = "mm10" # mm10 or hg38
dataset_name = "mESC"

encode_dir = os.path.join(PROJECT_DIR, "data/ground_truth_files/encode")
tss_file = os.path.join(PROJECT_DIR, f"data/genome_data/genome_annotation/{organism}/gene_tss.bed")
genome_file = os.path.join(PROJECT_DIR, f"data/genome_data/reference_genome/{organism}/{organism}.chrom.sizes")

download_path = os.path.join(encode_dir, f"{dataset_name}_files.txt")


In [13]:
def load_metadata(download_path):
    with open(download_path, "r") as f:
        first_line = f.readline().strip().replace('"', "")
        print(f"Metadata file download URL: {first_line}")

    metadata_df = pd.read_csv(first_line, sep="\t")
    return metadata_df

def filter_metadata(metadata_df):    
    metadata_filter_df = metadata_df[["File accession", "File download URL", "Experiment target"]].copy()
    metadata_filter_df["Experiment target"] = metadata_filter_df["Experiment target"].str.replace("-human", "", regex=False).str.replace("-mouse", "", regex=False)
    metadata_filter_df = metadata_filter_df[metadata_filter_df["File download URL"].str.endswith(".bed.gz")]
    metadata_filter_df = metadata_filter_df.rename(columns={"Experiment target":"TF"})
        
    return metadata_filter_df

PEAK_COLS = ["chr","start","end","name","score","strand","signalValue","pValue","qValue","peak"]

def _fetch_one(url: str, source_id: str) -> pd.DataFrame:
    df = pd.read_csv(
        url,
        compression="gzip",
        header=None,
        sep="\t",
        names=PEAK_COLS,
        engine="c",
    )
    df["source_id"] = source_id
    return df

def retrieve_chip_seq_binding_sites(metadata_filter_df: pd.DataFrame,
                                         num_rows: Optional[int] = None,
                                         max_workers: int = 16) -> pd.DataFrame:
    df0 = metadata_filter_df if num_rows is None else metadata_filter_df.iloc[:num_rows]

    # itertuples(name=None) -> plain tuples, so unpack directly
    tasks = list(df0[["File download URL", "TF"]].itertuples(index=False, name=None))

    dfs = []
    failures = 0

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(_fetch_one, url, tf) for (url, tf) in tasks]
        for fut in tqdm(as_completed(futures), total=len(futures), desc="Downloading ChIP-seq peaks", ncols=100):
            try:
                dfs.append(fut.result())
            except Exception:
                failures += 1

    out = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(columns=PEAK_COLS + ["source_id"])
    if failures:
        print(f"Failed downloads/parses: {failures} / {len(tasks)}")
    return out

def filter_low_confidence_peaks(df, min_width=50, max_width=2000, signal_value_quantile=0.75):
    # Filter peaks with low confidence
    df = df[df["qValue"] > 0]
    
    # Filter peaks that are too small or too large
    df["width"] = df["end"] - df["start"]
    
    df = df[(df["width"] >= min_width) & (df["width"] <= max_width)]
    
    df = df.drop(columns=["width"])
    
    # Remove weak signal (less than 3rd quantile)
    df = df[df["signalValue"] > df["signalValue"].quantile(signal_value_quantile)]
    
    return df

def map_tf_to_closest_gene_tss(chip_df, tss_bed_file, genome_file=None):
    chip_bed = pybedtools.BedTool.from_dataframe(chip_df)
    tss_bed = pybedtools.BedTool(tss_bed_file)
    genome_file = "/gpfs/Labs/Uzun/SCRIPTS/PROJECTS/2024.SINGLE_CELL_GRN_INFERENCE.MOELLER/data/genome_data/reference_genome/hg38/hg38.chrom.sizes"  # or None if you don't have one
    tss_bed = tss_bed.filter(lambda f: "random" not in f.chrom and "chrUn" not in f.chrom and "chrEBV" not in f.chrom).saveas()  # drop random/chrUn/chrEBV contigs").saveas()

    if genome_file:
        chip_sorted = chip_bed.sort(g=genome_file)
        tss_sorted  = tss_bed.sort(g=genome_file)
    else:
        chip_sorted = chip_bed.sort()
        tss_sorted  = tss_bed.sort()

    chip_closest_tss = chip_sorted.closest(tss_sorted, d=True, g=genome_file)


    raw_chip_closest_tss_df = chip_closest_tss.to_dataframe(
        names=[
            "peak_name",
            "peak_score",
            "peak_strand",
            "peak_thick_start",
            "peak_thick_end",
            "source_id",
            "tss_chr",
            "tss_start",
            "tss_end",
            "tss_gene",
            "distance"
        ]
    ).reset_index()

    raw_chip_closest_tss_df = raw_chip_closest_tss_df.rename(
        columns={
            "level_0": "peak_chr",
            "level_1": "peak_start",
            "level_2": "peak_end",
        }
    )

    raw_chip_closest_tss_df["peak_id"] = (
        raw_chip_closest_tss_df["peak_chr"].astype(str)
        + ":" +
        raw_chip_closest_tss_df["peak_start"].astype(str)
        + "-" +
        raw_chip_closest_tss_df["peak_end"].astype(str)
    )
    raw_chip_closest_tss_df["source_id"] = raw_chip_closest_tss_df["source_id"].str.upper()
    raw_chip_closest_tss_df["target_id"] = raw_chip_closest_tss_df["tss_gene"].str.upper()

    chip_closest_tss_df = raw_chip_closest_tss_df[["source_id", "peak_id", "target_id", "distance"]]
    chip_closest_tss_df.rename(columns={"distance":"tss_distance"}, inplace=True)
    
    return chip_closest_tss_df



## ChIP-seq

In [14]:
metadata_df = load_metadata(download_path)
metadata_filter_df = filter_metadata(metadata_df)
metadata_df.head()

Metadata file download URL: https://www.encodeproject.org/metadata/?assay_title=TF+ChIP-seq&status=released&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus&replicates.library.biosample.life_stage=embryonic&biosample_ontology.cell_slims=stem+cell&perturbed=false&type=Experiment&files.analyses.status=released&files.preferred_default=true


Unnamed: 0,File accession,File format,File type,File format type,Output type,File assembly,Experiment accession,Assay,Donor(s),Biosample term id,...,Platform,Controlled by,File Status,s3_uri,Azure URL,File analysis title,File analysis status,Audit WARNING,Audit NOT_COMPLIANT,Audit ERROR
0,ENCFF932EZQ,bigWig,bigWig,,signal p-value,mm10,ENCSR531HWD,TF ChIP-seq,/mouse-donors/ENCDO015AAA/,EFO:0007075,...,,,released,s3://encode-public/2020/11/24/2187e742-569a-4f...,https://datasetencode.blob.core.windows.net/da...,ENCODE4 v1.6.1 mm10,released,"inconsistent platforms, low read depth, mild t...",,
1,ENCFF927JDA,bed narrowPeak,bed,narrowPeak,IDR thresholded peaks,mm10,ENCSR531HWD,TF ChIP-seq,/mouse-donors/ENCDO015AAA/,EFO:0007075,...,,,released,s3://encode-public/2020/11/24/62f467e9-ecee-4d...,https://datasetencode.blob.core.windows.net/da...,ENCODE4 v1.6.1 mm10,released,"inconsistent platforms, low read depth, mild t...",,
2,ENCFF061FIN,bigBed narrowPeak,bigBed,narrowPeak,IDR thresholded peaks,mm10,ENCSR531HWD,TF ChIP-seq,/mouse-donors/ENCDO015AAA/,EFO:0007075,...,,,released,s3://encode-public/2020/11/24/eda90b29-537d-4f...,https://datasetencode.blob.core.windows.net/da...,ENCODE4 v1.6.1 mm10,released,"inconsistent platforms, low read depth, mild t...",,
3,ENCFF608GQG,bigWig,bigWig,,signal p-value,mm10,ENCSR779CZG,TF ChIP-seq,/mouse-donors/ENCDO015AAA/,EFO:0007751,...,,,released,s3://encode-public/2020/11/24/fb06a998-2a68-44...,https://datasetencode.blob.core.windows.net/da...,ENCODE4 v1.6.1 mm10,released,"borderline replicate concordance, mixed read l...",,
4,ENCFF097ISB,bed narrowPeak,bed,narrowPeak,IDR thresholded peaks,mm10,ENCSR779CZG,TF ChIP-seq,/mouse-donors/ENCDO015AAA/,EFO:0007751,...,,,released,s3://encode-public/2020/11/24/e482d4c7-00f8-46...,https://datasetencode.blob.core.windows.net/da...,ENCODE4 v1.6.1 mm10,released,"borderline replicate concordance, mixed read l...",,


In [15]:
tf_binding_sites_df = retrieve_chip_seq_binding_sites(metadata_filter_df)

print(f"Number of TFs before filtering: {tf_binding_sites_df['source_id'].nunique():,}")
print(f"Number of peaks before filtering: {tf_binding_sites_df.shape[0]:,}")

Downloading ChIP-seq peaks: 100%|███████████████████████████████████| 11/11 [00:01<00:00,  9.94it/s]


Number of TFs before filtering: 10
Number of peaks before filtering: 217,224


In [16]:
high_confidence_tf_binding_sites_df = filter_low_confidence_peaks(
    tf_binding_sites_df,
    min_width=50,
    max_width=2000,
    signal_value_quantile=0.75
    )

print(f"Number of TFs after filtering: {high_confidence_tf_binding_sites_df['source_id'].nunique():,}")
print(f"Number of peaks after filtering: {high_confidence_tf_binding_sites_df.shape[0]:,}")

high_confidence_tf_binding_sites_df.head()


Number of TFs after filtering: 10
Number of peaks after filtering: 54,289


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["width"] = df["end"] - df["start"]


Unnamed: 0,chr,start,end,name,score,strand,signalValue,pValue,qValue,peak,source_id
0,chr6,125215065,125215359,.,1000,.,436.24231,-1.0,3.63236,140,POU5F1
1,chr5,144083579,144083844,.,1000,.,376.32668,-1.0,3.63236,133,POU5F1
2,chr7,99980384,99980658,.,1000,.,363.74136,-1.0,3.63236,146,POU5F1
3,chr18,65893220,65893482,.,1000,.,361.63903,-1.0,3.63236,122,POU5F1
4,chr14,86396078,86396363,.,1000,.,358.2097,-1.0,3.63236,134,POU5F1


In [17]:
valid_croms = [f"chr{i}" for i in range(1, 19)]

high_confidence_tf_binding_sites_df = high_confidence_tf_binding_sites_df[high_confidence_tf_binding_sites_df["chr"].isin(valid_croms)].copy()

tf_to_tg_df = map_tf_to_closest_gene_tss(high_confidence_tf_binding_sites_df, tss_file, genome_file).sort_values(by=["source_id", "target_id"])
tf_to_tg_df.head()
print(f"Number of unique TFs: {tf_to_tg_df['source_id'].nunique():,}")
print(f"Number of unique TGs: {tf_to_tg_df['target_id'].nunique():,}")
print(f"Number of unique edges: {tf_to_tg_df.shape[0]:,}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chip_closest_tss_df.rename(columns={"distance":"tss_distance"}, inplace=True)


Number of unique TFs: 10
Number of unique TGs: 15,135
Number of unique edges: 52,187


In [18]:
tf_to_tg_df = tf_to_tg_df[["source_id", "target_id"]]
tf_to_tg_df.to_csv(os.path.join(encode_dir, f"{dataset_name}_encode_tf_peak_tg_dist.csv"), index=False)