In [1]:
import os
import numpy as np
from scipy.stats import fisher_exact
import pandas as pd
import pybedtools

CURRENT_DIR_PATH = os.path.dirname(os.path.abspath("__file__"))
pybedtools.helpers.set_tempdir(CURRENT_DIR_PATH)




In [2]:
negative_controls_file = "/data5/deepro/starrseq/papers/reproducibility/5_peak_qc/data/negative_controls.bed"
master_fragments_file = "/data5/deepro/starrseq/papers/results/1_compare_activity_ko_vs_wt/data/window/IN/master_filtered_windows.bed"
refseq_file = "/data5/deepro/starrseq/papers/reproducibility/0_in-house_dataset/data/tss/Refseq_hg38_gene_coordinates.txt"

In [3]:
def parse_gtf_to_get_gene_coords(gtf_file):
    gtf_df = pd.read_csv(gtf_file, sep="\t", low_memory=False, usecols=["chrom", "txStart", "txEnd", "name2"])
    gtf_df = gtf_df.groupby("name2").agg({"chrom": lambda x: list(x)[0], "txStart": min, "txEnd": max})
    return gtf_df.reset_index().rename(columns={"name2": "gene"}).loc[:, ["chrom", "txStart", "txEnd", "gene"]]

In [4]:
exon_bed = pybedtools.BedTool(negative_controls_file)
master_bed = pybedtools.BedTool(master_fragments_file)
gene_df = parse_gtf_to_get_gene_coords(refseq_file)
gene_bed = pybedtools.BedTool.from_dataframe(gene_df)
intron_bed = gene_bed - exon_bed

In [5]:
# exonic fragments within the master
exon_fragments = master_bed.intersect(exon_bed, f=0.95, F=0.95, e=True, u=True)

In [6]:
# intronic fragments within master
intron_fragments = master_bed.intersect(intron_bed, f=0.95, F=0.95, e=True, u=True)

In [7]:
# intronic and intergenic fragments within master
other_fragments = master_bed.intersect(exon_bed, f=0.95, F=0.95, e=True, v=True)

In [8]:
nexons_fragments, nintron_fragments, nother_fragments = len(exon_fragments), len(intron_fragments), len(other_fragments)

In [9]:
# 
libs= ["CC", "ATF2", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]
peak_dir = "/data5/deepro/starrseq/papers/results/2_categorize_fragments_on_activity/data"
peak_dict = dict()

for lib in libs:
    peak_file = os.path.join(peak_dir, lib, "peaks.bed")
    peak_bed = pybedtools.BedTool(peak_file)
    peaks_in_exons = peak_bed.intersect(exon_fragments, f=1, u=True)
    peaks_in_others = peak_bed.intersect(intron_fragments, f=1, u=True)
    peak_dict[lib] = (len(peaks_in_exons), len(peaks_in_others))
    cont_table = np.array([[peak_dict[lib][0], peak_dict[lib][1]], [nexons_fragments - peak_dict[lib][0], nintron_fragments-peak_dict[lib][1]]])
    res = fisher_exact(cont_table, alternative='less')
    print(lib)
    print(cont_table)
    print(res)

CC
[[    59   1091]
 [  6482 117667]]
(0.9816867184342681, 0.4794330602242858)
ATF2
[[    22    395]
 [  6519 118363]]
(1.0112547354276982, 0.5765590906601616)
FOXA1
[[    62   1334]
 [  6479 117424]]
(0.8423348134545168, 0.10259678436706286)
LEF1
[[    76    934]
 [  6465 117824]]
(1.4829685789566949, 0.999300660095073)
SCRT1
[[    45   1074]
 [  6496 117684]]
(0.759066172221152, 0.03677733456328781)
TCF7L2
[[    78   1296]
 [  6463 117462]]
(1.0938375711313975, 0.7972446665779745)
16P12_1
[[    56   1204]
 [  6485 117554]]
(0.8431191838052035, 0.11717863426451983)


In [10]:
# 
libs= ["CC", "ATF2", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]
peak_dir = "/data5/deepro/starrseq/papers/results/2_categorize_fragments_on_activity/data"
peak_dict = dict()

for lib in libs:
    peak_file = os.path.join(peak_dir, lib, "peaks.bed")
    peak_bed = pybedtools.BedTool(peak_file)
    peaks_in_exons = peak_bed.intersect(exon_fragments, f=1, u=True)
    peaks_in_others = peak_bed.intersect(other_fragments, f=1, u=True)
    peak_dict[lib] = (len(peaks_in_exons), len(peaks_in_others))
    cont_table = np.array([[peak_dict[lib][0], peak_dict[lib][1]], [nexons_fragments - peak_dict[lib][0], nother_fragments-peak_dict[lib][1]]])
    res = fisher_exact(cont_table, alternative='less')
    print(lib)
    print(cont_table)
    print(res)

CC
[[    59   2317]
 [  6482 245303]]
(0.9636510761117038, 0.4226513088678629)
ATF2
[[    22    856]
 [  6519 246764]]
(0.9728586317779947, 0.5052792943231644)
FOXA1
[[    62   2916]
 [  6479 244704]]
(0.8030401480693878, 0.04644108315935814)
LEF1
[[    76   2154]
 [  6465 245466]]
(1.339648029781101, 0.9931589834765502)
SCRT1
[[    45   2221]
 [  6496 245399]]
(0.7654040002173609, 0.03979213599539022)
TCF7L2
[[    78   2853]
 [  6463 244767]]
(1.035408056830168, 0.6464546280212538)
16P12_1
[[    56   2562]
 [  6485 245058]]
(0.8259767180252032, 0.08579971537218026)


In [11]:
pybedtools.helpers.cleanup(remove_all=True)