In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
libraries = ["ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]
sda_sde_table_dir = "/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/da_de_peaks"
tgi_dir = "/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/targets"

In [3]:
def get_top_targets(lib_df, tgi_dir, lib):
    # get tf link target genes for lib
    tgi_file = os.path.join(tgi_dir, f"{lib}.tsv")
    tgi_df = pd.read_csv(tgi_file, sep="\t")
    all_targets = tgi_df["Name.Target"].values
    return lib_df.loc[lib_df.gene_name.isin(all_targets)].sort_values([f"{lib}_log2FoldChange_act", f"{lib}_log2FoldChange_exp"])

In [35]:
# look for unique associations

def create_unique_df(sda_sde_table_dir, libraries, proportional_filename, tgi_dir):
    df = pd.DataFrame()

    for lib_name in libraries:
        lib_file = os.path.join(sda_sde_table_dir, lib_name, proportional_filename)
        lib_df = pd.read_csv(lib_file)
        top_targets_df = get_top_targets(lib_df, tgi_dir, lib_name)
        top_targets_df["lib"] = lib_name
        top_targets_df.columns = [c.replace(f"{lib_name}_", "lib_") for c in top_targets_df.columns]
        df = pd.concat((df, top_targets_df))

    df["dir"] = df.lib_log2FoldChange_act.apply(lambda x: "up" if x>0 else "down")
    df["unique"] = df.gene_name + "_" + df.dir
    df["unique_counts"] = df.unique.map(df.unique.value_counts())
    return df

In [44]:
abc_df = create_unique_df(sda_sde_table_dir, libraries[:-1], "abc_sda_sde_table_peaks_proportional.csv", tgi_dir)
nearest_df = create_unique_df(sda_sde_table_dir, libraries[:-1], "nearest_sda_sde_table_peaks_proportional.csv", tgi_dir)

In [45]:
lib_name = "ATF2"
abc_df.loc[(abc_df.unique_counts==1)&(abc_df.lib==lib_name)]

Unnamed: 0,chrom_coord,gene_name,CC_act,lib_act,lib_padj_act,lib_log2FoldChange_act,lib_padj_exp,lib_log2FoldChange_exp,CC_peak,lib_peak,lib,dir,unique,unique_counts
19,chr2_74862400_74862900,HK2,3.150507,0.987984,1.304872e-38,-2.233672,2.810095e-05,-0.322688,1,0,ATF2,down,HK2_down,1
64,chr8_46617217_46617717,MCM4,4.934769,2.795126,5.535309e-21,-2.200575,0.0,-0.887344,1,1,ATF2,down,MCM4_down,1
65,chr8_46617217_46617717,PRKDC,4.934769,2.795126,5.535309e-21,-2.200575,1.50357e-104,-0.55007,1,1,ATF2,down,PRKDC_down,1
26,chr3_100105778_100106278,LNP1,4.234542,2.184595,4.065076e-46,-2.135866,1.376698e-46,-1.704871,1,1,ATF2,down,LNP1_down,1
36,chr3_100105778_100106278,TOMM70,4.234542,2.184595,4.065076e-46,-2.135866,7.438003e-138,-0.805085,1,1,ATF2,down,TOMM70_down,1
31,chr3_100105778_100106278,TBC1D23,4.234542,2.184595,4.065076e-46,-2.135866,1.626771e-17,-0.374797,1,1,ATF2,down,TBC1D23_down,1
8,chr11_57493699_57494199,CLP1,3.562769,1.856032,1.657817e-10,-1.753301,1.29381e-20,-0.752075,1,0,ATF2,down,CLP1_down,1
12,chr11_57493699_57494199,SSRP1,3.562769,1.856032,1.657817e-10,-1.753301,3.169645e-131,-0.71546,1,0,ATF2,down,SSRP1_down,1
9,chr11_57493699_57494199,RTN4RL2,3.562769,1.856032,1.657817e-10,-1.753301,5.601844e-09,-0.670923,1,0,ATF2,down,RTN4RL2_down,1
13,chr11_57493699_57494199,TMX2,3.562769,1.856032,1.657817e-10,-1.753301,1.997752e-48,-0.567873,1,0,ATF2,down,TMX2_down,1


In [46]:
lib_name = "ATF2"
nearest_df.loc[(nearest_df.unique_counts==1)&(nearest_df.lib==lib_name)]

Unnamed: 0,chrom_coord,gene_name,CC_act,lib_act,lib_padj_act,lib_log2FoldChange_act,lib_padj_exp,lib_log2FoldChange_exp,CC_peak,lib_peak,distance,lib,dir,unique,unique_counts
57,chr2_74862400_74862900,HK2,3.150507,0.987984,1.304872e-38,-2.233672,2.810095e-05,-0.322688,1,0,0,ATF2,down,HK2_down,1
66,chr22_24039506_24040006,CABIN1,3.089341,1.086732,1.6565489999999998e-20,-2.068205,1.1327049999999999e-155,-0.981409,1,0,0,ATF2,down,CABIN1_down,1
18,chr11_74135378_74135878,C2CD3,1.679147,-0.205276,2.076161e-16,-1.991694,2.8536850000000002e-33,-0.874822,1,0,0,ATF2,down,C2CD3_down,1
78,chr22_28982887_28983387,ZNRF3,2.425711,0.665261,1.257687e-34,-1.860764,8.368742e-43,-0.787573,1,0,0,ATF2,down,ZNRF3_down,1
100,chr9_20357023_20357523,MLLT3,1.937186,0.197386,6.849494e-14,-1.857139,2.005487e-70,-0.759579,1,0,0,ATF2,down,MLLT3_down,1
71,chr22_31218687_31219187,LIMK2,1.995648,0.468998,3.289379e-24,-1.821062,1.201939e-28,-0.56303,1,0,0,ATF2,down,LIMK2_down,1
84,chr5_177269621_177270121,NSD1,2.218033,1.27295,1.259167e-09,-1.814686,0.000500566,-0.249249,1,0,0,ATF2,down,NSD1_down,1
89,chr7_6015498_6015998,AIMP2,3.145962,1.753227,1.060616e-18,-1.800756,0.02306911,-0.165972,1,0,0,ATF2,down,AIMP2_down,1
69,chr22_25061478_25061934,KIAA1671,2.408395,0.707905,1.0123070000000001e-22,-1.78774,8.397069e-59,-1.110979,1,0,0,ATF2,down,KIAA1671_down,1
85,chr5_55592682_55593182,SLC38A9,1.256289,-0.27829,5.53169e-31,-1.76822,1.011385e-05,-0.219807,1,0,32364,ATF2,down,SLC38A9_down,1


In [47]:
lib_name="CTCF"
abc_df.loc[(abc_df.unique_counts==1)&(abc_df.lib==lib_name)]

Unnamed: 0,chrom_coord,gene_name,CC_act,lib_act,lib_padj_act,lib_log2FoldChange_act,lib_padj_exp,lib_log2FoldChange_exp,CC_peak,lib_peak,lib,dir,unique,unique_counts
4,chr2_88599821_88600321,KRCC1,4.187284,3.348038,0.03998625,-0.871514,3.244994e-08,-0.357255,1,1,CTCF,down,KRCC1_down,1
10,chr3_126261092_126261592,SLC41A3,3.672116,3.070218,1.679495e-07,-0.632362,2.955749e-28,-0.484671,1,1,CTCF,down,SLC41A3_down,1
6,chr2_74862400_74862900,POLE4,3.150507,2.615122,5.316381e-08,-0.563227,0.00378585,-0.216459,1,1,CTCF,down,POLE4_down,1
1,chr11_57493699_57494199,SLC43A1,3.562769,3.158577,0.000118407,-0.432845,1.0674679999999999e-19,-0.743708,1,1,CTCF,down,SLC43A1_down,1
3,chr11_57493699_57494199,UBE2L6,3.562769,3.158577,0.000118407,-0.432845,1.983648e-06,-0.638086,1,1,CTCF,down,UBE2L6_down,1


In [55]:
nearest_df.loc[(nearest_df.unique_counts==1)&(nearest_df.lib==lib_name)]

Unnamed: 0,chrom_coord,gene_name,CC_act,lib_act,lib_padj_act,lib_log2FoldChange_act,lib_padj_exp,lib_log2FoldChange_exp,CC_peak,lib_peak,distance,lib,dir,unique,unique_counts
29,chr15_40098705_40099205,BMF,3.204333,2.231486,0.02071611,-1.047436,1.301484e-12,-0.603277,1,1,0,CTCF,down,BMF_down,1
35,chr15_99703804_99704304,MEF2A,3.179155,2.216252,0.0006791237,-0.993953,3.482866e-59,-0.93029,1,1,0,CTCF,down,MEF2A_down,1
61,chr3_5197293_5197793,EDEM1,3.343506,2.659346,1.465315e-10,-0.792086,1.987391e-06,-0.168738,1,1,0,CTCF,down,EDEM1_down,1
65,chr6_73713524_73714024,CD109,1.563678,0.808465,4.098986e-08,-0.784926,0.03715494,-0.154325,1,0,0,CTCF,down,CD109_down,1
28,chr15_89139514_89140014,ABHD2,3.749875,3.017944,1.969056e-11,-0.761663,7.897390999999999e-50,-0.826556,1,1,0,CTCF,down,ABHD2_down,1
1,chr1_153881783_153882283,GATAD2B,2.246217,1.526699,8.388367e-12,-0.747222,1.937269e-07,-0.27459,1,0,0,CTCF,down,GATAD2B_down,1
0,chr1_184457602_184458102,C1orf21,3.457039,2.735262,1.272677e-13,-0.746679,2.393004e-20,-0.453617,1,1,0,CTCF,down,C1orf21_down,1
64,chr5_83509321_83509821,VCAN,2.454515,1.756807,1.379569e-08,-0.726578,9.811578e-05,-0.205858,1,0,0,CTCF,down,VCAN_down,1
22,chr12_124923925_124924425,UBC,2.247392,1.549391,5.980722e-10,-0.726533,1.483876e-52,-0.316738,1,0,6558,CTCF,down,UBC_down,1
37,chr15_99155778_99156278,TTC23,2.759035,2.062596,3.929951e-12,-0.724456,8.249036e-58,-0.928407,1,0,0,CTCF,down,TTC23_down,1
