In [1]:
import time
import pandas as pd
from pymed import PubMed

In [2]:
tfs = ["ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2"]

In [3]:
def read_tf_df(filename, libn):
    colnames = ["chrom", "start", "end", "chrom_coord", f"{libn}_act", "strand", f"{libn}_log2FoldChange_act", f"gene_name", f"{libn}_log2FoldChange_exp", f"{libn}_padj_act", f"{libn}_padj_exp", "CC_peak", f"{libn}_peak"]
    return pd.read_csv(filename, sep="\t", header=None, names=colnames)

def read_target_df(lib):
    filename = f"/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/targets/{lib}.tsv"
    return pd.read_csv(filename, sep="\t")
    
def get_tf_df(tf):
    direct = f"/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/da_de_peaks/{tf}/direct.bed"
    direct_loss =  f"/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/da_de_peaks/{tf}/direct_loss.bed"
    indirect = f"/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/da_de_peaks/{tf}/indirect.bed"
    indirect_gained =  f"/data5/deepro/starrseq/papers/results/6_link_da_enhancers_to_de_genes/data/da_de_peaks/{tf}/indirect_gained.bed"
    return tuple(read_tf_df(f, tf) for f in (direct, direct_loss, indirect, indirect_gained))

def get_genewise_count(gene_name):
    query = f"{gene_name}[Title]"
    results = pubmed.query(query, max_results=5000)
    return len(list(results))

def sort_by_gene_counts(df):
    gene_names = df.gene_name.unique()
    gene_pop_dict = {}
    for gn in gene_names:
        count = get_genewise_count(gn)
        time.sleep(1)
        gene_pop_dict[gn] = count
    return gene_pop_dict

In [4]:
pubmed = PubMed(tool="MyTool", email="my@email.address")

# ATF2

In [7]:
# ATF2
tf = "ATF2"
direct, direct_loss, indirect, indirect_gained = get_tf_df(tf)


In [8]:
dl_tf_counts = sort_by_gene_counts(direct_loss)


In [9]:
idg_tf_counts = sort_by_gene_counts(indirect_gained)


In [10]:
dl_tf_counts = sorted(dl_tf_counts.items(), key=lambda x:x[1], reverse=True)
idg_tf_counts = sorted(idg_tf_counts.items(), key=lambda x:x[1], reverse=True)
target_df = read_target_df(tf)

In [11]:
# known and highly researched lost targets
[(tf,c) for tf,c in dl_tf_counts if tf in target_df["Name.Target"].values]

[('FOXO1', 2110),
 ('G6PD', 1856),
 ('BRD4', 907),
 ('HK2', 765),
 ('BMP7', 638),
 ('TCF7L2', 630),
 ('NF2', 616),
 ('MYH9', 389),
 ('MDM4', 247),
 ('MEIS1', 230),
 ('MAPK1', 194),
 ('NSD1', 176),
 ('TACC3', 136),
 ('MBNL1', 114),
 ('SRSF3', 92),
 ('UNC5B', 81),
 ('CLP1', 60),
 ('LIMK2', 60),
 ('ANKRD11', 57),
 ('FAF1', 54),
 ('SSRP1', 51),
 ('TRIM33', 48),
 ('GAK', 47),
 ('ZNRF3', 47),
 ('MLLT3', 44),
 ('AIMP2', 41),
 ('ASXL2', 41),
 ('MTHFD1L', 32),
 ('FLNB', 31),
 ('SLBP', 27),
 ('CABIN1', 26),
 ('CECR2', 24),
 ('POGLUT1', 20),
 ('TMEM175', 20),
 ('UVSSA', 20),
 ('RNF40', 20),
 ('SLC38A9', 19),
 ('RPL3', 19),
 ('ZDHHC5', 17),
 ('TP53BP2', 13),
 ('TMEM39A', 12),
 ('KMT2E', 12),
 ('L3MBTL2', 11),
 ('STK17A', 10),
 ('C2CD3', 10),
 ('MAEA', 9),
 ('PIGG', 8),
 ('RNF187', 8),
 ('HM13', 8),
 ('SCFD1', 8),
 ('CLUAP1', 8),
 ('TMX2', 7),
 ('TIMMDC1', 7),
 ('GNA12', 7),
 ('TTLL5', 7),
 ('FNDC3A', 6),
 ('GSE1', 6),
 ('PPP2R5E', 6),
 ('RNF157', 6),
 ('OSBPL3', 6),
 ('TNKS1BP1', 5),
 ('PHC2', 4),

In [12]:
# known and highly researched gained targets
[(tf,c) for tf,c in idg_tf_counts if tf in target_df["Name.Target"].values]

[('ERCC1', 991), ('PSEN1', 286), ('SETX', 39), ('SEC14L1', 5), ('SLC9A7', 2)]

# CTCF

In [13]:
# CTCF
tf = "CTCF"
direct, direct_loss, indirect, indirect_gained = get_tf_df(tf)

In [16]:
dl_tf_counts = sort_by_gene_counts(direct_loss)


In [17]:
idg_tf_counts = sort_by_gene_counts(indirect_gained)


In [18]:
dl_tf_counts = sorted(dl_tf_counts.items(), key=lambda x:x[1], reverse=True)
idg_tf_counts = sorted(idg_tf_counts.items(), key=lambda x:x[1], reverse=True)
target_df = read_target_df(tf)

In [19]:
# known and highly researched lost targets
[(tf,c) for tf,c in dl_tf_counts if tf in target_df["Name.Target"].values]

[('SP1', 3659),
 ('MR1', 617),
 ('NRP1', 242),
 ('UBC', 139),
 ('CD109', 99),
 ('BMF', 84),
 ('UNC5B', 81),
 ('CHD2', 59),
 ('PTPA', 44),
 ('VCAN', 43),
 ('SMOC1', 27),
 ('CHN1', 23),
 ('LARP6', 21),
 ('GATAD2B', 16),
 ('EHBP1', 14),
 ('TP53BP2', 13),
 ('CCNY', 13),
 ('DNAJB12', 10),
 ('RNF187', 8),
 ('SIPA1L2', 6),
 ('SEMA4B', 6),
 ('ZNF638', 5),
 ('ERGIC1', 4),
 ('POLR3GL', 4),
 ('ZNF704', 3),
 ('USP54', 3),
 ('SLC22A23', 3),
 ('NDUFC1', 2),
 ('OSBPL6', 1),
 ('TTC23', 0)]

In [20]:
# known and highly researched gained targets
[(tf,c) for tf,c in idg_tf_counts if tf in target_df["Name.Target"].values]

[('MYH9', 389),
 ('MERTK', 265),
 ('SDC1', 44),
 ('EBF3', 37),
 ('STEAP3', 25),
 ('GNA13', 20),
 ('LRCH1', 8),
 ('PIN4', 6),
 ('MOB3A', 1),
 ('C19orf47', 0)]