In [1]:
import os
import pandas as pd
import scanpy as sc

In [2]:
genes = pd.read_csv("../../result/network_perturb_phyloP/valid_genes", sep="\t")
id2genes = genes.set_index("ID")["genes"].to_dict()
genes2id = genes.set_index("genes")["ID"].to_dict()
genes = genes['genes'].values

In [3]:
meta = pd.read_csv("../../data/TIP/metadata.tsv", sep="\t")
meta['TF'] = meta['Experiment target'].str.split("-human", expand=True)[0]
meta = meta[meta['TF'].isin(genes)] # keep TF in genes only
meta = meta[meta['Output type'] == "IDR thresholded peaks"]
meta = meta[meta['Biosample term name'] == "K562"] # filter for K562 only

In [4]:
all_files = os.listdir("../../data/TIP/weight/")
all_files = set([ii.split(".txt")[0] for ii in all_files])
sample_files = set(meta['File accession'])
print(len(sample_files), len(set.intersection(sample_files, all_files))) # K562 is a subset of all

144 144


### parse result to network

In [5]:
meta = pd.read_csv("../../data/TIP/meta_perturb.tsv", sep="\t")
meta = meta[meta['File accession'].isin(sample_files)]

In [6]:
data = sc.read_h5ad("../../data/perturb/K562_gwps_normalized_bulk_01.h5ad")
eid2name = data.var[data.var['gene_name'].isin(genes)]['gene_name'].to_dict()

result = {}
for fid, TF in meta.values:
    df = pd.read_csv("../../data/TIP/qval/%s.txt" % fid, sep="\t")
    df["name"] = df["name"].map(eid2name) # keep target in genes only and convert to gene name
    df = df.dropna()
    targets = set(df['name'].unique())
    if TF in result.keys():
        result[TF] = set.union(result[TF], targets)
    else:
        result[TF] = targets

In [7]:
source = []
target = []
for ii in result.keys():
    source += [ii] * len(result[ii])
    target += list(result[ii])

In [8]:
net = pd.DataFrame([source, target]).T
net[0] = net[0].map(genes2id)
net[1] = net[1].map(genes2id)

net = net.dropna().copy()
net[0] = net[0].astype(int)
net[1] = net[1].astype(int)

In [9]:
net = net.drop_duplicates().sort_values([0, 1])
net.to_csv("../../result/network_perturb_phyloP/ChIP_TIP_K562.tsv", sep="\t", header=False, index=False)