In [1]:
import os
import pandas as pd
import scanpy as sc

In [2]:
genes = pd.read_csv("../../result/network_GTEx_phastcons/valid_genes", sep="\t")
id2genes = genes.set_index("ID")["genes"].to_dict()
genes2id = genes.set_index("genes")["ID"].to_dict()
genes = genes['genes'].values

In [3]:
meta = pd.read_csv("../../data/TIP/metadata.tsv", sep="\t")
meta['TF'] = meta['Experiment target'].str.split("-human", expand=True)[0]

data = sc.read_h5ad("../../data/perturb/K562_gwps_normalized_bulk_01.h5ad")
name2id = data.var["gene_name"].to_frame().reset_index().set_index("gene_name")["gene_id"].to_dict()
meta['TFname'] = meta['TF'].map(name2id)

meta = meta[meta['TFname'].isin(genes)] # keep TF in genes only
meta = meta[meta['Output type'] == "IDR thresholded peaks"]
meta = meta[meta['Biosample term name'] == "GM12878"] # filter for GM12878 only

In [4]:
all_files = os.listdir("../../data/TIP/weight/")
all_files = set([ii.split(".txt")[0] for ii in all_files])
sample_files = set(meta['File accession'])
print(len(sample_files), len(set.intersection(sample_files, all_files))) # GM12878 is a subset of all

36 36


### parse result to network

In [10]:
meta = pd.read_csv("../../data/TIP/meta_GTEx.tsv", sep="\t")
meta = meta[meta['File accession'].isin(sample_files)]

In [11]:
result = {}
for fid, TF in meta.values:
    df = pd.read_csv("../../data/TIP/qval/%s.txt" % fid, sep="\t")
    targets = set(df['name'].unique())
    if TF in result.keys():
        result[TF] = set.union(result[TF], targets)
    else:
        result[TF] = targets

In [12]:
source = []
target = []
for ii in result.keys():
    source += [ii] * len(result[ii])
    target += list(result[ii])

In [13]:
net = pd.DataFrame([source, target]).T

net[0] = net[0].map(genes2id)
net[1] = net[1].map(genes2id)

net = net.dropna().copy()
net[0] = net[0].astype(int)
net[1] = net[1].astype(int)

In [14]:
net = net.drop_duplicates().sort_values([0, 1])
net.to_csv("../../result/network_GTEx_phastcons/ChIP_TIP_GM12878.tsv", sep="\t", header=False, index=False)