In [1]:
'''Created on Jan 4, 2018

Merge TAD info, gene location and name info, and gexes across roadmap samples in a single dataframe.

Note that for the roadmap samples E001-E129, hg19 refseq genes were used (counting to ~27k). However, 
the annotation file is not used because the merged gex file has both the 
location of TSS+/-500bp and gene names present.
'''

'Created on Jan 4, 2018\n\nMerge TAD info, gene location and name info, and gexes across roadmap samples in a single dataframe.\n\nNote that for the roadmap samples E001-E129, hg19 refseq genes were used (counting to ~27k). However, \nthe annotation file is not used because the merged gex file has both the \nlocation of TSS+/-500bp and gene names present.\n'

In [2]:
import pandas as pd
import os
import re
import collections as col
import pybedtools
from pybedtools import BedTool as bedtools

csv_gex = os.path.abspath("../pooling_roadmap_gexes/roadmap.rnase_imputed.LogRPKM.signal.txt")
csv_tad = os.path.abspath("../getting_tad_domains/tad_domains_nIs4celltypes_noCentrTelo.bed")

In [3]:
df_gex = pd.read_csv(csv_gex, sep="\t", header=0)
print(df_gex.shape )

(27544, 129)


In [4]:
'''Get df_locs and bed_locs with gene loc info'''
locs = [re.split(":|-", x) for x in df_gex["loc"].tolist()]
locs = [[x[0], int(x[1]), int(x[2])] for x in locs]
df_locs = pd.DataFrame.from_records(locs, columns=["chrom", "tss_us", "tss_ds"])
bed_locs = bedtools.from_dataframe(df_locs)

In [5]:
'''Get the df_tads with tad_locs'''
df_tads = pd.read_csv(csv_tad, sep="\t", header=None, names=["chrom", "tad_ss", "tad_es"]) 
print(df_tads.shape)
df_tads.head(2)

(3081, 3)


Unnamed: 0,chrom,tad_ss,tad_es
0,chr1,10000,3460000
1,chr1,3460000,3720000


In [6]:
'''Helper function to compact list representation of location to string.'''
def compact_aloclist(aloc):
    # aloc is of form: ["chr1", 3242, 3525], for instance
    aloc = [str(x) for x in aloc] 
    aloc = aloc[0]+":"+aloc[1]+"-"+aloc[2]
    return aloc

In [7]:
'''Get tss_loc to tad_loc map for each gene.'''
dict_loc_to_tad = col.OrderedDict()
for ix in df_tads.index:
    bed_atad = bedtools("\t".join([str(x) for x in df_tads.iloc[ix].tolist()]), from_string=True) 
    bed_locs_in_atad = bed_atad.intersect(bed_locs, wb=True)
    df_locs_in_atad = pd.read_table(bed_locs_in_atad.fn, names=["chr", "ss_intersection", "es_intersection", "chrom", "ss", "es"]) 
    df_locs_in_atad = df_locs_in_atad[["chrom", "ss", "es"]]
    atad = compact_aloclist(df_tads.iloc[ix].tolist())  # loc for the tad with aloc
    for jx in df_locs_in_atad.index:
        aloc = compact_aloclist(df_locs_in_atad.iloc[jx].tolist())  # loc for a tss+/-some kb
        dict_loc_to_tad[aloc] = atad
    if (ix % 200 == 0):
        print("tad index:{}".format(ix))

tad index:0
tad index:200
tad index:400
tad index:600
tad index:800
tad index:1000
tad index:1200
tad index:1400
tad index:1600
tad index:1800
tad index:2000
tad index:2200
tad index:2400
tad index:2600
tad index:2800
tad index:3000


In [20]:
print(len(dict_loc_to_tad.items()))
print("Note: df_gex has its shape {}. So about 500 TSSes do not have their TAD info. \
These overlap the centro- and telo-mere regions that were removed in the \
TAD dataframe.".format(df_gex.shape))

27068
Note: df_gex has its shape (27544, 129). So about 500 TSSes do not have their TAD info. These overlap the centro- and telo-mere regions that were removed in the TAD dataframe.


In [19]:
'''Include the tad list into the df_gex - which will merge the dfs'''
tads_inOrder = []
locs_woTads = []  # contains about 500 locs/tsses without TAD info

for aloc in df_gex["loc"].tolist():
    if (aloc in dict_loc_to_tad.keys()):
        tads_inOrder.append(dict_loc_to_tad[aloc])
    else:
        locs_woTads.append(aloc)
        tads_inOrder.append("-")

df_gex["TAD_loc"] = tads_inOrder

In [33]:
'''Reorder the columns'''
df_gex = df_gex[["geneName", "loc", "TAD_loc"]+df_gex.columns.tolist()[2:-1]]

In [35]:
df_gex.head(3)

Unnamed: 0,geneName,loc,TAD_loc,E017_LNG.IMR90_IMR90,E002_ESC.WA7_ESC,E008_ESC.H9_ESC,E001_ESC.I3_ESC,E015_ESC.HUES6_ESC,E014_ESC.HUES48_ESC,E016_ESC.HUES64_ESC,...,E120_MUS.HSMM_ENCODE2012,E121_MUS.HSMMT_ENCODE2012,E122_VAS.HUVEC_ENCODE2012,E123_BLD.K562.CNCR_ENCODE2012,E124_BLD.CD14.MONO_ENCODE2012,E125_BRN.NHA_ENCODE2012,E126_SKIN.NHDFAD_ENCODE2012,E127_SKIN.NHEK_ENCODE2012,E128_LNG.NHLF_ENCODE2012,E129_BONE.OSTEO_ENCODE2012
0,DDX11L1,chr1:11373-12373,chr1:10000-3460000,112.19,103.55,83.26,80.15,76.86,65.11,106.22,...,88.75,101.48,105.5,212.38,300.45,81.02,91.73,98.23,124.92,88.48
1,MIR6859-2,chr1:16936-17936,chr1:10000-3460000,2423.37,2169.75,2051.7,2119.4,2058.41,2070.94,2215.21,...,2230.61,2041.39,2249.71,2783.41,3069.73,2173.7,2248.84,2402.39,2787.36,2168.68
2,WASH7P,chr1:28870-29870,chr1:10000-3460000,300.15,436.45,395.85,353.0,383.25,385.55,332.05,...,316.5,275.8,311.4,346.3,385.75,313.15,301.25,334.8,333.45,287.7


In [37]:
'''Save the merged df'''
df_gex.to_csv("roadmap.rnase_imputed.LogRPKM.signal.mergedWTADlocs.txt", sep="\t", header=True, index=False)

In [None]:
# - EOF -