In [1]:
import pandas as pd
import tqdm
import bioframe as bf
from collections import Counter
from multiprocessing import Pool
import numpy as np

In [2]:
df = pd.read_csv("gencode.vM25.basic.annotation.gtf.gz",
                 sep="\t", skiprows=5, header=None, 
                 names=["chrom", "source", "type", "start", "end", "strand", "data"], 
                 usecols=[0,1,2,3,4,6,8],
                 low_memory=False)

In [3]:
df.head()

Unnamed: 0,chrom,source,type,start,end,strand,data
0,chr1,HAVANA,gene,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
1,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ..."
2,chr1,HAVANA,exon,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ..."
3,chr1,ENSEMBL,gene,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
4,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ..."


In [4]:
genes_df = df[df["type"] == "gene"].reset_index(drop=True)
genes_df.head()

Unnamed: 0,chrom,source,type,start,end,strand,data
0,chr1,HAVANA,gene,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
1,chr1,ENSEMBL,gene,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
2,chr1,HAVANA,gene,3205901,3671498,-,"gene_id ""ENSMUSG00000051951.5""; gene_type ""pro..."
3,chr1,HAVANA,gene,3252757,3253236,+,"gene_id ""ENSMUSG00000102851.1""; gene_type ""pro..."
4,chr1,HAVANA,gene,3365731,3368549,-,"gene_id ""ENSMUSG00000103377.1""; gene_type ""TEC..."


In [5]:
genes_df.shape[0]

55401

In [6]:
gene_metadata = {}
for i in tqdm.trange(genes_df.shape[0]):
    row = genes_df.iloc[i]
    row_metadata = row.data
    row_metadata = row_metadata.replace(';', '').replace('"', '').split()
    metadata = {}
    for i in range(len(row_metadata) // 2):
        metadata[row_metadata[2*i]] = row_metadata[2*i + 1]
    gene_id = metadata["gene_id"]
    gene_type = metadata["gene_type"]
    gene_name = metadata["gene_name"]
    gene_metadata[gene_id] = {"gene_name" : gene_name, "gene_type" : gene_type}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55401/55401 [00:02<00:00, 23015.05it/s]


In [7]:
transcripts_df = df[df["type"] == "transcript"].reset_index(drop=True)
transcripts_df["gene_id"] = transcripts_df.data.map(lambda x: x.split(";")[0].split()[1].replace('"',''))

In [8]:
transcripts_df.shape

(81540, 8)

In [9]:
transcripts_df.head()

Unnamed: 0,chrom,source,type,start,end,strand,data,gene_id
0,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ...",ENSMUSG00000102693.1
1,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ...",ENSMUSG00000064842.1
2,chr1,HAVANA,transcript,3214482,3671498,-,"gene_id ""ENSMUSG00000051951.5""; transcript_id ...",ENSMUSG00000051951.5
3,chr1,HAVANA,transcript,3252757,3253236,+,"gene_id ""ENSMUSG00000102851.1""; transcript_id ...",ENSMUSG00000102851.1
4,chr1,HAVANA,transcript,3365731,3368549,-,"gene_id ""ENSMUSG00000103377.1""; transcript_id ...",ENSMUSG00000103377.1


In [10]:
transcript_metadata = {}
for i in tqdm.trange(transcripts_df.shape[0]):
    row = transcripts_df.iloc[i]
    row_metadata = row.data
    row_metadata = row_metadata.replace(';', '').replace('"', '').split()
    metadata = {}
    for i in range(len(row_metadata) // 2):
        metadata[row_metadata[2*i]] = row_metadata[2*i + 1]
    gene_id = metadata["gene_id"]
    transcript_id = metadata["transcript_id"]
    gene_name = metadata["gene_name"]
    gene_type = metadata["gene_type"]
    transcript_metadata[transcript_id] = {"gene_id" : gene_id, 
                                          "gene_name" : gene_name,
                                          "gene_type" : gene_type}

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81540/81540 [00:03<00:00, 20546.92it/s]


In [11]:
transcripts_df.head()

Unnamed: 0,chrom,source,type,start,end,strand,data,gene_id
0,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ...",ENSMUSG00000102693.1
1,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ...",ENSMUSG00000064842.1
2,chr1,HAVANA,transcript,3214482,3671498,-,"gene_id ""ENSMUSG00000051951.5""; transcript_id ...",ENSMUSG00000051951.5
3,chr1,HAVANA,transcript,3252757,3253236,+,"gene_id ""ENSMUSG00000102851.1""; transcript_id ...",ENSMUSG00000102851.1
4,chr1,HAVANA,transcript,3365731,3368549,-,"gene_id ""ENSMUSG00000103377.1""; transcript_id ...",ENSMUSG00000103377.1


In [12]:
def get_TSS(start, end, strand):
    if strand == "+":
        return(start, start)
    else:
        return(end, end)
        
transcripts_df[["TSS_start", "TSS_end"]] = transcripts_df.apply(lambda x: get_TSS(x["start"], x["end"], x["strand"]), axis=1, result_type="expand")

In [13]:
transcripts_df.head()

Unnamed: 0,chrom,source,type,start,end,strand,data,gene_id,TSS_start,TSS_end
0,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ...",ENSMUSG00000102693.1,3073253,3073253
1,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ...",ENSMUSG00000064842.1,3102016,3102016
2,chr1,HAVANA,transcript,3214482,3671498,-,"gene_id ""ENSMUSG00000051951.5""; transcript_id ...",ENSMUSG00000051951.5,3671498,3671498
3,chr1,HAVANA,transcript,3252757,3253236,+,"gene_id ""ENSMUSG00000102851.1""; transcript_id ...",ENSMUSG00000102851.1,3252757,3252757
4,chr1,HAVANA,transcript,3365731,3368549,-,"gene_id ""ENSMUSG00000103377.1""; transcript_id ...",ENSMUSG00000103377.1,3368549,3368549


In [14]:
transcripts_df = transcripts_df.rename(columns = {"start" : "transcript_start", 
                                 "end" : "transcript_end", 
                                 "TSS_start" : "start",
                                 "TSS_end" : "end"})

In [15]:
transcripts_df.head()

Unnamed: 0,chrom,source,type,transcript_start,transcript_end,strand,data,gene_id,start,end
0,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ...",ENSMUSG00000102693.1,3073253,3073253
1,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ...",ENSMUSG00000064842.1,3102016,3102016
2,chr1,HAVANA,transcript,3214482,3671498,-,"gene_id ""ENSMUSG00000051951.5""; transcript_id ...",ENSMUSG00000051951.5,3671498,3671498
3,chr1,HAVANA,transcript,3252757,3253236,+,"gene_id ""ENSMUSG00000102851.1""; transcript_id ...",ENSMUSG00000102851.1,3252757,3252757
4,chr1,HAVANA,transcript,3365731,3368549,-,"gene_id ""ENSMUSG00000103377.1""; transcript_id ...",ENSMUSG00000103377.1,3368549,3368549


In [16]:
transcript_counts = Counter()
for t in transcript_metadata:
    gene_id = transcript_metadata[t]["gene_id"]
    transcript_counts[gene_id] += 1

In [17]:
transcript_counts.most_common()[:10]

[('ENSMUSG00000052613.16', 28),
 ('ENSMUSG00000034101.14', 24),
 ('ENSMUSG00000053477.17', 24),
 ('ENSMUSG00000051331.16', 23),
 ('ENSMUSG00000063889.16', 23),
 ('ENSMUSG00000074813.14', 22),
 ('ENSMUSG00000069601.14', 22),
 ('ENSMUSG00000061723.18', 21),
 ('ENSMUSG00000062991.9', 21),
 ('ENSMUSG00000097451.11', 21)]

In [18]:
n_most = transcript_counts.most_common()[0][1]

In [19]:
print("A gene could have at most {} transcripts".format(n_most))

A gene could have at most 28 transcripts


In [20]:
cCREs = pd.read_csv("/data/projects/encode/Registry/V4/mm10/mm10-cCREs.bed",
                    sep="\t",
                    header=None,
                    names=["chrom", "start", "end", "rDHS", "cCRE", "cCRE_type"])
cCREs.head()

Unnamed: 0,chrom,start,end,rDHS,cCRE,cCRE_type
0,chr1,3012681,3012836,EM10D1036230,EM10E0932225,CA-CTCF
1,chr1,3035821,3035995,EM10D1036232,EM10E1235196,CA
2,chr1,3059627,3059790,EM10D1036235,EM10E1235197,CA
3,chr1,3062547,3062749,EM10D1036236,EM10E1235198,CA
4,chr1,3062789,3063138,EM10D1036237,EM10E1235199,CA


In [21]:
cCREs = cCREs.sort_values(['chrom', 'start'], ascending=[True, True]).reset_index(drop=True)
transcripts_df = transcripts_df.sort_values(['chrom', 'start'], ascending=[True, True]).reset_index(drop=True)

In [22]:
def get_n_closest(chrom):
    test = bf.closest(cCREs[cCREs["chrom"] == chrom], transcripts_df[transcripts_df["chrom"] == chrom], suffixes=["_cCRE","_transcript"], k=3*n_most)
    index = test.drop_duplicates(subset= ["cCRE_cCRE", "gene_id_transcript"]).groupby("cCRE_cCRE").distance.nsmallest(3).index
    index = [_[1] for _ in index]
    return(test.iloc[index])

In [23]:
def run_imap_multiprocessing(func, argument_list, num_processes):

    pool = Pool(processes=num_processes)

    result_list_tqdm = []
    for result in tqdm.tqdm(pool.imap(func=func, iterable=argument_list), total=len(argument_list)):
        result_list_tqdm.append(result)

    return result_list_tqdm

chroms = cCREs.chrom.unique().tolist()
df = run_imap_multiprocessing(get_n_closest, chroms, len(chroms))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:57<00:00,  2.76s/it]


In [24]:
df = pd.concat(df).sort_values(["chrom_cCRE", "start_cCRE"], ascending=[True, True]).reset_index(drop=True)

In [25]:
df.head()

Unnamed: 0,chrom_cCRE,start_cCRE,end_cCRE,rDHS_cCRE,cCRE_cCRE,cCRE_type_cCRE,chrom_transcript,source_transcript,type_transcript,transcript_start_transcript,transcript_end_transcript,strand_transcript,data_transcript,gene_id_transcript,start_transcript,end_transcript,distance
0,chr1,3012681,3012836,EM10D1036230,EM10E0932225,CA-CTCF,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ...",ENSMUSG00000102693.1,3073253,3073253,60417
1,chr1,3012681,3012836,EM10D1036230,EM10E0932225,CA-CTCF,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ...",ENSMUSG00000064842.1,3102016,3102016,89180
2,chr1,3012681,3012836,EM10D1036230,EM10E0932225,CA-CTCF,chr1,HAVANA,transcript,3252757,3253236,+,"gene_id ""ENSMUSG00000102851.1""; transcript_id ...",ENSMUSG00000102851.1,3252757,3252757,239921
3,chr1,3035821,3035995,EM10D1036232,EM10E1235196,CA,chr1,HAVANA,transcript,3073253,3074322,+,"gene_id ""ENSMUSG00000102693.1""; transcript_id ...",ENSMUSG00000102693.1,3073253,3073253,37258
4,chr1,3035821,3035995,EM10D1036232,EM10E1235196,CA,chr1,ENSEMBL,transcript,3102016,3102125,+,"gene_id ""ENSMUSG00000064842.1""; transcript_id ...",ENSMUSG00000064842.1,3102016,3102016,66021


In [26]:
gene_names = {_ : gene_metadata[_]["gene_name"] for _ in gene_metadata}

In [27]:
df["gene_name"] = df["gene_id_transcript"].map(gene_names)

In [28]:
columns = ["chrom_cCRE", "start_cCRE", "end_cCRE", "cCRE_cCRE", "rDHS_cCRE", "cCRE_type_cCRE", "gene_id_transcript", "gene_name", "start_transcript", "end_transcript", "distance"]
df[columns].head()

Unnamed: 0,chrom_cCRE,start_cCRE,end_cCRE,cCRE_cCRE,rDHS_cCRE,cCRE_type_cCRE,gene_id_transcript,gene_name,start_transcript,end_transcript,distance
0,chr1,3012681,3012836,EM10E0932225,EM10D1036230,CA-CTCF,ENSMUSG00000102693.1,4933401J01Rik,3073253,3073253,60417
1,chr1,3012681,3012836,EM10E0932225,EM10D1036230,CA-CTCF,ENSMUSG00000064842.1,Gm26206,3102016,3102016,89180
2,chr1,3012681,3012836,EM10E0932225,EM10D1036230,CA-CTCF,ENSMUSG00000102851.1,Gm18956,3252757,3252757,239921
3,chr1,3035821,3035995,EM10E1235196,EM10D1036232,CA,ENSMUSG00000102693.1,4933401J01Rik,3073253,3073253,37258
4,chr1,3035821,3035995,EM10E1235196,EM10D1036232,CA,ENSMUSG00000064842.1,Gm26206,3102016,3102016,66021


In [29]:
df[columns].to_csv("mm10-cCREs-closest_genes.bed", sep="\t", index=False, header=False)