# iGD: an intergrated genomic data source


In [19]:
import os
import struct
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob, functools, tqdm, PIL
import time
from multiprocess import Pool
import _pickle as pkl
import gzip
from operator import itemgetter
import gc

#-----------------------------------------------------------------
#Divide each HG chromosome into tiles of 16384 (2**14) base pairs
#
#chr1:  248,956,422+12,151,146-->15,937*16384
#chr2:  242,193,529+12,945,965-->15,573
#chr3:  198,295,559+10,638,715-->12,753
#chr4:  190,214,555+10,165,685-->12,231
#chr5:  181,538,259+ 9,519,995-->11,662
#chr6:  170,805,979+ 9,130,476-->10,983
#chr7:  159,345,973+ 8,613,298-->10,252
#chr8:  145,138,636+ 8,221,520--> 9,361
#chr9:  138,394,717+ 6,590,811--> 8,850
#chr10: 133,797,422+ 7,223,944--> 8,608
#chr11: 135,086,622+ 7,535,370--> 8,705
#chr12: 133,275,309+ 7,228,129--> 8,576
#chr13: 114,364,328+ 5,082,574--> 7,291
#chr14: 107,043,718+ 4,865,950--> 6,831
#chr15: 101,991,189+ 4,515,076--> 6,501
#chr16:  90,338,345+ 5,101,702--> 5,826
#chr17:  83,257,441+ 4,614,972--> 5,364
#chr18:  80,373,285+ 4,035,966--> 5,152
#chr19:  58,617,616+ 3,858,269--> 3,814
#chr20:  64,444,167+ 3,439,621--> 4,144
#chr21:  46,709,983+ 2,049,697--> 2,977
#chr22:  50,818,468+ 2,135,311--> 3,233
#chrX:  156,040,895+ 5,753,881--> 9,876
#chrY:   57,227,415+   211,643--> 3,506
#
#------------------------------------------------------------------------------------------------------------
# file/tile name base: blocksize 2**14=16384 bps
fileBase = "_b14_"          #14 bits block (tile)
nbp = 16384
nmax = [15940, 15580, 12760, 12240, 11670, 10990, 10260, 9370, 8860, 8610, 8710, 
        8580, 7300, 6840, 6510, 5830, 5370, 5160, 3820, 4150, 2980, 3240, 9880, 3510]
folder = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 
    'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']
gstart = nmax.copy()       #NW without .copy
for i in range(1, 24):
    gstart[i] += gstart[i-1]
gstart.insert(0, 0)
nTiles = gstart[24]        #198160
nTiles1 = nTiles-6
g2ichr = np.zeros(nTiles, dtype='uint8')
for i in range(24):        #convert block index to ichr
    g2ichr[gstart[i]:gstart[i+1]] = i
#[0, 15940, 31520, 44280, 56520, 68190, 79180, 89440, 98810, 107670, 116280, 124990, 133570, 
#140870, 147710, 154220, 160050, 165420, 170580, 174400, 178550, 181530, 184770, 194650, 198160]

In [20]:
#Append genomic object index file (head file)
#genomic object: annotation data sets (eg, ChIP_seq data)
def append_index(path, dbName, igDf):
    #igDF is a pandas data frame
    igDf.to_csv(path + '/' + dbName + '_index.tsv', mode='a', sep='\t', header=False)
    return

#Append igd data
def append_igd(path, dbName, tmpData):
    #open files and apend region data
    for ichr in range(1, 25):
        ichr1=ichr-1
        for m in range(gstart[ichr1], gstart[ichr]):
            file = open(path+'/'+folder[ichr]+'/'+ dbName +'_'+filebase+'_'+str(m-gstart[ichr1])+'.igd', 'a')
            file.append(tmpData[ichr1][m])
            file.close()
    return

In [21]:
#create a single file igd from bed (.gz) files 
def create_igd_w(ifilePath, ofilePath, dbName):   
    #1. Read head info    
    #infilePath = "/media/john/CE30F6EE30F6DC81/roadmap_sort/"
    file_ids = next(os.walk(ifilePath))[2]
    file_ids.sort()
    n_files = len(file_ids)  
    regionData = pd.read_csv(ifilePath+file_ids[0], delimiter='\t', header=None) 
    nrows, ncols = regionData.shape
    
    #2. Read region data: read int64 default--int32 should be better
    nRegions = np.zeros(n_files, dtype='uint32')
    mRegion = np.zeros(n_files, dtype='uint32')
    count = np.zeros(nTiles, dtype=np.uint32)    
    data = np.empty(nTiles, dtype=object)        #bytearray        
    for i, id_ in tqdm.tqdm(enumerate(file_ids)):
        file = ifilePath + id_
        regionData = pd.read_csv(file, delimiter='\t', header=None)       
        df = regionData.sort_values(by=[0, 1])   #first by str, then by start
        n1 = df[1].values//nbp
        n2 = df[2].values//nbp-n1 
        itmp = len(n1)
        nRegions[i] = itmp
        tmp = np.sum(df[2]-df[1])/itmp    #64-bit
        mRegion[i]= int(tmp)
        rchr, ridx, rcnt = np.unique(df[0].values, return_index=True, return_counts=True)        
        #if a record crosses the block boundary, list it under both blocks (duplicates)
        #the start and end values are kept for fast processing (np): serialization and deserial..
        rc1 = df[1].values #.astype('uint32')
        rc2 = df[2].values #.astype('uint32')
        if ncols<5:
            rc3 = np.ones(len(df[1]), dtype='uint32')
        else:
            rc3 = df[4].values #.astype('uint32')
        #rec_bytes = np.array(rc1, dtype=int)
        for m in range(0, len(rchr)):
            ichr = -1
            if rchr[m] == 'chrX':
                ichr = 22
            elif rchr[m] == 'chrY':
                ichr = 23
            else:
                tmps = rchr[m][3:]
                if tmps.isdigit():
                    ichr = int(tmps)-1
            if ichr<24 and ichr>=0:
                for k in range(0, rcnt[m]):
                    idx0 = k+ridx[m]
                    idx = n1[idx0]+gstart[ichr]
                    #16 bytes for fast pack/unpack
                    rec = struct.pack('IIII', i, rc1[idx0], rc2[idx0], rc3[idx0])          
                    for j in range(0,n2[idx0]+1):
                        if idx+j<nTiles:
                            if data[idx+j]==None:
                                data[idx+j] = rec
                            else:
                                data[idx+j] += rec 

    #save all in a single file
    headInfo = {'File-id':file_ids, 'number_of_regions':nRegions, 'mean_region_size':mRegion}
    headInfo = pd.DataFrame(headInfo)
    # headInfo.to_csv('igdata/roadmap_index.tsv', sep='\t')  
    headInfo.to_csv(ofilePath+dbName+'_index.tsv', sep='\t') 
    #---------------------------------------------------------
    t0 = time.time()
    file = open(ofilePath+dbName+'_'+fileBase+'.igd', 'wb')
    for m in range(nTiles):
        if data[m]!=None:
            count[m]=len(data[m])/16 # number of struct
        else:
            count[m]=0
       
    tmpd = []
    for m in range(nTiles):
        if count[m]>0:
            #to list then sort and then repack
            tmpd = list(struct.iter_unpack('IIII', data[m]))
            tmpd.sort(key=itemgetter(2))
            tmp = bytearray()
            for i in range(count[m]):
                tmp += struct.pack('IIII', *tmpd[i])           
            file.write(tmp) 
            
    #put count[] at the end
    file.write(count.tostring())      
    file.close()
    
    print('t_save=', time.time()-t0)   

In [22]:
# Get specified block data: igdlist=[(1, 1008), (1, 3890), (6, 1010), (6, 2000), ....]
# return a list of m lists of tuples: should add chr info 
def get_regions(igdlist, dbPath, dbName):
    t0 = time.time()    
    nblocks = len(igdlist)
    tmpd = []
    for m in range(nblocks):
        ichr, k = igdlist[m]
        fname = dbPath +folder[ichr]+'/'+dbName+'_' + fileBase+'_'+str(k)+'.igd'
        if os.path.exists(fname):
            file = open(fname, 'rb')
            tmp = list(struct.iter_unpack('IIII', file.read()))
            tmpd.append(tmp)
            file.close()    
    print('time for get_regions: ', time.time()-t0) 
    return tmpd

In [23]:
# Get specified block data: igdlist=[(1, 1008), (1, 3890), (6, 1010), (6, 2000), ....]
def get_regions_w(igdlist, dbPath, dbName):
    t0 = time.time()    
    nblocks = len(igdlist)
    tmpd = []
    for m in range(nblocks):
        ichr, k = igdlist[m]
        fname = dbPath +folder[ichr]+'/'+dbName+'_' + fileBase+'.igd'
        tmp = list(struct.iter_unpack('IIII', file.read()))
        tmpd.append(tmp)
        file.close()    
    print('time for get_regions: ', time.time()-t0) 
    return tmpd

In [24]:
#build query set list from bed file: each list of query<-->each igdlist (block)
def get_igdlist(file_path):
    regionData = pd.read_csv(file_path, delimiter='\t', header=None)
    #regionData.info()
    df = regionData.sort_values(by=[0, 1])   #df[i]--ith column not row
    df.reset_index(drop=True, inplace=True)  #df normally keeps the index!
    n1 = df[1].values//nbp
    n2 = df[2].values//nbp-n1  
    rchr, ridx, rcnt = np.unique(df[0].values, return_index=True, return_counts=True)   
    igdlist = []   
    for m in range(len(rchr)):
        if len(rchr[m])<6:
            if rchr[m] == 'chrX':
                ichr = 22
            elif rchr[m] == 'chrY':
                ichr = 23
            else:
                ichr = int(rchr[m][3:])-1
            for k in range(rcnt[m]):
                idx0 = k+ridx[m]
                idx = n1[idx0] + gstart[ichr]  #to be sorted uniquely
                for j in range(0,n2[idx0]+1):
                    igdlist.append((idx+j,df[1][idx0], df[2][idx0])) 
    igdlist.sort(key=itemgetter(0))
    igdlist = np.asarray(igdlist, dtype='uint32')
    return igdlist #sort

In [25]:
#directly examine each block, attach chr info to the result (add tuple item +(100,))
def get_overlaps(igdlist):
    t0 = time.time()  
    rblk, ridx, rcnt = np.unique(igdlist[:,0], return_index=True, return_counts=True)  
    nblocks = len(rblk)    
    overlaps = []
    for m in range(nblocks):
        bk = rblk[m]
        if bk<198006:
            ichr = int(g2ichr[bk])
            k = bk - gstart[ichr]   
            fname = 'igdata/'+folder[ichr]+'/bb14_'+str(k)+'.igd'
            if os.path.exists(fname):
                file = open(fname, 'rb')
                regiondb = list(struct.iter_unpack('IIII', file.read()))
                #make it a np array
                file.close() 
                #--find overlaps in this block
                for n in range(rcnt[m]):
                    idx0 = ridx[m]+n
                    q, q1, q2 = igdlist[idx0]
                    for item in regiondb:   #list of tuples (234,52312312,52312612,156), (256,52307985,52308160,590)
                        if not (q2<item[1] or q1>item[2]):
                            overlaps.append(item+(ichr,))

    print('nBlocks,', nblocks)
    print('time for get_overlaps:', time.time()-t0)
    return overlaps

In [26]:
#directly examine each block, attach chr info to the result (add tuple item +(100,))
#using np array in the block
def get_overlaps_w(igdlist, igdName):
    t0 = time.time()     
    rblk, ridx, rcnt = np.unique(igdlist[:,0], return_index=True, return_counts=True)  
    nblocks = len(rblk)     
    file  = open('igdata/' + igdName + '.igd', 'rb')
    len0 = nTiles*4
    count = list(struct.unpack('I'*nTiles, file.read(len0)))
    #print(count[:100])
    mloc = count.copy()
    #[x*16 for x in mloc]
    mloc.insert(0,len0)
    for m in range(1, nTiles):
        mloc[m] += mloc[m-1] 
    #-----------------------------------------------------------  
    overlaps = []   
    for m in range(nblocks):
        bk = rblk[m]
        if bk<nTiles and count[bk]>0:
            ichr = int(g2ichr[bk])   
            file.seek(mloc[bk])
            regiondb = list(struct.iter_unpack('IIII', file.read(count[bk]))) 
            #print('nrec:', len(regiondb)) 
            #--find overlaps in this block
            for n in range(rcnt[m]):
                idx0 = ridx[m]+n
                q, q1, q2 = igdlist[idx0]                    
                for item in regiondb:   #list of tuples (234,52312312,52312612,156)
                    #if not (q2<item[1] or q1>item[2]):
                    if q2 >= item[1] and q1 <=item[2]:
                        overlaps.append(item+(ichr,))
                        #print(q1, q2, item[1], item[2])
    #-----------------------------------------------------------
    file.close()
    print('nBlocks,', nblocks)
    print('time for get_overlaps:', time.time()-t0)
    return overlaps

In [27]:
#0. Create igd: path includes '/'
ifilePath = "/media/john/CE30F6EE30F6DC81/roadmap_sort/"
ofilePath = "/media/john/CE30F6EE30F6DC81/roadmap_igd/"
create_igd_w(ifilePath, ofilePath, "roadmap")

1005it [02:35,  6.48it/s]


t_save= 39.84963870048523
