# iGD: an intergrated genomic data source


In [63]:
import os
import struct
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob, functools, tqdm, PIL
import time
from multiprocess import Pool
import _pickle as pkl

#0. Prepare:
# file/tile name base: blocksize 2**12=4096 bps
fileBase = "bb12"         #12 bits block
nbp = 4096 
nmax = [63748, 62292, 51012, 48924, 46648, 43932, 41008, 37444, 35400, 34432, 34820, 34304, 
        29164, 27324, 26004, 23304, 21456, 20608, 15256, 16576, 11908, 12932, 39504, 14024]
folder = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 
    'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']
gstart = nmax.copy()       #NW without .copy
for i in range(1, 24):
    gstart[i] += gstart[i-1]
gstart.insert(0, 0)
nTiles = gstart[24]
g2ichr = np.zeros(nTiles, dtype='uint8')
for i in range(24):        #convert block index to ichr
    g2ichr[gstart[i]:gstart[i+1]] = i
#[0, 63748, 126040, 177052, 225976, 272624, 316556, 357564, 395008, 430408, 464840, 499660, 533964,
# 563128, 590452, 616456, 639760, 661216, 681824, 697080, 713656, 725564, 738496, 778000, 792024]    

In [64]:
#Create encode_tfbs binary data .igb: store the whole data in a single file
def create_igd():   
    #1. Read head info
    file_path = "/home/john/LOLA/LOLACore/hg19/encode_tfbs/"
    file = open(file_path+"index.txt")
    headInfo = pd.read_csv(file, delimiter='\t')
    file.close()
    headInfo.to_csv('igdata/igd_index.tsv', sep='\t')
    
    #2. Read region data: read int64 default--int32 should be better
    file_path += "regions/"
    file_ids = next(os.walk(file_path))[2]
    file_ids.sort()
    n_files = len(file_ids)
    
    count = np.zeros(nTiles, dtype=np.uint32)    
    data = np.empty(nTiles, dtype=object)        #bytearray        
    for i, id_ in tqdm.tqdm(enumerate(file_ids)):
        file = file_path + id_
        regionData = pd.read_csv(file, delimiter='\t', header=None)       
        df = regionData.sort_values(by=[0, 1])   #first by str, then by start
        n1 = df[1].values//nbp
        n2 = df[2].values//nbp-n1 
        rchr, ridx, rcnt = np.unique(df[0].values, return_index=True, return_counts=True)        
        #if a record crosses the block boundary, list it under both blocks (duplicates)
        #the start and end values are kept for fast processing (np): serialization and deserial..
        rc1 = df[1].values 
        rc2 = df[2].values 
        rc3 = df[4].values 
        for m in range(0, len(rchr)):
            if rchr[m] == 'chrX':
                ichr = 22
            elif rchr[m] == 'chrY':
                ichr = 23
            else:
                ichr = int(rchr[m][3:])-1
            for k in range(0, rcnt[m]):
                idx0 = k+ridx[m]
                idx = n1[idx0]+gstart[ichr]
                #4x4 bytes for fast pack/unpack
                rec = struct.pack('IIII', i, rc1[idx0], rc2[idx0], rc3[idx0])          
                if n2[idx0] == 0:
                    if data[idx]==None:
                        data[idx] = rec
                    else:
                        data[idx] += rec
                else:
                    for j in range(0,n2[idx0]):
                        if data[idx+j]==None:
                            data[idx+j] = rec
                        else:
                            data[idx+j] += rec 

    #save all in a single file
    t0 = time.time()
    file = open('igdata/'+fileBase+'.igd', 'wb')
    #Write header info: number of struct elements in each tile--nTiles*4
    for m in range(nTiles):
        if data[m]!=None:
            count[m]=len(data[m])
        else:
            count[m]=0
        
    file.write(count.tostring())
    for m in range(nTiles):
        if count[m]>0:
            file.write(data[m])       
    file.close()
    print('t_save=', time.time()-t0)

In [65]:
# Get the entire sets: for small igd within memory limit
def get_allRegionSets():   
    t0 = time.time()
    file = open('igdata/' + fileBase + '.igd', 'rb')
    data = file.read()
    file.close()    
    #read head:
    i = nTiles*4
    count = list(struct.unpack('I'*nTiles, data[0:i]))
    #igdata = struct.unpack('IIIH'*nRecords, data[i:]) #NW: due to alighment    
    igdata = list(struct.iter_unpack('IIII', data[i:]))   
    file.close()          
    dt0 = time.time()-t0    
    print('time for get_allRegions:', dt0)
    return igdata

In [66]:
#build query set list from bed file: each list of query<-->each igdlist (block)
from operator import itemgetter
def get_igdlist(file_path):
    regionData = pd.read_csv(file_path, delimiter='\t', header=None)
    #regionData.info()
    df = regionData.sort_values(by=[0, 1]) 
    df.reset_index(drop=True, inplace=True)  #df normally keeps the index!   
    n1 = df[1].values//nbp
    n2 = df[2].values//nbp-n1  
    rchr, ridx, rcnt = np.unique(df[0].values, return_index=True, return_counts=True)   
    igdlist = []   
    for m in range(len(rchr)):
        if len(rchr[m])<6:
            if rchr[m] == 'chrX':
                ichr = 22
            elif rchr[m] == 'chrY':
                ichr = 23
            else:
                ichr = int(rchr[m][3:])-1
            for k in range(rcnt[m]):
                idx0 = k+ridx[m]
                idx = n1[idx0] + gstart[ichr]  #to be sorted uniquely
                if n2[idx0] == 0:
                    igdlist.append((idx, df[1][idx0], df[2][idx0]))
                else:
                    for j in range(0,n2[idx0]+1): #
                        igdlist.append((idx+j,df[1][idx0], df[2][idx0]))                   
    igdlist.sort(key=itemgetter(0))
    igdlist = np.asarray(igdlist, dtype='uint32')
    return igdlist[igdlist[:,0].argsort()]

In [67]:
#directly examine each block, attach chr info to the result (add tuple item +(100,))
def get_overlaps(igdlist):  
    t0 = time.time()     
    rblk, ridx, rcnt = np.unique(igdlist[:,0], return_index=True, return_counts=True)  
    nblocks = len(rblk)     
    file  = open('igdata/' + fileBase + '.igd', 'rb')
    len0 = nTiles*4
    count = list(struct.unpack('I'*nTiles, file.read(len0))) #bytes of the record
    mloc = count.copy()
    mloc.insert(0,len0)
    for m in range(1, nTiles):
        mloc[m] += mloc[m-1] 
    #-----------------------------------------------------------  
    overlaps = []   
    for m in range(nblocks):
        bk = rblk[m]
        if bk<nTiles and count[bk]>0:
            ichr = int(g2ichr[bk])   
            file.seek(mloc[bk])
            regiondb = list(struct.iter_unpack('IIII', file.read(count[bk]))) 
            #print('nrec:', len(regiondb)) 
            #--find overlaps in this block
            for n in range(rcnt[m]):
                idx0 = ridx[m]+n
                q, q1, q2 = igdlist[idx0]                    
                for item in regiondb:   #list of tuples (234,52312312,52312612,156)
                    if not (q2<item[1] or q1>item[2]):
                        overlaps.append(item+(ichr,))
                        #print(q1, q2, item[1], item[2])
    #-----------------------------------------------------------
    file.close()
    print('nBlocks,', nblocks)
    print('time for get_overlaps:', time.time()-t0)
    return overlaps

In [69]:
#0. Create igd:
#create_igd()
igdata = get_allRegionSets()
print(len(igdata))

time for get_allRegions: 2.5122313499450684
13204602


In [89]:
from operator import itemgetter
#0. Create igd:
#create_igd()
#1. Read a query region set from file
file = "/home/john/LOLA/lola_vignette_data/setC_100.bed" #test.bed"#
igdlist = get_igdlist(file)
#for i in range(len(igdlist)):
#    print(i, igdlist[i])
overlaps = get_overlaps(igdlist)
overlaps = sorted(overlaps, key=itemgetter(0, 4, 1))
unilaps = pd.DataFrame(overlaps)[1]
unilaps = unilaps.unique()

nBlocks, 138
time for get_overlaps: 0.15026545524597168


In [90]:
print(len(igdlist))
print(len(overlaps))
print(len(unilaps))
for i in range(1,10):
    print(i, unilaps[i])

138
2744
2454
1 35576437
2 45374400
3 26021708
4 45374459
5 26021714
6 35576303
7 35576503
8 15312174
9 45374461
