In [1]:
import numpy as np
import os

In [2]:
data_dir = "./data"

In [3]:
signal_local_fn = os.path.join(data_dir, "WM20201125_multiDHS_domains.hg38.17.S1.bed")

In [4]:
import pandas as pd

In [5]:
# read gzip'd file into pandas dataframe
signal_df = pd.read_csv(signal_local_fn, sep='\t', header=None)

In [6]:
signal_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,chr1,0,200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
1,chr1,200,400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
2,chr1,400,600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
3,chr1,600,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
4,chr1,800,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587


In [7]:
# columns 3-19 are 16 DHS factors and a NaN category

In [8]:
data = signal_df.rename(columns={0:"chromosome", 1: "start", 2: "end"})

In [20]:
factors = [a for a in range(3,20)]

In [23]:
factors

[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [9]:
data.head()

Unnamed: 0,chromosome,start,end,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,chr1,0,200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
1,chr1,200,400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
2,chr1,400,600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
3,chr1,600,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587
4,chr1,800,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47587


In [10]:
%load_ext autoreload

In [11]:
%autoreload 2

In [26]:
import hilbertgenome

In [27]:
def aggregator(data):
    return data[factors].sum(axis=0).to_numpy()

In [28]:
def accessor(d):
    # our dtype will want to match the value here
    return d[factors].to_numpy()

In [29]:
hg = hilbertgenome.HilbertGenome(data=data, 
                                 name="dhs_sum",
                                 aggregator=aggregator,
                                 accessor=accessor,
                                 dtype="float32",
                                 signal_resolution=200,
                                 missing_value=0
                                )

data width 17
order 6 individual False
order 7 individual False
order 8 individual False
order 9 individual False
order 10 individual False
order 11 individual False
order 12 individual True
order 13 individual True
order 14 individual True
order 15 individual True


In [33]:
params = hg.aggregate_chromosome("chrX", 6)

In [34]:
params[0]

{'chromosome': 'chrX',
 'order': 6,
 'hstart': 3813,
 'hstop': 4020,
 'gstart': 2875001522}

In [35]:
hg.aggregate_range_region(**params[0])

array([[ 0.0000000e+00,  0.0000000e+00,  4.4497334e+03, ...,
         0.0000000e+00,  0.0000000e+00,  9.0229538e+01],
       [ 9.7429840e+01,  0.0000000e+00,  8.0816782e+03, ...,
         0.0000000e+00,  0.0000000e+00, -1.0417406e+03],
       [ 0.0000000e+00,  0.0000000e+00,  3.2021868e+03, ...,
         1.2648690e+01,  0.0000000e+00, -1.5226178e+02],
       ...,
       [ 0.0000000e+00,  0.0000000e+00,  4.6277099e+00, ...,
         0.0000000e+00,  0.0000000e+00,  1.3544891e+03],
       [ 3.7038841e+01,  0.0000000e+00,  0.0000000e+00, ...,
         6.2563562e+02,  0.0000000e+00,  1.3717739e+03],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         4.3365978e+01,  0.0000000e+00,  1.6628633e+03]], dtype=float32)

In [39]:
hg.generate_order(9)

generating 24 files
chr1 9 0 starting
chr1 9 0 done in: 10.645295143127441
chr2 9 21132 starting
chr2 9 21132 done in: 10.148818731307983
chr3 9 41690 starting
chr3 9 41690 done in: 8.408786058425903
chr4 9 58522 starting
chr4 9 58522 done in: 8.137749910354614
chr5 9 74668 starting
chr5 9 74668 done in: 7.735008001327515
chr6 9 90078 starting
chr6 9 90078 done in: 7.416348934173584
chr7 9 104577 starting
chr7 9 104577 done in: 6.897778034210205
chr8 9 118103 starting
chr8 9 118103 done in: 6.39188814163208
chr9 9 130422 starting
chr9 9 130422 done in: 6.0618040561676025
chr10 9 142170 starting
chr10 9 142170 done in: 5.944384813308716
chr11 9 153527 starting
chr11 9 153527 done in: 5.950679063796997
chr12 9 164994 starting
chr12 9 164994 done in: 5.856544017791748
chr13 9 176307 starting
chr13 9 176307 done in: 5.104655027389526
chr14 9 186014 starting
chr14 9 186014 done in: 4.8400959968566895
chr15 9 195101 starting
chr15 9 195101 done in: 4.623320817947388
chr16 9 203758 starting
c