In [1]:
import numpy as np

In [2]:
data_dir = "./data"

In [3]:
# Get the data file and store it locally
#
# ref. /net/seq/data/projects/Epilogos
#          /multivec-for-browser-2022-redo/epilogos_tracks/single/human
#          /Boix_et_al_833_sample/hg19/18/All_833_biosamples/S1/scores.txt.filledGap.versionSorted.txt.gz
#
import os
import requests
import io
signal_remote_URI = 'https://resources.altius.org/~areynolds/public/Boix_et_al_833_sample.hg19.18.All_833_biosamples.S1.scores.txt.gz'
signal_local_fn = os.path.join(data_dir, 'Boix_et_al_833_sample.hg19.18.All_833_biosamples.S1.scores.txt.gz')
if not os.path.exists(signal_local_fn):
    try:
        r = requests.get(signal_remote_URI)
        with open(signal_local_fn, "wb") as ofh:
            b = io.BytesIO(r.content)
            ofh.write(b.getbuffer())
    except requests.exceptions.RequestException as e:
        raise SystemExit(e)

In [4]:
import pandas as pd

In [5]:
# read gzip'd file into pandas dataframe
signal_df = pd.read_csv(signal_local_fn, sep='\t', header=None)

In [361]:
signal_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,max,sum
0,chr1,0,200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40997,17,0.40997
1,chr1,200,400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40997,17,0.40997
2,chr1,400,600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40997,17,0.40997
3,chr1,600,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40997,17,0.40997
4,chr1,800,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40997,17,0.40997


In [141]:
signal_df['max'] = signal_df.iloc[:, 3:21].idxmax(axis=1) - 3

In [363]:
# the data is actually empty for chrY
signal_df["max"] = np.where(signal_df[0] == "chrY", 17, signal_df["max"])

In [378]:
signal_df[signal_df[0] == "chrY"][2000:2020]["max"]

15183531    17
15183532    17
15183533    17
15183534    17
15183535    17
15183536    17
15183537    17
15183538    17
15183539    17
15183540    17
15183541    17
15183542    17
15183543    17
15183544    17
15183545    17
15183546    17
15183547    17
15183548    17
15183549    17
15183550    17
Name: max, dtype: int64

In [379]:
signal_df['sum'] = signal_df.iloc[:, 3:21].sum(axis=1)

In [415]:
data = signal_df[[0,1,2,"max","sum"]].rename(columns={0: "chromosome", 1: "start", 2: "end"})

In [416]:
data.head()

Unnamed: 0,chromosome,start,end,max,sum
0,chr1,0,200,17,0.40997
1,chr1,200,400,17,0.40997
2,chr1,400,600,17,0.40997
3,chr1,600,800,17,0.40997
4,chr1,800,1000,17,0.40997


10

In [12]:
%load_ext autoreload

In [24]:
%autoreload 2

In [25]:
import hilbertgenome

In [15]:
def aggregator(data):
    # our dtype will want to match the value here
    # return data[np.argmax(np.array([i['sum'] for i in data]))]['max']
    # return data[np.argmax(np.array([i[4] for i in data]))][3]
    idx = data["sum"].idxmax()
    return data.loc[idx]["max"]

In [17]:
def accessor(d):
    # our dtype will want to match the value here
    return d["max"]

In [540]:
hg = hilbertgenome.HilbertGenome(data=data, 
                                 name="cs_max2",
                                 aggregator=aggregator,
                                 accessor=accessor,
                                 dtype="int8",
                                 signal_resolution=200,
                                 missing_value=17
                                )

data width 1
order 6 individual False
order 7 individual False
order 8 individual False
order 9 individual False
order 10 individual False
order 11 individual False
order 12 individual True
order 13 individual True
order 14 individual True
order 15 individual True


In [541]:
hg.aggregate_chromosome("chr1", 5)

[{'chromosome': 'chr1', 'order': 5, 'hstart': 0, 'hstop': 82, 'gstart': 0}]

In [542]:
hg.generate_order(6)

generating 24 files
chr1 6 0 starting
chr1 6 0 done in: 4.491639137268066
chr2 6 330 starting
chr2 6 330 done in: 4.311007022857666
chr3 6 651 starting
chr3 6 651 done in: 3.677361011505127
chr4 6 914 starting
chr4 6 914 done in: 3.505523204803467
chr5 6 1166 starting
chr5 6 1166 done in: 3.3581840991973877
chr6 6 1407 starting
chr6 6 1407 done in: 3.2263801097869873
chr7 6 1634 starting
chr7 6 1634 done in: 3.040771245956421
chr8 6 1845 starting
chr8 6 1845 done in: 2.815991163253784
chr9 6 2037 starting
chr9 6 2037 done in: 2.691575050354004
chr10 6 2221 starting
chr10 6 2221 done in: 2.6124508380889893
chr11 6 2398 starting
chr11 6 2398 done in: 2.6677801609039307
chr12 6 2578 starting
chr12 6 2578 done in: 2.6171629428863525
chr13 6 2754 starting
chr13 6 2754 done in: 2.3349878787994385
chr14 6 2906 starting
chr14 6 2906 done in: 2.2156968116760254
chr15 6 3048 starting
chr15 6 3048 done in: 2.1383450031280518
chr16 6 3183 starting
chr16 6 3183 done in: 1.961364984512329
chr17 6 33

In [87]:
# hg.generate_order_parallel(6)

In [143]:
files = hg.files_for_order(6)

In [404]:
import multiprocess as mp
import time

In [405]:
from os import fork, getpid


In [406]:
def test(order):
    print(order)
    hg.generate_order(order)
    print("done with order", order)

In [407]:
workers = mp.cpu_count() - 1
print("workers", workers)

orders = range(6, 12)
start_time = time.time()
with mp.Pool(workers) as p:
    p.map(hg.generate_order, orders)
    
print("done in: %s" % (time.time() - start_time))
# with mp.Pool(workers) as p:
#     p.map(hg.aggregate_parallel, files)

workers 9
98671011





generatinggeneratinggeneratinggeneratinggeneratinggenerating   24 24  2424 24 24  files  filesfilesfiles

filesfiles

{'chromosome': 'chr1', 'order': 6, 'hstart': 0, 'hstop': 330, 'gstart': 0, 'individual': False, 'dtype': 'int8'}{'chromosome': 'chr1', 'order': 8, 'hstart': 0, 'hstop': 5283, 'gstart': 0, 'individual': False, 'dtype': 'int8'}

{'chromosome': 'chr1', 'order': 7, 'hstart': 0, 'hstop': 1320, 'gstart': 0, 'individual': False, 'dtype': 'int8'}{'chromosome': 'chr1', 'order': 9, 'hstart': 0, 'hstop': 21132, 'gstart': 0, 'individual': False, 'dtype': 'int8'}


{'chromosome': 'chr1', 'order': 11, 'hstart': 0, 'hstop': 338117, 'gstart': 0, 'individual': False, 'dtype': 'int8'}{'chromosome': 'chr1', 'order': 10, 'hstart': 0, 'hstop': 84529, 'gstart': 0, 'individual': False, 'dtype': 'int8'}


chr1chr1chr1chr1chr1chr1      11691078      000000    starting  startingstartingstartingstartingstarting    1244819  124478212449961244790
12462541244996




chr1 6 0 

chr19 8 56329 starting 293315
chr21 6 3683 done in: 0.7815449237823486
{'chromosome': 'chr22', 'order': 6, 'hstart': 3745, 'hstop': 3813, 'gstart': 2824183054, 'individual': False, 'dtype': 'int8'}
chr14 9 186014 starting 535265
chr22 6 3745 starting 256523
chr21 7 14735 done in: 0.7913558483123779
{'chromosome': 'chr22', 'order': 7, 'hstart': 14982, 'hstop': 15252, 'gstart': 2824183054, 'individual': False, 'dtype': 'int8'}
chr19 8 56329 done in: 1.0687658786773682
{'chromosome': 'chr20', 'order': 8, 'hstart': 57573, 'hstop': 58940, 'gstart': 2713028904, 'individual': False, 'dtype': 'int8'}
chr22 7 14982 starting 254505
chr22 6 3745 done in: 0.8405299186706543
{'chromosome': 'chrX', 'order': 6, 'hstart': 3813, 'hstop': 4020, 'gstart': 2875001522, 'individual': False, 'dtype': 'int8'}
chr20 8 57573 starting 315128
chrX 6 3813 starting 776353
chr22 7 14982 done in: 0.8507740497589111
{'chromosome': 'chrX', 'order': 7, 'hstart': 15252, 'hstop': 16080, 'gstart': 2875001522, 'individual':

{'chromosome': 'chr12', 'order': 10, 'hstart': 659977, 'hstop': 705228, 'gstart': 1943767673, 'individual': False, 'dtype': 'int8'}
chr12 10 659977 starting 666376
chr12 10 659977 done in: 5.501842975616455
{'chromosome': 'chr13', 'order': 10, 'hstart': 705228, 'hstop': 744059, 'gstart': 2077042982, 'individual': False, 'dtype': 'int8'}
chr13 10 705228 starting 571826
chr13 10 705228 done in: 4.729810953140259
{'chromosome': 'chr14', 'order': 10, 'hstart': 744059, 'hstop': 780404, 'gstart': 2191407310, 'individual': False, 'dtype': 'int8'}
chr14 10 744059 starting 535221
chr14 10 744059 done in: 4.419811248779297
{'chromosome': 'chr15', 'order': 10, 'hstart': 780404, 'hstop': 815034, 'gstart': 2298451028, 'individual': False, 'dtype': 'int8'}
chr15 10 780404 starting 509965
chr4 11 936363 done in: 22.15097689628601
{'chromosome': 'chr5', 'order': 11, 'hstart': 1194701, 'hstop': 1441256, 'gstart': 879660065, 'individual': False, 'dtype': 'int8'}
chr5 11 1194701 starting 904577
chr15 10 

In [408]:
workers = mp.cpu_count() - 1
print("workers", workers)

orders = range(12, 17)
start_time = time.time()
with mp.Pool(workers) as p:
    p.map(hg.generate_order, orders)
    
print("done in: %s" % (time.time() - start_time))
# with mp.Pool(workers) as p:
#     p.map(hg.aggregate_parallel, files)

workers 9
generating 24 files
{'chromosome': 'chr1', 'order': 12, 'hstart': 0, 'hstop': 1352471, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr1 12 0 starting 1244782
generating 27 files
{'chromosome': 'chr1', 'order': 13, 'hstart': 0, 'hstop': 4194304, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr1 13 0 starting 965084
chr1 12 0 done in: 22.319047927856445
{'chromosome': 'chr2', 'order': 12, 'hstart': 1352471, 'hstop': 2668202, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr2 12 1352471 starting 1210968
generating 77 files
{'chromosome': 'chr1', 'order': 14, 'hstart': 0, 'hstop': 4194304, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr1 14 0 starting 241271
chr1 14 0 done in: 4.435200929641724
{'chromosome': 'chr1', 'order': 14, 'hstart': 4194304, 'hstop': 8388608, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
 
241270starting
{'chromosome': 'chr1', 'order': 13, 'hstart': 4194304, 'hstop': 5409884, 'gstart': 0, 'individual': True, 'dtype': 'int8'

chr1 16 138412032 starting 15079
chr1 16 138412032 done in: 0.27614569664001465
{'chromosome': 'chr1', 'order': 16, 'hstart': 142606336, 'hstop': 146800640, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr3 13 10672809 done in: 17.640193939208984
{'chromosome': 'chr3', 'order': 13, 'hstart': 14867113, 'hstop': 14981820, 'gstart': 491149951, 'individual': True, 'dtype': 'int8'}
chr2 15 99141061 done in: 1.1125400066375732
{'chromosome': 'chr2', 'order': 15, 'hstart': 103335365, 'hstop': 107529669, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr1 16 142606336 starting 15078
chr3 13 14867113 starting 25028
chr1 16 142606336 done in: 0.2796900272369385
{'chromosome': 'chr1', 'order': 16, 'hstart': 146800640, 'hstop': 150994944, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr2 15 103335365 starting 60316
chr3 13 14867113 done in: 0.46367692947387695
{'chromosome': 'chr4', 'order': 13, 'hstart': 14981820, 'hstop': 19115229, 'gstart': 689445510, 'individual': True, 'd

{'chromosome': 'chr1', 'order': 16, 'hstart': 322961408, 'hstop': 327155712, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr4 14 72510194 starting 227258
chr1 16 322961408 starting 15078
chr1 chr316  15322961408  done in: 0.28290367126464844187542166
 {'chromosome': 'chr1', 'order': 16, 'hstart': 327155712, 'hstop': 331350016, 'gstart': 0, 'individual': True, 'dtype': 'int8'}done in: 1.1147639751434326

{'chromosome': 'chr3', 'order': 15, 'hstart': 191736470, 'hstop': 195930774, 'gstart': 491149951, 'individual': True, 'dtype': 'int8'}
chr1 16 327155712 starting 15078
chr3 15 191736470 starting 60317
chr1 16 327155712 done in: 0.2795140743255615
{'chromosome': 'chr1', 'order': 16, 'hstart': 331350016, 'hstop': 335544320, 'gstart': 0, 'individual': True, 'dtype': 'int8'}
chr5 13 19115229 done in: 16.55770993232727
{'chromosome': 'chr6', 'order': 13, 'hstart': 23060100, 'hstop': 26771756, 'gstart': 1061198324, 'individual': True, 'dtype': 'int8'}
chr1 16 331350016 starting 15079
ch

{'chromosome': 'chr2', 'order': 16, 'hstart': 497227543, 'hstop': 501421847, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr8 13 30234384 starting 725693
chr2 16 497227543 starting 15079
chr4 15 273263563 done in: 1.1085009574890137
{'chromosome': 'chr4', 'order': 15, 'hstart': 277457867, 'hstop': 281652171, 'gstart': 689445510, 'individual': True, 'dtype': 'int8'}
chr2 16 497227543 done in: 0.278609037399292
{'chromosome': 'chr2', 'order': 16, 'hstart': 501421847, 'hstop': 505616151, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr4 15 277457867 starting 60317
chr2 16 501421847 starting 15078
chr2 16 501421847 done in: 0.2812461853027344
{'chromosome': 'chr2', 'order': 16, 'hstart': 505616151, 'hstop': 509810455, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr2 16 505616151 starting 15079
chr2 16 505616151 done in: 0.2798781394958496
{'chromosome': 'chr2', 'order': 16, 'hstart': 509810455, 'hstop': 514004759, 'gstart': 248956422, 'individual': 

chr2 16 673388311 done in: 0.27682924270629883
{'chromosome': 'chr2', 'order': 16, 'hstart': 677582615, 'hstop': 681776919, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr5 15 360369629 done in: 1.11393404006958
{'chromosome': 'chr5', 'order': 15, 'hstart': 364563933, 'hstop': 368758237, 'gstart': 879660065, 'individual': True, 'dtype': 'int8'}
chr2 16 677582615 starting 15079
chr2 16 677582615 done in: 0.27883386611938477
{'chromosome': 'chr2', 'order': 16, 'hstart': 681776919, 'hstop': 683059801, 'gstart': 248956422, 'individual': True, 'dtype': 'int8'}
chr5 15 364563933 starting 60128
chr2 16 681776919 starting 4611
chr2 16 681776919 done in: 0.08899879455566406
{'chromosome': 'chr3', 'order': 16, 'hstart': 683059801, 'hstop': 687254105, 'gstart': 491149951, 'individual': True, 'dtype': 'int8'}
chr3 16 683059801 starting 15079
chr5 15 364563933 done in: 1.0977060794830322
{'chromosome': 'chr5', 'order': 15, 'hstart': 368758237, 'hstop': 368961614, 'gstart': 879660065, 

{'chromosome': 'chr7', 'order': 15, 'hstart': 445125326, 'hstop': 449319630, 'gstart': 1232004303, 'individual': True, 'dtype': 'int8'}
chr10 14 153971142 done in: 3.4286978244781494
{'chromosome': 'chr11', 'order': 14, 'hstart': 157212338, 'hstop': 161406642, 'gstart': 1808681051, 'individual': True, 'dtype': 'int8'}
chr3 16 846637657 starting 15078
chr7 15 445125326 starting 60316
chr3 16 846637657 done in: 0.2766742706298828
{'chromosome': 'chr3', 'order': 16, 'hstart': 850831961, 'hstop': 855026265, 'gstart': 491149951, 'individual': True, 'dtype': 'int8'}
chr11 14 157212338 starting 241271
chr3 16 850831961 starting 15079
chr3 16 850831961 done in: 0.27701520919799805
{'chromosome': 'chr3', 'order': 16, 'hstart': 855026265, 'hstop': 859220569, 'gstart': 491149951, 'individual': True, 'dtype': 'int8'}
chr7 15 445125326 done in: 1.1382219791412354
{'chromosome': 'chr7', 'order': 15, 'hstart': 449319630, 'hstop': 453513934, 'gstart': 1232004303, 'individual': True, 'dtype': 'int8'}
c

{'chromosome': 'chr13', 'order': 14, 'hstart': 184732926, 'hstop': 188927230, 'gstart': 2077042982, 'individual': True, 'dtype': 'int8'}
chr4 16 1017556782 starting 15078
chr13 14 184732926 starting 241270
chr4 16 1017556782 done in: 0.275270938873291
{'chromosome': 'chr4', 'order': 16, 'hstart': 1021751086, 'hstop': 1025945390, 'gstart': 689445510, 'individual': True, 'dtype': 'int8'}
chr8 15 525693186 done in: 1.1303369998931885
{'chromosome': 'chr8', 'order': 15, 'hstart': 529887490, 'hstop': 534081794, 'gstart': 1391350276, 'individual': True, 'dtype': 'int8'}
chr4 16 1021751086 starting 15079
chr8 15 529887490 starting 60317
chr4 16 1021751086 done in: 0.27285003662109375
{'chromosome': 'chr4', 'order': 16, 'hstart': 1025945390, 'hstop': 1030139694, 'gstart': 689445510, 'individual': True, 'dtype': 'int8'}
chr4 16 1025945390 starting 15078
chr4 16 1025945390 done in: 0.27537012100219727
{'chromosome': 'chr4', 'order': 16, 'hstart': 1030139694, 'hstop': 1034333998, 'gstart': 689445

chr9 15 563572647 done in: 1.1105670928955078
{'chromosome': 'chr9', 'order': 15, 'hstart': 567766951, 'hstop': 571961255, 'gstart': 1536488912, 'individual': True, 'dtype': 'int8'}
chr4 16 1101442862 starting 15078
chr9 15 567766951 starting 60316
chr4 16 1101442862 done in: 0.28015899658203125
{'chromosome': 'chr4', 'order': 16, 'hstart': 1105637166, 'hstop': 1109831470, 'gstart': 689445510, 'individual': True, 'dtype': 'int8'}
chr14 14 194673586 done in: 4.43839693069458
{'chromosome': 'chr14', 'order': 14, 'hstart': 198867890, 'hstop': 199783627, 'gstart': 2191407310, 'individual': True, 'dtype': 'int8'}
chr4 16 1105637166 starting 15078
chr14 14 198867890 starting 52675
chr4 16 1105637166 done in: 0.2831296920776367
{'chromosome': 'chr4', 'order': 16, 'hstart': 1109831470, 'hstop': 1114025774, 'gstart': 689445510, 'individual': True, 'dtype': 'int8'}
chr9 15 567766951 done in: 1.1174800395965576
{'chromosome': 'chr9', 'order': 15, 'hstart': 571961255, 'hstop': 576155559, 'gstart':

{'chromosome': 'chr5', 'order': 16, 'hstart': 1256929142, 'hstop': 1261123446, 'gstart': 879660065, 'individual': True, 'dtype': 'int8'}
chr5 16 1256929142 starting 15078
chr5 16 1256929142 done in: 0.27890586853027344
{'chromosome': 'chr5', 'order': 16, 'hstart': 1261123446, 'hstop': 1265317750, 'gstart': 879660065, 'individual': True, 'dtype': 'int8'}
chr11 15 641432266 done in: 1.1229820251464844
{'chromosome': 'chr11', 'order': 15, 'hstart': 645626570, 'hstop': 649820874, 'gstart': 1808681051, 'individual': True, 'dtype': 'int8'}
chr17 14 220695404 done in: 3.0141520500183105
{'chromosome': 'chr18', 'order': 14, 'hstart': 223737918, 'hstop': 227932222, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr5 16 1261123446 starting 15079
chr11 15 645626570 starting 60316
chr18 14 223737918 starting 241271
chr5 16 1261123446 done in: 0.27819228172302246
{'chromosome': 'chr5', 'order': 16, 'hstart': 1265317750, 'hstop': 1269512054, 'gstart': 879660065, 'individual': True, 'dtyp

{'chromosome': 'chr12', 'order': 15, 'hstart': 696788323, 'hstop': 700982627, 'gstart': 1943767673, 'individual': True, 'dtype': 'int8'}
chr5 16 1365981046 starting 15078
chr5 16 1365981046 done in: 0.277418851852417
{'chromosome': 'chr5', 'order': 16, 'hstart': 1370175350, 'hstop': 1374369654, 'gstart': 879660065, 'individual': True, 'dtype': 'int8'}
chr12 15 696788323 starting 60317
chr5 16 1370175350 starting 15078
chr20 14 240013448 done in: 1.3488481044769287
{'chromosome': 'chr21', 'order': 14, 'hstart': 241420695, 'hstop': 245480773, 'gstart': 2777473071, 'individual': True, 'dtype': 'int8'}
chr5 16 1370175350 done in: 0.27708911895751953
{'chromosome': 'chr5', 'order': 16, 'hstart': 1374369654, 'hstop': 1378563958, 'gstart': 879660065, 'individual': True, 'dtype': 'int8'}
chr12 15 696788323 done in: 1.1114580631256104
{'chromosome': 'chr12', 'order': 15, 'hstart': 700982627, 'hstop': 705176931, 'gstart': 1943767673, 'individual': True, 'dtype': 'int8'}
chr21 14 241420695 starti

combining files
Combining files ['./data/chromosome_bytes/cs_max_chr1_14_0.int8', './data/chromosome_bytes/cs_max_chr1_14_4194304.int8', './data/chromosome_bytes/cs_max_chr1_14_8388608.int8', './data/chromosome_bytes/cs_max_chr1_14_12582912.int8', './data/chromosome_bytes/cs_max_chr1_14_16777216.int8', './data/chromosome_bytes/cs_max_chr1_14_20971520.int8', './data/chromosome_bytes/cs_max_chr2_14_21639537.int8', './data/chromosome_bytes/cs_max_chr2_14_25833841.int8', './data/chromosome_bytes/cs_max_chr2_14_30028145.int8', './data/chromosome_bytes/cs_max_chr2_14_34222449.int8', './data/chromosome_bytes/cs_max_chr2_14_38416753.int8', './data/chromosome_bytes/cs_max_chr2_14_42611057.int8', './data/chromosome_bytes/cs_max_chr3_14_42691237.int8', './data/chromosome_bytes/cs_max_chr3_14_46885541.int8', './data/chromosome_bytes/cs_max_chr3_14_51079845.int8', './data/chromosome_bytes/cs_max_chr3_14_55274149.int8', './data/chromosome_bytes/cs_max_chr3_14_59468453.int8', './data/chromosome_bytes

{'chromosome': 'chr15', 'order': 15, 'hstart': 807523118, 'hstop': 811717422, 'gstart': 2298451028, 'individual': True, 'dtype': 'int8'}
chr6 16 1593286971 starting 15078
chr15 15 807523118 starting 60317
chr6 16 1593286971 done in: 0.27168822288513184
{'chromosome': 'chr6', 'order': 16, 'hstart': 1597481275, 'hstop': 1601675579, 'gstart': 1061198324, 'individual': True, 'dtype': 'int8'}
chr6 16 1597481275 starting 15079
chr6 16 1597481275 done in: 0.27201294898986816
{'chromosome': 'chr6', 'order': 16, 'hstart': 1601675579, 'hstop': 1605869883, 'gstart': 1061198324, 'individual': True, 'dtype': 'int8'}
chr15 15 807523118 done in: 1.0906567573547363
{'chromosome': 'chr15', 'order': 15, 'hstart': 811717422, 'hstop': 815911726, 'gstart': 2298451028, 'individual': True, 'dtype': 'int8'}
chr6 16 1601675579 starting 15078
chr6 16 1601675579 chr15done in: 0.27289390563964844 
15{'chromosome': 'chr6', 'order': 16, 'hstart': 1605869883, 'hstop': 1610064187, 'gstart': 1061198324, 'individual': 

{'chromosome': 'chr18', 'order': 15, 'hstart': 907534587, 'hstop': 911728891, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr7 16 1797278522 starting 15079
chr18 15 907534587 starting 60317
chr7 16 1797278522 done in: 0.27664899826049805
{'chromosome': 'chr7', 'order': 16, 'hstart': 1801472826, 'hstop': 1805667130, 'gstart': 1232004303, 'individual': True, 'dtype': 'int8'}
chr7 16 1801472826 starting 15078
chr7 16 1801472826 done in: 0.2726469039916992
{'chromosome': 'chr7', 'order': 16, 'hstart': 1805667130, 'hstop': 1809861434, 'gstart': 1232004303, 'individual': True, 'dtype': 'int8'}
chr18 15 907534587 done in: 1.0826618671417236
{'chromosome': 'chr18', 'order': 15, 'hstart': 911728891, 'hstop': 915923195, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr7 16 1805667130 starting 15079
chr18 15 911728891 startingchr7 60316 
16 1805667130 done in: 0.2732970714569092
{'chromosome': 'chr7', 'order': 16, 'hstart': 1809861434, 'hstop': 1814055738, 'gstart': 12

{'chromosome': 'chr8', 'order': 16, 'hstart': 2002109449, 'hstop': 2006303753, 'gstart': 1391350276, 'individual': True, 'dtype': 'int8'}
chr8 16 2002109449 starting 15078
chrX 15 1003786161 done in: 1.0908639430999756
{'chromosome': 'chrX', 'order': 15, 'hstart': 1007980465, 'hstop': 1012174769, 'gstart': 2875001522, 'individual': True, 'dtype': 'int8'}
chr8 16 2002109449 done in: 0.2743041515350342
{'chromosome': 'chr8', 'order': 16, 'hstart': 2006303753, 'hstop': 2010498057, 'gstart': 1391350276, 'individual': True, 'dtype': 'int8'}
chrX 15 1007980465 starting 60317
chr8 16 2006303753 starting 15078
chr8 16 2006303753 done in: 0.2771151065826416
{'chromosome': 'chr8', 'order': 16, 'hstart': 2010498057, 'hstop': 2014692361, 'gstart': 1391350276, 'individual': True, 'dtype': 'int8'}
chr8 16 2010498057 starting 15079
chrX 15 1007980465 done in: 1.0791332721710205
{'chromosome': 'chrX', 'order': 15, 'hstart': 1012174769, 'hstop': 1016369073, 'gstart': 2875001522, 'individual': True, 'dt

Combining files ['./data/chromosome_bytes/cs_max_chr1_15_0.int8', './data/chromosome_bytes/cs_max_chr1_15_4194304.int8', './data/chromosome_bytes/cs_max_chr1_15_8388608.int8', './data/chromosome_bytes/cs_max_chr1_15_12582912.int8', './data/chromosome_bytes/cs_max_chr1_15_16777216.int8', './data/chromosome_bytes/cs_max_chr1_15_20971520.int8', './data/chromosome_bytes/cs_max_chr1_15_25165824.int8', './data/chromosome_bytes/cs_max_chr1_15_29360128.int8', './data/chromosome_bytes/cs_max_chr1_15_33554432.int8', './data/chromosome_bytes/cs_max_chr1_15_37748736.int8', './data/chromosome_bytes/cs_max_chr1_15_41943040.int8', './data/chromosome_bytes/cs_max_chr1_15_46137344.int8', './data/chromosome_bytes/cs_max_chr1_15_50331648.int8', './data/chromosome_bytes/cs_max_chr1_15_54525952.int8', './data/chromosome_bytes/cs_max_chr1_15_58720256.int8', './data/chromosome_bytes/cs_max_chr1_15_62914560.int8', './data/chromosome_bytes/cs_max_chr1_15_67108864.int8', './data/chromosome_bytes/cs_max_chr1_15_

chr9 16 2136850077 starting 15079
chr9 16 2136850077 done in: 0.2859930992126465
{'chromosome': 'chr9', 'order': 16, 'hstart': 2141044381, 'hstop': 2145238685, 'gstart': 1536488912, 'individual': True, 'dtype': 'int8'}
all done!
Combining files ['./data/chromosome_bytes/cs_max_chr1_15_0.json', './data/chromosome_bytes/cs_max_chr1_15_4194304.json', './data/chromosome_bytes/cs_max_chr1_15_8388608.json', './data/chromosome_bytes/cs_max_chr1_15_12582912.json', './data/chromosome_bytes/cs_max_chr1_15_16777216.json', './data/chromosome_bytes/cs_max_chr1_15_20971520.json', './data/chromosome_bytes/cs_max_chr1_15_25165824.json', './data/chromosome_bytes/cs_max_chr1_15_29360128.json', './data/chromosome_bytes/cs_max_chr1_15_33554432.json', './data/chromosome_bytes/cs_max_chr1_15_37748736.json', './data/chromosome_bytes/cs_max_chr1_15_41943040.json', './data/chromosome_bytes/cs_max_chr1_15_46137344.json', './data/chromosome_bytes/cs_max_chr1_15_50331648.json', './data/chromosome_bytes/cs_max_chr

all done!
done with 15
chr9 16 2141044381 starting 15078
chr9 16 2141044381 done in: 0.2716081142425537
{'chromosome': 'chr9', 'order': 16, 'hstart': 2145238685, 'hstop': 2149432989, 'gstart': 1536488912, 'individual': True, 'dtype': 'int8'}
chr9 16 2145238685 starting 15079
chr9 16 2145238685 done in: 0.2679009437561035
{'chromosome': 'chr9', 'order': 16, 'hstart': 2149432989, 'hstop': 2153627293, 'gstart': 1536488912, 'individual': True, 'dtype': 'int8'}
chr9 16 2149432989 starting 15078
chr9 16 2149432989 done in: 0.27042317390441895
{'chromosome': 'chr9', 'order': 16, 'hstart': 2153627293, 'hstop': 2157821597, 'gstart': 1536488912, 'individual': True, 'dtype': 'int8'}
chr9 16 2153627293 starting 15079
chr9 16 2153627293 done in: 0.2677149772644043
{'chromosome': 'chr9', 'order': 16, 'hstart': 2157821597, 'hstop': 2162015901, 'gstart': 1536488912, 'individual': True, 'dtype': 'int8'}
chr9 16 2157821597 starting 15078
chr9 16 2157821597 done in: 0.26576995849609375
{'chromosome': 'ch

chr10 16 2450955375 starting 15079
chr10 16 2450955375 done in: 0.2693960666656494
{'chromosome': 'chr10', 'order': 16, 'hstart': 2455149679, 'hstop': 2459343983, 'gstart': 1674883629, 'individual': True, 'dtype': 'int8'}
chr10 16 2455149679 starting 15078
chr10 16 2455149679 done in: 0.2660999298095703
{'chromosome': 'chr10', 'order': 16, 'hstart': 2459343983, 'hstop': 2463538287, 'gstart': 1674883629, 'individual': True, 'dtype': 'int8'}
chr10 16 2459343983 starting 15079
chr10 16 2459343983 done in: 0.2695450782775879
{'chromosome': 'chr10', 'order': 16, 'hstart': 2463538287, 'hstop': 2467732591, 'gstart': 1674883629, 'individual': True, 'dtype': 'int8'}
chr10 16 2463538287 starting 15078
chr10 16 2463538287 done in: 0.26548290252685547
{'chromosome': 'chr10', 'order': 16, 'hstart': 2467732591, 'hstop': 2471926895, 'gstart': 1674883629, 'individual': True, 'dtype': 'int8'}
chr10 16 2467732591 starting 15079
chr10 16 2467732591 done in: 0.2674140930175781
{'chromosome': 'chr10', 'ord

chr12 16 2753598863 done in: 0.26993894577026367
{'chromosome': 'chr12', 'order': 16, 'hstart': 2757793167, 'hstop': 2761987471, 'gstart': 1943767673, 'individual': True, 'dtype': 'int8'}
chr12 16 2757793167 starting 15079
chr12 16 2757793167 done in: 0.26474595069885254
{'chromosome': 'chr12', 'order': 16, 'hstart': 2761987471, 'hstop': 2766181775, 'gstart': 1943767673, 'individual': True, 'dtype': 'int8'}
chr12 16 2761987471 starting 15078
chr12 16 2761987471 done in: 0.26796698570251465
{'chromosome': 'chr12', 'order': 16, 'hstart': 2766181775, 'hstop': 2770376079, 'gstart': 1943767673, 'individual': True, 'dtype': 'int8'}
chr12 16 2766181775 starting 15079
chr12 16 2766181775 done in: 0.2739992141723633
{'chromosome': 'chr12', 'order': 16, 'hstart': 2770376079, 'hstop': 2774570383, 'gstart': 1943767673, 'individual': True, 'dtype': 'int8'}
chr12 16 2770376079 starting 15078
chr12 16 2770376079 done in: 0.28742003440856934
{'chromosome': 'chr12', 'order': 16, 'hstart': 2774570383, '

chr14 16 3056057122 done in: 0.27452802658081055
{'chromosome': 'chr14', 'order': 16, 'hstart': 3060251426, 'hstop': 3064445730, 'gstart': 2191407310, 'individual': True, 'dtype': 'int8'}
chr14 16 3060251426 starting 15078
chr14 16 3060251426 done in: 0.26839208602905273
{'chromosome': 'chr14', 'order': 16, 'hstart': 3064445730, 'hstop': 3068640034, 'gstart': 2191407310, 'individual': True, 'dtype': 'int8'}
chr14 16 3064445730 starting 15079
chr14 16 3064445730 done in: 0.26796817779541016
{'chromosome': 'chr14', 'order': 16, 'hstart': 3068640034, 'hstop': 3072834338, 'gstart': 2191407310, 'individual': True, 'dtype': 'int8'}
chr14 16 3068640034 starting 15078
chr14 16 3068640034 done in: 0.27199721336364746
{'chromosome': 'chr14', 'order': 16, 'hstart': 3072834338, 'hstop': 3077028642, 'gstart': 2191407310, 'individual': True, 'dtype': 'int8'}
chr14 16 3072834338 starting 15079
chr14 16 3072834338 done in: 0.26399803161621094
{'chromosome': 'chr14', 'order': 16, 'hstart': 3077028642, 

chr16 16 3359352354 done in: 0.26680779457092285
{'chromosome': 'chr16', 'order': 16, 'hstart': 3363546658, 'hstop': 3367740962, 'gstart': 2400442217, 'individual': True, 'dtype': 'int8'}
chr16 16 3363546658 starting 15079
chr16 16 3363546658 done in: 0.26465582847595215
{'chromosome': 'chr16', 'order': 16, 'hstart': 3367740962, 'hstop': 3371935266, 'gstart': 2400442217, 'individual': True, 'dtype': 'int8'}
chr16 16 3367740962 starting 15078
chr16 16 3367740962 done in: 0.2690751552581787
{'chromosome': 'chr16', 'order': 16, 'hstart': 3371935266, 'hstop': 3376129570, 'gstart': 2400442217, 'individual': True, 'dtype': 'int8'}
chr16 16 3371935266 starting 15078
chr16 16 3371935266 done in: 0.26505303382873535
{'chromosome': 'chr16', 'order': 16, 'hstart': 3376129570, 'hstop': 3380323874, 'gstart': 2400442217, 'individual': True, 'dtype': 'int8'}
chr16 16 3376129570 starting 15079
chr16 16 3376129570 done in: 0.2686593532562256
{'chromosome': 'chr16', 'order': 16, 'hstart': 3380323874, 'h

chr18 16 3663692782 done in: 0.2659018039703369
{'chromosome': 'chr18', 'order': 16, 'hstart': 3667887086, 'hstop': 3672081390, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr18 16 3667887086 starting 15078
chr18 16 3667887086 done in: 0.27025604248046875
{'chromosome': 'chr18', 'order': 16, 'hstart': 3672081390, 'hstop': 3676275694, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr18 16 3672081390 starting 15079
chr18 16 3672081390 done in: 0.2707710266113281
{'chromosome': 'chr18', 'order': 16, 'hstart': 3676275694, 'hstop': 3680469998, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr18 16 3676275694 starting 15078
chr18 16 3676275694 done in: 0.26609301567077637
{'chromosome': 'chr18', 'order': 16, 'hstart': 3680469998, 'hstop': 3684664302, 'gstart': 2574038003, 'individual': True, 'dtype': 'int8'}
chr18 16 3680469998 starting 15079
chr18 16 3680469998 done in: 0.265078067779541
{'chromosome': 'chr18', 'order': 16, 'hstart': 3684664302, 'hst

{'chromosome': 'chr22', 'order': 16, 'hstart': 3965441106, 'hstop': 3969635410, 'gstart': 2824183054, 'individual': True, 'dtype': 'int8'}
chr22 16 3965441106 starting 15079
chr22 16 3965441106 done in: 0.2658669948577881
{'chromosome': 'chr22', 'order': 16, 'hstart': 3969635410, 'hstop': 3973829714, 'gstart': 2824183054, 'individual': True, 'dtype': 'int8'}
chr22 16 3969635410 starting 15078
chr22 16 3969635410 done in: 0.26721787452697754
{'chromosome': 'chr22', 'order': 16, 'hstart': 3973829714, 'hstop': 3978024018, 'gstart': 2824183054, 'individual': True, 'dtype': 'int8'}
chr22 16 3973829714 starting 15079
chr22 16 3973829714 done in: 0.2645730972290039
{'chromosome': 'chr22', 'order': 16, 'hstart': 3978024018, 'hstop': 3982218322, 'gstart': 2824183054, 'individual': True, 'dtype': 'int8'}
chr22 16 3978024018 starting 15078
chr22 16 3978024018 done in: 0.2648649215698242
{'chromosome': 'chr22', 'order': 16, 'hstart': 3982218322, 'hstop': 3986412626, 'gstart': 2824183054, 'individu

{'chromosome': 'chrY', 'order': 16, 'hstart': 4274099339, 'hstop': 4278293643, 'gstart': 3031042417, 'individual': True, 'dtype': 'int8'}
chrY 16 4274099339 starting 15078
chrY 16 4274099339 done in: 0.26598405838012695
{'chromosome': 'chrY', 'order': 16, 'hstart': 4278293643, 'hstop': 4282487947, 'gstart': 3031042417, 'individual': True, 'dtype': 'int8'}
chrY 16 4278293643 starting 15079
chrY 16 4278293643 done in: 0.2688760757446289
{'chromosome': 'chrY', 'order': 16, 'hstart': 4282487947, 'hstop': 4286682251, 'gstart': 3031042417, 'individual': True, 'dtype': 'int8'}
chrY 16 4282487947 starting 15078
chrY 16 4282487947 done in: 0.2659108638763428
{'chromosome': 'chrY', 'order': 16, 'hstart': 4286682251, 'hstop': 4290876555, 'gstart': 3031042417, 'individual': True, 'dtype': 'int8'}
chrY 16 4286682251 starting 15078
chrY 16 4286682251 done in: 0.2653319835662842
{'chromosome': 'chrY', 'order': 16, 'hstart': 4290876555, 'hstop': 4294967296, 'gstart': 3031042417, 'individual': True, 'd

all done!
Combining files ['./data/chromosome_bytes/cs_max_chr1_16_0.json', './data/chromosome_bytes/cs_max_chr1_16_4194304.json', './data/chromosome_bytes/cs_max_chr1_16_8388608.json', './data/chromosome_bytes/cs_max_chr1_16_12582912.json', './data/chromosome_bytes/cs_max_chr1_16_16777216.json', './data/chromosome_bytes/cs_max_chr1_16_20971520.json', './data/chromosome_bytes/cs_max_chr1_16_25165824.json', './data/chromosome_bytes/cs_max_chr1_16_29360128.json', './data/chromosome_bytes/cs_max_chr1_16_33554432.json', './data/chromosome_bytes/cs_max_chr1_16_37748736.json', './data/chromosome_bytes/cs_max_chr1_16_41943040.json', './data/chromosome_bytes/cs_max_chr1_16_46137344.json', './data/chromosome_bytes/cs_max_chr1_16_50331648.json', './data/chromosome_bytes/cs_max_chr1_16_54525952.json', './data/chromosome_bytes/cs_max_chr1_16_58720256.json', './data/chromosome_bytes/cs_max_chr1_16_62914560.json', './data/chromosome_bytes/cs_max_chr1_16_67108864.json', './data/chromosome_bytes/cs_ma

all done!
done with 16
done in: 869.3135411739349


In [None]:
# Show how to encode a simple dataset

In [None]:
# Show what happens when we do individual for a small dataset

In [None]:
# Test that the parallel splicing the data matches a simpler straightforward

In [62]:
df = pd.DataFrame({'A': [5, 5, 1, 1, 2, 2, 3, 3], 'B': [1, 1, 14, 5, 6, 7, 8, 9]})

In [63]:
df.head()

Unnamed: 0,A,B
0,5,1
1,5,1
2,1,14
3,1,5
4,2,6


In [318]:
gb = df.groupby("A").apply(lambda x: x.sum())

In [319]:
gb.head()

Unnamed: 0_level_0,A,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,19
2,4,13
3,6,17
5,10,2


In [320]:
gb.index

Int64Index([1, 2, 3, 5], dtype='int64', name='A')

In [321]:
type(gb.index)

pandas.core.indexes.numeric.Int64Index

In [112]:
for i in gb.index:
    print(i, gb.loc[i]['A'])

1 2
2 4
3 6
5 10


In [95]:
df2 = df.apply(lambda x: [x['A'], x['A'] + 1], axis=1)

In [96]:
df2.head()

0    [5, 6]
1    [5, 6]
2    [1, 2]
3    [1, 2]
4    [2, 3]
dtype: object

In [148]:
f = hg.aggregate_chromosome("chr1", 6)[0]

In [167]:
samples = hg.slice_data(f)

{'chromosome': 'chr1', 'order': 6, 'hstart': 0, 'hstop': 330, 'gstart': 0, 'individual': False, 'dtype': 'int8'}


In [168]:
# {'chromosome': 'chr1', 'order': 6, 'hstart': 0, 'hstop': 330, 'gstart': 0, 'individual': False, 'dtype': 'int8'}
dtype = f['dtype']
hstart = f['hstart']
hstop = f['hstop']
gstart = f['gstart']
order = f['order']
v = np.arange(hstart, hstop, dtype=np.dtype(dtype))

In [215]:
hstop - hstart

330

In [474]:
# samples_copy = samples.copy()
samples_copy = data[data["chromosome"] == "chrX"].copy()
samples_copy["hpos"] = samples_copy.apply(lambda x: 
    hg.get_hilbert_from_genome_order(x['start'] + gstart, order)
, axis=1)

In [475]:
# samples_copy["max"] = 17

In [476]:
samples_copy[10000:].head()

Unnamed: 0,chromosome,start,end,max,sum,hpos
14415178,chrX,2000000,2000200,17,0.40997,679
14415179,chrX,2000200,2000400,17,0.40997,679
14415180,chrX,2000400,2000600,17,0.40997,679
14415181,chrX,2000600,2000800,17,0.40997,679
14415182,chrX,2000800,2001000,17,0.40997,679


In [536]:
def aggregator_counts(data):
    return data["max"].value_counts(sort=False).reindex([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]).fillna(0).to_numpy()    

In [537]:
result = samples_copy.groupby("hpos").apply(aggregator_counts)

In [538]:
# result.columns[0]

In [539]:
result.head()

hpos
0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
dtype: object

In [532]:
type(result)

pandas.core.series.Series

In [533]:
result.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            52710, 52711, 52712, 52713, 52714, 52715, 52716, 52717, 52718,
            52719],
           dtype='int64', name='hpos', length=52720)

In [534]:
# result.loc[(0, 9)]

In [535]:
type(result.index)

pandas.core.indexes.numeric.Int64Index

In [274]:
width = 18
rows = hstop - hstart
missing_value = 0
v = np.full((rows, width), missing_value, dtype=np.dtype("uint32"))
for idx in result.index:
    i = idx[0] - hstart # the hilbert pos
    ii = idx[1] # the column of the aggregate
    if(i < 0 or i >= rows):
        continue
    v[i][ii] = result.loc[idx]
#     v[i] = result.loc[pos]
#     print(pos)
#     print(result.loc[pos])

In [275]:
v

array([[   4,    7,    0, ...,    0,    0, 3751],
       [  91,   55,   77, ...,  418,  554,  347],
       [  69,   19,  114, ...,  205,  857,   60],
       ...,
       [  47,   26,    8, ...,   12,  157,  828],
       [  14,   42,   18, ...,    3,  169, 1136],
       [   0,    2,    0, ...,    0,    0, 1088]], dtype=uint32)

In [278]:
np.amax(v, axis=0)

array([ 143,   69,  199,   35,  869, 2466,   59,   59,  555,  353,  519,
        728, 2537,  122,  163,  964, 3153, 3770], dtype=uint32)

In [280]:
foo = np.amax(v, axis=0)

In [283]:
np.amax(foo, axis=0)

3770

In [296]:
a = [{'min': [0, 3], 'max': [10,40]}, {'min': [5, 2], 'max': [10,3]}]

In [297]:
dfa = pd.DataFrame(a)

In [298]:
dfa

Unnamed: 0,min,max
0,"[0, 3]","[10, 40]"
1,"[5, 2]","[10, 3]"


In [301]:
dfa.values

array([[list([0, 3]), list([10, 40])],
       [list([5, 2]), list([10, 3])]], dtype=object)

In [302]:
b = [np.array([0,3]), np.array([5,2])]

In [386]:
npb = np.array(b)

In [388]:
npb

array([[0, 3],
       [5, 2]])

In [387]:
npb.flatten()

array([0, 3, 5, 2])

In [389]:
npb.tobytes()

b'\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00'

In [390]:
npb.flatten().tobytes()

b'\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00'

In [307]:
files6 = hg.files_for_order(6)

In [313]:
hg.combine_meta_files(files6, 6)

Combining files ['./data/chromosome_bytes/example_chr1_6_0.json', './data/chromosome_bytes/example_chr2_6_330.json', './data/chromosome_bytes/example_chr3_6_651.json', './data/chromosome_bytes/example_chr4_6_914.json', './data/chromosome_bytes/example_chr5_6_1166.json', './data/chromosome_bytes/example_chr6_6_1407.json', './data/chromosome_bytes/example_chr7_6_1634.json', './data/chromosome_bytes/example_chr8_6_1845.json', './data/chromosome_bytes/example_chr9_6_2037.json', './data/chromosome_bytes/example_chr10_6_2221.json', './data/chromosome_bytes/example_chr11_6_2398.json', './data/chromosome_bytes/example_chr12_6_2578.json', './data/chromosome_bytes/example_chr13_6_2754.json', './data/chromosome_bytes/example_chr14_6_2906.json', './data/chromosome_bytes/example_chr15_6_3048.json', './data/chromosome_bytes/example_chr16_6_3183.json', './data/chromosome_bytes/example_chr17_6_3303.json', './data/chromosome_bytes/example_chr18_6_3413.json', './data/chromosome_bytes/example_chr19_6_352

In [543]:
hg_counts = hilbertgenome.HilbertGenome(data=data, 
                                 name="cs_counts2",
                                 aggregator=aggregator_counts,
                                 accessor=accessor,
                                 dtype="uint32",
                                 signal_resolution=200,
                                 missing_value=0
                                )

data width 18
order 6 individual False
order 7 individual False
order 8 individual False
order 9 individual False
order 10 individual False
order 11 individual False
order 12 individual True
order 13 individual True
order 14 individual True
order 15 individual True


In [544]:
hg_counts.generate_order(6)

generating 24 files
chr1 6 0 starting
chr1 6 0 done in: 4.446446180343628
chr2 6 330 starting
chr2 6 330 done in: 4.3661789894104
chr3 6 651 starting
chr3 6 651 done in: 3.650153875350952
chr4 6 914 starting
chr4 6 914 done in: 3.516226053237915
chr5 6 1166 starting
chr5 6 1166 done in: 3.431173086166382
chr6 6 1407 starting
chr6 6 1407 done in: 3.2898149490356445
chr7 6 1634 starting
chr7 6 1634 done in: 3.143502950668335
chr8 6 1845 starting
chr8 6 1845 done in: 2.9246537685394287
chr9 6 2037 starting
chr9 6 2037 done in: 2.762754201889038
chr10 6 2221 starting
chr10 6 2221 done in: 2.6905930042266846
chr11 6 2398 starting
chr11 6 2398 done in: 2.780296802520752
chr12 6 2578 starting
chr12 6 2578 done in: 2.6865549087524414
chr13 6 2754 starting
chr13 6 2754 done in: 2.3563458919525146
chr14 6 2906 starting
chr14 6 2906 done in: 2.277864933013916
chr15 6 3048 starting
chr15 6 3048 done in: 2.1490018367767334
chr16 6 3183 starting
chr16 6 3183 done in: 1.9597198963165283
chr17 6 3303 

In [409]:
workers = mp.cpu_count() - 1
print("workers", workers)

# order 10 takes about 300 seconds
# order 11 takes about 2000 seconds
orders = range(7, 12)
start_time = time.time()
with mp.Pool(workers) as p:
    p.map(hg_counts.generate_order, orders)
    
print("done in: %s" % (time.time() - start_time))
# with mp.Pool(workers) as p:
#     p.map(hg.aggregate_parallel, files)

workers 9
generating 24 files
{'chromosome': 'chr1', 'order': 7, 'hstart': 0, 'hstop': 1320, 'gstart': 0, 'individual': False, 'dtype': 'uint32'}
chr1 7 0 starting 1244996
chr1 7 0 done in: 4.314413070678711
{'chromosome': 'chr2', 'order': 7, 'hstart': 1320, 'hstop': 2605, 'gstart': 248956422, 'individual': False, 'dtype': 'uint32'}
chr2 7 1320 starting 1211282
chr2 7 1320 done in: 4.166431188583374
{'chromosome': 'chr3', 'order': 7, 'hstart': 2605, 'hstop': 3657, 'gstart': 491149951, 'individual': False, 'dtype': 'uint32'}
chr3 7 2605 starting 990113
generating 24 files
{'chromosome': 'chr1', 'order': 8, 'hstart': 0, 'hstop': 5283, 'gstart': 0, 'individual': False, 'dtype': 'uint32'}
chr1 8 0 starting 1244996
chr3 7 2605 done in: 3.4942359924316406
{'chromosome': 'chr4', 'order': 7, 'hstart': 3657, 'hstop': 4666, 'gstart': 689445510, 'individual': False, 'dtype': 'uint32'}
chr4 7 3657 starting 951257
chr4 7 3657 done in: 3.337125062942505
{'chromosome': 'chr5', 'order': 7, 'hstart': 4

Combining files ['./data/chromosome_bytes/cs_counts_chr1_8_0.uint32', './data/chromosome_bytes/cs_counts_chr2_8_5283.uint32', './data/chromosome_bytes/cs_counts_chr3_8_10422.uint32', './data/chromosome_bytes/cs_counts_chr4_8_14630.uint32', './data/chromosome_bytes/cs_counts_chr5_8_18667.uint32', './data/chromosome_bytes/cs_counts_chr6_8_22519.uint32', './data/chromosome_bytes/cs_counts_chr7_8_26144.uint32', './data/chromosome_bytes/cs_counts_chr8_8_29525.uint32', './data/chromosome_bytes/cs_counts_chr9_8_32605.uint32', './data/chromosome_bytes/cs_counts_chr10_8_35542.uint32', './data/chromosome_bytes/cs_counts_chr11_8_38381.uint32', './data/chromosome_bytes/cs_counts_chr12_8_41248.uint32', './data/chromosome_bytes/cs_counts_chr13_8_44076.uint32', './data/chromosome_bytes/cs_counts_chr14_8_46503.uint32', './data/chromosome_bytes/cs_counts_chr15_8_48775.uint32', './data/chromosome_bytes/cs_counts_chr16_8_50939.uint32', './data/chromosome_bytes/cs_counts_chr17_8_52856.uint32', './data/chr

Combining files ['./data/chromosome_bytes/cs_counts_chr1_10_0.json', './data/chromosome_bytes/cs_counts_chr2_10_84529.json', './data/chromosome_bytes/cs_counts_chr3_10_166762.json', './data/chromosome_bytes/cs_counts_chr4_10_234090.json', './data/chromosome_bytes/cs_counts_chr5_10_298675.json', './data/chromosome_bytes/cs_counts_chr6_10_360314.json', './data/chromosome_bytes/cs_counts_chr7_10_418308.json', './data/chromosome_bytes/cs_counts_chr8_10_472412.json', './data/chromosome_bytes/cs_counts_chr9_10_521691.json', './data/chromosome_bytes/cs_counts_chr10_10_568681.json', './data/chromosome_bytes/cs_counts_chr11_10_614110.json', './data/chromosome_bytes/cs_counts_chr12_10_659977.json', './data/chromosome_bytes/cs_counts_chr13_10_705228.json', './data/chromosome_bytes/cs_counts_chr14_10_744059.json', './data/chromosome_bytes/cs_counts_chr15_10_780404.json', './data/chromosome_bytes/cs_counts_chr16_10_815034.json', './data/chromosome_bytes/cs_counts_chr17_10_845707.json', './data/chro

In [328]:
len(aggregator_counts(data))

18

In [333]:
str(type(result.index))

"<class 'pandas.core.indexes.multi.MultiIndex'>"

In [342]:
data[data["chromosome"] == "chrY"]["max"].value_counts().sort_index()

0    296868
Name: max, dtype: int64

In [343]:
aggregator_counts(data[data["chromosome"] == "chrY"])

0    296868
Name: max, dtype: int64

In [414]:
# try out parallelizing the whole thing again
workers = mp.cpu_count() - 1
print("workers", workers)

order = 10

# not parallel: order 10 done in: 311.15400218963623
# for some reason this is almost the same speed
# probably the dataframe being big and not shared memory or something
files = hg_counts.files_for_order(order)
    
print("starting parallel work")
start_time = time.time()
with mp.Pool(workers) as p:
    p.map(hg_counts.aggregate_parallel, files)
    
print("done in: %s" % (time.time() - start_time))

hg_counts.combine_bytes_files(files, order)
hg_counts.combine_meta_files(files, order)
print("done with", order)

workers 9
starting parallel work
chr1 10 0 starting
chr2 10 84529 starting
chr3 10 166762 starting
chr1 10 0 done in: 29.363043069839478
chr4 10 234090 starting
chr2 10 84529 done in: 27.869900226593018
chr3 10 166762 done in: 21.17950987815857
chr5 10 298675 starting
chr4 10 234090 done in: 20.481046199798584
chr6 10 360314 starting
chr5 10 298675 done in: 19.67300820350647
chr7 10 418308 starting
chr6 10 360314 done in: 18.057921171188354
chr8 10 472412 starting
chr7 10 418308 done in: 16.512941122055054
chr9 10 521691 starting
chr8 10 472412 done in: 14.847740888595581
chr10 10 568681 starting
chr9 10 521691 done in: 13.920643091201782
chr11 10 614110 starting
chr10 10 568681 done in: 13.523697137832642
chr12 10 659977 starting
chr11 10 614110 done in: 13.710996866226196
chr13 10 705228 starting
chr12 10 659977 done in: 13.18272089958191
chr13 10 705228 done in: 10.953779935836792
chr14 10 744059 starting
chr14 10 744059 done in: 10.419567823410034
chr15 10 780404 starting
chr15 10 