In [1]:
import pandas as pd
import multiprocessing
import numpy as np
import time
from omilayers import Omilayers
import io

# Connect to database

In [2]:
omi = Omilayers("project.duckdb", engine="duckdb")

# Helper functions

In [3]:
Nsamples = 100
sampleIDs = ["SA"+f"{i}".zfill(3) for i in range(1, Nsamples+1)]

def timeit(func):
    """
    Decorator that times the execution of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = np.round(end_time - start_time, 2)
        return (duration, result)
    
    return wrapper

@timeit
def simulate_data(featuresName, features, minValue, maxValue, integers=False):
    """
    Function for simulating data.
    """
    JSON = {}
    for feature in features:
        if not integers:
            JSON[feature] = np.round(np.random.uniform(minValue, maxValue, Nsamples), 4)
        else:
            JSON[feature] = np.random.randint(minValue, maxValue, Nsamples)
    data = pd.DataFrame(JSON, index=sampleIDs)
    data = data.T
    data = data.reset_index()
    data = data.rename(columns={"index":featuresName})
    
    return data

def memory_usage_simulated_data(data):
    """
    Function for estimating the memory usage of a pandas.DataFrame.
    """
    output = io.StringIO()
    data.info(memory_usage='deep', buf=output)
    output = output.getvalue().split("\n")
    memoryUsage = output[-2].split(":")[-1].strip()
    
    return memoryUsage

@timeit
def store_data(dataToStore, layerName, tagName, description):
    """
    Function for storing omic layer data.
    """
    omi.layers[layerName] = dataToStore
    omi.layers[layerName].set_tag(tagName)
    omi.layers[layerName].set_info(description)
    
    return None

@timeit
def load_layer(layerName):
    """
    Function for loading stored omic layer data.
    """
    layerData = omi.layers[layerName].to_df()
    
    return layerData

@timeit
def get_sample(layer, sample):
    result = omi.layers[layer][sample]
    
    return result

@timeit
def add_sample(layer, sampleName, data):
    omi.layers[layer][sampleName] = data
    
    return None



# Cohort features

## Create synthetic data

In [4]:
data = pd.DataFrame({
    "sample_id": sampleIDs,
    "gender": np.random.choice(["female", "male"], Nsamples),
    "age": np.random.randint(20, 50, Nsamples),
    "bmi": np.round(np.random.uniform(20, 40, Nsamples), 2)
})

Get memory usage of simulated data

In [5]:
memoryUsage = memory_usage_simulated_data(data)

In [6]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 12.2 KB


## Store synthetic data

In [7]:
duration, _ = store_data(data, "cohort", "raw", "Cohort features.")

In [8]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.07 s


## Retrieve stored data

In [9]:
duration, layerData = load_layer("cohort")

In [10]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.02 s


In [11]:
layerData

Unnamed: 0,sample_id,gender,age,bmi
0,SA001,female,26,22.37
1,SA002,male,33,25.09
2,SA003,female,37,22.41
3,SA004,female,33,29.19
4,SA005,male,22,29.68
...,...,...,...,...
95,SA096,female,28,29.65
96,SA097,female,27,37.64
97,SA098,male,23,36.92
98,SA099,male,42,26.65


# Blood metabolomic data

## Create synthetic data

Load features

In [12]:
features = pd.read_csv("omic_features/blood_metabolites-2024-06-25.csv")['NAME']
print(f"Number of features: {len(features)}")

Number of features: 37228


In [13]:
duration, data = simulate_data("metabolite", features, minValue=0, maxValue=1000, integers=False)

In [14]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.49 s


Get memory usage of simulated data

In [15]:
memoryUsage = memory_usage_simulated_data(data)

In [16]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 31.3 MB


## Store synthetic data

In [17]:
duration, _ = store_data(data, "blood_metas", "raw", "Raw blood metabolomic data.")

In [18]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.4 s


## Retrieve stored data

In [19]:
duration, layerData = load_layer("blood_metas")

In [20]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.11 s


In [21]:
layerData

Unnamed: 0,metabolite,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
0,1-Methylhistidine,160.5345,613.0872,127.6444,261.1609,621.7449,163.1826,277.1065,244.2859,922.7444,...,740.8128,732.1407,855.9692,858.5214,394.7873,273.1119,918.8326,155.8283,253.7521,987.6875
1,"1,3-Diaminopropane",315.3933,301.8827,843.2439,517.3792,509.7382,940.6679,443.3848,506.7704,54.3492,...,134.0646,451.9462,432.7803,311.5084,584.2263,912.7995,333.6045,808.4649,406.1256,635.3067
2,2-Ketobutyric acid,739.3905,913.4635,891.7950,568.7992,980.7496,358.0732,209.7491,298.1375,64.5342,...,336.5384,271.8927,179.8703,204.6860,402.5722,810.4362,634.1774,797.1647,970.7482,144.5507
3,2-Hydroxybutyric acid,948.0209,8.5789,296.1401,404.8405,14.6976,892.1954,237.7730,874.0592,211.6430,...,749.8965,184.0260,665.3067,125.3999,14.2509,670.3545,134.9766,121.5945,682.5727,344.5291
4,2-Methoxyestrone,619.4599,593.1441,631.9754,328.8788,711.9863,187.6999,899.7451,888.5594,125.1285,...,855.0462,803.3730,293.6359,380.2945,329.6876,495.5465,335.5249,114.6146,885.8145,791.4863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37223,Succinylserine,669.4016,990.2062,599.3050,537.0790,874.1201,930.3231,589.1364,395.5103,507.1417,...,38.2632,978.0679,81.1957,542.2210,521.2700,576.0537,949.5406,985.1538,927.6703,933.4608
37224,4-Hydroxylidocaine,764.5399,455.3041,147.6259,974.3035,210.6123,249.4724,819.1735,679.4274,880.6852,...,214.3124,833.3597,45.0000,45.1954,531.3629,370.4195,117.6715,688.8661,805.6011,153.1466
37225,Tryptophan N-glucoside,820.1227,558.2926,539.3568,636.5652,33.2268,948.3939,874.4618,597.4105,533.9874,...,672.8176,578.2594,663.4910,299.0170,447.4534,240.2909,421.9770,302.3849,781.0342,473.3501
37226,"6-Amino-5-formamido-1,3-dimethyluracil",40.7246,692.4722,682.1125,604.7818,847.6135,153.5956,356.7477,679.9632,990.8804,...,59.9737,27.3775,482.4652,899.7126,745.7765,37.4367,276.2628,625.4594,961.8168,44.9775


# Urine metabolomic data

## Create synthetic data

Load features

In [22]:
features = pd.read_csv("omic_features/urine_metabolites-2024-06-25.csv")['NAME']
print(f"Number of features: {len(features)}")

Number of features: 5661


In [23]:
duration, data = simulate_data("metabolite", features, minValue=0, maxValue=1000, integers=False)

In [24]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.1 s


Get memory usage of simulated data

In [25]:
memoryUsage = memory_usage_simulated_data(data)

In [26]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 4.7 MB


## Store synthetic data

In [27]:
duration, _ = store_data(data, "urine_metas", "raw", "Raw urine metabolomic data.")

In [28]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.27 s


## Retrieve stored data

In [29]:
duration, layerData = load_layer("urine_metas")

In [30]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.06 s


In [31]:
layerData

Unnamed: 0,metabolite,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
0,1-Methylhistidine,599.5326,130.8074,5.6231,112.1568,342.8687,825.3048,313.3522,434.3370,182.1469,...,842.5498,579.9794,446.1824,702.9679,351.9723,887.4945,389.0576,987.1664,76.3255,920.9628
1,"1,3-Diaminopropane",864.8976,45.2125,415.1271,518.9421,115.4816,548.1262,416.3232,647.2947,196.4134,...,102.7487,980.1498,435.4666,180.7155,727.0736,539.1433,196.8919,27.8801,839.1451,310.0854
2,2-Ketobutyric acid,137.2715,434.5991,279.7073,756.8208,267.1745,733.7809,599.9630,1.0733,629.0622,...,205.2774,49.5604,306.0978,734.6848,298.8214,570.9356,669.0936,950.1628,354.7930,641.6338
3,2-Hydroxybutyric acid,530.7086,593.2817,220.4830,599.8230,280.7625,59.0322,33.4500,401.8823,321.2285,...,76.5761,185.1008,147.1899,67.2840,142.3333,592.0218,675.3555,759.8203,647.9762,306.1477
4,2-Methoxyestrone,400.3906,414.7068,524.1155,904.2617,280.1326,32.9417,319.0162,580.6262,396.3317,...,4.5932,256.3173,990.8723,93.3672,241.3523,436.7753,225.4027,622.8292,374.4432,544.2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5656,Succinylserine,761.5220,503.0347,506.3095,11.2133,547.3082,93.5416,929.4531,682.6190,38.7209,...,886.8764,435.0977,276.9569,40.2080,359.3003,180.9991,860.2736,420.7622,84.7589,967.8542
5657,4-Hydroxylidocaine,434.1394,38.8280,423.8500,324.7733,640.4306,170.4279,225.4530,935.5911,995.6767,...,682.8840,147.4198,376.4324,320.7335,658.2887,507.5782,195.1674,165.1324,353.5497,308.9591
5658,Tryptophan N-glucoside,882.6050,25.2865,339.2979,328.8618,547.7629,394.7974,27.5363,57.3540,461.4065,...,770.3846,765.3990,922.4892,265.3067,97.8000,837.7626,563.9811,738.4937,137.3807,892.9729
5659,"6-Amino-5-formamido-1,3-dimethyluracil",441.6934,485.2161,78.5605,370.8682,342.3166,778.4752,56.2918,695.7316,262.4303,...,670.5072,845.0429,672.2825,681.5266,602.5948,980.9233,31.5021,250.5693,687.8369,463.4555


# Bulk RNASeq data

## Create synthetic data

Load features

In [32]:
features = open("omic_features/hg38_ensembl_ids.txt").read().splitlines()
print(f"Number of features: {len(features)}")

Number of features: 60649


In [33]:
duration, data = simulate_data("gene", features, minValue=0, maxValue=1000, integers=True)

In [34]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.88 s


Get memory usage of simulated data

In [35]:
memoryUsage = memory_usage_simulated_data(data)

In [36]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 49.9 MB


## Store synthetic data

In [37]:
duration, _ = store_data(data, "rnaseq", "raw", "Raw bulk RNASeq data.")

In [38]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.42 s


## Retrieve stored data

In [39]:
duration, layerData = load_layer("rnaseq")

In [40]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.15 s


In [41]:
layerData

Unnamed: 0,gene,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
0,ENSG00000223972,698,944,287,810,161,665,806,739,322,...,884,704,889,389,740,654,949,982,542,661
1,ENSG00000227232,410,745,235,34,808,923,806,655,898,...,294,902,446,593,230,937,709,439,936,791
2,ENSG00000278267,324,645,403,207,231,457,433,694,721,...,242,315,770,890,45,959,665,660,23,397
3,ENSG00000243485,858,323,446,933,665,967,428,455,752,...,707,166,42,987,70,538,683,750,736,866
4,ENSG00000284332,32,867,836,595,153,71,441,845,210,...,58,886,414,734,667,964,609,850,856,431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60600,ENSG00000198695,475,64,149,615,335,635,32,920,397,...,43,254,94,244,702,417,721,941,78,548
60601,ENSG00000210194,673,438,743,489,565,625,461,821,82,...,513,453,118,30,136,579,349,834,797,542
60602,ENSG00000198727,567,632,940,112,900,304,443,525,351,...,71,547,104,597,658,428,322,786,563,628
60603,ENSG00000210195,434,985,226,221,16,22,461,341,446,...,559,676,598,420,673,703,670,585,482,30


## Retrieve all stored features for sample

In [42]:
duration, sample = get_sample("rnaseq", "SA090")

In [43]:
print(f"Time to load sample with {len(sample)} features: {duration} s")

Time to load sample with 60605 features: 0.05 s


In [44]:
print(f"Memory footprint of data: {memory_usage_simulated_data(pd.Series(sample))}")

Memory footprint of data: 473.6 KB


## Add new sample

In [45]:
duration, _ = add_sample("rnaseq", "SA101", sample)

In [46]:
print(f"Time to add new sample with {len(sample)} features: {duration} s")

Time to add new sample with 60605 features: 0.14 s


# Gut microbiome data

## Create synthetic data

Load features

In [47]:
features = open("omic_features/species.txt").read().splitlines()
print(f"Number of features: {len(features)}")

Number of features: 6914


In [48]:
duration, data = simulate_data("species", features, minValue=0, maxValue=1000, integers=True)

In [49]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.13 s


Get memory usage of simulated data

In [50]:
memoryUsage = memory_usage_simulated_data(data)

In [51]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 5.8 MB


## Store synthetic data

In [52]:
duration, _ = store_data(data, "microbiome", "raw", "Raw gut microbiome data.")

In [53]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.34 s


## Retrieve stored data

In [54]:
duration, layerData = load_layer("microbiome")

In [55]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.07 s


In [56]:
layerData

Unnamed: 0,species,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
0,Bacteroides uniformis,511,348,820,450,841,77,418,683,342,...,755,784,45,979,522,531,999,59,253,146
1,Bacteroides ovatus,644,431,772,913,790,290,366,239,251,...,15,270,279,903,963,958,489,429,541,794
2,Bacteroides vulgatus,411,267,241,212,574,919,685,59,539,...,882,739,998,418,94,493,528,27,134,784
3,Blautia obeum,433,564,113,503,675,528,722,934,249,...,548,363,902,807,146,633,744,334,607,349
4,[Eubacterium] rectale,647,167,946,617,638,652,971,297,197,...,130,207,417,469,661,225,389,730,268,122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6909,Rathayibacter festucae,638,269,527,126,671,94,30,383,22,...,422,358,582,128,992,69,410,838,350,826
6910,Thermosipho sp. DSM 6568,642,821,623,23,578,542,131,999,410,...,198,20,442,797,371,85,456,566,179,158
6911,Demequina globuliformis,50,212,712,532,721,63,141,430,642,...,442,64,41,960,954,676,791,994,162,63
6912,Streptomyces abyssalis,182,490,323,93,919,12,813,746,283,...,844,832,423,274,774,488,706,606,417,877


# Genomic data (VCF)

## Download data

The synthetic VCF file is hosted on Zenodo where is can be downloaded.

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12790872.svg)](https://doi.org/10.5281/zenodo.12790872)

## Store synthetic data

In [57]:
@timeit
def store_vcf(filename):
    omi.layers.from_csv(layer='vcf', filename=filename, sep='\t', chunksize=100000)
    return None

duration, _ = store_vcf("simulated.vcf.gz")

In [58]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 1928.88 s


Add tag and description to layer

In [59]:
omi.layers['vcf'].set_tag("raw")
omi.layers['vcf'].set_info("Cohort imputed VCF.")

## Retrieve all stored features for sample

In [60]:
duration, sample = get_sample("vcf", "SA090")

In [61]:
print(f"Time to load sample with {len(sample)} features: {duration} s")

Time to load sample with 9640953 features: 1.86 s


In [62]:
print(f"Memory footprint of data: {memory_usage_simulated_data(pd.Series(sample))}")

Memory footprint of data: 803.0 MB


## Add new sample

In [63]:
duration, _ = add_sample("vcf", "SA101", sample)

In [64]:
print(f"Time to add sample with {len(sample)} features: {duration} s")

Time to add sample with 9640953 features: 9.87 s


## Retrieve stored data

VCF includes character "#" infront of the 'CHROM" column. Rename column to remove character.

In [65]:
omi.layers['vcf'].rename("'#CHROM'", 'CHROM')

In [66]:
@timeit
def parse_vcf(chromo, pos, columns):
    if chromo is not None:
        result = omi.layers['vcf'].query(f"CHROM == '{chromo}' and POS == '{pos}'")[columns]
    else:
        result = omi.layers['vcf'].select(cols=columns, where='POS', values=pos)
    return result

@timeit
def parse_range_vcf(chromo, start, end, columns):
    result = omi.layers['vcf'].query(f"CHROM == '{chromo}' and POS BETWEEN {start} AND {end}")[columns]
    return result
    

In [67]:
duration, result = parse_vcf('chr3', 100000, ['ID', 'SA010', 'SA090'])
result     

Unnamed: 0_level_0,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1654040,3:100000:G:T,"1|0:0.184:0.051,0.062:0.353,0.285,0.362","0|1:0.363:0.761,0.894:0.995,0.002,0.003"


In [68]:
print(f"Time to parse vcf: {duration} s")  

Time to parse vcf: 0.1 s


In [69]:
duration, result = parse_vcf('chr22', 100000, ['ID', 'SA010', 'SA090'])
result

Unnamed: 0_level_0,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9588773,22:100000:T:A,"1|1:0.542:0.438,0.166:0.957,0.016,0.027","1|0:0.888:0.686,0.946:0.231,0.194,0.575"


In [70]:
print(f"Time to parse vcf: {duration} s")

Time to parse vcf: 0.1 s


In [71]:
duration, result = parse_vcf(chromo=None, pos=100000, columns=['ID', 'SA010', 'SA090'])
result

Unnamed: 0_level_0,POS,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
90000,100000,1:100000:G:A,"0|1:0.641:0.88,0.203:0.934,0.052,0.014","0|1:0.322:0.518,0.689:0.519,0.382,0.099"
832932,100000,2:100000:C:T,"1|0:0.144:0.462,0.846:0.759,0.197,0.044","1|0:0.72:0.378,0.539:0.059,0.664,0.277"
1654040,100000,3:100000:G:T,"1|0:0.184:0.051,0.062:0.353,0.285,0.362","0|1:0.363:0.761,0.894:0.995,0.002,0.003"
2355176,100000,4:100000:C:G,"1|0:0.363:0.021,0.732:0.172,0.233,0.595","1|0:0.119:0.441,0.84:0.413,0.083,0.504"
3066329,100000,5:100000:C:G,"0|0:0.423:0.948,0.166:0.015,0.454,0.531","0|0:0.62:0.254,0.99:0.903,0.049,0.048"
3702775,100000,6:100000:C:A,"1|1:0.17:0.174,0.625:0.935,0.006,0.059","1|0:0.07:0.751,0.039:0.772,0.095,0.133"
4349834,100000,7:100000:A:C,"1|1:0.498:0.045,0.372:0.818,0.081,0.101","0|1:0.078:0.716,0.819:0.151,0.258,0.591"
4911808,100000,8:100000:T:C,"0|1:0.757:0.785,0.161:0.778,0.207,0.015","1|0:0.185:0.632,0.472:0.83,0.133,0.037"
5459234,100000,9:100000:A:C,"0|1:0.774:0.915,0.858:0.975,0.003,0.022","1|0:0.732:0.647,0.803:0.607,0.005,0.388"
5876401,100000,10:100000:T:A,"1|1:0.235:0.982,0.52:0.729,0.11,0.161","0|1:0.041:0.004,0.031:0.805,0.015,0.18"


In [72]:
print(f"Time to parse vcf: {duration} s")

Time to parse vcf: 0.09 s


In [73]:
duration, result = parse_range_vcf('chr15', 50000, 50010, ['ID', 'SA010', 'SA090'])
result

Unnamed: 0_level_0,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7937317,15:50000:A:G,"1|1:0.502:0.24,0.938:0.434,0.19,0.376","1|0:0.975:0.318,0.538:0.176,0.674,0.15"
7937318,15:50001:C:T,"1|1:0.214:0.732,0.514:0.236,0.158,0.606","0|0:0.732:0.979,0.842:0.1,0.066,0.834"
7937319,15:50002:A:G,"1|1:0.396:0.537,0.581:0.534,0.277,0.189","0|0:0.446:0.276,0.065:0.747,0.047,0.206"
7937320,15:50003:A:G,"0|0:0.677:0.191,0.832:0.589,0.254,0.157","0|1:0.336:0.97,0.848:0.737,0.194,0.069"
7937321,15:50004:A:G,"1|0:0.947:0.946,0.091:0.468,0.477,0.055","0|1:0.252:0.284,0.098:0.888,0.037,0.075"
7937322,15:50005:T:G,"1|0:0.055:0.06,0.958:0.846,0.102,0.052","1|0:0.344:0.956,0.419:0.204,0.495,0.301"
7937323,15:50006:C:A,"0|0:0.108:0.658,0.017:0.771,0.056,0.173","0|1:0.85:0.109,0.291:0.932,0.028,0.04"
7937324,15:50007:T:C,"1|1:0.53:0.347,0.236:0.381,0.571,0.048","1|1:0.764:0.96,0.585:0.152,0.184,0.664"
7937325,15:50008:T:A,"1|1:0.542:0.539,0.925:0.344,0.492,0.164","1|1:0.799:0.346,0.185:0.125,0.456,0.419"
7937326,15:50009:C:T,"0|1:0.699:0.793,0.185:0.085,0.404,0.511","0|1:0.741:0.316,0.701:0.717,0.029,0.254"


In [74]:
print(f"Time to parse vcf: {duration} s")

Time to parse vcf: 0.11 s


## Retrieve stored data in parallel

For parallel data retrieval, the database should be accessed in read-only mode

In [75]:
omi = Omilayers("project.duckdb", read_only=True)

In [76]:
def getSamples(chromo, pos):
    result = omi.layers['vcf'].query(f"CHROM == '{chromo}' and POS == {pos}")[['SA096', 'SA098', 'SA099']]
    return (chromo, pos, result.values)

@timeit
def queryInParallel(queries):
    with multiprocessing.Pool(processes=8) as pool:
        results = pool.starmap(getSamples, queries)
    return results

queries = []
for i in [1,10,20]:
    for j in range(1, 100):
        queries.append((f"chr{i}", 100000+j))
        
duration, results = queryInParallel(queries)

In [77]:
results[:5]

[('chr1',
  100001,
  array([['0|0:0.508:0.702,0.432:0.962,0.013,0.025',
          '1|1:0.913:0.092,0.546:0.795,0.028,0.177',
          '1|0:0.721:0.193,0.479:0.051,0.172,0.777']], dtype=object)),
 ('chr1',
  100002,
  array([['1|0:0.25:0.389,0.383:0.668,0.059,0.273',
          '0|1:0.681:0.042,0.261:0.72,0.034,0.246',
          '0|0:0.455:0.177,0.445:0.776,0.003,0.221']], dtype=object)),
 ('chr1',
  100003,
  array([['0|1:0.981:0.986,0.122:0.157,0.182,0.661',
          '1|0:0.513:0.84,0.457:0.292,0.133,0.575',
          '1|1:0.766:0.471,0.88:0.145,0.423,0.432']], dtype=object)),
 ('chr1',
  100004,
  array([['0|1:0.676:0.346,0.759:0.183,0.637,0.18',
          '1|0:0.061:0.735,0.559:0.236,0.309,0.455',
          '1|1:0.895:0.97,0.068:0.275,0.665,0.06']], dtype=object)),
 ('chr1',
  100005,
  array([['1|1:0.364:0.393,0.319:0.193,0.313,0.494',
          '0|1:0.569:0.968,0.188:0.516,0.027,0.457',
          '0|1:0.896:0.018,0.915:0.057,0.866,0.077']], dtype=object))]

In [78]:
print(f"Time to parse vcf in parallel: {duration} s")

Time to parse vcf in parallel: 8.47 s


# View stored omic layers

In [79]:
omi.layers

       name tag       shape                        info
     cohort raw       100x4            Cohort features.
blood_metas raw   37228x101 Raw blood metabolomic data.
urine_metas raw    5661x101 Raw urine metabolomic data.
     rnaseq raw   60605x102       Raw bulk RNASeq data.
 microbiome raw    6914x101    Raw gut microbiome data.
        vcf raw 9640953x110         Cohort imputed VCF.