In [1]:
import pandas as pd
import multiprocessing
import numpy as np
import time
from omilayers import Omilayers
import io

# Connect to database

In [2]:
omi = Omilayers("project.sqlite", engine='sqlite')

# Helper functions

In [3]:
Nsamples = 100
sampleIDs = ["SA"+f"{i}".zfill(3) for i in range(1, Nsamples+1)]

def timeit(func):
    """
    Decorator that times the execution of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = np.round(end_time - start_time, 2)
        return (duration, result)
    
    return wrapper

@timeit
def simulate_data(featuresName, features, minValue, maxValue, integers=False):
    """
    Function for simulating data.
    """
    JSON = {}
    for feature in features:
        if not integers:
            JSON[feature] = np.round(np.random.uniform(minValue, maxValue, Nsamples), 4)
        else:
            JSON[feature] = np.random.randint(minValue, maxValue, Nsamples)
    data = pd.DataFrame(JSON, index=sampleIDs)
    data = data.T
    data = data.reset_index()
    data = data.rename(columns={"index":featuresName})
    
    return data

def memory_usage_simulated_data(data):
    """
    Function for estimating the memory usage of a pandas.DataFrame.
    """
    output = io.StringIO()
    data.info(memory_usage='deep', buf=output)
    output = output.getvalue().split("\n")
    memoryUsage = output[-2].split(":")[-1].strip()
    
    return memoryUsage

@timeit
def store_data(dataToStore, layerName, tagName, description):
    """
    Function for storing omic layer data.
    """
    omi.layers[layerName] = dataToStore
    omi.layers[layerName].set_tag(tagName)
    omi.layers[layerName].set_info(description)
    
    return None

@timeit
def load_layer(layerName):
    """
    Function for loading stored omic layer data.
    """
    layerData = omi.layers[layerName].to_df()
    
    return layerData

@timeit
def get_sample(layer, sample):
    result = omi.layers[layer][sample]
    
    return result

@timeit
def add_sample(layer, sampleName, data):
    omi.layers[layer][sampleName] = data
    
    return None



# Cohort features

## Create synthetic data

In [4]:
data = pd.DataFrame({
    "sample_id": sampleIDs,
    "gender": np.random.choice(["female", "male"], Nsamples),
    "age": np.random.randint(20, 50, Nsamples),
    "bmi": np.round(np.random.uniform(20, 40, Nsamples), 2)
})

Get memory usage of simulated data

In [5]:
memoryUsage = memory_usage_simulated_data(data)

In [6]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 12.2 KB


## Store synthetic data

In [7]:
duration, _ = store_data(data, "cohort", "raw", "Cohort features.")

In [8]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.03 s


## Retrieve stored data

In [9]:
duration, layerData = load_layer("cohort")

In [10]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.0 s


In [11]:
layerData

Unnamed: 0,sample_id,gender,age,bmi
1,SA001,female,22,23.83
2,SA002,female,45,24.63
3,SA003,male,39,29.54
4,SA004,male,21,28.51
5,SA005,female,35,36.35
...,...,...,...,...
96,SA096,male,47,33.93
97,SA097,female,20,24.12
98,SA098,female,23,37.86
99,SA099,male,27,27.00


# Blood metabolomic data

## Create synthetic data

Load features

In [12]:
features = pd.read_csv("omic_features/blood_metabolites-2024-06-25.csv")['NAME']
print(f"Number of features: {len(features)}")

Number of features: 37228


In [13]:
duration, data = simulate_data("metabolite", features, minValue=0, maxValue=1000, integers=False)

In [14]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.61 s


Get memory usage of simulated data

In [15]:
memoryUsage = memory_usage_simulated_data(data)

In [16]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 31.3 MB


## Store synthetic data

In [17]:
duration, _ = store_data(data, "blood_metas", "raw", "Raw blood metabolomic data.")

In [18]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.89 s


## Retrieve stored data

In [19]:
duration, layerData = load_layer("blood_metas")

In [20]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 1.57 s


In [21]:
layerData

Unnamed: 0,metabolite,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
1,1-Methylhistidine,45.3706,478.6055,328.2729,377.1951,775.6611,232.0408,307.5129,120.0841,106.7276,...,959.9922,392.5347,276.8086,378.3022,882.3151,499.0548,326.1606,202.9317,636.0588,191.3103
2,"1,3-Diaminopropane",225.4026,872.0941,146.7751,755.5319,732.7744,224.7002,737.7373,733.0400,410.1918,...,823.6695,982.3547,89.2593,775.2037,301.4046,342.9952,476.2215,164.1154,933.5161,382.7093
3,2-Ketobutyric acid,700.1429,772.7507,307.6208,134.1636,553.8676,192.4303,967.0663,820.5887,984.5681,...,132.9107,241.6282,132.8417,35.6768,953.3979,233.4450,11.9798,120.9393,365.6358,20.5123
4,2-Hydroxybutyric acid,102.5729,742.4443,631.6390,252.9539,695.4263,358.3840,47.3662,676.4373,835.8446,...,251.5147,126.7600,982.0232,723.5378,651.4645,370.3369,936.0268,239.9355,173.7366,712.1068
5,2-Methoxyestrone,119.0940,243.2109,344.9199,517.7811,423.5300,806.0178,272.3354,764.6878,628.3809,...,759.6443,813.6800,237.7346,458.7994,663.9205,523.5593,388.8600,571.5247,65.2239,61.3496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37224,Succinylserine,621.8102,822.8640,543.8633,536.7050,682.2519,526.2239,209.0388,676.9137,70.4114,...,777.6977,240.7427,516.7844,352.4962,367.6868,757.7798,729.1987,639.0870,785.3333,124.5689
37225,4-Hydroxylidocaine,669.6796,522.7420,276.9837,143.7012,263.2380,856.7332,817.9850,110.7852,408.0559,...,657.5754,271.2418,838.7848,458.2431,806.2078,974.7964,787.2245,46.0643,879.9023,811.1177
37226,Tryptophan N-glucoside,400.4810,699.4903,83.5269,228.9070,652.2775,524.0437,538.5030,803.0403,434.3476,...,381.9734,181.5014,902.7167,445.6827,942.7741,438.2394,27.1573,89.1503,317.7183,505.8846
37227,"6-Amino-5-formamido-1,3-dimethyluracil",763.1958,434.0678,785.8578,495.8024,988.3055,975.1518,821.4391,833.9318,52.5221,...,873.8257,154.6516,618.6343,931.8189,729.1797,139.2886,246.8257,2.4672,537.3168,742.0887


# Urine metabolomic data

## Create synthetic data

Load features

In [22]:
features = pd.read_csv("omic_features/urine_metabolites-2024-06-25.csv")['NAME']
print(f"Number of features: {len(features)}")

Number of features: 5661


In [23]:
duration, data = simulate_data("metabolite", features, minValue=0, maxValue=1000, integers=False)

In [24]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.09 s


Get memory usage of simulated data

In [25]:
memoryUsage = memory_usage_simulated_data(data)

In [26]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 4.7 MB


## Store synthetic data

In [27]:
duration, _ = store_data(data, "urine_metas", "raw", "Raw urine metabolomic data.")

In [28]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.16 s


## Retrieve stored data

In [29]:
duration, layerData = load_layer("urine_metas")

In [30]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.14 s


In [31]:
layerData

Unnamed: 0,metabolite,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
1,1-Methylhistidine,792.3147,826.0664,120.9468,999.0884,980.6263,460.5441,685.4633,88.6773,599.5874,...,191.7147,201.6404,508.7420,306.9755,560.6245,934.2456,321.2244,759.3123,981.3626,593.3694
2,"1,3-Diaminopropane",630.0177,926.0808,629.3713,531.3176,588.3823,960.9402,668.0743,211.6372,965.2359,...,874.0533,695.9193,747.2220,432.2935,970.6788,694.2447,67.7990,559.3346,151.4413,48.9508
3,2-Ketobutyric acid,6.5942,170.8984,983.4950,968.7197,693.0017,782.7152,617.6278,475.4380,565.4883,...,981.2040,360.9761,782.3894,15.0917,411.9089,500.6740,795.7136,732.8313,542.0177,879.2099
4,2-Hydroxybutyric acid,84.3965,548.5687,287.6209,196.6327,610.0506,881.5024,703.7766,166.0133,17.1374,...,81.2190,473.9587,576.4383,746.4798,413.0391,654.7386,229.2008,49.4307,406.0790,613.5022
5,2-Methoxyestrone,449.9186,597.0519,614.5517,13.4089,410.0955,370.9341,62.0765,493.6511,77.2316,...,887.4833,21.7080,617.3623,33.9727,992.0038,807.8894,539.3513,325.7954,719.8290,357.6753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5657,Succinylserine,974.5281,205.3004,968.2135,320.2487,31.4898,686.9594,123.1166,753.6679,888.5543,...,652.3773,721.6010,977.8972,935.4676,128.4339,863.8998,808.6005,106.9796,22.2139,212.7434
5658,4-Hydroxylidocaine,357.9649,637.6479,998.4878,62.1030,785.6727,422.4691,479.3546,29.3356,620.6457,...,828.2840,224.1049,200.1766,497.1765,86.8982,742.3863,91.6526,929.2157,654.8987,110.4697
5659,Tryptophan N-glucoside,242.3413,864.5070,974.4031,406.9387,350.9016,272.5244,662.1457,233.6749,853.3353,...,438.1893,790.7892,711.5321,549.6436,332.7752,657.7612,440.8769,774.4058,930.9068,80.6949
5660,"6-Amino-5-formamido-1,3-dimethyluracil",366.2841,16.2556,186.1512,938.3840,644.4475,827.5003,56.3099,156.7200,267.7437,...,856.8287,254.3741,32.6121,360.6982,368.6537,659.7074,97.7146,170.2626,212.4574,504.3694


# Bulk RNASeq data

## Create synthetic data

Load features

In [32]:
features = open("omic_features/hg38_ensembl_ids.txt").read().splitlines()
print(f"Number of features: {len(features)}")

Number of features: 60649


In [33]:
duration, data = simulate_data("gene", features, minValue=0, maxValue=1000, integers=True)

In [34]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.85 s


Get memory usage of simulated data

In [35]:
memoryUsage = memory_usage_simulated_data(data)

In [36]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 49.9 MB


## Store synthetic data

In [37]:
duration, _ = store_data(data, "rnaseq", "raw", "Raw bulk RNASeq data.")

In [38]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 1.23 s


## Retrieve stored data

In [39]:
duration, layerData = load_layer("rnaseq")

In [40]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 3.27 s


In [41]:
layerData

Unnamed: 0,gene,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
1,ENSG00000223972,392,263,859,549,296,762,741,103,573,...,274,628,68,592,419,932,155,983,133,343
2,ENSG00000227232,480,550,134,849,225,448,259,671,746,...,873,515,513,451,165,891,580,730,83,383
3,ENSG00000278267,239,178,279,712,526,325,966,326,87,...,465,679,730,695,182,748,983,396,409,948
4,ENSG00000243485,727,517,267,877,432,613,743,5,917,...,839,847,527,821,119,999,792,584,314,947
5,ENSG00000284332,846,586,666,773,123,74,613,944,485,...,527,62,626,240,597,723,375,847,300,154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60601,ENSG00000198695,822,850,746,602,234,265,840,432,372,...,449,697,412,106,344,176,202,640,826,953
60602,ENSG00000210194,32,899,10,399,64,339,108,759,801,...,569,17,266,567,833,327,764,579,119,167
60603,ENSG00000198727,217,168,389,997,137,892,294,775,805,...,391,714,659,548,285,878,585,942,890,750
60604,ENSG00000210195,417,915,760,573,323,362,621,394,4,...,39,28,323,475,510,101,587,154,291,995


## Retrieve all stored features for sample

In [42]:
duration, sample = get_sample("rnaseq", "SA090")

In [43]:
print(f"Time to load sample with {len(sample)} features: {duration} s")

Time to load sample with 60605 features: 0.1 s


In [44]:
print(f"Memory footprint of data: {memory_usage_simulated_data(pd.Series(sample))}")

Memory footprint of data: 473.6 KB


## Add new sample

In [45]:
duration, _ = add_sample("rnaseq", "SA101", sample)

In [46]:
print(f"Time to add new sample with {len(sample)} features: {duration} s")

Time to add new sample with 60605 features: 0.52 s


# Gut microbiome data

## Create synthetic data

Load features

In [47]:
features = open("omic_features/species.txt").read().splitlines()
print(f"Number of features: {len(features)}")

Number of features: 6914


In [48]:
duration, data = simulate_data("species", features, minValue=0, maxValue=1000, integers=True)

In [49]:
print(f"Time to create simulated data: {duration} s")

Time to create simulated data: 0.13 s


Get memory usage of simulated data

In [50]:
memoryUsage = memory_usage_simulated_data(data)

In [51]:
print(f"Memory usage of simulated data: {memoryUsage}")

Memory usage of simulated data: 5.8 MB


## Store synthetic data

In [52]:
duration, _ = store_data(data, "microbiome", "raw", "Raw gut microbiome data.")

In [53]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 0.19 s


## Retrieve stored data

In [54]:
duration, layerData = load_layer("microbiome")

In [55]:
print(f"Time to load stored layer: {duration} s")

Time to load stored layer: 0.25 s


In [56]:
layerData

Unnamed: 0,species,SA001,SA002,SA003,SA004,SA005,SA006,SA007,SA008,SA009,...,SA091,SA092,SA093,SA094,SA095,SA096,SA097,SA098,SA099,SA100
1,Bacteroides uniformis,46,677,487,960,137,504,525,918,644,...,197,757,662,786,325,613,222,601,355,935
2,Bacteroides ovatus,464,455,929,100,363,319,332,356,925,...,243,659,984,959,773,910,727,325,918,615
3,Bacteroides vulgatus,412,905,213,458,372,119,476,455,335,...,629,340,491,518,310,649,123,115,428,145
4,Blautia obeum,209,363,903,177,250,332,883,831,187,...,172,277,718,652,611,380,736,738,942,441
5,[Eubacterium] rectale,246,120,751,878,558,525,84,831,929,...,417,986,521,346,580,682,909,517,387,234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6910,Rathayibacter festucae,549,517,782,363,668,540,531,105,229,...,26,119,152,6,944,745,480,911,720,231
6911,Thermosipho sp. DSM 6568,981,823,23,431,454,822,898,661,511,...,469,242,445,606,585,71,221,972,775,926
6912,Demequina globuliformis,456,327,585,934,697,817,955,903,3,...,132,103,488,348,780,116,907,932,116,861
6913,Streptomyces abyssalis,533,273,163,681,678,348,86,417,626,...,55,830,207,926,217,335,410,108,737,646


# Genomic data (VCF)

## Download data

The synthetic VCF file is hosted on Zenodo where is can be downloaded.

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12790872.svg)](https://doi.org/10.5281/zenodo.12790872)

## Store synthetic data

In [57]:
@timeit
def store_vcf(filename):
    omi.layers.from_csv(layer='vcf', filename=filename, sep='\t', chunksize=100000)
    return None

duration, _ = store_vcf("simulated.vcf.gz")

In [58]:
print(f"Time to store simulated data: {duration} s")

Time to store simulated data: 1109.62 s


Add tag and description to layer

In [59]:
omi.layers['vcf'].set_tag("raw")
omi.layers['vcf'].set_info("Cohort imputed VCF.")

## Retrieve all stored features for sample

In [62]:
duration, sample = get_sample("vcf", "SA090")

In [63]:
print(f"Time to load sample with {len(sample)} features: {duration} s")

Time to load sample with 9640953 features: 70.88 s


In [64]:
print(f"Memory footprint of data: {memory_usage_simulated_data(pd.Series(sample))}")

Memory footprint of data: 803.0 MB


## Add new sample

In [65]:
duration, _ = add_sample("vcf", "SA101", sample)

In [66]:
print(f"Time to add new sample with {len(sample)} features: {duration} s")

Time to add new sample with 9640953 features: 316.49 s


## Retrieve stored data

VCF includes character "#" infront of the 'CHROM" column. Rename column to remove character.

In [69]:
omi.layers['vcf'].rename("#CHROM", 'CHROM')

In [70]:
@timeit
def parse_vcf(chromo, pos, columns):
    if chromo is not None:
        result = omi.layers['vcf'].query(f"CHROM == '{chromo}' and POS == '{pos}'")[columns]
    else:
        result = omi.layers['vcf'].select(cols=columns, where='POS', values=pos)
    return result

@timeit
def parse_range_vcf(chromo, start, end, columns):
    result = omi.layers['vcf'].query(f"CHROM == '{chromo}' and POS BETWEEN {start} AND {end}")[columns]
    return result
    

In [71]:
duration, result = parse_vcf('chr3', 100000, ['ID', 'SA010', 'SA090'])
result     

Unnamed: 0_level_0,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1654041,3:100000:G:T,"1|0:0.184:0.051,0.062:0.353,0.285,0.362","0|1:0.363:0.761,0.894:0.995,0.002,0.003"


In [72]:
print(f"Time to parse vcf: {duration} s")  

Time to parse vcf: 33.29 s


In [73]:
duration, result = parse_vcf('chr22', 100000, ['ID', 'SA010', 'SA090'])
result

Unnamed: 0_level_0,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9588774,22:100000:T:A,"1|1:0.542:0.438,0.166:0.957,0.016,0.027","1|0:0.888:0.686,0.946:0.231,0.194,0.575"


In [74]:
print(f"Time to parse vcf: {duration} s")

Time to parse vcf: 33.21 s


In [75]:
duration, result = parse_vcf(chromo=None, pos=100000, columns=['ID', 'SA010', 'SA090'])
result

Unnamed: 0_level_0,POS,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
90001,100000,1:100000:G:A,"0|1:0.641:0.88,0.203:0.934,0.052,0.014","0|1:0.322:0.518,0.689:0.519,0.382,0.099"
832933,100000,2:100000:C:T,"1|0:0.144:0.462,0.846:0.759,0.197,0.044","1|0:0.72:0.378,0.539:0.059,0.664,0.277"
1654041,100000,3:100000:G:T,"1|0:0.184:0.051,0.062:0.353,0.285,0.362","0|1:0.363:0.761,0.894:0.995,0.002,0.003"
2355177,100000,4:100000:C:G,"1|0:0.363:0.021,0.732:0.172,0.233,0.595","1|0:0.119:0.441,0.84:0.413,0.083,0.504"
3066330,100000,5:100000:C:G,"0|0:0.423:0.948,0.166:0.015,0.454,0.531","0|0:0.62:0.254,0.99:0.903,0.049,0.048"
3702776,100000,6:100000:C:A,"1|1:0.17:0.174,0.625:0.935,0.006,0.059","1|0:0.07:0.751,0.039:0.772,0.095,0.133"
4349835,100000,7:100000:A:C,"1|1:0.498:0.045,0.372:0.818,0.081,0.101","0|1:0.078:0.716,0.819:0.151,0.258,0.591"
4911809,100000,8:100000:T:C,"0|1:0.757:0.785,0.161:0.778,0.207,0.015","1|0:0.185:0.632,0.472:0.83,0.133,0.037"
5459235,100000,9:100000:A:C,"0|1:0.774:0.915,0.858:0.975,0.003,0.022","1|0:0.732:0.647,0.803:0.607,0.005,0.388"
5876402,100000,10:100000:T:A,"1|1:0.235:0.982,0.52:0.729,0.11,0.161","0|1:0.041:0.004,0.031:0.805,0.015,0.18"


In [76]:
print(f"Time to parse vcf: {duration} s")

Time to parse vcf: 33.19 s


In [77]:
duration, result = parse_range_vcf('chr15', 50000, 50010, ['ID', 'SA010', 'SA090'])
result

Unnamed: 0_level_0,ID,SA010,SA090
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7937318,15:50000:A:G,"1|1:0.502:0.24,0.938:0.434,0.19,0.376","1|0:0.975:0.318,0.538:0.176,0.674,0.15"
7937319,15:50001:C:T,"1|1:0.214:0.732,0.514:0.236,0.158,0.606","0|0:0.732:0.979,0.842:0.1,0.066,0.834"
7937320,15:50002:A:G,"1|1:0.396:0.537,0.581:0.534,0.277,0.189","0|0:0.446:0.276,0.065:0.747,0.047,0.206"
7937321,15:50003:A:G,"0|0:0.677:0.191,0.832:0.589,0.254,0.157","0|1:0.336:0.97,0.848:0.737,0.194,0.069"
7937322,15:50004:A:G,"1|0:0.947:0.946,0.091:0.468,0.477,0.055","0|1:0.252:0.284,0.098:0.888,0.037,0.075"
7937323,15:50005:T:G,"1|0:0.055:0.06,0.958:0.846,0.102,0.052","1|0:0.344:0.956,0.419:0.204,0.495,0.301"
7937324,15:50006:C:A,"0|0:0.108:0.658,0.017:0.771,0.056,0.173","0|1:0.85:0.109,0.291:0.932,0.028,0.04"
7937325,15:50007:T:C,"1|1:0.53:0.347,0.236:0.381,0.571,0.048","1|1:0.764:0.96,0.585:0.152,0.184,0.664"
7937326,15:50008:T:A,"1|1:0.542:0.539,0.925:0.344,0.492,0.164","1|1:0.799:0.346,0.185:0.125,0.456,0.419"
7937327,15:50009:C:T,"0|1:0.699:0.793,0.185:0.085,0.404,0.511","0|1:0.741:0.316,0.701:0.717,0.029,0.254"


In [78]:
print(f"Time to parse vcf: {duration} s")

Time to parse vcf: 32.61 s


## Retrieve stored data in parallel

In [5]:
def getSamples(chromo, pos):
    result = omi.layers['vcf'].query(f"CHROM == '{chromo}' and POS == {pos}")[['SA096', 'SA098', 'SA099']]
    return (chromo, pos, result.values)

@timeit
def queryInParallel(queries):
    with multiprocessing.Pool(processes=8) as pool:
        results = pool.starmap(getSamples, queries)
    return results

queries = []
for i in [1,10,20]:
    for j in range(1, 100):
        queries.append((f"chr{i}", 100000+j))
        
duration, results = queryInParallel(queries)

In [6]:
results[:5]

[('chr1',
  100001,
  array([['0|0:0.508:0.702,0.432:0.962,0.013,0.025',
          '1|1:0.913:0.092,0.546:0.795,0.028,0.177',
          '1|0:0.721:0.193,0.479:0.051,0.172,0.777']], dtype=object)),
 ('chr1',
  100002,
  array([['1|0:0.25:0.389,0.383:0.668,0.059,0.273',
          '0|1:0.681:0.042,0.261:0.72,0.034,0.246',
          '0|0:0.455:0.177,0.445:0.776,0.003,0.221']], dtype=object)),
 ('chr1',
  100003,
  array([['0|1:0.981:0.986,0.122:0.157,0.182,0.661',
          '1|0:0.513:0.84,0.457:0.292,0.133,0.575',
          '1|1:0.766:0.471,0.88:0.145,0.423,0.432']], dtype=object)),
 ('chr1',
  100004,
  array([['0|1:0.676:0.346,0.759:0.183,0.637,0.18',
          '1|0:0.061:0.735,0.559:0.236,0.309,0.455',
          '1|1:0.895:0.97,0.068:0.275,0.665,0.06']], dtype=object)),
 ('chr1',
  100005,
  array([['1|1:0.364:0.393,0.319:0.193,0.313,0.494',
          '0|1:0.569:0.968,0.188:0.516,0.027,0.457',
          '0|1:0.896:0.018,0.915:0.057,0.866,0.077']], dtype=object))]

In [7]:
print(f"Time to parse vcf in parallel: {duration} s")

Time to parse vcf in parallel: 1800.07 s


# View stored omic layers

In [70]:
omi.layers

       name tag       shape                        info
     cohort raw       100x4            Cohort features.
blood_metas raw   37228x101 Raw blood metabolomic data.
urine_metas raw    5661x101 Raw urine metabolomic data.
     rnaseq raw   60605x101       Raw bulk RNASeq data.
 microbiome raw    6914x101    Raw gut microbiome data.
        vcf raw 9640953x110         Cohort imputed VCF.