## Dask GWAS 1KG QC Prototype

This analysis will run variant/sample call rate QC for 1KG data using Dask for efficiency comparisons to Glow/Hail on the same operation.

In [1]:
from pysnptools.snpreader import Bed
from dask.distributed import Client, progress
import dask.array as da
import os.path as osp
import numpy as np
%run codecs.py
plink_file = osp.expanduser('~/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes')
expected_shape = (8240745, 629) # Expected (n_variant, n_sample) after QC filtering

In [2]:
# Create a wrapper around pysnptools Bed reader class so that dask can 
# access slices directly from PLINK files
class BedArray(object):
    
    def __init__(self, bed):
        self.bed = bed
        self.shape = (bed.sid_count, bed.iid_count)
        self.dtype = np.uint8
        self.ndim = 2
        
    def __getitem__(self, idx):
        assert isinstance(idx, tuple)
        chunk = self.bed.__getitem__(idx[::-1]).read(dtype=np.float32)
        arr = chunk.val.T
        # Add one to leave allele count in [0, 3] (0 = missing)
        arr = np.nan_to_num(arr, nan=-1) + 1
        arr = arr.astype(np.uint8)
        print(arr.shape)
        return arr

In [3]:
def get_client(n_workers, n_threads=1):
    ml = str(128 // n_workers)
    client = Client(processes=True, threads_per_worker=n_threads, n_workers=n_workers, memory_limit=ml + 'GB')
    client.register_worker_plugin(CodecPlugin())
    return client

In [4]:
# Define functions for basic QC ops

def filter_by_variant_call_rate(m, threshold):
    return m[m.mean(axis=1) >= threshold, :]

def filter_by_sample_call_rate(m, threshold):
    return m[:, m.mean(axis=0) >= threshold]

def qc(m):
    m = filter_by_variant_call_rate(m, .8)
    m = filter_by_sample_call_rate(m, .8)
    m = filter_by_variant_call_rate(m, .98)
    m = filter_by_sample_call_rate(m, .98)
    m.compute_chunk_sizes() # Force graph computation
    return m

### QC Over PLINK

Test QC times over uncompressed PLINK data.

In [5]:
gt = da.from_array(BedArray(Bed(plink_file, count_A1=True)), chunks=(50000, 629), lock=False)
gt

(0, 0)


Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 31.45 MB Shape (25488488, 629) (50000, 629) Count 511 Tasks 510 Chunks Type uint8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray


In [6]:
client = get_client(n_workers=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:36627  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 128.00 GB


In [7]:
%%time
# Determine shape of filtered dataset and compare to expectation;
# Note that the call matrix (gt) contains allele counts with 0 indicating missing values
# so computing mean values per axis after converting to boolean indicating presence is equivalent to call rate
assert qc(gt > 0).shape == expected_shape

CPU times: user 20.4 s, sys: 7.51 s, total: 27.9 s
Wall time: 1min 24s


In [8]:
!du -ch {plink_file}.bed

3.8G	/home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes.bed
3.8G	total


### QC over Bitpacked Zarr

Test QC times over Blosc + LZ4 compression of bitpacked allele counts.

In [9]:
zarr_path = osp.join('/tmp', osp.basename(plink_file))
zarr_path

'/tmp/ALL.2of4intersection.20100804.genotypes'

In [10]:
%%time
# Write the zarr files first
da.to_zarr(gt, url=zarr_path, overwrite=True, filters=[PackGeneticBits()])

CPU times: user 13.5 s, sys: 7.04 s, total: 20.5 s
Wall time: 1min 48s


*Reset Client*: Use the maximum number of workers (with less memory) now that the far more memory-intensive PLINK read is done.

In [11]:
if 'client' in globals():
    client.close()
client = get_client(n_workers=16)
client

0,1
Client  Scheduler: tcp://127.0.0.1:36977  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 16  Cores: 16  Memory: 128.00 GB


In [12]:
gt = da.from_zarr(url=zarr_path)
gt

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 31.45 MB Shape (25488488, 629) (50000, 629) Count 511 Tasks 510 Chunks Type uint8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray


In [13]:
%%time
assert qc(gt > 0).shape == expected_shape

CPU times: user 11.1 s, sys: 557 ms, total: 11.7 s
Wall time: 36.7 s


In [7]:
!du -ch $zarr_path

1.4G	/tmp/ALL.2of4intersection.20100804.genotypes
1.4G	total
