In [57]:
import xarray as xr
import numpy as np
import zarr
import numcodecs
from sgkit.testing import simulate_genotype_call_dataset

In [25]:
# ds = simulate_genotype_call_dataset(n_variant=100, n_sample=10, n_ploidy=3, n_allele=10, seed=0)
# ds.to_zarr('/tmp/ds.zarr')

In [105]:
ds = simulate_genotype_call_dataset(n_variant=1000, n_sample=100, n_ploidy=3, n_allele=2, seed=0)
ds.attrs['contigs'][0] = int(ds.attrs['contigs'][0]) # Fix type for to_zarr

shape = (ds.dims['variants'], ds.dims['samples'], 3)
rs = np.random.RandomState(0)
gp = np.where(rs.rand(*shape) < .3, np.nan, rs.rand(*shape))
gp -= np.nanmin(gp)
gp /= np.nanmax(gp)
gp[0, 0, :] = np.array([0.0, 1.0, np.nan])
ds['call_genotype_probability'] = xr.DataArray(gp, dims=('variants', 'samples', 'genotypes'))
ds['call_genotype_probability'] = ds.call_genotype_probability.chunk(chunks=(100, 50))

In [106]:
ds['call_genotype_probability'].min().compute().item(0), ds['call_genotype_probability'].max().compute().item(0)

(0.0, 1.0)

In [107]:
ds = ds[['call_genotype_probability']]
ds

Unnamed: 0,Array,Chunk
Bytes,2.40 MB,120.00 kB
Shape,"(1000, 100, 3)","(100, 50, 3)"
Count,21 Tasks,20 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.40 MB 120.00 kB Shape (1000, 100, 3) (100, 50, 3) Count 21 Tasks 20 Chunks Type float64 numpy.ndarray",3  100  1000,

Unnamed: 0,Array,Chunk
Bytes,2.40 MB,120.00 kB
Shape,"(1000, 100, 3)","(100, 50, 3)"
Count,21 Tasks,20 Chunks
Type,float64,numpy.ndarray


In [108]:
np.sum(np.isnan(ds.call_genotype_probability.values))

90179

In [109]:
(1 + 1./254) / (1./254), np.uint8((1 + 1./254) / (1./254))

(255.0, 255)

In [110]:
def save(ds, path):
    compressor = zarr.Blosc(cname="zstd", clevel=5, shuffle=2)
    encoding = {v: {"compressor": compressor} for v in ds}
    encoding['call_genotype_probability']['dtype'] = 'uint8'
    encoding['call_genotype_probability']['_FillValue'] = 0
    encoding['call_genotype_probability']['add_offset'] = -1./254.
    encoding['call_genotype_probability']['scale_factor'] = 1./254.
    ds.to_zarr(store=path, mode="w", consolidated=True, encoding=encoding)
    
save(ds, '/tmp/ds.1.zarr')

In [111]:
gp_enc = xr.open_zarr('/tmp/ds.1.zarr', mask_and_scale=False).call_genotype_probability.values
gp_enc.dtype, gp_enc.min(), gp_enc.max()

(dtype('uint8'), 0, 255)

In [112]:
gp = xr.open_zarr('/tmp/ds.1.zarr').call_genotype_probability.values
gp.dtype

dtype('float32')

In [113]:
ds.call_genotype_probability.values[:3, :3]

array([[[0.        , 1.        ,        nan],
        [0.06766507, 0.20946076, 0.73306022],
        [0.03978927, 0.55413637, 0.29987843]],

       [[0.09016546, 0.11399314, 0.88251526],
        [       nan, 0.55368075,        nan],
        [       nan,        nan, 0.18646903]],

       [[       nan, 0.83744135, 0.53700164],
        [       nan, 0.84629747, 0.19177986],
        [       nan,        nan, 0.64122095]]])

In [114]:
gp[:3, :3]

array([[[0.        , 1.        ,        nan],
        [0.06692913, 0.20866142, 0.7322835 ],
        [0.03937008, 0.5551181 , 0.2992126 ]],

       [[0.09055118, 0.11417323, 0.88188976],
        [       nan, 0.5551181 ,        nan],
        [       nan,        nan, 0.18503937]],

       [[       nan, 0.8385827 , 0.53543305],
        [       nan, 0.8464567 , 0.19291338],
        [       nan,        nan, 0.6417323 ]]], dtype=float32)

In [115]:
gp_enc[:3, :3]

array([[[  1, 255,   0],
        [ 18,  54, 187],
        [ 11, 142,  77]],

       [[ 24,  30, 225],
        [  0, 142,   0],
        [  0,   0,  48]],

       [[  0, 214, 137],
        [  0, 216,  50],
        [  0,   0, 164]]], dtype=uint8)

In [92]:
np.allclose(gp, ds.call_genotype_probability.values, atol=.01, equal_nan=True)

True

In [93]:
codec = numcodecs.FixedScaleOffset(
    offset=0, 
    scale=254, # 2^B - 1, B=8
    dtype='f4', # Decoded data type (doesn't have to be 16-bit)
    astype='uint8' # Encoded data type
)

In [94]:
x = ds.call_genotype_probability.values.astype('float32')
y = codec.encode(x)
z = np.zeros(x.shape, dtype='float32')
codec.decode(y, z);

In [95]:
x.shape, y.shape, z.shape

((1000, 100, 3), (300000,), (1000, 100, 3))

In [96]:
# Is rounding the same for both?
np.allclose(z, np.nan_to_num(gp, nan=0))

True