Calculating segregating variants using Xarray w/o first computing allele counts.

See: https://scikit-allel.readthedocs.io/en/stable/model/ndarray.html?highlight=is_segregating#allel.AlleleCountsArray.is_segregating

In [1]:
import xarray as xr
import numpy as np

calls = np.array([
    [[0, 0], [0, 0]],
    [[0, 0], [0, 1]],
    [[0, 2], [1, 1]],
    [[2, 2], [-1, -1]],
    [[2, -1], [2, -1]],
    [[2, -1], [1, -1]],
    [[2, -1], [-1, -1]],
    [[-1, -1], [-1, -1]],
])
dims = ('variants', 'samples', 'ploidy')
ds = xr.Dataset(dict(call_genotype=xr.DataArray(calls, dims=dims)))

is_segregating = (
    ds
    # This looks for any calls that are different from
    # a nan-aware mean (which would be the same w/ no segregation)
    .assign(cgo=ds.call_genotype, cgf=ds.call_genotype >= 0)
    .assign(cgm=lambda ds: ds.cgo.weighted(ds.cgf).mean(dim=('samples', 'ploidy')))
    .pipe(lambda ds: ds.cgf & (ds.cgo != ds.cgm))
    .any(dim=('samples', 'ploidy'))
)
is_segregating.values

array([False,  True,  True, False, False,  True, False, False])