### Linear Regression Extension

Comparison of single outcome function to multi-outcome function

In [6]:
import dask.array as da
from dask.array import stats
import numpy as np

In [151]:
ds = xr.Dataset(dict(
    x = xr.DataArray([[1, 2], [3, 4], [5, 6]], dims=['d1', 'd2']),
    y = xr.DataArray([[1, 3], [4, 4], [7, 8]], dims=['d1', 'd2'])
))
ds

In [186]:
x = np.array([[1, 2], [3, 4], [5, 6]])
from scipy.sparse import csc_matrix
y = csc_matrix((np.array([2, 3]), (np.array([0, 1]), np.array([1, 0]))))
#y = np.array([[1, 2], [3, 4]])
#x @ y
y.toarray()

array([[0, 2],
       [3, 0]])

In [187]:
x @ y

array([[ 6,  2],
       [12,  6],
       [18, 10]])

In [181]:
y.mask

array([[ True, False],
       [False,  True]])

In [147]:
da.atleast_2d(da.array([1, 2, 3]).T)

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(1, 3)","(1, 3)"
Count,3 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 24 B 24 B Shape (1, 3) (1, 3) Count 3 Tasks 1 Chunks Type int64 numpy.ndarray",3  1,

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(1, 3)","(1, 3)"
Count,3 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [7]:
m, n, c, y = 20, 25, 2, 3
np.random.seed(0)
X = np.random.normal(size=(n, c))
BX = np.random.normal(size=(X.shape[1], 1)) 
G = np.random.choice([0, 1, 2], size=(m, n))
BG = np.random.normal(size=(m, y))

Y = X @ BX + G.T @ BG + np.random.normal(size=(n, y), scale=.001)
X.shape, G.shape, Y.shape

((25, 2), (20, 25), (25, 3))

In [118]:
n_variant = x.shape[0]
def bincount(x):
    if x.size == 0:
        return xr.DataArray(np.empty((n_variant, 3)), dims=("variants", "alleles"))
    x = np.apply_along_axis(np.bincount, 1, x, minlength=3)
    return xr.DataArray(x, dims=("variants", "alleles"))
x.map_blocks(bincount).values

array([[1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0]])

In [61]:
def regression(G, X, Y):
    G, X = da.asarray(G), da.asarray(X)  # Coerce for `lstsq`
    if set([x.ndim for x in [G, X, Y]]) != {2}:
        raise ValueError('All arguments must be two dimensional')
    n_core_covar, n_loop_covar, n_obs, n_outcome = \
        X.shape[1], G.shape[1], Y.shape[0], Y.shape[1]
    dof = n_obs - n_core_covar - 1
    if dof < 1:
        raise ValueError(
            'Number of observations (N) too small to calculate sampling statistics. '
            'N must be greater than number of core covariates (C) plus one. '
            f'Arguments provided: N={n_obs}, C={n_core_covar}.'
        )
    
    # Apply orthogonal projection to eliminate core covariates
    # Note: QR factorization or SVD should be used here to find
    # what are effectively OLS residuals rather than matrix inverse
    # to avoid need for MxM array; additionally, dask.lstsq fails
    # with numpy arrays
    GP = G - X @ da.linalg.lstsq(X, G)[0]
    assert GP.shape == (n_obs, n_loop_covar)
    YP = Y - X @ da.linalg.lstsq(X, Y)[0]
    assert YP.shape == (n_obs, n_outcome)

    # Estimate coefficients for each loop covariate
    # Note: A key assumption here is that 0-mean residuals
    # from projection require no extra terms in variance
    # estimate for loop covariates (columns of G), which is
    # only true when an intercept is present.
    GPS = (GP ** 2).sum(axis=0, keepdims=True).T
    assert GPS.shape == (n_loop_covar, 1)
    B = (GP.T @ YP) / GPS
    assert B.shape == (n_loop_covar, n_outcome)

    # Compute residuals for each loop covariate and outcome separately
    YR = YP[:, np.newaxis, :] - GP[..., np.newaxis] * B[np.newaxis, ...]
    assert YR.shape == (n_obs, n_loop_covar, n_outcome)
    RSS = (YR ** 2).sum(axis=0)
    assert RSS.shape == (n_loop_covar, n_outcome)
    # Get t-statistics for coefficient estimates
    T = B / np.sqrt(RSS / dof / GPS)
    assert T.shape == (n_loop_covar, n_outcome)
    # Match to p-values
    # Note: t dist not implemented in Dask so this will
    # coerce result to numpy (`T` will still be da.Array)
    P = 2 * stats.distributions.t.sf(np.abs(T), dof)
    assert P.shape == (n_loop_covar, n_outcome)
    return B, T, P

    
B, T, P = regression(G.T, X, Y)
P.shape

(20, 3)

In [62]:
def regression_single(G, X, y):
    G, X = da.asarray(G), da.asarray(X)

    # Apply orthogonal projection to eliminate core covariates
    # Note: QR factorization or SVD should be used here to find
    # what are effectively OLS residuals rather than matrix inverse
    # to avoid need for MxM array; additionally, dask.lstsq will not
    # work with numpy arrays
    Gp = G - X @ da.linalg.lstsq(X, G)[0]
    yp = y - X @ da.linalg.lstsq(X, y)[0]

    # Estimate coefficients for each loop covariate
    # Note: A key assumption here is that 0-mean residuals
    # from projection require no extra terms in variance
    # estimate for loop covariates (columns of G), which is
    # only true when an intercept is present.
    Gps = (Gp ** 2).sum(axis=0)
    b = (Gp.T @ yp) / Gps

    # Compute statistics and p values for each regression separately
    dof = y.shape[0] - X.shape[1] - 1
    y_resid = yp[:, np.newaxis] - Gp * b
    rss = (y_resid ** 2).sum(axis=0)
    t_val = b / np.sqrt(rss / dof / Gps)
    p_val = 2 * stats.distributions.t.sf(np.abs(t_val), dof)
    return b, t_val, p_val

# Compare to individual regressions
PS = np.stack([
    regression_single(G.T, X, Y[:, i])[-1]
    for i in range(Y.shape[1])
], axis=1)
PS.shape

(20, 3)

In [63]:
np.testing.assert_allclose(P, PS)

In [64]:
np.asarray(T)

array([[-7.01245314, -5.39531762, -0.50627279],
       [-7.49986849, -4.76582726, -0.99916552],
       [-3.84843158, -4.94721653, -1.27098152],
       [-4.89860156, -3.74223951,  0.67205359],
       [-3.04283058, -5.61583047, -1.71197538],
       [-8.28514785, -6.1363946 , -1.34811411],
       [-3.69259499, -3.09104025, -2.1676783 ],
       [-4.00525921, -5.03075027,  0.56599415],
       [-2.54091026, -7.64289292, -0.07601142],
       [-3.24016026, -5.42272435, -2.38751541],
       [-4.24798628, -5.51651349, -1.96740872],
       [-4.19257039, -6.17302875, -2.90175349],
       [-3.1692477 , -4.47815777, -0.87970506],
       [-2.93411117, -5.65576694, -2.79789092],
       [-6.22892409, -4.59339143, -1.63120909],
       [-4.97425675, -3.67790575, -1.74215976],
       [-5.58661166, -5.37640233, -1.17616024],
       [-5.86609168, -6.67038058, -0.7463263 ],
       [-4.65452102, -6.12184637, -0.73481925],
       [-3.94112122, -5.0070558 , -1.1259018 ]])

In [65]:
P

array([[4.89092548e-07, 2.03402793e-05, 6.17701188e-01],
       [1.69468215e-07, 9.30244563e-05, 3.28578260e-01],
       [8.72233294e-04, 5.98718681e-05, 2.17010409e-01],
       [6.73660878e-05, 1.12857068e-03, 5.08546564e-01],
       [5.97033146e-03, 1.20292647e-05, 1.00963878e-01],
       [3.28448880e-08, 3.54954972e-06, 1.91339486e-01],
       [1.27266768e-03, 5.33567105e-03, 4.12773099e-02],
       [5.95461247e-04, 4.89052902e-05, 5.77126049e-01],
       [1.86214137e-02, 1.24918865e-07, 9.40097020e-01],
       [3.75853993e-03, 1.90502654e-05, 2.59836305e-02],
       [3.29246062e-04, 1.52314337e-05, 6.18710311e-02],
       [3.76979714e-04, 3.26107258e-06, 8.27203719e-03],
       [4.44218984e-03, 1.87617579e-04, 3.88527525e-01],
       [7.67892594e-03, 1.09430424e-05, 1.04856567e-02],
       [2.86631380e-06, 1.41611966e-04, 1.17080208e-01],
       [5.60737842e-05, 1.31867952e-03, 9.54454614e-02],
       [1.28929573e-05, 2.12820147e-05, 2.52101594e-01],
       [6.66555303e-06, 1.04837