In [2]:
import numpy as np
import pandas as pd
import dask.array as da
from sklearn.linear_model import Ridge, RidgeCV
%load_ext autoreload
%autoreload 2

### Helper Functions

-----
Finding boundaries within a contig:

In [163]:
def get_block_boundaries(x, size):
    assert x.ndim == 1
    assert size > 0
    breaks = np.argwhere(np.diff(x, prepend=x[0]))[:,0]
    breaks = np.concatenate(([0], breaks, [x.size]))
    index = np.concatenate([
        np.arange(breaks[i], breaks[i+1], size)
        for i in range(breaks.size-1)
    ])
    sizes = np.diff(index, append=x.size)
    return index, sizes
    

def test_block_boundaries():
    def check(x, size):
        idx, sizes = get_block_boundaries(x, size)
        assert sizes.sum() == x.size
        assert idx.ndim == sizes.ndim == 1
        assert idx.size == sizes.size
        chunks = []
        for i in range(idx.size):
            start, stop = idx[i], idx[i] + sizes[i]
            chunk = x[slice(start, stop)]
            assert len(chunk) <= size
            chunks.append(chunk)
        np.testing.assert_equal(np.concatenate(chunks), x)

    arrays = [
        np.array([0]),
        np.array([0, 0]),
        np.array([0, 1]),
        np.array([0, 1, 1, 1]),
        np.array([0, 1, 1, 1, 1, 10]),
        np.array([0, 1, 1, 1, 2, 2, 3, 5]),
        np.array([0, 1, 1, 2, 2, 2, 5, 5, 5, 5, 5, 8, 8, 8, 8, 10])
    ]
    for x in arrays:
        for size in [1, 2, 3]:
            check(x, size)
        check(x, x.size)

test_block_boundaries()

In [165]:
get_block_boundaries(np.array([0, 1, 1, 5, 5, 5, 8, 8, 8, 8, 10]), 2)

(array([ 0,  1,  3,  5,  6,  8, 10]), array([1, 2, 2, 1, 2, 2, 1]))

-----

Ridge regression within blocks:

In [719]:
from sklearn.linear_model import Ridge

# TODO: Is there a way to solve for B in XtX @ B = XtY without inverse?
# - SVD is equivalent, but only when n <= p and alpha = 0
def ridge_regression_svd(X, Y, a):
    """Multi-outcome, multi-parameter ridge regression via SVD."""
    U, s, Vt = np.linalg.svd(X, full_matrices=False)
    UtY = np.dot(U.T, Y)
    V = np.expand_dims(Vt.T, 0)
    s = np.expand_dims(s, 0)
    a = np.expand_dims(a, -1)
    d = np.expand_dims(s / (s ** 2 + a), -1)
    d_UtY = d * UtY
    # returns (n_alpha, n_covariate, n_outcome)
    return np.matmul(V, d_UtY)


def _test_data(n, p, y):
    np.random.seed(0)
    X = np.random.normal(size=(n, p))
    B = np.random.normal(size=(p, y))
    Y = X @ B 
    return X, B, Y


def test_ridge_projection():
    def check(n, p, y, alphas):
        X, B, Y = _test_data(n, p, y)
        XtX, XtY = X.T @ X, X.T @ Y
        b1 = ridge_regression_svd(X, Y, alphas)
        b2 = ridge_regression_svd(XtX, XtY, alphas)
        assert b1.ndim == 3
        assert b1.shape == (alphas.size,) + B.shape
        assert b2.shape == b2.shape
        np.testing.assert_allclose(b1, b2)
        
    # In order for SVD ridge to have the same solution for (X, Y)
    # as (XtX, XtY), alpha must be 0 and n >= p
    shapes = [
        (10, 1, 1),
        (10, 5, 3),
        (10, 10, 3),
        (1000, 50, 10),
    ]
    alphas = np.array([0])
    for shape in shapes:
        check(*shape, alphas)
        
def test_ridge_regression():
    def check(n, p, y, alphas):
        X, B, Y = _test_data(n, p, y)
        b = ridge_regression_svd(X, Y, alphas)
        assert b.ndim == 3
        assert b.shape == (alphas.size,) + B.shape
        if n >= p:
            # Check no regularization case only if (n >= p)
            np.testing.assert_allclose(b[0], B)
        for i, a in enumerate(alphas):
            if a == 0:
                continue
            est = Ridge(alpha=a, fit_intercept=False, normalize=False, solver='svd', random_state=0)
            est.fit(X, Y)
            np.testing.assert_allclose(est.coef_.T, b[i])
            
    shapes = [
        (1, 1, 1),
        (10, 1, 1),
        (1, 10, 1),
        (10, 5, 3),
        (5, 10, 3),
        (10, 10, 3),
        (1000, 50, 10),
        (50, 1000, 10)
    ]
    alphas = np.array([0., .001, .01, .1, 1, 10, 100, 1000])
    for shape in shapes:
        check(*shape, alphas)
        
test_ridge_regression()
test_ridge_projection()

In [738]:
from sklearn.linear_model import Ridge

def ridge_regression(XtX, XtY, alphas):
    """Multi-outcome, multi-parameter ridge regression."""
    assert XtX.shape[0] == XtX.shape[1] == XtY.shape[0]
    diags = np.stack([
        np.eye(XtX.shape[0]) * alphas[i]
        for i in range(len(alphas))
    ])
    return np.linalg.inv(XtX + diags) @ XtY


def _test_data(n, p, y):
    np.random.seed(0)
    X = np.random.normal(size=(n, p))
    B = np.random.normal(size=(p, y))
    Y = X @ B 
    return X, B, Y
        
def test_ridge_regression():
    def check(n, p, y, alphas):
        X, B, Y = _test_data(n, p, y)
        XtX, XtY = X.T @ X, X.T @ Y
        b = ridge_regression(XtX, XtY, alphas)
        assert b.ndim == 3
        assert b.shape == (alphas.size,) + B.shape
#         if n >= p:
#             # Check no regularization case only if (n >= p)
#             np.testing.assert_allclose(b[0], B)
        for i, a in enumerate(alphas):
            if a == 0:
                continue
            est = Ridge(alpha=a, fit_intercept=False, normalize=False, solver='lsqr', random_state=0)
            est.fit(X, Y)
            np.testing.assert_allclose(est.coef_.T, b[i], atol=1e-2)
            
    shapes = [
#         (1, 1, 1),
#         (10, 1, 1),
#         (1, 10, 1),
        (10, 5, 3),
        #(5, 10, 3),
        (10, 9, 3),
        (1000, 50, 10),
        #(50, 1000, 10)
    ]
    alphas = np.array([.001, .01, .1, 1, 10, 100, 1000])
    for shape in shapes:
        check(*shape, alphas)
        
# Ridge using inv of XtX and XtY is not the same as sklearn?
test_ridge_regression()

------

R2 score

In [521]:
def r2_score(YP, Y):
    # https://github.com/projectglow/glow/blob/f3edf5bb8fe9c2d2e1a374d4402032ba5ce08e29/python/glow/wgr/linear_model/functions.py#L227
    # Observations must be in last dimension
    # Returns -inf if the number of observations is 1
    # Returns nan if any values are nan
    assert YP.shape[-1] == Y.shape[-1]
    YP, Y = np.broadcast_arrays(YP, Y)
    tot = np.power(Y - Y.mean(axis=-1, keepdims=True), 2)
    tot = tot.sum(axis=-1, keepdims=True)
    res = np.power(Y - YP, 2)
    res = res.sum(axis=-1, keepdims=True)
    r2 = 1 - (res / tot)
    return r2[..., 0]
    

def test_r2_score():
    n, p, y = 20, 5, 3
    np.random.seed(0)
    X = np.random.normal(size=(n, p))
    B = np.random.normal(size=(p, y))
    Y = (X @ B).T
    YP = Y + np.random.normal(size=(6, 8, y, n), scale=.1)
    
    # Test case with perfect predictions
    np.testing.assert_allclose(r2_score(Y, Y), 1)
    
    # Test case with near perfect predictions and extra
    # loop dimensions
    r2_actual = r2_score(YP, Y)
    assert r2_actual.shape == YP.shape[:-1]
    r2_expected = np.array([
        r2_score(YP[i, j, k], Y[k])
        for i in range(YP.shape[0])
        for j in range(YP.shape[1])
        for k in range(y)
    ])
    # This will ensure that aggregations occurred across
    # the correct axis and that the loop dimensions can
    # be recapitulated with an explicit set of nested loops
    np.testing.assert_allclose(r2_actual.ravel(), r2_expected)

test_r2_score()

### Implementation

In [93]:
from sgkit.stats import regenie

In [94]:
contigs = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2])
variant_chunk_start, variant_chunk_size = regenie.get_block_boundaries(contigs, 3)
variant_chunk_start, variant_chunk_size

(array([0, 3, 5, 8]), array([3, 2, 3, 1]))

In [95]:
contigs[variant_chunk_start]

array([1, 1, 2, 2])

In [125]:
m, n, c, y = 20, 25, 2, 3
np.random.seed(0)
X = np.random.normal(size=(n, c))
BX = np.random.normal(size=(X.shape[1], 1)) 
G = np.random.choice([0, 1, 2], size=(m, n))
BG = np.random.normal(size=(m, y))
contigs = np.sort(np.arange(m) // 2)
#contigs = np.ones(m, dtype=int)

Y = X @ BX + G.T @ BG + np.random.normal(size=(n, y), scale=.001)
X.shape, G.shape, Y.shape, contigs.shape

((25, 2), (20, 25), (25, 3), (20,))

In [126]:
contigs

array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9])

In [127]:
alphas = np.array([1000., 2500.])
res = regenie.regenie(
    G.T, X, Y, contigs, 
    variant_block_size=8, 
    sample_block_size=6, 
    normalize=False, 
    alphas=alphas
)
YP1, YP2, B2, L3 = res
B2

Unnamed: 0,Array,Chunk
Bytes,2.76 kB,552 B
Shape,"(115, 3)","(23, 3)"
Count,2546 Tasks,5 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.76 kB 552 B Shape (115, 3) (23, 3) Count 2546 Tasks 5 Chunks Type float64 numpy.ndarray",3  115,

Unnamed: 0,Array,Chunk
Bytes,2.76 kB,552 B
Shape,"(115, 3)","(23, 3)"
Count,2546 Tasks,5 Chunks
Type,float64,numpy.ndarray


In [128]:
L3

Unnamed: 0,Array,Chunk
Bytes,6.00 kB,144 B
Shape,"(10, 25, 3)","(1, 6, 3)"
Count,4481 Tasks,50 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.00 kB 144 B Shape (10, 25, 3) (1, 6, 3) Count 4481 Tasks 50 Chunks Type float64 numpy.ndarray",3  25  10,

Unnamed: 0,Array,Chunk
Bytes,6.00 kB,144 B
Shape,"(10, 25, 3)","(1, 6, 3)"
Count,4481 Tasks,50 Chunks
Type,float64,numpy.ndarray


In [129]:
L3.compute().shape

(10, 25, 3)

In [107]:
L3[-1].compute().shape

(25, 3)

In [90]:
L3[-2].compute().shape

(3, 5, 3, 2)

In [85]:
YP1.compute().shape

(3, 2, 25, 3)

In [86]:
YP2.compute().shape

(25, 3)

In [87]:
B2.compute().shape

(45, 3)

In [70]:
x = np.arange(120).reshape(2, 3, 4, 5)
x = np.asarray(x, order='F')
x.shape

(2, 3, 4, 5)

In [76]:
x[0, 1, 0]

array([20, 21, 22, 23, 24])

In [75]:
x.reshape((2, 3, 20)).reshape((2, 3, 4, 5))[0, 1, 0]

array([20, 21, 22, 23, 24])