In [57]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, RidgeCV

### Helper Functions

-----
Finding boundaries within a contig:

In [33]:
def get_block_boundaries(x, size):
    assert x.ndim == 1
    assert size > 0
    blocks = np.argwhere(np.diff(x, prepend=x[0]))[:,0]
    blocks = np.concatenate(([0], blocks, [x.size]))
    return np.concatenate([
        list(range(blocks[i], blocks[i+1], size))
        for i in range(blocks.size-1)
    ])


def test_block_boundaries():
    def check(x, size):
        idx = get_block_boundaries(x, size)
        chunks = []
        for i in range(idx.size):
            start, stop = idx[i], idx[i+1] if i+1 < idx.size else None
            chunk = x[slice(start, stop)]
            assert len(chunk) <= size
            chunks.append(chunk)
        np.testing.assert_equal(np.concatenate(chunks), x)

    arrays = [
        np.array([0]),
        np.array([0, 0]),
        np.array([0, 1]),
        np.array([0, 1, 1, 1]),
        np.array([0, 1, 1, 1, 1, 10]),
        np.array([0, 1, 1, 1, 2, 2, 3, 5]),
        np.array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5])
    ]
    for x in arrays:
        for size in [1, 2, 3]:
            check(x, size)
        check(x, x.size)

test_block_boundaries()

In [22]:
get_block_boundaries(np.array([0, 1, 3, 3]), 2)

array([0, 1, 2])

### Implementation

In [54]:
m, n, c, y = 20, 25, 2, 3
np.random.seed(0)
X = np.random.normal(size=(n, c))
BX = np.random.normal(size=(X.shape[1], 1)) 
G = np.random.choice([0, 1, 2], size=(m, n))
BG = np.random.normal(size=(m, y))
contigs = np.sort(np.arange(m) // 6)

Y = X @ BX + G.T @ BG + np.random.normal(size=(n, y), scale=.001)
X.shape, G.shape, Y.shape, contigs.shape

((25, 2), (20, 25), (25, 3), (20,))

In [55]:
contigs

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3])

In [56]:
def _level_0(G, X, Y, contigs, alphas, block_size):
    YB = []
    bc = []
    blocks = get_block_boundaries(contigs, block_size)
    for i in range(blocks.size):
        # Extract variants for this block
        start, stop = blocks[i], blocks[i+1] if i+1 < blocks.size else None
        bc.append(contigs[start])
        GB = G[:, slice(start, stop)]
        # Concatenate to static covariates
        XB = np.concatenate((X, GB), axis=1)
        # Solve, predict, and stack to (n_outcome, n_alpha, n_sample)
        PB = np.stack([
            Ridge(alpha=alphas[j], fit_intercept=False, normalize=False, random_state=0, solver='svd')\
            .fit(XB, Y).predict(XB).T
            for j in range(alphas.size)
        ], axis=1)
        YB.append(PB)
        
    # (n_outcome, n_block, n_alpha, n_sample)
    YB = np.stack(YB, axis=1)
    bc = np.array(bc)
    return YB, bc

def _level_1(YB, Y, bc):
    assert YB.ndim == 4
    assert Y.ndim == 2
    assert bc.ndim == 1
    n_outcome, n_block, n_alpha, n_sample = YB.shape
    assert bc.size == n_block
    
    for i in range(n_outcome):
        for contig in np.unique(bc):
            train_bidx = np.argwhere(bc != contig)[:,0]
            pred_bidx = np.argwhere(bc == contig)[:,0]
            # Extract (n_block_subset, n_sample, n_alpha)
            X_train = YB[i, train_bidx]
            X_pred = YB[i, pred_bidx]
            Y_train = Y[i]
            print(X_train.shape, X_pred.shape, Y_train.shape)
    

def regenie(G, X, Y, contigs, alphas=[0.01,0.25,0.50,0.75,0.99], block_size=None):
    assert len(set(map(len, [G, X, Y]))) == 1
    alphas = np.asarray(alphas)
    assert alphas.ndim == 1
    n_sample, n_variant = Y.shape[0], G.shape[1]
    X = np.concatenate((np.ones(shape=(n_sample, 1)), X), axis=1)
    
    if block_size is None:
        block_size = G.shape[1]
    
    YB, bc = _level_0(G, X, Y, contigs, alphas=alphas, block_size=block_size)
    assert len(np.unique(bc)) == len(np.unique(contigs))
    _level_1(YB, Y, bc)
    print(YB.shape)

regenie(G.T, X, Y, contigs, block_size=8)

(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 5, 25) (1, 5, 25) (3,)
(3, 4, 5, 25)


In [52]:
x = np.array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4])
blocks = np.argwhere(np.diff(x, prepend=x[0])).squeeze()
diffs = np.diff(blocks, prepend=0).squeeze()
#blocks, diffs
def get_block_boundaries(x, size):
    assert x.ndim == 1
    assert size > 0
    blocks = np.argwhere(np.diff(x, prepend=x[0])).squeeze()
    blocks = np.concatenate(([0], blocks, [x.size]))
    return np.concatenate([
        list(range(blocks[i], blocks[i+1], size))
        for i in range(blocks.size-1)
    ])

idx = get_blocks(x, 2)
idx

array([ 0,  1,  3,  5,  6,  8, 10, 11, 13])