# Prototype of core update in M&M ASH model
This is the core update for VEM step of M&M ASH model, version 2, single SNP calculation under MASH model.

The notebook prototypes the codes in `libgaow` repo.

$\newcommand{\bs}[1]{\boldsymbol{#1}}$
$\DeclareMathOperator*{\diag}{diag}$
$\DeclareMathOperator*{\cov}{cov}$
$\DeclareMathOperator*{\rank}{rank}$
$\DeclareMathOperator*{\var}{var}$
$\DeclareMathOperator*{\tr}{tr}$
$\DeclareMathOperator*{\veco}{vec}$
$\DeclareMathOperator*{\uniform}{\mathcal{U}niform}$
$\DeclareMathOperator*{\argmin}{arg\ min}$
$\DeclareMathOperator*{\argmax}{arg\ max}$
$\DeclareMathOperator*{\N}{N}$
$\DeclareMathOperator*{\gm}{Gamma}$
$\DeclareMathOperator*{\dif}{d}$

In [1]:
dat = readRDS('/home/gaow/Documents/GTExV8/Thyroid.Lung.FMO2.filled.rds')
str(dat)
attach(dat)

List of 2
 $ Y:'data.frame':	698 obs. of  2 variables:
  ..$ Thyroid: num [1:698] 0.163 0.436 -0.212 0.327 -0.698 ...
  ..$ Lung   : num [1:698] 0.77011 0.77799 -0.65361 0.00672 -0.36792 ...
 $ X: num [1:698, 1:7492] 1 0 0 0 0 1 1 0 1 1 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:698] "GTEX-111CU" "GTEX-111FC" "GTEX-111VG" "GTEX-111YS" ...
  .. ..$ : chr [1:7492] "chr1_170185243_G_A_b38" "chr1_170185272_T_C_b38" "chr1_170185405_C_A_b38" "chr1_170185417_G_A_b38" ...


In [2]:
%get X Y --from R
import numpy as np

Loading required package: feather


## Data preview

In [3]:
X

array([[ 1.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  1.,  0., ...,  0.,  1.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.]])

In [4]:
Y = Y.as_matrix()

In [5]:
Y

array([[ 0.16348104,  0.77010917],
       [ 0.43588995,  0.77798736],
       [-0.21237311, -0.65361193],
       ..., 
       [ 0.62036618, -0.0035004 ],
       [ 0.00279156, -0.05439095],
       [-0.14650835,  0.29935286]])

## Utility: mash model

In [6]:
%save /home/gaow/GIT/software/libgaow/py/src/model_mash.py -f
#!/usr/bin/env python3
__author__ = "Gao Wang"
__copyright__ = "Copyright 2016, Stephens lab"
__email__ = "gaow@uchicago.edu"
__license__ = "MIT"
__version__ = "0.1.0"

import numpy as np, scipy as sp
from scipy.stats import norm, multivariate_normal as mvnorm
from collections import OrderedDict

def inv_sympd(m):
    '''
    Inverse of symmetric positive definite
    https://stackoverflow.com/questions/40703042/more-efficient-way-to-invert-a-matrix-knowing-it-is-symmetric-and-positive-semi
    '''
    zz , _ = sp.linalg.lapack.dpotrf(m, False, False)
    inv_m , info = sp.linalg.lapack.dpotri(zz)
    # lapack only returns the upper or lower triangular part
    return np.triu(inv_m) + np.triu(inv_m, k=1).T

def get_svs(s, V):
    '''
    diag(s) @ V @ diag(s)
    '''
    return (s * V.T).T * s

def safe_mvnorm_logpdf(val, cov):
    try:
        return mvnorm.logpdf(val, cov=cov)
    except np.linalg.linalg.LinAlgError:
        if len(val.shape) == 1:
            return np.inf if np.sum(val) < 1E-6 else -np.inf
        else:
            return np.array([np.inf if np.sum(x) < 1E-6 else -np.inf for x in val.T])

class LikelihoodMASH:
    def __init__(self, data):
        self.J = data.B.shape[0]
        self.R = data.B.shape[1]
        self.P = len(data.U)
        self.data = data
        self.data.lik = {'relative_likelihood' : None,
                         'lfactor': None,
                         'marginal_loglik': None,
                         'loglik': None,
                         'null_loglik': None,
                         'alt_loglik': None}
        self.debug = None

    def compute_log10bf(self):
        self.data.log10bf = (self.data.lik['alt_loglik'] -  self.data.lik['null_loglik']) / np.log(10)

    def compute_relative_likelihood_matrix(self):
        matrix_llik = self._calc_likelihood_matrix_comcov() if self.data.is_common_cov() \
                      else self._calc_likelihood_matrix()
        lfactors = np.vstack(np.amax(matrix_llik, axis = 1))
        self.data.lik['relative_likelihood'] = np.exp(matrix_llik - lfactors)
        self.data.lik['lfactor'] = lfactors

    def _calc_likelihood_matrix(self):
        loglik = np.zeros((self.J, self.P))
        for j in range(self.J):
            sigma_mat = get_svs(self.data.S[j,:], self.data.V)
            loglik[j,:] = np.array([safe_mvnorm_logpdf(self.data.B[j,:], sigma_mat + self.data.U[p]) for p in self.data.U])
        return loglik

    def _calc_likelihood_matrix_comcov(self):
        sigma_mat = get_svs(self.data.S[0,:], self.data.V)
        return np.array([safe_mvnorm_logpdf(self.data.B, sigma_mat + self.data.U[p]) for p in self.data.U])

    def compute_loglik_from_matrix(self, options = ['all', 'alt', 'null']):
        '''
        data.lik.relative_likelihood first column is null, the rest are alt
        '''
        if 'marginal' in options:
            self.data.lik['marginal_loglik'] = np.log(self.data.lik['relative_likelihood'] @ self.data.pi) + self.data.lik['lfactor'] - np.sum(np.log(self.data.S), axis = 0)
            self.data.lik['loglik'] = np.sum(self.data.lik['marginal_loglik'])
        if 'alt' in options:
            self.data.lik['alt_loglik'] = np.log(self.data.lik['relative_likelihood'][:,1:] @ (self.data.pi[1:] / (1 - self.data.pi[0]))) + self.data.lik['lfactor'] - np.sum(np.log(self.data.S), axis = 1)
        if 'null' in options:
            self.data.lik['null_loglik'] = np.log(self.data.lik['relative_likelihood'][:,0]) + self.data.lik['lfactor'] - np.sum(np.log(self.data.S), axis = 1)

class PosteriorMASH:
    def __init__(self, data):
        '''
        // @param b_mat J by R
        // @param s_mat J by R
        // @param v_mat R by R
        // @param U_cube list of prior covariance matrices, for each mixture component P by R by R
        '''
        self.J = data.B.shape[0]
        self.R = data.B.shape[1]
        self.P = len(data.U)
        self.data = data
        self.data.post_mean_mat = np.zeros((self.R, self.J))
        self.data.post_mean2_mat = np.zeros((self.R, self.J))
        self.data.neg_prob_mat = np.zeros((self.R, self.J))
        self.data.zero_prob_mat = np.zeros((self.R, self.J))

    def compute_posterior_weights(self):
        d = (self.data.pi * self.data.lik['relative_likelihood'])
        self.data.posterior_weights = (d.T / np.sum(d, axis = 1))

    def compute_posterior(self):
        for j in range(self.J):
            Vinv_mat = inv_sympd(get_svs(self.data.S[j,:], self.data.V))
            mu1_mat = np.zeros((self.R, self.P))
            mu2_mat = np.zeros((self.R, self.P))
            zero_mat = np.zeros((self.R, self.P))
            neg_mat = np.zeros((self.R, self.P))
            for p, name in enumerate(self.data.U.keys()):
                U1_mat = self.get_posterior_cov(Vinv_mat, self.data.U[name])
                mu1_mat[:,p] = self.get_posterior_mean_vec(self.data.B[j,:], Vinv_mat, U1_mat)
                sigma_vec = np.sqrt(np.diag(U1_mat))
                null_cond = (sigma_vec == 0)
                mu2_mat[:,p] = np.square(mu1_mat[:,p]) + np.diag(U1_mat)
                if not null_cond.all():
                    neg_mat[np.invert(null_cond),p] = norm.cdf(mu1_mat[np.invert(null_cond),p], scale=sigma_vec[np.invert(null_cond)])
                zero_mat[null_cond,p] = 1.0
            self.data.post_mean_mat[:,j] = mu1_mat @ self.data.posterior_weights[:,j]
            self.data.post_mean2_mat[:,j] = mu2_mat @ self.data.posterior_weights[:,j]
            self.data.neg_prob_mat[:,j] = neg_mat @ self.data.posterior_weights[:,j]
            self.data.zero_prob_mat[:,j] = zero_mat @ self.data.posterior_weights[:,j]

    def compute_posterior_comcov(self):
        Vinv_mat = inv_sympd(get_svs(self.data.S[0,:], self.data.V))
        for p, name in enumerate(self.data.U.keys()):
            zero_mat = np.zeros((self.R, self.P))
            U1_mat = self.get_posterior_cov(Vinv_mat, self.data.U[name])
            mu1_mat = self.get_posterior_mean_mat(self.data.B, Vinv_mat, U1_mat)
            sigma_vec = np.sqrt(np.diag(U1_mat))
            null_cond = (sigma_vec == 0)
            sigma_mat = np.repeat(sigma_vec, self.J, axis = 1)
            neg_mat = np.zeros((self.R, self.J))
            if not null_cond.all():
                neg_mat[np.invert(null_cond),:] = norm.cdf(mu1_mat[np.invert(null_cond),:], scale = sigma_mat[np.invert(null_cond),:])
            m2_mat = np.square(mu1_mat) + np.diag(U1_mat)
            zero_mat[null_cond,:] = 1.0
            self.data.post_mean_mat += posterior_weights[p,:] * mu1_mat
            self.data.post_mean2_mat += posterior_weights[p,:] * mu2_mat
            self.data.neg_prob_mat += posterior_weights[p,:] * neg_mat
            self.data.zero_prob_mat += posterior_weights[p,:] * zero_mat

    @staticmethod
    def get_posterior_mean_vec(B, V_inv, U):
        return U @ (V_inv @ B)

    @staticmethod
    def get_posterior_mean_mat(B, V_inv, U):
        return B @ V_inv @ U

    @staticmethod
    def get_posterior_cov(V_inv, U):
        return U @ inv_sympd(V_inv @ U + np.identity(U.shape[0]))

    @classmethod
    def apply(cls, data):
        obj = cls(data)
        obj.compute_posterior_weights()
        if data.is_common_cov():
            obj.compute_posterior_comcov()
        else:
            obj.compute_posterior()

class PriorMASH:
    def __init__(self, data):
        self.data = data
        self.R = data.B.shape[1]

    def expand_cov(self, use_pointmass = True):
        def product(x,y):
            for item in y:
                yield x*item
        res = OrderedDict()
        if use_pointmass:
            res['null'] = np.zeros((self.R, self.R))
        res.update(OrderedDict(sum([[(f"{p}.{i+1}", g) for i, g in enumerate(product(self.data.U[p], np.square(self.data.grid)))] for p in self.data.U], [])))
        self.data.U = res

## Utility: regression data

In [7]:
%save /home/gaow/GIT/software/libgaow/py/src/regression_data.py -f
#!/usr/bin/env python3
__author__ = "Gao Wang"
__copyright__ = "Copyright 2016, Stephens lab"
__email__ = "gaow@uchicago.edu"
__license__ = "MIT"
__version__ = "0.1.0"

from model_mash import PriorMASH, LikelihoodMASH, PosteriorMASH
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression

class RegressionData:
    def __init__(self, X = None, Y = None, Z = None, B = None, S = None):
        self.X = X
        self.Y = Y
        self.Z = Z
        self.B = B
        self.S = S
        self.lik = None
        self.l10bf = None
        if (self.X is not None and self.Y is not None) and (self.B is None and self.S is None):
            self.get_summary_stats()

    def get_summary_stats(self):
        '''
        perform univariate regression
        FIXME: it is slower than lapply + .lm.fit in R
        FIXME: this faster implementation is on my watch list:
        https://github.com/ajferraro/fastreg
        '''
        self.B = np.zeros((self.X.shape[1], self.Y.shape[1]))
        self.S = np.zeros((self.X.shape[1], self.Y.shape[1]))
        for r, y in enumerate(self.Y.T):
            self.B[:,r], self.S[:,r] = univariate_simple_regression(self.X, y)[:,[0,2]].T
        
    def set_prior(self):
        pass

    def calc_likelihood(self):
        pass

    def calc_posterior(self):
        pass

    def calc_bf(self):
        pass

    @staticmethod
    def univariate_simple_regression(X, y, Z=None):
        if Z is not None:
            model = LinearRegression()
            model.fit(Z, y)
            y = y - model.predict(Z)
        return np.vstack([linregress(x, y) for x in X.T])[:,[0,1,4]]


class MASHData(RegressionData):
    def __init__(self, X = None, Y = None, Z = None, B = None, S = None):
        RegressionData.__init__(self, X, Y, Z, B, S)
        self.post_mean_mat = None
        self.post_mean2_mat = None
        self.neg_prob_mat = None
        self.zero_prob_mat = None
        self._is_common_cov = None
        self.V = None
        self.U = None
        self.pi = None
        self.posterior_weights = None
        self.grid = None

    def is_common_cov(self):
        if self._is_common_cov is None and self.S is not None:
            self._is_common_cov = (self.S.T == self.S.T[0,:]).all()
        return self._is_common_cov

    def calc_posterior(self):
        PosteriorMASH.apply(self)

    def calc_likelihood(self):
        LikelihoodMASH.apply(self)

## Main function calls

In [None]:
data = MASHData(X = X, Y = Y)
data.U = {'identity': np.identity(2)}
data.V = np.identity(2)
data.pi = np.array([0.9, 0.05, 0.05])
data.grid = [0.5, 1]
prior = PriorMASH(data)
prior.expand_cov()

In [None]:
lik = LikelihoodMASH(data)
lik.compute_relative_likelihood_matrix()

In [None]:
lik.compute_loglik_from_matrix(options = ['alt', 'null'])
lik.compute_log10bf()

In [None]:
import warnings
warnings.filterwarnings("error")
PosteriorMASH.apply(data)

## VEM updates
The Core function:

In [None]:
def singe_snp_multivariate(data, Y):
    '''
    single snp bayes regression of Y on each column of X
    under MASH model
    Y is N by R matrix
    X is N by J matrix
    Assume residual variance is identity for now
    '''
    data.reset(Y=Y)
    lik = LikelihoodMASH(data)
    lik.compute_relative_likelihood_matrix()
    lik.compute_loglik_from_matrix(options = ['alt', 'null'])
    lik.compute_log10bf()
    PosteriorMASH.apply(data)
    return {'alpha': data.log10bf / np.sum(data.log10bf), 'mu': data.post_mean_mat, 's2': data.post_mean2_mat - np.square(data.post_mean_mat)}

In [None]:
res = .lm.fit(cbind(1, X[,1:2]), Y[,1])
head(res$residuals)
head(Y[,1])

In [None]:
head(Y[,1] - cbind(1, X[,1:2]) %*% coef(res))

In [None]:
res = lm(Y[,1] ~ X[,1:10])
head(res$residuals)

In [None]:
head(Y[,1] - cbind(1, X[,1:10]) %*% coef(res))

In [None]:
coef(res)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X[:,0:2], Y[:,0])
res = Y[:,0] - model.predict(X[:,0:2])

In [None]:
res[:,(0,10)]

In [None]:
from scipy.stats import linregress
def univariate_simple_regression(X, y, Z=None):
        if Z is not None:
            model = LinearRegression()
            model.fit(Z, y)
            y = y - model.predict(Z)
        return np.vstack([linregress(x, y) for x in X.T])[:,[0,1,4]]

In [None]:
res = univariate_simple_regression(X,Y[:,0])

In [None]:
res[:,2]

In [None]:
sebetahat[:,0]

In [None]:
betahat

In [None]:
data.B

In [None]:
data = MASHData(X = X, Y = Y)