Here I test the RAISS model function between Python code from https://gitlab.pasteur.fr/statistical-genetics/raiss/-/blob/master/raiss/stat_models.py and the ChatGPT 4 generated R codes.

R code:

In [26]:
raiss_model <- function(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=TRUE, report_condition_number=T) {
  # Translated content from Python function
  sig_t_inv <- invert_sig_t(sig_t, lamb, rcond)
  if (is.null(sig_t_inv)) {
    return(NULL)
  }

  if (batch) {
    
    condition_number <- if(report_condition_number) rep(kappa(sig_t, exact=T, norm="2"), nrow(sig_i_t)) else NA
    correct_inversion <- rep(check_inversion(sig_t, sig_t_inv), nrow(sig_i_t))
  } else {
    condition_number <- if(report_condition_number) kappa(sig_t, exact=T, norm="2") else NA
    correct_inversion <- check_inversion(sig_t, sig_t_inv)
  }


  var_ld_score <- compute_var(sig_i_t, sig_t_inv, lamb, batch)
  var <- var_ld_score$var
  ld_score <- var_ld_score$ld_score

  mu <- compute_mu(sig_i_t, sig_t_inv, zt)
  var_norm <- var_in_boundaries(var, lamb)

  R2 <- ((1 + lamb) - var_norm)
  mu <- mu / sqrt(R2)

  return(list(var=var_norm, mu=mu, ld_score=ld_score, condition_number=condition_number, correct_inversion=correct_inversion))
}

compute_mu <- function(sig_i_t, sig_t_inv, zt) {
  return(sig_i_t %*% (sig_t_inv %*% zt))
}

compute_var <- function(sig_i_t, sig_t_inv, lamb, batch=TRUE) {
  if (batch) {
    var <- (1 + lamb) - rowSums((sig_i_t %*% sig_t_inv) * sig_i_t)
    ld_score <- rowSums(sig_i_t^2)
  } else {
    var <- (1 + lamb) - (sig_i_t %*% (sig_t_inv %*% t(sig_i_t)))
    ld_score <- sum(sig_i_t^2)
  }
  return(list(var=var, ld_score=ld_score))
}

check_inversion <- function(sig_t, sig_t_inv) {
  return(all.equal(sig_t, sig_t %*% (sig_t_inv %*% sig_t), tolerance=1e-5))
}

var_in_boundaries <- function(var, lamb) {
  var[var < 0] <- 0
  var[var > (0.99999 + lamb)] <- 1
  return(var)
}

invert_sig_t <- function(sig_t, lamb, rcond) {
  diag(sig_t) <- 1 + lamb
  sig_t_inv <- MASS::ginv(sig_t, tol=rcond)
  return(sig_t_inv)
}

Python code:

In [27]:
"""
This module contain the statistical library for imputation.

Notation style of matrices subset and vectors are based on the publication:

Bogdan Pasaniuc, Noah Zaitlen, Huwenbo Shi, Gaurav Bhatia, Alexander Gusev,
Joseph Pickrell, Joel Hirschhorn, David P. Strachan, Nick Patterson,
Alkes L. Price;
Fast and accurate imputation of summary statistics enhances evidence
of functional enrichment, Bioinformatics, Volume 30, Issue 20, 15 October 2014,
Pages 2906–2914

"""

import numpy as np
import scipy as sc
import scipy.linalg

def compute_mu(sig_i_t, sig_t_inv, zt):
    """
    Compute the estimation of z-score from neighborring snp

    Args:
        sig_i_t (matrix?) : correlation matrix with line corresponding to
        unknown Snp (snp to impute) and column to known SNPs
        sig_t_inv (np.ndarray): inverse of the correlation matrix of known
        matrix
        zt (np.array?): Zscores of known snp
    Returns:
        mu_i (np.array): a vector of length i containing the estimate of zscore

    """
    return np.dot(sig_i_t, np.dot(sig_t_inv, zt))

def compute_var(sig_i_t, sig_t_inv, lamb, batch=True):
    """
    Compute the expected variance of the imputed SNPs
    Args:
        sig_i_t (matrix?) : correlation matrix with line corresponding to
        unknown Snp (snp to impute) and column to known SNPs
        sig_t_inv (np.ndarray): inverse of the correlation matrix of known
        matrix
        lamb (float): regularization term added to matrix

    """

    if batch:
        var = (1 + lamb) - np.einsum('ij,jk,ki->i', sig_i_t, sig_t_inv ,sig_i_t.transpose())
        ld_score = (sig_i_t**2).sum(1)
    else:
        var = (1 + lamb) - np.dot(sig_i_t, np.dot(sig_t_inv, sig_i_t.transpose()))
        ld_score = (sig_i_t**2).sum()
    return var, ld_score

def check_inversion(sig_t, sig_t_inv):
    return np.allclose(sig_t, np.dot(sig_t, np.dot(sig_t_inv, sig_t)))

def var_in_boundaries(var,lamb):
    """
    Forces the variance to be in the 0 to 1+lambda boundary
    theoritically we shouldn't have to do that
    """
    id_neg = np.where(var < 0)
    var_norm = var
    var[id_neg] = 0
    id_inf = np.where(var > (0.99999+lamb))
    var[id_inf] = 1

    return var

def invert_sig_t(sig_t, lamb, rcond):
    try:
        np.fill_diagonal(sig_t, (1+lamb))
        sig_t_inv = scipy.linalg.pinv(sig_t, rcond=rcond)
        return(sig_t_inv)
    except np.linalg.LinAlgError:
        invert_sig_t(sig_t, lamb*1.1, rcond*1.1)

def raiss_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01,  batch=True):
    """
    Compute the variance
    Args:
        zt (np.array): the vector of known Z scores
        sig_t (np.ndarray) : the matrix of known Linkage desiquilibrium
         correlation
        sig_i_t (np.ndarray): correlation matrix of known matrix
        lamb (float): regularization term added to the diagonal of the sig_t matrix
        rcond (float): threshold to filter eigenvector with a eigenvalue under rcond
        make inversion biased but much more numerically robust
    """
    sig_t_inv = invert_sig_t(sig_t, lamb, rcond)
    if sig_t_inv is None:
        return None
    else:
        if batch:
            condition_number = np.array([np.linalg.cond(sig_t)]*sig_i_t.shape[0])
            correct_inversion = np.array([check_inversion(sig_t, sig_t_inv)]*sig_i_t.shape[0])
        else:
            condition_number = np.linalg.cond(sig_t)
            correct_inversion = check_inversion(sig_t, sig_t_inv)
        var, ld_score = compute_var(sig_i_t, sig_t_inv, lamb, batch)

        mu = compute_mu(sig_i_t, sig_t_inv, zt)
        var_norm = var_in_boundaries(var, lamb)

        R2 = ((1+lamb)-var_norm)

        mu = mu / np.sqrt(R2)
        return({"var" : var, "mu" : mu, "ld_score" : ld_score, "condition_number" : condition_number, "correct_inversion":correct_inversion })

Simulate some data,

In [28]:
import numpy as np
import pandas as pd

np.random.seed(0)  # Set seed for reproducibility

# Generate test data
n_known = 5   # Number of known SNPs
n_unknown = 3  # Number of unknown SNPs

# Known linkage disequilibrium (LD) matrix
sig_t = np.random.rand(n_known, n_known)
sig_t = np.dot(sig_t, sig_t.transpose())  # Make it symmetric and positive definite

# Correlation matrix with rows for unknown SNPs and columns for known SNPs
sig_i_t = np.random.rand(n_unknown, n_known)

# Vector of known z-scores
zt = np.random.rand(n_known)

# Save to CSV files
np.savetxt("sig_t.csv", sig_t, delimiter=",")
np.savetxt("sig_i_t.csv", sig_i_t, delimiter=",")
np.savetxt("zt.csv", zt, delimiter=",")


Analys with Python:

In [29]:
raiss_model(zt, sig_t, sig_i_t)

{'var': array([1.        , 1.        , 0.76142815]),
 'mu': array([1.2253736 , 0.89356343, 0.36750296]),
 'ld_score': array([1.76671864, 1.20097081, 2.49228169]),
 'condition_number': array([249.9064389, 249.9064389, 249.9064389]),
 'correct_inversion': array([False, False, False])}

Load data to R:

In [30]:
# Read the data
sig_t <- read.csv("sig_t.csv", header = FALSE, sep = ",")
sig_i_t <- read.csv("sig_i_t.csv", header = FALSE, sep = ",")
zt <- read.csv("zt.csv", header = FALSE, sep = ",")

# Convert to matrices/vectors
sig_t <- as.matrix(sig_t)
sig_i_t <- as.matrix(sig_i_t)
zt <- as.vector(t(zt))

In [31]:
raiss_model(zt,sig_t,sig_i_t)

0
1.2253736
0.8935634
0.367503


Condition number does not match. Not sure why because the following looks okay:

In [32]:
import numpy as np

# Example matrix
A = np.array([[1, 1], [0, 2]])

# Compute the condition number using SVD (default)
np.linalg.cond(A)


2.6180339887498953

In [33]:
A <- matrix(c(1, 1, 0, 2), nrow = 2, byrow = TRUE)
kappa(A, exact=T, norm="2")

But this is different:

In [34]:
np.linalg.cond(sig_t)

249.90643890226198

In [35]:
kappa(sig_t, exact=T, norm="2")

However, consider this quantity is not used any where in the rest of the code, I skip computing it in my implementation with an option