# Description

**TODO**

# Modules

In [1]:
library(reticulate)
pd <- import("pandas")

# Settings

In [2]:
# reference panel
REFERENCE_PANEL <- "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL <- "MASHR"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"

chromosome <- NULL

In [3]:
# Parameters
chromosome = 7
REFERENCE_PANEL = "GTEX_V8"
EQTL_MODEL = "MASHR"


In [4]:
paste0("Using reference panel: ", REFERENCE_PANEL)

In [5]:
paste0("Using eQTL model: ", EQTL_MODEL)

In [6]:
# chromosome must be provided as parameter
stopifnot(!is.null(chromosome))

# Paths

In [7]:
GENE_CORRS_DIR <- Sys.getenv("PHENOPLIER_PHENOMEXCAN_LD_BLOCKS_GENE_CORRS_DIR")
IRdisplay::display(GENE_CORRS_DIR)

In [8]:
INPUT_DIR <- file.path(GENE_CORRS_DIR, tolower(REFERENCE_PANEL), tolower(EQTL_MODEL), "by_chr")
IRdisplay::display(INPUT_DIR)

In [9]:
INPUT_FILE <- file.path(INPUT_DIR, paste0("gene_corrs-chr", chromosome, ".pkl"))
IRdisplay::display(INPUT_FILE)
stopifnot(file.exists(INPUT_FILE))

In [10]:
OUTPUT_DIR <- file.path(INPUT_DIR, "corrected_positive_definite")
IRdisplay::display(OUTPUT_DIR)
dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)

In [11]:
OUTPUT_FILE <- file.path(OUTPUT_DIR, paste0("gene_corrs-chr", chromosome, ".pkl"))
IRdisplay::display(OUTPUT_FILE)
if (file.exists(OUTPUT_FILE)) {
  IRdisplay::display("Output file exists, it will be overwritten")
}

# Functions

In [12]:
# taken from https://www.r-bloggers.com/2013/08/correcting-a-pseudo-correlation-matrix-to-be-positive-semidefinite/
# TODO: add documentation
CorrectCM <- function(CM, p = 0) {
  n <- dim(var(CM))[1L]
  E <- eigen(CM)
  CM1 <- E$vectors %*% tcrossprod(diag(pmax(E$values, p), n), E$vectors)
  Balance <- diag(1 / sqrt(diag(CM1)))
  CM2 <- Balance %*% CM1 %*% Balance
  return(CM2)
}

In [13]:
is_positive_definite <- function(data) {
  eigenvalues <- eigen(data)$values
  nonpositive_eigenvalues <- eigenvalues[eigenvalues <= 0]

  if (length(nonpositive_eigenvalues) > 0) {
    IRdisplay::display("We need to correct the data and make the matrix positive definite")
    return(FALSE)
  } else {
    IRdisplay::display("Matrix is already positive definite!")
    return(TRUE)
  }
}

# Load data

In [14]:
gene_corrs <- pd$read_pickle(INPUT_FILE)

In [15]:
dim(gene_corrs)

In [16]:
head(gene_corrs[1:10, 1:10])

Unnamed: 0_level_0,ENSG00000197461,ENSG00000188191,ENSG00000105963,ENSG00000198517,ENSG00000002822,ENSG00000106263,ENSG00000136213,ENSG00000106003,ENSG00000146535,ENSG00000198286
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000197461,1.0,-0.029848843,-0.02182156,0.20879455,0.0009010742,-0.204422847,-0.03045959,-0.008917525,-0.043456995,-0.001200841
ENSG00000188191,-0.0298488429,1.0,0.07576802,0.01603634,-0.0185407859,-0.007383636,-0.10630895,0.036843596,-0.075719704,-0.045024394
ENSG00000105963,-0.0218215643,0.075768017,1.0,-0.02747716,-0.0208687365,0.123523055,0.04601975,0.050243888,0.005391165,0.042752751
ENSG00000198517,0.2087945457,0.016036336,-0.02747716,1.0,0.0297400722,-0.143564429,0.01133198,0.0612233,-0.026798648,-0.084529813
ENSG00000002822,0.0009010742,-0.018540786,-0.02086874,0.02974007,1.0,0.067276238,0.08154279,-0.015789123,-0.080906463,-0.006509731
ENSG00000106263,-0.2044228467,-0.007383636,0.12352306,-0.14356443,0.0672762383,1.0,0.1030807,-0.132459911,-0.076416367,0.032850805


# Check positive definiteness

In [17]:
is_positive_definite(gene_corrs)

In [18]:
# see eigenvalues
eigenvalues <- eigen(gene_corrs)$values

In [19]:
nonpositive_eigenvalues <- eigenvalues[eigenvalues <= 0]
IRdisplay::display(length(nonpositive_eigenvalues))
IRdisplay::display(nonpositive_eigenvalues)

In [20]:
if (length(eigenvalues[eigenvalues <= 0]) == 0) { quit() }

# Make matrix positive definite if needed

In [21]:
gene_corrs_corrected <- CorrectCM(gene_corrs, 1)

In [22]:
dimnames(gene_corrs_corrected)[[1]] <- rownames(gene_corrs)

In [23]:
dimnames(gene_corrs_corrected)[[2]] <- colnames(gene_corrs)

In [24]:
gene_corrs_corrected <- as.data.frame(gene_corrs_corrected)

In [25]:
dim(gene_corrs_corrected)

# Check positive definiteness of corrected matrix

In [26]:
is_positive_definite(gene_corrs_corrected)

In [27]:
# see eigenvalues
eigenvalues <- eigen(gene_corrs_corrected)$values

In [28]:
nonpositive_eigenvalues <- eigenvalues[eigenvalues <= 0]
IRdisplay::display(length(nonpositive_eigenvalues))
IRdisplay::display(nonpositive_eigenvalues)

In [29]:
stopifnot(length(eigenvalues[eigenvalues <= 0]) == 0)

In [30]:
# quick and visual comparison of the two matrices
IRdisplay::display(head(gene_corrs[1:10, 1:10]))
IRdisplay::display(head(gene_corrs_corrected[1:10, 1:10]))

Unnamed: 0_level_0,ENSG00000197461,ENSG00000188191,ENSG00000105963,ENSG00000198517,ENSG00000002822,ENSG00000106263,ENSG00000136213,ENSG00000106003,ENSG00000146535,ENSG00000198286
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000197461,1.0,-0.029848843,-0.02182156,0.20879455,0.0009010742,-0.204422847,-0.03045959,-0.008917525,-0.043456995,-0.001200841
ENSG00000188191,-0.0298488429,1.0,0.07576802,0.01603634,-0.0185407859,-0.007383636,-0.10630895,0.036843596,-0.075719704,-0.045024394
ENSG00000105963,-0.0218215643,0.075768017,1.0,-0.02747716,-0.0208687365,0.123523055,0.04601975,0.050243888,0.005391165,0.042752751
ENSG00000198517,0.2087945457,0.016036336,-0.02747716,1.0,0.0297400722,-0.143564429,0.01133198,0.0612233,-0.026798648,-0.084529813
ENSG00000002822,0.0009010742,-0.018540786,-0.02086874,0.02974007,1.0,0.067276238,0.08154279,-0.015789123,-0.080906463,-0.006509731
ENSG00000106263,-0.2044228467,-0.007383636,0.12352306,-0.14356443,0.0672762383,1.0,0.1030807,-0.132459911,-0.076416367,0.032850805


Unnamed: 0_level_0,ENSG00000197461,ENSG00000188191,ENSG00000105963,ENSG00000198517,ENSG00000002822,ENSG00000106263,ENSG00000136213,ENSG00000106003,ENSG00000146535,ENSG00000198286
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000197461,1.0,0.012102013,-0.03454392,0.088940126,0.006413368,-0.094096187,-0.03403603,0.0055683,-0.005442539,0.007825555
ENSG00000188191,0.012102013,1.0,0.03334762,-0.004284231,-0.009380047,-0.004469148,-0.05600176,0.02135948,-0.027613595,0.010304937
ENSG00000105963,-0.03454392,0.033347616,1.0,-0.054786949,0.016138747,0.051164732,0.01241237,0.02915232,-0.015853479,0.012776099
ENSG00000198517,0.088940126,-0.004284231,-0.05478695,1.0,0.015380675,-0.053850669,0.01306831,0.03757618,-0.048669795,-0.028472289
ENSG00000002822,0.006413368,-0.009380047,0.01613875,0.015380675,1.0,0.058641989,0.02366272,-0.0113325,-0.034375518,-0.008273867
ENSG00000106263,-0.094096187,-0.004469148,0.05116473,-0.053850669,0.058641989,1.0,0.04533068,-0.05170435,-0.095446772,-0.01408419


Both matrices should "look" similar. We are not interested in perfectly accurate correlation values (they are already inaccurate).

# Save

In [31]:
py_save_object(gene_corrs_corrected, OUTPUT_FILE)