# Description

**TODO**

# Modules

In [1]:
library(reticulate)
pd <- import("pandas")

# Settings

In [2]:
# reference panel
REFERENCE_PANEL <- "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL <- "MASHR"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"

chromosome <- NULL

In [3]:
paste0("Using reference panel: ", REFERENCE_PANEL)

In [4]:
paste0("Using eQTL model: ", EQTL_MODEL)

In [7]:
# chromosome must be provided as parameter
stopifnot(is.character(chromosome))

# Paths

In [8]:
GENE_CORRS_DIR <- Sys.getenv("PHENOPLIER_PHENOMEXCAN_LD_BLOCKS_GENE_CORRS_DIR")
IRdisplay::display(GENE_CORRS_DIR)

In [9]:
INPUT_DIR <- file.path(GENE_CORRS_DIR, tolower(REFERENCE_PANEL), tolower(EQTL_MODEL), "by_chr")
IRdisplay::display(INPUT_DIR)

In [11]:
INPUT_FILE <- file.path(INPUT_DIR, paste0("gene_corrs-chr", chromosome, ".pkl"))
IRdisplay::display(INPUT_FILE)
stopifnot(file.exists(INPUT_FILE))

In [10]:
OUTPUT_DIR <- file.path(INPUT_DIR, "corrected_positive_definite")
IRdisplay::display(OUTPUT_DIR)
dir.create(OUTPUT_DIR, recursive = TRUE, showWarnings = FALSE)

In [12]:
OUTPUT_FILE <- file.path(OUTPUT_DIR, paste0("gene_corrs-chr", chromosome, ".pkl"))
IRdisplay::display(OUTPUT_FILE)
if (file.exists(OUTPUT_FILE)) {
  IRdisplay::display("Output file exists, it will be overwritten")
}

# Functions

In [13]:
# taken from https://www.r-bloggers.com/2013/08/correcting-a-pseudo-correlation-matrix-to-be-positive-semidefinite/
# TODO: add documentation
CorrectCM <- function(CM, p = 0) {
  n <- dim(var(CM))[1L]
  E <- eigen(CM)
  CM1 <- E$vectors %*% tcrossprod(diag(pmax(E$values, p), n), E$vectors)
  Balance <- diag(1 / sqrt(diag(CM1)))
  CM2 <- Balance %*% CM1 %*% Balance
  return(CM2)
}

In [34]:
is_positive_definite <- function(data) {
  eigenvalues <- eigen(data)$values
  nonpositive_eigenvalues <- eigenvalues[eigenvalues <= 0]

  if (length(nonpositive_eigenvalues) > 0) {
    IRdisplay::display("We need to correct the data and make the matrix positive definite")
    return(FALSE)
  } else {
    IRdisplay::display("Matrix is already positive definite!")
    return(TRUE)
  }
}

# Load data

In [17]:
gene_corrs <- pd$read_pickle(INPUT_FILE)

In [18]:
dim(gene_corrs)

In [19]:
head(gene_corrs[1:10, 1:10])

Unnamed: 0_level_0,ENSG00000177663,ENSG00000131100,ENSG00000099968,ENSG00000015475,ENSG00000243156,ENSG00000215193,ENSG00000183785,ENSG00000184979,ENSG00000100033,ENSG00000100075
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000177663,1.0,-0.07717207,0.07890063,-0.03464399,-0.008794338,0.0359547,0.001726432,0.095401984,0.0391027,0.32156489
ENSG00000131100,-0.077172067,1.0,-0.25827894,0.15425866,-0.102483855,-0.03833009,-0.069450615,-0.077466787,-0.25148724,-0.36351196
ENSG00000099968,0.078900627,-0.25827894,1.0,-0.15236055,0.151567921,0.06397927,-0.028284257,0.094149609,0.02634172,0.23893616
ENSG00000015475,-0.034643993,0.15425866,-0.15236055,1.0,-0.075705611,-0.02813188,-0.160490888,0.014910651,-0.14498552,-0.06459698
ENSG00000243156,-0.008794338,-0.10248385,0.15156792,-0.07570561,1.0,-0.05797458,-0.07788508,0.010540094,0.03839897,0.01825553
ENSG00000215193,0.035954696,-0.03833009,0.06397927,-0.02813188,-0.05797458,1.0,0.007340717,-0.002354151,0.07588282,0.17708923


# Check positive definiteness

In [38]:
is_positive_definite(gene_corrs)

In [39]:
# see eigenvalues
eigenvalues <- eigen(gene_corrs)$values

In [40]:
nonpositive_eigenvalues <- eigenvalues[eigenvalues <= 0]
IRdisplay::display(length(nonpositive_eigenvalues))
IRdisplay::display(nonpositive_eigenvalues)

# Make matrix positive definite if needed

In [41]:
gene_corrs_corrected <- CorrectCM(gene_corrs, 1e-14)

In [42]:
dimnames(gene_corrs_corrected)[[1]] <- rownames(gene_corrs)

In [43]:
dimnames(gene_corrs_corrected)[[2]] <- colnames(gene_corrs)

In [44]:
gene_corrs_corrected <- as.data.frame(gene_corrs_corrected)

In [45]:
dim(gene_corrs_corrected)

# Check positive definiteness of corrected matrix

In [46]:
is_positive_definite(gene_corrs_corrected)

In [47]:
# see eigenvalues
eigenvalues <- eigen(gene_corrs_corrected)$values

In [48]:
nonpositive_eigenvalues <- eigenvalues[eigenvalues <= 0]
IRdisplay::display(length(nonpositive_eigenvalues))
IRdisplay::display(nonpositive_eigenvalues)

In [49]:
# quick and visual comparison of the two matrices
IRdisplay::display(head(gene_corrs[1:10, 1:10]))
IRdisplay::display(head(gene_corrs_corrected[1:10, 1:10]))

Unnamed: 0_level_0,ENSG00000177663,ENSG00000131100,ENSG00000099968,ENSG00000015475,ENSG00000243156,ENSG00000215193,ENSG00000183785,ENSG00000184979,ENSG00000100033,ENSG00000100075
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000177663,1.0,-0.07717207,0.07890063,-0.03464399,-0.008794338,0.0359547,0.001726432,0.095401984,0.0391027,0.32156489
ENSG00000131100,-0.077172067,1.0,-0.25827894,0.15425866,-0.102483855,-0.03833009,-0.069450615,-0.077466787,-0.25148724,-0.36351196
ENSG00000099968,0.078900627,-0.25827894,1.0,-0.15236055,0.151567921,0.06397927,-0.028284257,0.094149609,0.02634172,0.23893616
ENSG00000015475,-0.034643993,0.15425866,-0.15236055,1.0,-0.075705611,-0.02813188,-0.160490888,0.014910651,-0.14498552,-0.06459698
ENSG00000243156,-0.008794338,-0.10248385,0.15156792,-0.07570561,1.0,-0.05797458,-0.07788508,0.010540094,0.03839897,0.01825553
ENSG00000215193,0.035954696,-0.03833009,0.06397927,-0.02813188,-0.05797458,1.0,0.007340717,-0.002354151,0.07588282,0.17708923


Unnamed: 0_level_0,ENSG00000177663,ENSG00000131100,ENSG00000099968,ENSG00000015475,ENSG00000243156,ENSG00000215193,ENSG00000183785,ENSG00000184979,ENSG00000100033,ENSG00000100075
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000177663,1.0,-0.07960362,0.08104729,-0.03069212,-0.006166357,0.04145404,0.004236295,0.092786278,0.03758189,0.29453645
ENSG00000131100,-0.079603618,1.0,-0.25909894,0.15218979,-0.102135541,-0.03941496,-0.070705927,-0.075919537,-0.2474391,-0.34484725
ENSG00000099968,0.081047295,-0.25909894,1.0,-0.15089347,0.15189194,0.06525966,-0.027356017,0.093445488,0.02537699,0.2270727
ENSG00000015475,-0.030692124,0.15218979,-0.15089347,1.0,-0.074442656,-0.02540813,-0.15905442,0.013779881,-0.14546432,-0.07206088
ENSG00000243156,-0.006166357,-0.10213554,0.15189194,-0.07444266,1.0,-0.05544506,-0.077240338,0.009803215,0.03899841,0.01283286
ENSG00000215193,0.041454039,-0.03941496,0.06525966,-0.02540813,-0.055445058,1.0,0.008892585,-0.004056183,0.07554576,0.16017802


Both matrices "look" similar. We are not interested in perfectly accurate correlation values (they are already inaccurate).

# Save

In [50]:
py_save_object(gene_corrs_corrected, OUTPUT_FILE)