In [1]:
library(data.table)
library(matrixStats)
source("analysis.utils.r")
set.seed(2023)

“package ‘MASS’ was built under R version 4.1.3”


In [2]:
data.dir = "../Data/Methylation/Purified-Reinius/"

# Shared set of Cpgs

In [3]:
#read in the original input 
Reinius = readRDS(file.path(data.dir, "reinius.rds"))
Hannum  = readRDS(file.path(data.dir, "hannum.rds"))

In [4]:
str(Hannum)

List of 4
 $ X       : num [1:97792, 1:656] 0.464 0.911 0.132 0.718 0.687 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:97792] "cg00000029" "cg00000109" "cg00000165" "cg00000236" ...
  .. ..$ : chr [1:656] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
 $ cov     : chr [1:656, 1:4] "47" "69" "46" "44" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:656] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:4] "age" "gender" "plate" "ethnicity"
 $ W       : num [1:656, 1:6] 0.771 0.855 0.793 0.828 0.727 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:656] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
 $ ctrl_pcs: num [1:656, 1:20] -62.9 -63.1 -64.7 -83.2 -64.4 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:656] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ...


In [5]:
str(Reinius)

List of 4
 $ X  : num [1:459227, 1:6] 0.541 0.964 0.794 0.22 0.829 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:459227] "cg00000029" "cg00000108" "cg00000109" "cg00000165" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ cov: chr [1:6, 1] "Male" "Male" "Male" "Male" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "GSM861635" "GSM861636" "GSM861637" "GSM861638" ...
  .. ..$ : chr "gender"
 $ W  : num [1:6, 1:6] 0.532 0.59 0.665 0.617 0.775 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
  .. ..$ : chr [1:6] "Gran" "CD4T" "CD8T" "Mono" ...
 $ Z  : num [1:6, 1:459227, 1:6] 0.45 0.654 0.701 0.472 0.615 ...
  ..- attr(*, "dimnames")=List of 3
  .. ..$ : chr [1:6] "Gran" "CD4T" "CD8T" "Mono" ...
  .. ..$ : chr [1:459227] "cg00000029" "cg00000108" "cg00000109" "cg00000165" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...


In [6]:
feature.ids = intersect(rownames(Hannum$X), rownames(Reinius$X))
source.ids  = colnames(Hannum$W)

Reinius$X = Reinius$X [feature.ids,]
Reinius$W = Reinius$W [,source.ids]
Reinius$Z = Reinius$Z [source.ids,feature.ids,]
Reinius$params = calc_params_from_Z(Reinius$Z, max_sds = Inf) # only 6 samples dont remove any sample

#keep Caucasian samples only
keep.samples    = rownames(Hannum$cov[Hannum$cov[,"ethnicity"] == "Caucasian - European",])
Hannum$X        = Hannum$X[feature.ids,keep.samples]
Hannum$cov      = Hannum$cov[keep.samples, ]
Hannum$W        = Hannum$W[keep.samples, source.ids]
Hannum$ctrl_pcs = Hannum$ctrl_pcs[keep.samples, ]

In [7]:
str(Hannum)

List of 4
 $ X       : num [1:93086, 1:426] 0.464 0.911 0.132 0.718 0.687 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:93086] "cg00000029" "cg00000109" "cg00000165" "cg00000236" ...
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
 $ cov     : chr [1:426, 1:4] "47" "69" "46" "44" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:4] "age" "gender" "plate" "ethnicity"
 $ W       : num [1:426, 1:6] 0.771 0.855 0.793 0.828 0.727 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
 $ ctrl_pcs: num [1:426, 1:20] -62.9 -63.1 -64.7 -83.2 -64.4 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ...


In [8]:
str(Reinius)

List of 5
 $ X     : num [1:93086, 1:6] 0.541 0.794 0.22 0.829 0.575 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:93086] "cg00000029" "cg00000109" "cg00000165" "cg00000236" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ cov   : chr [1:6, 1] "Male" "Male" "Male" "Male" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "GSM861635" "GSM861636" "GSM861637" "GSM861638" ...
  .. ..$ : chr "gender"
 $ W     : num [1:6, 1:6] 0.532 0.59 0.665 0.617 0.775 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
 $ Z     : num [1:6, 1:93086, 1:6] 0.45 0.701 0.654 0.472 0.668 ...
  ..- attr(*, "dimnames")=List of 3
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
  .. ..$ : chr [1:93086] "cg00000029" "cg00000109" "cg00000165" "cg00000236" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ params:List of 4
  ..$ si

# HVF

In [9]:
source.ids = colnames(Reinius$W)
sum.var = matrix(0, length(feature.ids), 1)
for (source.id in source.ids){
    sum.var = sum.var + rowVars(Reinius$Z[source.id,,])
}
rownames(sum.var)  = feature.ids

In [10]:
sum.var[order(-sum.var[,1]),,drop = F]

0,1
cg12426467,1.0580003
cg02113055,1.0098040
cg00079898,0.9078819
cg04131969,0.8785469
cg06193597,0.7919495
cg00944631,0.7776597
cg08052546,0.7335217
cg02872767,0.7316883
cg06064954,0.6985999
cg04245305,0.6861550


In [11]:
feature.ids[order(-sum.var)[1:10000]]

# subset

In [12]:
for (version in c("hvf.10k", "random.10k")){
    print(paste0("working on ", version))
    
    if (version == "hvf.10k"){
        keep.cpgs = feature.ids[order(-sum.var)[1:10000]]
    }else if(version == "random.10k"){
        keep.cpgs = sample(feature.ids, 10000)        
    }else{
        print("wrong version")
    }
    
 
    #Reinius
    Reinius.sub   = copy(Reinius)
    Reinius.sub$X = Reinius.sub$X [keep.cpgs,]
    Reinius.sub$Z = Reinius.sub$Z [source.ids,keep.cpgs,]
    Reinius.sub$params = calc_params_from_Z(Reinius.sub$Z, max_sds = Inf) # only 6 samples dont remove any sample
    
    #Hannum
    Hannum.sub   = copy(Hannum)
    Hannum.sub$X = Hannum.sub$X[keep.cpgs, ]
    
    #save subset of data
    Hannum.sub.file   = file.path(data.dir, version, paste0("hannum.", version, ".rds"))
    Reinius.sub.file  = file.path(data.dir, version, paste0("reinius.", version, ".rds"))
    
    saveRDS(Hannum.sub,   Hannum.sub.file) 
    saveRDS(Reinius.sub,  Reinius.sub.file)

    #save cibersortx related data 
    cibersortx.X.file = file.path(data.dir, version, paste0("cibersortx.", version, ".X.txt"))
    cibersortx.W.file = file.path(data.dir, version, paste0("cibersortx.", version, ".W.txt"))
    #so that cibersortx will not consider this as log transformed version of the expression count
    cibersortx.X = cbind(Reinius.sub$X, Hannum.sub$X) * 10000 
    cibersortx.W = rbind(Reinius.sub$W, Hannum.sub$W) 
    fwrite(as.data.frame(cibersortx.X),   
                 file = cibersortx.X.file,  
                 sep = "\t", quote=FALSE, row.names = T, col.names = T)

    fwrite(as.data.frame(cibersortx.W),   
                 file = cibersortx.W.file,  
                 sep = "\t", quote=FALSE, row.names = T, col.names = T)
   
}

[1] "working on hvf.10k"
[1] "working on random.10k"


In [15]:
str(Hannum.sub)

List of 4
 $ X       : num [1:10000, 1:426] 0.73 0.735 0.787 0.81 0.826 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:10000] "cg03873392" "cg04605816" "cg14162906" "cg01688293" ...
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
 $ cov     : chr [1:426, 1:4] "47" "69" "46" "44" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:4] "age" "gender" "plate" "ethnicity"
 $ W       : num [1:426, 1:6] 0.771 0.855 0.793 0.828 0.727 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
 $ ctrl_pcs: num [1:426, 1:20] -62.9 -63.1 -64.7 -83.2 -64.4 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:426] "GSM989827" "GSM989828" "GSM989829" "GSM989830" ...
  .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ...


In [14]:
str(Reinius.sub)

List of 5
 $ X     : num [1:10000, 1:6] 0.78 0.646 0.766 0.815 0.792 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:10000] "cg03873392" "cg04605816" "cg14162906" "cg01688293" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ cov   : chr [1:6, 1] "Male" "Male" "Male" "Male" ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "GSM861635" "GSM861636" "GSM861637" "GSM861638" ...
  .. ..$ : chr "gender"
 $ W     : num [1:6, 1:6] 0.532 0.59 0.665 0.617 0.775 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
 $ Z     : num [1:6, 1:10000, 1:6] 0.792 0.674 0.738 0.793 0.682 ...
  ..- attr(*, "dimnames")=List of 3
  .. ..$ : chr [1:6] "Gran" "CD8T" "CD4T" "Mono" ...
  .. ..$ : chr [1:10000] "cg03873392" "cg04605816" "cg14162906" "cg01688293" ...
  .. ..$ : chr [1:6] "sample.1" "sample.2" "sample.3" "sample.4" ...
 $ params:List of 4
  ..$ s