In [1]:
# This notebook read in the pvals from EWAS done on all 4 dataset
# it looks for the set of CPGs that simultanesouly show strong signal (< sig.thr) in Liu and 2 Hannon datasets
# then subsetting all 3 datasets to only include these cpgs. 

In [2]:
pheno = "age"
data.names  = c("liu", "hannon1", "hannon2")
slot = "TCA.parametric.full.joint"
sig.thr = 10 **(-4)
#source.ids = c("Gran", "CD4T", "CD8T", "Mono", "B", "NK")

# Load pvals, coeffs

In [3]:
results_dir = "/u/project/halperin/johnsonc/TCAx/TCAx2023/Result/Methylation/Consistency/"
pval.list  = readRDS(file.path(results_dir, paste0(pheno,".pval.list")))
#coeff.list = readRDS(file.path(results_dir, paste0(pheno,".coeff.list")))

In [4]:
str(pval.list)

List of 4
 $ liu    :List of 16
  ..$ TCAx.parametric.full.marginal   : num [1:153155, 1:6] 0.667 0.343 0.414 0.108 0.417 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:153155] "cg00001349" "cg00002837" "cg00003287" "cg00008647" ...
  .. .. ..$ : chr [1:6] "Gran.age" "CD4T.age" "CD8T.age" "Mono.age" ...
  ..$ TCAx.parametric.full.joint      : num [1:153155, 1] 0.52031 0.00774 0.0283 0.13981 0.92741 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:153155] "cg00001349" "cg00002837" "cg00003287" "cg00008647" ...
  .. .. ..$ : chr "age"
  ..$ TCAx.parametricX2Q2.marginal    : num [1:153155, 1:6] 0.235 0.542 0.19 0.128 0.479 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:153155] "cg00001349" "cg00002837" "cg00003287" "cg00008647" ...
  .. .. ..$ : chr [1:6] "Gran.age" "CD4T.age" "CD8T.age" "Mono.age" ...
  ..$ TCAx.parametricX2Q2.joint       : num [1:153155, 1] 0.203 0.192 0.212 0.609 0.813 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. .

In [5]:
# get shared set of Cpgs
pvals = list()
shared.feature.ids = c()
for (data.name in data.names){
    pvals[[data.name]] = pval.list[[data.name]][[slot]]
    shared.feature.ids [[data.name]] = rownames(pvals[[data.name]])
} 

shared.feature.ids = Reduce(intersect, shared.feature.ids)

In [6]:
str(pvals)

List of 3
 $ liu    : num [1:153155, 1] 0.58626 0.00662 0.02677 0.29886 0.92152 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:153155] "cg00001349" "cg00002837" "cg00003287" "cg00008647" ...
  .. ..$ : chr "age"
 $ hannon1: num [1:134250, 1] 0.00692 0.96172 0.45208 0.07036 0.61987 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:134250] "cg00000029" "cg00000109" "cg00000165" "cg00000236" ...
  .. ..$ : chr "age"
 $ hannon2: num [1:95360, 1] 1.54e-05 9.27e-01 5.43e-01 4.47e-04 1.73e-01 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:95360] "cg00000029" "cg00000109" "cg00000236" "cg00000289" ...
  .. ..$ : chr "age"


In [7]:
sig.feature.ids = list()
for (data.name in data.names){
    pvals[[data.name]] = pvals[[data.name]] [shared.feature.ids, ,drop = F]
    sig.feature.ids[[data.name]] = rownames(pvals[[data.name]][pvals[[data.name]] < sig.thr,, drop = F])
    print(data.name)
    print(length(sig.feature.ids[[data.name]]))
}

[1] "liu"
[1] 7172
[1] "hannon1"
[1] 3257
[1] "hannon2"
[1] 9377


In [8]:
sig.feature.ids = Reduce(intersect, sig.feature.ids)

In [9]:
length(sig.feature.ids)

# Check on the overlap in the lowest set of cpgs in each data

In [10]:
# rank it
top.feature.ids = list()
for (data.name in data.names){
    pvals[[data.name]] = pvals[[data.name]] [shared.feature.ids, ,drop = F]
    top.feature.ids[[data.name]] = rownames(pvals[[data.name]][order(pvals[[data.name]][,pheno]), , drop = F])
    
}

In [11]:
# check for overlap
for (top.t in c(2500, 5000, 10000)){
    shared.top.features = list()
    for (data.name in data.names){
        shared.top.features[[data.name]] = top.feature.ids[[data.name]][1:top.t]
    }  
    print(length(Reduce(intersect, shared.top.features)))
}

[1] 1373
[1] 2628
[1] 4760


In [12]:
str(shared.top.features)

List of 3
 $ liu    : chr [1:10000] "cg16867657" "cg21572722" "cg10501210" "cg22454769" ...
 $ hannon1: chr [1:10000] "cg16867657" "cg10501210" "cg21572722" "cg24724428" ...
 $ hannon2: chr [1:10000] "cg16867657" "cg21572722" "cg06639320" "cg24724428" ...


# Back to those that simultaneously cross sig.thr

In [13]:
data.dir = "/u/home/j/johnsonc/project-halperin/TCAx/TCAx2023/Data/Methylation/Consistency/"

file.paths = list()
file.paths[["liu"]]        = file.path(data.dir, "liu.processed.RData")
file.paths[["hannum"]]     = file.path(data.dir, "hannum.processed.RData")
file.paths[["hannon1"]]    = file.path(data.dir, "hannon1.processed.RData")
file.paths[["hannon2"]]    = file.path(data.dir, "hannon2.processed.RData")

In [None]:
load(file.paths[["liu"]])
load(file.paths[["hannon1"]])
load(file.paths[["hannon2"]])
age.consistent.list = list("liu"     = liu, 
                           "hannon1" = hannon1, 
                           "hannon2" = hannon2)

In [None]:
for (data.name in data.names){
    age.consistent.list [[data.name]]$X = age.consistent.list [[data.name]]$X[sig.feature.ids, ]
}

In [None]:
str(age.consistent.list)

In [None]:
str(age.consistent.list)

In [None]:
saveRDS(age.consistent.list, file.path(data.dir, "age.consistent.list.rds"))

In [None]:
# #https://www.statisticshowto.com/benjamini-hochberg-procedure/#:~:text=The%20Benjamini%2DHochberg%20Procedure%20is,reject%20the%20true%20null%20hypotheses.
# hist(p.adjust(c(0.001, 0.008, 0.039, 0.041, 0.042, 
#            0.06, 0.074, 0.205, 0.234, 0.39, 
#            0.4, 0.9, 0.34, 0.67, 0.45,
#            0.32, 0.45, 0.79, 0.89, 0.43,
#            0.34, 0.58, 0.58, 0.97, 0.21), method = "fdr"), breaks = 100)
# ?p.adjust