/
data.R
487 lines (471 loc) · 17.1 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#' A GRanges object with genomic coordinates for cytosines
#' from chr1 for the package's built-in DNA methylation data
#'
#' Corresponds to
#' the rows of brcaMethylData1.
#' DNA methlyation data is Illumina 450k
#' microarray data from breast cancer
#' patients from The Cancer Genome Atlas
#' (TCGA-BRCA, https://portal.gdc.cancer.gov/).
#' Coordinates correspond to the hg38 genome version.
#' Only selected cytosines on chr1 are included to keep
#' the example data small.
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaMCoord1
#' @usage data(brcaMCoord1)
#' @format A GRanges object
NULL
#' A matrix with DNA methylation levels
#' from some CpGs on chromosome 1
#'
#' This object contains methylation levels (0 to 1)
#' for select cytosines in chromosome 1
#' for TCGA breast cancer patients from
#' a DNA methylation microarray (Illumina 450k microarray).
#' Each row corresponds to one cytosine and
#' the coordinates for these cytosines are
#' in the object brcaMCoord1, (data("brcaMCoord1"), hg38 genome).
#' Only select cytosines on chr1 are included to keep
#' the example data small. Columns are patients,
#' with TCGA patient identifiers as column names.
#' 6004 CpGs and 300 patients are included.
#' DNA methlyation data is Illumina 450k
#' microarray data from breast cancer
#' patients from The Cancer Genome Atlas
#' (TCGA-BRCA, https://portal.gdc.cancer.gov/).
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaMethylData1
#' @usage data(brcaMethylData1)
#' @format A matrix object
NULL
#' A matrix with principal component scores for
#' PCs 1-4 for four breast cancer patients.
#'
#' This object contains PC scores for four patients
#' for PCs 1-4. Columns are PCs. Rows are patients,
#' with TCGA patient identifiers as row names.
#' DNA methlyation data is Illumina 450k
#' microarray data from breast cancer
#' patients from The Cancer Genome Atlas
#' (TCGA-BRCA, https://portal.gdc.cancer.gov/),
#' hg38 genome.
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaPCScores
#' @usage data(brcaPCScores)
#' @format A matrix object
NULL
#' A data.frame with principal component scores for
#' PCs 1-4 for 657 breast cancer patients as well
#' as a column with estrogen receptor status.
#'
#' This object contains PC scores for 657 patients
#' for PCs 1-4. Columns are PCs as well
#' as a column with estrogen receptor status. Rows are patients,
#' with TCGA patient identifiers as row names. Patients
#' were selected from all BRCA patients in TCGA based on having complete
#' metadata information for estrogen receptor status
#' and progesterone receptor status as well as
#' having 450k microarray data.
#' PCA was done on the Illumina 450k
#' DNA methlyation
#' microarray data (TCGA-BRCA, https://portal.gdc.cancer.gov/),
#' hg38 genome.
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaPCScores657
#' @usage data(brcaPCScores657)
#' @format A data.frame object
NULL
#' Estrogen receptor alpha binding regions.
#'
#' Binding regions for estrogen receptor alpha (ESR1).
#' hg 38 genome version. Only includes regions in chr1 to keep
#' the example data small.
#'
#' @docType data
#' @keywords datasets
#' @name esr1_chr1
#' @usage data(esr1_chr1)
#' @format A GRanges object
NULL
#' Gata3 binding regions.
#'
#' Binding regions for gata3.
#' hg38 genome version.
#' Only includes regions in chr1 to keep
#' the example data small.
#'
#' @docType data
#' @keywords datasets
#' @name gata3_chr1
#' @usage data(gata3_chr1)
#' @format A GRanges object
NULL
#' Nrf1 binding regions.
#'
#' Binding regions for Nrf1.
#' hg38 genome version.
#' Only includes regions in chr1 to keep
#' the example data small.
#'
#' @docType data
#' @keywords datasets
#' @name nrf1_chr1
#' @usage data(nrf1_chr1)
#' @format A GRanges object
NULL
#' Atf3 binding regions.
#'
#' Binding regions for Atf3.
#' hg38 genome version.
#' Only includes regions in chr1 to keep
#' the example data small.
#'
#' @docType data
#' @keywords datasets
#' @name atf3_chr1
#' @usage data(atf3_chr1)
#' @format A GRanges object
NULL
#' Example COCOA Results (made up)
#'
#' A data.frame with example COCOA results.
#' 5 region sets with names given by rsScores$rsName.
#' Each region set has a score for each PC. Scores
#' for real region sets would normally be orders of
#' magnitude smaller.
#'
#' @docType data
#' @keywords datasets
#' @name rsScores
#' @usage data(rsScores)
#' @format A data.frame object
NULL
#' A GRanges object with coordinates for select
#' BRCA ATAC-seq peak regions from chr1.
#'
#' Corresponds to
#' the rows of brcaATACData1.
#' The ATAC-seq data is from breast cancer
#' patients from The Cancer Genome Atlas
#' (TCGA-BRCA, Corces et. al, 2018, doi: 10.1126/science.aav1898,
#' https://atacseq.xenahubs.net/download/brca/brca_peak_Log2Counts_dedup).
#' Coordinates correspond to the hg38 genome version.
#' Only select regions on chr1 are included to keep
#' the example data small.
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaATACCoord1
#' @usage data(brcaATACCoord1)
#' @format A GRanges object
NULL
#' A matrix with ATAC-seq counts in select peak regions
#' from chromosome 1 for 37 patients.
#'
#' Each row corresponds to one region and
#' the coordinates for these regions are
#' in the object brcaATACCoord1, (data("brcaATACCoord1"), hg38 genome).
#' Only select regions on chr1 are included to keep
#' the example data small. Columns are patients,
#' with TCGA patient identifiers as column names.
#' 4053 regions are included.
#' ATAC-seq data is from breast cancer
#' patients from The Cancer Genome Atlas
#' (TCGA-BRCA, Corces et. al, 2018, doi: 10.1126/science.aav1898,
#' https://atacseq.xenahubs.net/download/brca/brca_peak_Log2Counts_dedup).
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaATACData1
#' @usage data(brcaATACData1)
#' @format A matrix object
NULL
#' A data.frame with patient metadata for breast
#' cancer patients.
#'
#' Has metadata for patients for which DNA methylation
#' or chromatin accessibility data was included as package data (329 patients).
#' Rows are patients,
#' with TCGA patient identifiers as row names and the column "subject_ID".
#' Also includes columns: ER_status, ER_percent, age_diagnosis, days_to_death,
#' and days_to_last_follow_up.
#' Metadata is from The Cancer Genome Atlas
#' (TCGA-BRCA, https://portal.gdc.cancer.gov/).
#'
#'
#' @docType data
#' @keywords datasets
#' @name brcaMetadata
#' @usage data(brcaMetadata)
#' @format A data.frame object
NULL
# # script for generating package data
# # restricting data to reduce how much memory the package takes up
# # first run part of 02-brca_PCRSA to get brcaMList and filteredMData
#
# library(simpleCache)
# library(GenomicRanges)
#
# setCacheDir(paste0(Sys.getenv("PROCESSED"), "brca_PCA/RCache/"))
#
# # loading values
# simpleCache("allMPCA_657", assignToVariable = "mPCA")
# loadingMat <- mPCA$rotation
#
# # coordinates
# coordinateDT <- brcaMList[["coordinates"]]
#
# # region sets
# # some of the top ranked region sets for PC1 (in top 5)
# esr1 <- GRList[rsName ==
# "Human_MCF-7_ESR1_E2-45min_Brown.bed"][[1]]
# gata3 <- GRList[rsName ==
# "wgEncodeAwgTfbsSydhMcf7Gata3UcdUniPk.narrowPeak"][[1]]
# # some of the bottom ranked region sets
# # for PC1 (in bottom 1% of >2200 region sets)
# nrf1 <- GRList[rsName ==
# "wgEncodeAwgTfbsSydhHepg2Nrf1IggrabUniPk.narrowPeak"][[1]]
# atf3 <- GRList[rsName ==
# "wgEncodeAwgTfbsSydhK562Atf3UniPk.narrowPeak"][[1]]
#
#
# # restrict to chromosome 22 (reduce file sizes)
# chr22Ind <- coordinateDT$chr == "chr22"
# coord22 <- coordinateDT[chr22Ind,]
# # subset loadings, also restrict PCs
# load22 <- loadingMat[chr22Ind, paste0("PC", 1:4)]
#
# grList <- GRangesList(er, gata, nrf1, atf3)
# ol22 <- lapply(X = grList,
# function(x) findOverlaps(query = x,
# subject = MIRA:::dtToGr(coord22)))
#
#
# # restrict to chromosome 1 (reduce file sizes)
# chr1Ind <- coordinateDT$chr == "chr1"
# brcaMCoord1 <- coordinateDT[chr1Ind,]
# brcaMCoord1 <- as.data.frame(brcaMCoord1)
# # subset loadings, also restrict PCs
# brcaLoadings1 <- loadingMat[chr1Ind, paste0("PC", 1:4)]
#
# # restrict region sets to chr1
# grList <- GRangesList(er, gata3, nrf1, atf3)
# ol1 <- lapply(X = grList,
# function(x) findOverlaps(query = x,
# subject = MIRA:::dtToGr(coord1)))
#
# # to restrict to chr1
# esr1_chr1 <- esr1[ seqnames(esr1) == "chr1"]
# gata3_chr1 <- gata3[ seqnames(gata3) == "chr1"]
# nrf1_chr1 <- nrf1[ seqnames(nrf1) == "chr1"]
# atf3_chr1 <- atf3[ seqnames(atf3) == "chr1"]
# # add sequence info (seqInfo)
# exRSNames <- c("esr1_chr1", "gata3_chr1", "nrf1_chr1", "atf3_chr1")
#
# # for loop is to avoid having to retype expression for each region set
# for (i in seq_along(exRSNames)) {
#
# # restrict seqinfo to only chromosomes that are present (chr1)
# assignString = paste0("seqlevels(", exRSNames[i], ") <- as.character(unique(seqnames(", exRSNames[i], ")))")
# eval(parse(text=assignString))
#
# assignString = paste0("isCircular(", exRSNames[i], ") <- rep(FALSE, length(seqinfo(", exRSNames[i], ")))")
# eval(parse(text=assignString))
#
# # assign reference genome
# assignString = paste0("genome(", exRSNames[i], ") <- rep(\"hg38\", length(seqinfo(", exRSNames[i], ")))")
# eval(parse(text=assignString))
#
# # chr1 has length of 248,956,422 based on below link
# # https://www.ncbi.nlm.nih.gov/grc/human/data
# assignString = paste0("seqlengths(", exRSNames[i], ") <- 248956422")
# eval(parse(text=assignString))
# }
#
# # save("", file = "coord1.RData") # reduce ~4x from in-memory size
# save("brcaLoadings1",
# file = "brcaLoadings1.RData",
# compress = "xz") # reduce ~7x
# save("brcaMCoord1", file = "brcaMCoord1.RData", compress = "xz")
#
# save("esr1_chr1", file = "esr1_chr1.RData", compress = "xz")
# save("gata3_chr1", file = "gata3_chr1.RData", compress = "xz")
# save("nrf1_chr1", file = "nrf1_chr1.RData", compress = "xz")
# save("atf3_chr1", file = "atf3_chr1.RData", compress = "xz")
#
# ### Get DNA methylation level data, filteredMData from another script
# filteredMData <- cbind(brcaMList[["coordinates"]], filteredMData)
# chr1MData <- filteredMData[filteredMData$chr == "chr1", ]
#
# # selecting patients to include based on PC1, two with low PC scores,
# # and two with high PC scores
# pc1Order <- sort(mPCA$x[, "PC1"], decreasing = FALSE)
# lowScore <- names(pc1Order)[1:2]
# highScore <- names(pc1Order)[(length(pc1Order) - 1):length(pc1Order)]
#
# brcaMethylData1 <- chr1MData[, c(lowScore, highScore), with=FALSE]
# brcaMethylData1 <- as.matrix(brcaMethylData1)
# save("brcaMethylData1", file = "brcaMethylData1.RData", compress = "xz")
#
# ## also getting PC scores for PC 1-4 for these patients
# brcaPCScores <- mPCA$x[c(lowScore, highScore), paste0("PC", 1:4)]
# save("brcaPCScores", file="brcaPCScores.RData", compress="xz")
##################
# # signalMat, signalCoord, patientMetadata
# source(paste0(Sys.getenv("CODE"), "COCOA_paper/src/00-init.R"))
# loadBRCADNAm(loadingMat = FALSE, pcScores = FALSE)
# # making second round of BRCA DNAm data
#
# signalCoord <- COCOA:::dtToGr(signalCoord)
# data("brcaMethylData1")
#
# data("esr1_chr1")
# data("gata3_chr1")
# data("atf3_chr1")
# data("nrf1_chr1")
#
# allTfs <- c(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1)
#
# # expand regions so meta region profile can be created
# allTfs <- resize(allTfs, width=12000, fix="center")
#
# # regions that are covered by any TF
# allTfs <- reduce(allTfs)
# # length(unique(queryHits(findOverlaps(query = brcaATACCoord1, subject = allTfs))))
# overlapsRegionSets <- rep(FALSE, length(signalCoord))
# overlapsRegionSets[unique(queryHits(findOverlaps(query = signalCoord,
# subject = allTfs)))] <- TRUE
#
#
# brcaMCoord1 <- signalCoord[overlapsRegionSets]
#
# # select samples, try to get a somewhat even distribution across PC's 1 and 4
# data("brcaPCScores657")
#
# set.seed(1234)
# PC1Vec <- brcaPCScores657[, c("PC1", "PC4")]
# PC1Dist <- as.matrix(dist(PC1Vec))
# neighbor3Dist <- apply(X = PC1Dist, 2, function(x) mean(sort(x)[2:4]))
#
# sampleInd <- sample(1:nrow(brcaPCScores657), size = 300, replace = FALSE, prob = neighbor3Dist)
# plot(brcaPCScores657[sampleInd, c("PC1", "PC4")])
# sampleNames <- row.names(brcaPCScores657)[sampleInd]
# sampleNamesDNAM <- sampleNames
#
# brcaMethylData1 <- signalMat[overlapsRegionSets, sampleNames]
# save(brcaMethylData1, file="brcaMethylData1.RData", compress="xz")
# save(brcaMCoord1, file="brcaMCoord1.RData", compress="xz")
############ data for rsRankingIndex
# setwd("/data")
# PC1 <- c(1, 2, 3, 5, 7)
# PC2 <- c(5, 1, 3, 2, 4)
# rsName <- c("rs1", "rs2", "rs3", "rs4", "rs5")
# rsScores <- data.frame(PC1, PC2, rsName)
# save(rsScores, file = "rsScores.RData", compress = "xz")
# rsRankingIndex(rsScores = rsScores, PCsToAnnotate = c("PC1", "PC2"))
############ brcaPCScores657
# # # first load patient metadata and PCA for the 657 patients
# # brcaPCScores657 <- as.data.frame(allMPCA$x[, c("PC1", "PC2", "PC3", "PC4")])
# # patientMetadata <- as.data.frame(patientMetadata)
# # row.names(patientMetadata) <- patientMetadata$subject_ID
# # brcaPCScores657 <- cbind(brcaPCScores657, ER_Status = patientMetadata[row.names(brcaPCScores657), "ER_status"])
# # save(brcaPCScores657, file="brcaPCScores657.RData", compress = "xz")
#
########### BRCA ATAC-seq data
# # first load BRCA ATAC-seq data from Corces et al. (signalMat, signalCoord)
# loadBRCAatac(signalMat = TRUE, signalCoord = TRUE,
# loadingMat = TRUE, pcScores = TRUE)
#
# # subset to only ATAC regions that are on chr1 and also
# # overlap with the package's existing region sets
# chr1Ind <- as.logical(seqnames(signalCoord) == "chr1")
#
# data("esr1_chr1")
# data("gata3_chr1")
# data("atf3_chr1")
# data("nrf1_chr1")
# allTfs <- c(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1)
# # expand regions so meta region profile can be created
# allTfs <- resize(allTfs, width=12000, fix="center")
# allTfs <- reduce(allTfs)
# # length(unique(queryHits(findOverlaps(query = brcaATACCoord1, subject = allTfs))))
# overlapsRegionSets <- rep(FALSE, length(signalCoord))
# overlapsRegionSets[unique(queryHits(findOverlaps(query = signalCoord,
# subject = allTfs)))] <- TRUE
# keepInd <- chr1Ind & overlapsRegionSets
#
# brcaATACCoord1 <- signalCoord[keepInd]
# brcaATACData1 <- signalMat[keepInd, ]
#
# save("brcaATACCoord1", file = "brcaATACCoord1.RData", compress = "xz")
#
# sampleNames <- colnames(brcaATACData1)
# # pca of all the data
# a = prcomp(t(signalMat))
# plot(a$x[, 1:2])
# patientMetadata <- as.data.frame(patientMetadata)
# row.names(patientMetadata) <- patientMetadata$subject_ID
# annoScores <- cbind(data.frame(a$x),
# ER_status=as.factor(patientMetadata[sampleNames, "ER_status"]))
# ggplot(data = annoScores, mapping = aes(x=PC1, y=PC2)) + geom_point(aes(color=ER_status))
# sampleInd = ((annoScores$PC1 > 12) | (annoScores$PC1 < -15)) & !is.na(annoScores$ER_status)
#
# # take out samples without ER status
# brcaATACData1 <- brcaATACData1[, !is.na(annoScores$ER_status)]
# sampleNames <- colnames(brcaATACData1)
#
# # trying to get a somewhat even distribution across PC1
# set.seed(1234)
# PC1Vec <- a$x[sampleNames, "PC1"]
# PC1Dist <- as.matrix(dist(PC1Vec))
# neighbor3Dist <- apply(X = PC1Dist, 2, function(x) mean(sort(x)[2:4]))
# sampleInd <- sample(1:ncol(brcaATACData1), size = 37, replace = FALSE, prob = neighbor3Dist)
#
# sampleNames <- sampleNames[sampleInd]
# sampleNamesATAC <- sampleNames
# a = prcomp(t(brcaATACData1[, sampleInd]))
# plot(a$x[, 1:2])
# patientMetadata <- as.data.frame(patientMetadata)
# row.names(patientMetadata) <- patientMetadata$subject_ID
# annoScores <- cbind(data.frame(a$x),
# ER_status=as.factor(patientMetadata[sampleNames, "ER_status"]))
# ggplot(data = annoScores, mapping = aes(x=PC1, y=PC2)) + geom_point(aes(color=ER_status))
#
# brcaATACData1 <- brcaATACData1[, sampleInd]
#
# # only keep 3 sig figs
# brcaATACData1 <- round(brcaATACData1, 2)
#
# save("brcaATACData1", file = "brcaATACData1.RData", compress = "xz")
#
# # we no longer need to include the loadings or PC scores because we can
# # do PCA to generate them in the vignette
# # brcaATACLoadings1 <- loadingMat[keepInd, paste0("PC", 1:4)]
# # save("brcaATACLoadings1", file = "brcaATACLoadings1.RData", compress = "xz")
# # brcaATACPCScores <- pcScores[c(), paste0("PC", 1:4)]
# # save("brcaATACPCScores", file = "brcaATACPCScores", compress = "xz")
# #
# ###########################################
# # get brcaMetadata
# sampleNames <- unique(c(sampleNamesDNAM, sampleNamesATAC))
# patientMetadata <- as.data.frame(patientMetadata)
# row.names(patientMetadata) <- patientMetadata$subject_ID
# brcaMetadata <- patientMetadata[sampleNames, c("subject_ID", "ER_status", "ER_percent",
# "age_diagnosis", "days_to_death",
# "days_to_last_follow_up")]
# brcaMetadata$ER_status = as.factor(brcaMetadata$ER_status)
# save(brcaMetadata, file = "brcaMetadata.RData", compress="xz")