/
importBUStools.R
170 lines (152 loc) · 5.96 KB
/
importBUStools.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# dir <- "genecount"
.constructSCEFromBUStoolsOutputs <- function(dir,
sample,
matrixFileName,
featuresFileName,
barcodesFileName,
gzipped,
class,
delayedArray) {
cb <- .readBarcodes(file.path(dir, barcodesFileName))
fe <- .readFeatures(file.path(dir, featuresFileName))
ma <- .readMatrixMM(file.path(dir, matrixFileName),
gzipped = gzipped,
class = class,
delayedArray = delayedArray)
ma <- t(ma)
coln <- paste(sample, cb[[1]], sep = "_")
rownames(ma) <- fe[[1]]
sce <- SingleCellExperiment::SingleCellExperiment(
assays = list(counts = ma))
SummarizedExperiment::rowData(sce) <- fe
SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb,
column_name = coln,
sample = sample,
row.names = coln)
return(sce)
}
# main function
.importBUStools <- function(
BUStoolsDirs,
samples,
matrixFileNames,
featuresFileNames,
barcodesFileNames,
gzipped,
class,
delayedArray,
rowNamesDedup) {
if (length(BUStoolsDirs) != length(samples)) {
stop("'BUStoolsDirs' and 'samples' have unequal lengths!")
}
res <- vector("list", length = length(samples))
matrixFileNames <- .getVectorized(matrixFileNames, length(samples))
featuresFileNames <- .getVectorized(featuresFileNames, length(samples))
barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples))
gzipped <- .getVectorized(gzipped, length(samples))
for (i in seq_along(samples)) {
dir <- file.path(BUStoolsDirs[i])
scei <- .constructSCEFromBUStoolsOutputs(dir,
sample = samples[i],
matrixFileName = matrixFileNames[i],
featuresFileName = featuresFileNames[i],
barcodesFileName = barcodesFileNames[i],
gzipped = gzipped[i],
class = class,
delayedArray = delayedArray)
res[[i]] <- scei
}
sce <- do.call(SingleCellExperiment::cbind, res)
if (isTRUE(rowNamesDedup)) {
if (any(duplicated(rownames(sce)))) {
message("Duplicated gene names found, adding '-1', '-2', ",
"... suffix to them.")
}
sce <- dedupRowNames(sce)
}
return(sce)
}
#' @name importBUStools
#' @rdname importBUStools
#' @title Construct SCE object from BUStools output
#' @description Read the barcodes, features (genes), and matrix from BUStools
#' output. Import them
#' as one \link[SingleCellExperiment]{SingleCellExperiment} object. Note the
#' cells in the output files for BUStools 0.39.4 are not filtered.
#' @param BUStoolsDirs A vector of paths to BUStools output files. Each sample
#' should have its own path. For example: \code{./genecount}.
#' Must have the same length as \code{samples}.
#' @param samples A vector of user-defined sample names for the samples to be
#' imported. Must have the same length as \code{BUStoolsDirs}.
#' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse
#' matrix files (.mtx files). Must have length 1 or the same
#' length as \code{samples}.
#' @param featuresFileNames Filenames for the feature annotation files.
#' Must have length 1 or the same length as \code{samples}.
#' @param barcodesFileNames Filenames for the cell barcode list file.
#' Must have length 1 or the same length as \code{samples}.
#' @param gzipped Boolean. \code{TRUE} if the BUStools output files
#' (barcodes.txt, genes.txt, and genes.mtx) were
#' gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in BUStools
#' 0.39.4. Default \code{"auto"} which automatically detects if the
#' files are gzip compressed. Must have length 1 or the same length as
#' \code{samples}.
#' @param class Character. The class of the expression matrix stored in the SCE
#' object. Can be one of "Matrix" (as returned by
#' \link{readMM} function), or "matrix" (as returned by
#' \link[base]{matrix} function). Default "Matrix".
#' @param delayedArray Boolean. Whether to read the expression matrix as
#' \link[DelayedArray]{DelayedArray-class} object or not. Default \code{FALSE}.
#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default
#' \code{TRUE}.
#' @return A \code{SingleCellExperiment} object containing the count
#' matrix, the gene annotation, and the cell annotation.
#' @examples
#' # Example #1
#' # FASTQ files were downloaded from
#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0
#' # /pbmc_1k_v3
#' # They were concatenated as follows:
#' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz >
#' # pbmc_1k_v3_R1.fastq.gz
#' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz >
#' # pbmc_1k_v3_R2.fastq.gz
#' # The following BUStools command generates the gene, cell, and
#' # matrix files
#'
#' # bustools correct -w ./3M-february-2018.txt -p output.bus | \
#' # bustools sort -T tmp/ -t 4 -p - | \
#' # bustools count -o genecount/genes \
#' # -g ./transcripts_to_genes.txt \
#' # -e matrix.ec \
#' # -t transcripts.txt \
#' # --genecounts -
#'
#' # The top 20 genes and the first 20 cells are included in this example.
#' sce <- importBUStools(
#' BUStoolsDirs = system.file("extdata/BUStools_PBMC_1k_v3_20x20/genecount/",
#' package = "singleCellTK"),
#' samples = "PBMC_1k_v3_20x20")
#' @export
importBUStools <- function(
BUStoolsDirs,
samples,
matrixFileNames = "genes.mtx",
featuresFileNames = "genes.genes.txt",
barcodesFileNames = "genes.barcodes.txt",
gzipped = "auto",
class = c("Matrix", "matrix"),
delayedArray = FALSE,
rowNamesDedup = TRUE) {
class <- match.arg(class)
.importBUStools(
BUStoolsDirs = BUStoolsDirs,
samples = samples,
matrixFileNames = matrixFileNames,
featuresFileNames = featuresFileNames,
barcodesFileNames = barcodesFileNames,
gzipped = gzipped,
class = class,
delayedArray = delayedArray,
rowNamesDedup = rowNamesDedup)
}