-
Notifications
You must be signed in to change notification settings - Fork 6
/
checkAlevinInputFiles.R
162 lines (150 loc) · 6.46 KB
/
checkAlevinInputFiles.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#' Check that all required input files are available
#'
#' @param baseDir Path to the output directory from the alevin run (should be
#' the directory containing the \code{alevin} directory).
#'
#' @author Charlotte Soneson
#'
#' @export
#'
#' @importFrom utils read.delim
#'
#' @return Returns nothing, raises an error if any of the required files are
#' missing.
#'
#' @examples
#' checkAlevinInputFiles(system.file("extdata/alevin_example_v0.14",
#' package = "alevinQC"))
#'
checkAlevinInputFiles <- function(baseDir) {
msg <- NULL
## First check whether the baseDir files are compatible with alevin
## v0.14 or later, including a whitelist.txt file.
## This is the "standard" v0.14 output.
v0.14 <- .checkAlevinInputFiles_v0.14(baseDir, type = "standard")
if (is.null(v0.14)) {
return("v0.14")
} else {
msg <- c(msg,
"Input directory not compatible with Salmon v0.14 or newer ",
"(without external whitelist), ",
"the following required file(s) are missing or malformed:\n",
paste(v0.14, collapse = "\n"), "\n\n")
}
## If the above check failed, check whether the baseDir files are
## compatible with alevin v0.14 or later, with an external whitelist
## provided. In this case, there would not have to be a whitelist.txt
## file, and there would be a "whitelist" entry in the meta_info.json file.
## This is the "extwl" v0.14 output.
v0.14extwl <- .checkAlevinInputFiles_v0.14(baseDir, type = "extwl")
if (is.null(v0.14extwl)) {
return("v0.14extwl")
} else {
msg <- c(msg,
"Input directory not compatible with Salmon v0.14 or newer ",
"(with external whitelist), ",
"the following required file(s) are missing or malformed:\n",
paste(v0.14extwl, collapse = "\n"), "\n\n")
}
## If the above checks failed, check whether the baseDir files are
## compatible with alevin v0.14 or later, but without the whitelist.txt
## file. This can happen if the whitelisting could not be performed
## for some reason. This is the "nowl" v0.14 output.
v0.14nowl <- .checkAlevinInputFiles_v0.14(baseDir, type = "nowl")
if (is.null(v0.14nowl)) {
return("v0.14nowl")
} else {
msg <- c(msg,
"Input directory not compatible with Salmon v0.14 or newer ",
"(without final whitelist), ",
"the following required file(s) are missing or malformed:\n",
paste(v0.14nowl, collapse = "\n"), "\n\n")
}
## If all the checks above failed, check whether the baseDir files are
## compatible with alevin pre-v0.14.
pre0.14 <- .checkAlevinInputFiles_pre0.14(baseDir)
if (is.null(pre0.14)) {
return("pre0.14")
} else {
msg <- c(msg,
"Input directory not compatible with Salmon pre-v0.14, ",
"the following required file(s) are missing or malformed:\n",
paste(pre0.14, collapse = "\n"), "\n\n")
}
## If we are here, the baseDir doesn't match any of the known
## output structures.
stop(msg)
}
.checkAlevinInputFiles_pre0.14 <- function(baseDir) {
## Raise an error if any of the required files is missing
reqFiles <- c(file.path(baseDir, "alevin/raw_cb_frequency.txt"),
file.path(baseDir, "alevin/filtered_cb_frequency.txt"),
file.path(baseDir, "alevin/featureDump.txt"),
file.path(baseDir, "alevin/MappedUmi.txt"),
file.path(baseDir, "alevin/whitelist.txt"),
file.path(baseDir, "alevin/quants_mat_rows.txt"),
file.path(baseDir, "alevin/quants_mat_cols.txt"),
file.path(baseDir, "alevin/quants_mat.gz"),
file.path(baseDir, "aux_info/meta_info.json"),
file.path(baseDir, "cmd_info.json"))
missingFiles <- reqFiles[vapply(reqFiles,
function(f) {
!file.exists(f)
}, TRUE)]
if (length(missingFiles) > 0) {
return(missingFiles)
} else {
return(NULL)
}
}
# type is either 'standard', 'extwl', or 'nowl'
.checkAlevinInputFiles_v0.14 <- function(baseDir, type = "standard") {
## Raise an error if any of the required files is missing
reqFiles <- c(file.path(baseDir, "alevin/raw_cb_frequency.txt"),
file.path(baseDir, "alevin/featureDump.txt"),
file.path(baseDir, "aux_info/meta_info.json"),
file.path(baseDir, "aux_info/alevin_meta_info.json"),
file.path(baseDir, "cmd_info.json"))
if (type == "standard") {
## 'standard' checks should include also the whitelist.txt file
reqFiles <- c(reqFiles, file.path(baseDir, "alevin/whitelist.txt"))
}
## If any of the files are missing, stop and return these
missingFiles <- reqFiles[vapply(reqFiles,
function(f) {
!file.exists(f)
}, TRUE)]
if (length(missingFiles) > 0) {
return(missingFiles)
}
## All required files exist. Check that they contain the required info.
if (type == "extwl") {
## 'extwl' checks should have a 'whitelist' entry in meta_info.json
meta <- rjson::fromJSON(file = file.path(baseDir,
"cmd_info.json"))
if (is.null(meta$whitelist)) {
missingFiles <- paste0(
file.path(baseDir, "aux_info/meta_info.json"),
" (whitelist entry)"
)
}
if (length(missingFiles) > 0) {
return(missingFiles)
}
}
## Check that all required columns in featureDump.txt are present
coln <- unlist(utils::read.delim(
file.path(baseDir, "alevin/featureDump.txt"),
header = FALSE, as.is = TRUE, sep = "\t", nrows = 1))
if (!(all(c("CorrectedReads", "MappingRate", "DedupRate",
"NumGenesOverMean", "MappedReads", "DeduplicatedReads",
"NumGenesExpressed") %in% coln))) {
missingFiles <- c(file.path(baseDir, "alevin/featureDump.txt"))
}
if (length(missingFiles) > 0) {
return(missingFiles)
}
## If we are here, the input is consistent with the provided
## 'type' argument.
return(NULL)
}