-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse_dataset.R
151 lines (127 loc) · 4.61 KB
/
parse_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#' @importFrom fs is_file
#' @importFrom hdf5r H5File h5attr_names
#' @importFrom Matrix sparseMatrix Matrix t
parse_dataset <- function(x, loom_expression_layer = NULL) {
assert_that(
is.character(x),
length(x) == 1,
fs::is_file(x) || fs::is_link(x),
msg = "--dataset should contain a pathname of a .loom or .h5 file. Add a '-h' flag for help."
)
extra_input <- NULL
expression <- NULL
##########################
### LOOM ###
##########################
if (grepl("\\.loom$", x)) {
file_h5 <- H5File$new(x, mode = "r")
on.exit(file_h5$close_all())
assert_that(file_h5 %has_names% c("matrix", "row_attrs", "col_attrs", "layers"))
counts <- file_h5[["matrix"]][,] %>% Matrix(sparse = TRUE)
feature_paths <- paste0("row_attrs/", c("gene_names", "Gene"))
cell_paths <- paste0("col_attrs/", c("cell_names", "CellID"))
feature_exists <- map_lgl(feature_paths, file_h5$exists) %>% which()
cell_exists <- map_lgl(cell_paths, file_h5$exists) %>% which()
feature_ids <-
if (length(feature_exists) == 1) {
file_h5[[feature_paths[[feature_exists]]]][]
} else {
warning("feature IDs could not be found in the loom format!")
paste("Feature", seq_len(ncol(counts)))
}
if (any(duplicated(feature_ids))) {
stop("duplicated feature IDs found!")
}
cell_ids <-
if (length(cell_exists) == 1) {
file_h5[[cell_paths[[cell_exists]]]][]
} else {
warning("cell IDs could not be found in the loom format!")
paste("Cell", seq_len(nrow(counts)))
}
if (any(duplicated(cell_ids))) {
stop("duplicated cell IDs found!")
}
dimnames(counts) <- list(cell_ids, feature_ids)
if (!is.null(loom_expression_layer)) {
expression <- file_h5[[paste0("layers/", loom_expression_layer)]][,] %>% Matrix(sparse = TRUE)
dimnames(expression) <- list(cell_ids, feature_ids)
}
} else if (grepl("\\.h5$", x)) {
file_h5 <- H5File$new(x, mode = "r")
on.exit(file_h5$close_all())
if (file_h5 %has_names% c("data", "names") && "object_class" %in% h5attr_names(file_h5)) {
##########################
### OWN H5 ###
##########################
tmp <- dynutils::read_h5_(file_h5)
counts <- tmp$counts
expression <- tmp$expression
extra_input <- list()
if ("parameters" %in% names(tmp)) {
extra_input$parameters <- tmp$parameters
}
if ("priors" %in% names(tmp)) {
extra_input$priors <- tmp$priors
}
if ("prior_information" %in% names(tmp)) {
extra_input$priors <- tmp$prior_information
}
if ("seed" %in% names(tmp)) {
extra_input$seed <- tmp$seed
}
# add dataset prior if given
if (any(c("milestone_percentages", "divergence_regions", "milestone_network", "progressions") %in% names(tmp))) {
extra_input$priors$dataset <- tmp[c("milestone_network", "progressions", "milestone_percentages", "divergence_regions")]
}
} else if (file_h5 %has_names% "matrix" && file[["matrix"]] %has_names% c("barcodes", "data", "features", "indices", "indptr", "shape")) {
##########################
### CELLRANGER V3 ###
##########################
subfile <- file_h5[["matrix"]]
counts <-
Matrix::sparseMatrix(
i = subfile[["indices"]][],
p = subfile[["indptr"]][],
x = subfile[["data"]][],
dims = subfile[["shape"]][],
dimnames = list(
subfile[["features/id"]][],
subfile[["barcodes"]][]
),
index1 = FALSE
) %>%
Matrix::t()
} else if (length(names(file_h5)) == 1 && file_h5[[names(file_h5)]] %has_names% c("barcodes", "data", "genes", "indices", "indptr", "shape")) {
##########################
### CELLRANGER V2 ###
##########################
subfile <- file_h5[[names(file_h5)]]
counts <-
Matrix::sparseMatrix(
i = subfile[["indices"]][],
p = subfile[["indptr"]][],
x = subfile[["data"]][],
dims = subfile[["shape"]][],
dimnames = list(
subfile[["genes"]][],
subfile[["barcodes"]][]
),
index1 = FALSE
) %>%
Matrix::t()
}
}
if (is.null(expression)) {
expression <- normalise(counts)
}
out <- lst(counts, expression)
c(out, extra_input)
}
normalise <- function(counts) {
# TODO: provide better normalisation :(
# TODO: Also print out warning that better normalisation should be added
expr <- counts
expr@x <- log2(expr@x + 1)
expr
}