/
cell_types_utils.R
executable file
·363 lines (332 loc) · 13.6 KB
/
cell_types_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#!/usr/bin/env Rscript
# Define global variables
# list of unlabelled cell types
# NB: keep these relevant to the tools' output
unlabelled = c("unassigned", "unknown", 'rand','node','ambiguous', NA, "not available", "")
# add common words here to improve specificity of word-matching method
trivial_terms = c("cell", "of", "tissue", "entity", "type")
### find number of unassigned cells ###
get_unlab_rate = function(labels, unlabelled){
labels = as.character(labels)
prop_unknown_provided = length(labels[(labels %in% unlabelled)])/length(labels)
prop_unknown_provided = round(prop_unknown_provided, 3)
return(prop_unknown_provided)
}
#### find delta between unlabelled cells in reference dataset and those in predicted dataset ###
get_unlab_rate_delta = function(prop_unlab_reference, prop_unlab_predicted){
unlab_delta = abs(prop_unlab_predicted - prop_unlab_reference)
return(round(unlab_delta, 3))
}
#### find proportion of exact matches between reference cell types and predicted labels ###
get_exact_matches = function(reference_labels, predicted_labels){
matches = length(reference_labels[reference_labels == predicted_labels])
prop_exact_match = round(matches/length(reference_labels), 3)
return(prop_exact_match)
}
### find mean proportion of shared terms between reference and predicted labels ###
get_shared_terms_prop = function(reference_labs, pred_labs, trivial_terms){
# split predicted and provided annotations into words
split_ref = sapply(reference_labs, function(x) strsplit(as.character(x), " "))
split_prediction = sapply(pred_labs, function(x) strsplit(as.character(x), " "))
# find corresponding terms between word vectors
intersection = sapply(1:length(split_ref),
function(x) intersect(split_ref[[x]], split_prediction[[x]]))
# remove trivial terms from intersection
intersection = sapply(intersection, function(x) x[!x %in% trivial_terms])
# find average proportion of matching words
intersect_ratio = mean(sapply(1:length(intersection),
function(x) length(intersection[[x]])/length(split_prediction[[x]])))
intersect_ratio = round(intersect_ratio, 3)
return(intersect_ratio)
}
### find per-class/median F-1 scores and accuracy ###
# adapted from https://github.com/tabdelaal/scRNAseq_Benchmark/blob/master/evaluate.R
get_f1 = function(reference_labs, predicted_labs, unlabelled=unlabelled) {
unique_ref = unique(reference_labs)
unique_pred = unique(predicted_labs)
conf = table(reference_labs, predicted_labs)
pop_size = rowSums(conf)
# confusion table for existing cell types, excluding unassigned
conf_F1 = table(reference_labs,predicted_labs, exclude = unlabelled)
F1 = c()
sum_acc = 0
# iterate across classes and calculate per-class F1-score
for (i in c(1:length(unique_ref))){
# check if label is in confusion table
find_label = colnames(conf_F1) == row.names(conf_F1)[i]
if(sum(find_label, na.rm = TRUE)){
prec = conf_F1[i, find_label] / colSums(conf_F1)[find_label]
rec = conf_F1[i, find_label] / rowSums(conf_F1)[i]
if (prec == 0 || rec == 0){
F1[i] = 0
} else{
F1[i] = (2*prec*rec) / (prec + rec)
}
sum_acc = sum_acc + conf_F1[i, find_label]
} else {
F1[i] = 0
}
}
# filter out empty classes
pop_size = pop_size[pop_size > 0]
names(F1) = names(pop_size)
med_F1 = round(median(F1), 3)
# calculate overall accuracy
acc = round(sum_acc/sum(conf_F1), 3)
out = list(MedF1 = med_F1, Acc = acc)
return(out)
}
# import CL ontology graph
import_ontology_graph = function(tmpdir, ont_graph){
# check ontology graph; specify download link or use provided
if(is.na(ont_graph)){
uri = "https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl-basic.obo"
} else if(grepl("www.|http:|https:", ont_graph)){
uri = ont_graph
} else {
if(file.exists(ont_graph)){
return(ont_graph)
}
stop(paste("File", ont_graph, "does not exist."))
}
# use system variable if no tmpdir specified
# create it, if necessary
if(is.na(tmpdir)){
tmpdir = Sys.getenv("TMPDIR")
} else if(!dir.exists(tmpdir)){
dir.create(tmpdir)
}
ont_graph = paste(tmpdir, basename(uri), sep="/")
if(!file.exists(ont_graph)){
download.file(uri, destfile=ont_graph)
}
return(ont_graph)
}
# calculate similarity between a pair of labels
get_CL_similarity = function(label_1, label_2, lab_cl_mapping, ontology, sim_metric, unlabelled) {
suppressPackageStartupMessages(require(Onassis)) #NB: keep package import within function, otherwise parallelism is broken
# initialise and configure Similarity object
siml_object = new('Similarity')
ontology(siml_object) = ontology
# configure similarity measurement metric
if(! sim_metric %in% listSimilarities()$pairwiseMeasures){
stop("Incorrect semantic similarity metric provided.")
}
idx = which(listSimilarities()$pairwiseMeasures == sim_metric)
pairwiseConfig(siml_object) = listSimilarities()$pairwiseMeasures[idx]
# map labels to CL terms; return 0 if unlabelled
if(label_1 %in% unlabelled | label_2 %in% unlabelled){
return(0)
}
# check if labels in dict keys; return NA otherwise
if(! label_1 %in% keys(lab_cl_mapping) && label_2 %in% keys(lab_cl_mapping)){
return(NA)
}
term_1 = lab_cl_mapping[[as.character(label_1)]]
term_2 = lab_cl_mapping[[as.character(label_2)]]
# semantic similarity
tryCatch({
psim = pairsim(siml_object, term_1, term_2)
return(psim)
},
error = function(cond){
print(cond)
print("returning 0")
return(0)})
}
# get a combined score across all metrics
get_tool_combined_score = function(unlab_delta,
exact_match_prop,
mean_shared_terms,
median_F1,
accuracy,
siml,
include_sem_siml) {
if(!include_sem_siml) siml = NA
# score must be in the [0;1] interval
score = mean(c(exact_match_prop, mean_shared_terms, median_F1, siml), na.rm=TRUE)
return(round(score, 3))
}
#########################################
# combine function calls
#########################################
obtain_metrics_list = function(tool,
reference_labs,
predicted_labs,
prop_unlab_reference,
lab_cl_mapping,
unlabelled,
trivial_terms,
ontology,
sim_metric,
include_sem_siml){
# proportion of unlabelled cells in predicted labels
prop_unlab_predicted = get_unlab_rate(predicted_labs, unlabelled)
# ratio of unlabelled cells proportions
unlab_delta = get_unlab_rate_delta(prop_unlab_reference, prop_unlab_predicted)
# propotion of exact matches
exact_match_prop = get_exact_matches(reference_labs, predicted_labs)
# match based on shared terms
mean_shared_terms = get_shared_terms_prop(reference_labs, predicted_labs, trivial_terms)
# calculate F1 scores and accuracy
metrics = get_f1(reference_labs, predicted_labs, unlabelled)
median_F1 = metrics$MedF1
accuracy = metrics$Acc
# cell ontology similarity
sim_vec = c()
for(idx in seq_along(reference_labs)){
lab_1 = reference_labs[idx]
lab_2 = predicted_labs[idx]
sim_vec[idx] = get_CL_similarity(lab_1, lab_2,
lab_cl_mapping=lab_cl_mapping,
ontology=ontology,
sim_metric=sim_metric,
unlabelled=unlabelled)
}
siml = round(mean(sim_vec, na.rm=TRUE), 3)
score = get_tool_combined_score(unlab_delta, exact_match_prop, mean_shared_terms,
median_F1, accuracy, siml, include_sem_siml)
# combine metrics
row = list(Tool = tool,
Unlab_ref = prop_unlab_reference,
Unlab_pred = prop_unlab_predicted,
Unlab_delta = unlab_delta,
Exact_match_prop = exact_match_prop,
Mean_partial_match = mean_shared_terms,
Med_F1 = median_F1,
Accuracy = accuracy,
CL_similarity = siml,
Combined_score = score)
return(row)
}
################################################################################
# methods for per-cell statistics
################################################################################
# extract list of unique cell ids
get_unq_cell_ids = function(tables_list){
cell_ids = lapply(tables_list, function(tbl) tbl[ , "cell_id"])
if(!length(unique(cell_ids)) == 1){
stop("Inconsistent cell IDs provided")
}
cell_ids = unlist(unique(cell_ids))
return(cell_ids)
}
# get agreement across predicted labels
get_agreement_rate = function(label_list){
label_list = label_list[which(!is.na(label_list))]
n_unq = length(unique(label_list))
if(n_unq == 0){
return(NA)
}
agreement = 1 / n_unq
return(round(agreement, 2))
}
# find the most frequent labels across top candidates from each tool; if specified, get weighted scores
get_top_labels = function(label_list, tool_scores=NULL){
unq_labs = unique(label_list)
# determine frequency of each unique label
.get_freq = function(lab, n_labs){
freq = as.numeric(table(label_list)[lab] / n_labs)
return(round(freq, 3))
}
freqs = sapply(unq_labs, function(lab) .get_freq(lab, n_labs=length(label_list)))
# if tool scores provided, find weighted scores
if(!is.null(tool_scores)){
# extract tool name for each label and find corresponding score
# NB: label vector must contrain corresponding tool in its name, e.g. tool-X_1
source_tools = sapply(names(label_list), function(name) unlist(strsplit(name, "_"))[1])
label_scores = sapply(source_tools, function(tool) tool_scores[tool])
# find mean value of scores corresponding to each label
score_sums = sapply(unq_labs, function(lab) mean(label_scores[which(label_list==lab)], na.rm=TRUE))
# account for label frequencies and sort
sorted = sort(score_sums * freqs, decreasing=TRUE, index.return=TRUE, na.last=TRUE)
score_names = paste("weighted-score", c(1:3), sep="_")
} else{
# otherwise, simply return top labels by frequency
sorted = sort(freqs, decreasing=TRUE, index.return=TRUE, na.last=TRUE)
score_names = paste("frequency", c(1:3), sep="_")
}
sorted_scores = sorted$x
sorted_scores = sapply(sorted_scores, function(s) round(s,2))
sorted_labs = unq_labs[sorted$ix]
# select top 3 labels; add NAs if there are fewer than 3
while(length(sorted_scores) < 3){
sorted_labs = append(sorted_labs, NA)
sorted_scores = append(sorted_scores, NA)
}
sorted_labs = sorted_labs[1:3]
names(sorted_labs) = paste("label", c(1:3), sep="_")
sorted_scores = sorted_scores[1:3]
names(sorted_scores) = score_names
return(append(sorted_labs, sorted_scores))
}
# create a mapping between predicted labels and datasets of origin
# it might be that one label comes from more than one dataset
# in this case, combine datasets names into vector.
# takes as input vectors of labels and corresponding datasets + dictionary to fill in
# returns updated dictionary
build_label_dataset_mapping = function(labels, datasets, lab_ds_map){
for(idx in seq_along(labels)){
lab = labels[idx]
ds = datasets[idx]
if(is.na(lab) | lab == ''){
next
}
if(lab %in% keys(lab_ds_map)){
if(!ds %in% lab_ds_map[[lab]]){
lab_ds_map[lab] = append(lab_ds_map[[lab]], ds)
} else {
next
}
} else {
lab_ds_map[lab] = ds
}
}
return(lab_ds_map)
}
# extract datasets of origin by building a string of accession codes
# take as input a vector of labels
# return a vector of corresponding dataset(s)
extract_datasets = function(label_vec, lab_ds_map){
label_vec = label_vec[1:3]
s = c()
for(idx in seq_along(label_vec)){
l = label_vec[idx]
if(l %in% keys(lab_ds_map)){
ds_vec = lab_ds_map[[l]]
} else{
ds_vec = NA
}
if(length(ds_vec) > 1){
s[idx] = paste(ds_vec, collapse="; ")
} else{
s[idx] = ds_vec
}
}
return(s)
}
# extract metadata from file into a list
extract_metadata = function(file){
con = file(file, "r")
vals = list()
idx = 1
while(TRUE){
line = readLines(con, 1)
if(!startsWith(line, "#")){
return(vals)
}
data = unlist(strsplit(line, " "))
# 1st item is '#', 2nd is field name
n = data[2]
# extract values
vals[[idx]] = data[-c(1,2)]
names(vals)[idx] = tolower(n)
idx = idx + 1
}
}
# remove non-aplhanumeric charecters from metadata/marker gene files
# given a vector of labels, return updated labels
filter_labels = function(labels, reg_exp="[^-A-Za-z0-9>+ ]"){
labels = sapply(labels, function(l) gsub(reg_exp, "_", l))
return(labels)
}