-
Notifications
You must be signed in to change notification settings - Fork 15
/
allele_qc.R
280 lines (244 loc) · 13.2 KB
/
allele_qc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#' Match alleles between target_variants and ref_variants
#'
#' Match by ("chrom", "A1", "A2" and "pos"), accounting for possible
#' strand flips and major/minor allele flips (opposite effects and zscores).
#'
#' @param target_variants A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".
#' @param ref_variants A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".
#' @param target_data A data frame on which QC procedures will be applied.
#' @param col_to_flip The name of the column in target_data where flips are to be applied.
#' @param match_min_prop Minimum proportion of variants in the smallest data
#' to be matched, otherwise stops with an error. Default is 20%.
#' @param remove_dups Whether to remove duplicates, default is TRUE.
#' @param remove_indels Whether to remove INDELs, default is FALSE.
#' @param flip Whether the alleles must be flipped: A <--> T & C <--> G, in which case
#' corresponding `col_to_flip` are multiplied by -1. Default is `TRUE`.
#' @param remove_strand_ambiguous Whether to remove strand SNPs (if any). Default is `TRUE`.
#' @param flip_strand Whether to output the variants after strand flip. Default is `FALSE`.
#' @param remove_unmatched Whether to remove unmatched variants. Default is `TRUE`.
#' @return A single data frame with matched variants.
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else
#' @importFrom vctrs vec_duplicate_detect
#' @importFrom dplyr if_else
#' @importFrom tidyr separate
#' @export
allele_qc <- function(target_variants, ref_variants, target_data, col_to_flip = NULL,
match_min_prop = 0.2, remove_dups = TRUE,
remove_indels = FALSE, remove_strand_ambiguous = TRUE,
flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE, target_gwas = TRUE) {
strand_flip <- function(ref) {
# Define a mapping for complementary bases
base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
# Function to complement a single base
complement_base <- function(base) {
complement <- base_mapping[base]
return(complement)
}
# Function to reverse and complement a DNA sequence
reverse_complement <- function(sequence) {
reversed <- rev(strsplit(sequence, NULL)[[1]])
complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE)
return(paste(complemented, collapse = ""))
}
complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE)
return(complemented_sequence)
}
# check if the pattern is ATCG/DI
check_ATCG <- function(vec) {
pattern <- "^[ATCGDI]+$"
# Function to check if a single element matches the pattern
check_element <- function(element) {
grepl(pattern, element)
}
result <- sapply(vec, check_element, USE.NAMES = FALSE)
return(result)
}
# transform all inputs to dataframe
target_variants <- variant_id_to_df(target_variants)
ref_variants <- variant_id_to_df(ref_variants)
if (isTRUE(target_gwas)) target_data <- variant_id_to_df(target_data)
target_data <- target_data %>% mutate(target_variants)
columns_to_remove <- c("chromosome", "position", "ref", "alt", "variant_id")
# Check if any of the specified columns are present
if (any(columns_to_remove %in% colnames(target_data))) {
target_data <- select(target_data, -any_of(columns_to_remove))
}
match_result <- merge(target_data, ref_variants, by = c("chrom", "pos"), all = FALSE, suffixes = c(".target", ".ref")) %>%
# match target & ref by chrom and position
as.data.frame() %>%
mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>%
mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>%
# filter out totally same rows.
filter(duplicated(.) | !duplicated(.)) %>%
# upper case target/reference A1 A2
mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>%
mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>%
# these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip
mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") |
(A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>%
# filter out non-ATCG coded alleles
mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>%
# exact match should be kept all the time
mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>%
mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>%
mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>%
mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>%
mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1)))
# if not remove, then this should'nt be a condition to filter out any variants
if (!remove_strand_ambiguous) {
match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
}
# if all strand flip is un-ambigous, then we know ambigous cases are indeed a strand flip
# not a sign flip, then we infer there is no ambigous in the whole dataset, and keep those ambigous ones
if (nrow(match_result %>% filter(strand_flip == TRUE) %>% filter(strand_unambiguous == TRUE)) == 0) {
match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
}
# To keep variants: if it's a strand flip, we will keep those unambigous (because if ambigous, cannot know it's trand / sign flip, so discard all)
# or exact match or indel match (ID_match)
# If not a strand flip, then we will keep those that are exact match / those are sign flip / INDEL matched
match_result <- match_result %>% mutate(keep = if_else(strand_flip, true = (strand_unambiguous | exact_match | ID_match), false =
(exact_match | sign_flip | ID_match)
))
if (remove_indels) {
match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE))
}
# flip the signs of the column col_to_flip if there is a sign flip
if (!is.null(col_to_flip)) {
if (!is.null(match_result[, col_to_flip])) {
match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip]
} else {
stop("Column '", col_to_flip, "' not found in target_data.")
}
}
# flip the strands if there is a strand flip
if (flip_strand) {
strand_flipped_indices <- which(match_result$strand_flip)
match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"])
match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"])
}
# FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE.
# It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly.
# What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this
# remove_dup name
if (remove_dups) {
dups <- vec_duplicate_detect(match_result[, c("chrom", "pos", "variants_id_qced")])
if (any(dups)) {
match_result <- match_result[!dups, , drop = FALSE]
message("Some duplicates were removed.")
}
}
# Remove all unnecessary columns used to determine qc status
# Finally keep those variants with FLAG keep = TRUE
result <- match_result[match_result$keep, , drop = FALSE]
result <- result %>%
select(-(flip1.ref:keep)) %>%
select(-variants_id_original, -A1.target, -A2.target) %>%
rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
# default FALSE, but if want to remove same variants having different z score, then set as TRUE
if (remove_same_vars) {
same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")])
if (any(same_vars)) {
result <- result[!same_vars, , drop = FALSE]
message("Same variants with different z scores are removed.")
}
}
if (!remove_unmatched) {
match_variant <- match_result %>% pull(variants_id_original)
match_result <- select(match_result, -(flip1.ref:keep)) %>%
select(-variants_id_original, -A1.target, -A2.target) %>%
rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":"))
if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) {
unmatch_data <- target_data %>% filter(!variant_id %in% match_variant)
result <- rbind(result, unmatch_data)
}
}
min_match <- match_min_prop * nrow(ref_variants)
if (nrow(result) < min_match) {
stop("Not enough variants have been matched.")
}
# throw an error if there are any variant_id that are duplicated (meaning that same variant having different other infos for example z score)
if (!remove_same_vars & any(duplicated(result$variant_id))) {
stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.")
}
return(list(target_data_qced = result, qc_summary = match_result))
}
#' Align Variant Names
#'
#' This function aligns variant names from two strings containing variant names in the format of
#' "chr:pos:A1:A2" or "chr:pos_A1_A2". The first string should be the "source" and the second
#' should be the "reference".
#'
#' @param source A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".
#' @param reference A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".
#'
#' @return A list with two elements:
#' - aligned_variants: A character vector of aligned variant names.
#' - unmatched_indices: A vector of indices for the variants in the source that could not be matched.
#'
#' @examples
#' source <- c("1:123:A:C", "2:456:G:T", "3:789:C:A")
#' reference <- c("1:123:A:C", "2:456:T:G", "4:101:G:C")
#' align_variant_names(source, reference)
#'
#' @export
align_variant_names <- function(source, reference, remove_indels = FALSE) {
# Check if source and reference follow the expected pattern
source_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", source)
reference_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", reference)
if (!all(source_pattern) && !all(reference_pattern)) {
# Both source and reference do not follow the expected pattern
warning("Cannot unify variant names because they do not follow the expected variant naming convention chr:pos:A2:A1 or chr:pos_A2_A1.")
return(list(aligned_variants = source, unmatched_indices = integer(0)))
}
if ((!all(source_pattern) && all(reference_pattern)) || (all(source_pattern) && !all(reference_pattern))) {
# One of source or reference follows the pattern, while the other does not
stop("Source and reference have different variant naming conventions. They cannot be aligned.")
}
source_has_chr_prefix <- grepl("^chr", source[1])
reference_has_chr_prefix <- grepl("^chr", reference[1])
source_df <- variant_id_to_df(source)
reference_df <- variant_id_to_df(reference)
qc_result <- allele_qc(
target_variants = source_df,
ref_variants = reference_df,
target_data = source_df,
col_to_flip = NULL,
match_min_prop = 0,
remove_dups = TRUE,
flip_strand = TRUE,
remove_indels = remove_indels,
remove_strand_ambiguous = FALSE,
remove_unmatched = FALSE
)
aligned_df <- qc_result$target_data_qced
# Determine the output format based on the reference convention
if (!grepl("_", reference[1])) {
# Reference follows "chr:pos:A2:A1" convention
output_format <- "colon"
} else {
# Reference follows "chr:pos_A2_A1" convention
output_format <- "colon_underscore"
}
# Format the aligned variants based on the reference convention
if (output_format == "colon") {
aligned_variants <- apply(aligned_df[, c("chrom", "pos", "A2", "A1")], 1, paste, collapse = ":")
} else {
aligned_variants <- apply(aligned_df[, c("chrom", "pos", "A2", "A1")], 1, paste, collapse = ":")
aligned_variants <- gsub(":(\\d+):(\\w):(\\w)", ":\\1_\\2_\\3", aligned_variants)
}
names(aligned_variants) <- NULL
# Adjust the chr prefix in aligned_variants based on the reference convention
aligned_variants_has_chr_prefix <- grepl("^chr", aligned_variants[1])
if (reference_has_chr_prefix && !aligned_variants_has_chr_prefix) {
aligned_variants <- paste0("chr", aligned_variants)
} else if (!reference_has_chr_prefix && aligned_variants_has_chr_prefix) {
aligned_variants <- sub("^chr", "", aligned_variants)
}
unmatched_indices <- which(match(aligned_variants, reference, nomatch = 0) == 0)
list(
aligned_variants = aligned_variants,
unmatched_indices = unmatched_indices
)
}