-
Notifications
You must be signed in to change notification settings - Fork 8
/
blast_best_reciprocal_hit.R
334 lines (307 loc) · 12.8 KB
/
blast_best_reciprocal_hit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#' @title Retrieve only the best reciprocal BLAST hit for each query
#' @description This function performs a BLAST search between query and subject sequences reciprocally and
#' returns (for each direction) only the best hit based on the following criteria.
#'
#' A best blast hit is defined as:
#' \itemize{
#' \item the hit with the smallest e-value
#' \item if e-values are identical then the hit with the longest alignment length is chosen
#' }
#'
#' A best reciprocal BLAST hit is defined as a hit that fulfills the following symmetry criteria:
#'
#' \code{blast_best_hit(A,B) = blast_best_hit(B,A)}
#'
#' @param query path to input file in fasta format.
#' @param subject path to subject file in fasta format or blast-able database.
#' @param search_type type of query and subject sequences that will be compared via BLAST search.
#' Options are:
#' \itemize{
#' \item \code{search_type = "nucleotide_to_nucleotide"}
#' \item \code{search_type = "nucleotide_to_protein"}
#' \item \code{search_type = "protein_to_nucleotide"}
#' \item \code{search_type = "protein_to_protein"}
#' }
#' @param strand Query DNA strand(s) to search against database/subject.
#' Options are:
#' \itemize{
#' \item \code{strand = "both"} (Default): query against both DNA strands.
#' \item \code{strand = "minus"} : query against minus DNA strand.
#' \item \code{strand = "plus"} : query against plus DNA strand.
#' }
#' @param output.path path to folder at which BLAST output table shall be stored.
#' Default is \code{output.path = NULL} (hence \code{getwd()} is used).
#' @param is.subject.db logical specifying whether or not the \code{subject} file is a file in fasta format (\code{is.subject.db = FALSE}; default)
#' or a blast-able database that was formatted with \code{makeblastdb} (\code{is.subject.db = TRUE}).
#' @param task BLAST search task option (depending on the selected \code{search_type}). Options are:
#' \itemize{
#' \item \code{search_type = "nucleotide_to_nucleotide"}
#' \itemize{
#' \item \code{task = "blastn"} : Standard nucleotide-nucleotide comparisons (default) - Traditional BLASTN requiring an exact match of 11.
#' \item \code{task = "blastn-short"} : Optimized nucleotide-nucleotide comparisons for query sequences shorter than 50 nucleotides.
#' \item \code{task = "dc-megablast"} : Discontiguous megablast used to find somewhat distant sequences.
#' \item \code{task = "megablast"} : Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences.
#' \item \code{task = "rmblastn"}
#' }
#' \item \code{search_type = "nucleotide_to_protein"}
#' \itemize{
#' \item \code{task = c("blastx", "tblastn")} : Standard nucleotide-protein comparisons (default) from A -> B and standard protein-nucleotide comparisons (default) from B -> A.
#' \item \code{task = c("blastx-fast", "tblastn-fast")} : Optimized nucleotide-protein comparisons from A -> B and aptimized protein-nucleotide comparisons from B -> A.
#' }
#' \item \code{search_type = "protein_to_nucleotide"}
#' \itemize{
#' \item \code{task = c("tblastn", "blastx")} : Standard protein-nucleotide comparisons (default) from A -> B and Standard nucleotide-protein comparisons (default) from B -> A.
#' \item \code{task = c("tblastn-fast", "blastx-fast")} : Optimized protein-nucleotide comparisons from A -> B and optimized nucleotide-protein comparisons from B -> A.
#' }
#' \item \code{search_type = "protein_to_protein"}
#' \itemize{
#' \item \code{task = "blastp"} : Standard protein-protein comparisons (default).
#' \item \code{task = "blast-fast"} : Improved BLAST searches using longer words for protein seeding.
#' \item \code{task = "blastp-short"} : Optimized protein-protein comparisons for query sequences shorter than 30 residues.
#' }
#' }
#' @param db.import shall the BLAST output be stored in a PostgresSQL database and shall a connection be established to this database? Default is \code{db.import = FALSE}.
#' In case users wish to to only generate a BLAST output file without importing it to the current R session they can specify \code{db.import = NULL}.
#' @param postgres.user when \code{db.import = TRUE} and \code{out.format = "postgres"} is selected, the BLAST output is imported and stored in a
#' PostgresSQL database. In that case, users need to have PostgresSQL installed and initialized on their system.
#' Please consult the Installation Vignette for details.
#' @param evalue Expectation value (E) threshold for saving hits (default: \code{evalue = 0.001}).
#' @param out.format a character string specifying the format of the file in which the BLAST results shall be stored.
#' Available options are:
#' \itemize{
#' \item \code{out.format = "pair"} : Pairwise
#' \item \code{out.format = "qa.ident"} : Query-anchored showing identities
#' \item \code{out.format = "qa.nonident"} : Query-anchored no identities
#' \item \code{out.format = "fq.ident"} : Flat query-anchored showing identities
#' \item \code{out.format = "fq.nonident"} : Flat query-anchored no identities
#' \item \code{out.format = "xml"} : XML
#' \item \code{out.format = "tab"} : Tabular separated file
#' \item \code{out.format = "tab.comment"} : Tabular separated file with comment lines
#' \item \code{out.format = "ASN.1.text"} : Seqalign (Text ASN.1)
#' \item \code{out.format = "ASN.1.binary"} : Seqalign (Binary ASN.1)
#' \item \code{out.format = "csv"} : Comma-separated values
#' \item \code{out.format = "ASN.1"} : BLAST archive (ASN.1)
#' \item \code{out.format = "json.seq.aln"} : Seqalign (JSON)
#' \item \code{out.format = "json.blast.multi"} : Multiple-file BLAST JSON
#' \item \code{out.format = "xml2.blast.multi"} : Multiple-file BLAST XML2
#' \item \code{out.format = "json.blast.single"} : Single-file BLAST JSON
#' \item \code{out.format = "xml2.blast.single"} : Single-file BLAST XML2
#' \item \code{out.format = "SAM"} : Sequence Alignment/Map (SAM)
#' \item \code{out.format = "report"} : Organism Report
#' }
#' @param cores number of cores for parallel BLAST searches.
#' @param max.target.seqs maximum number of aligned sequences that shall be retained. Please be aware that \code{max.target.seqs} selects best hits based on the database entry and not by the best e-value. See details here: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/bty833/5106166 .
#' @param db.soft.mask shall low complexity regions be soft masked? Default is \code{db.soft.mask = FALSE}.
#' @param db.hard.mask shall low complexity regions be hard masked? Default is \code{db.hard.mask = FALSE}.
#' @param blast.path path to BLAST executables.
#' @author Hajk-Georg Drost
#' @examples
#' \dontrun{
#' test_best_reciprocal_hit <- blast_best_reciprocal_hit(
#' query = system.file('seqs/qry_nn.fa', package = 'metablastr'),
#' subject = system.file('seqs/sbj_nn_best_hit.fa', package = 'metablastr'),
#' search_type = "nucleotide_to_nucleotide",
#' output.path = tempdir(),
#' db.import = FALSE)
#'
#' # look at results
#' test_best_reciprocal_hit
#' }
#'
#' @seealso \code{\link{blast_nucleotide_to_nucleotide}}, \code{\link{blast_protein_to_protein}},
#' \code{\link{blast_nucleotide_to_protein}}, \code{\link{blast_protein_to_nucleotide}}, \code{\link{blast_best_hit}}
#' @export
blast_best_reciprocal_hit <-
function(query,
subject,
search_type = "nucleotide_to_nucleotide",
strand = "both",
output.path = NULL,
is.subject.db = FALSE,
task = "blastn",
db.import = FALSE,
postgres.user = NULL,
evalue = 0.001,
out.format = "csv",
cores = 1,
max.target.seqs = 10000,
db.soft.mask = FALSE,
db.hard.mask = FALSE,
blast.path = NULL) {
if (search_type == "nucleotide_to_nucleotide") {
orthoA <-
blast_best_hit(
query = query,
subject = subject,
search_type = "nucleotide_to_nucleotide",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task,
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
orthoB <-
blast_best_hit(
query = subject,
subject = query,
search_type = "nucleotide_to_nucleotide",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task,
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
}
if (search_type == "nucleotide_to_protein") {
orthoA <-
blast_best_hit(
query = query,
subject = subject,
search_type = "nucleotide_to_protein",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task[1],
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
orthoB <-
blast_best_hit(
query = subject,
subject = query,
search_type = "protein_to_nucleotide",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task[2],
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
}
if (search_type == "protein_to_nucleotide") {
orthoA <-
blast_best_hit(
query = query,
subject = subject,
search_type = "protein_to_nucleotide",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task[1],
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
orthoB <-
blast_best_hit(
query = subject,
subject = query,
search_type = "nucleotide_to_protein",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task[2],
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
}
if (search_type == "protein_to_protein") {
orthoA <-
blast_best_hit(
query = query,
subject = subject,
search_type = "protein_to_protein",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task,
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
orthoB <-
blast_best_hit(
query = subject,
subject = query,
search_type = "protein_to_protein",
strand = strand,
output.path = output.path,
is.subject.db = is.subject.db,
task = task,
db.import = db.import,
postgres.user = postgres.user,
evalue = evalue,
out.format = out.format,
cores = cores,
max.target.seqs = max.target.seqs,
db.soft.mask = db.soft.mask,
db.hard.mask = db.hard.mask,
blast.path = blast.path
)
}
colnames(orthoB)[1:2] <- c("subject_id", "query_id")
tryCatch({
return(dplyr::semi_join(
orthoA,
orthoB,
by = c("query_id", "subject_id")
))
}, error = function(e) {
stop(
"The BLAST tables resulting from ",
query,
" and ",
subject,
" could not be joined properly to select only the reciprocal best hits."
)
})
}