-
Notifications
You must be signed in to change notification settings - Fork 1
/
FastaDbParser.java
399 lines (374 loc) · 18.3 KB
/
FastaDbParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
package com.compomics.colims.core.io.fasta;
import com.compomics.colims.model.FastaDb;
import com.compomics.colims.model.Protein;
import com.compomics.colims.model.enums.SearchEngineType;
import com.compomics.util.protein.Header;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class parses FASTA files (protein accession and sequence).
* <p>
* Created by Niels Hulstaert on 7/10/16.
*/
@Component("fastaDbParser")
public class FastaDbParser {
/**
* Logger instance.
*/
private static final Logger LOGGER = LoggerFactory.getLogger(FastaDbParser.class);
private static final String BLOCK_SEPARATOR = ">";
private static final String SPLITTER = " ";
private static final String PARSE_RULE_SPLITTER = ";";
/**
* Parse the given FASTA files into a map of protein accession -> sequence pairs. This method takes a {@link
* LinkedHashMap} of {@link FastaDb} instances as keys as an argument to consistently handle possible duplicate
* accessions between different FASTA DB files. In case of a duplicate accession, the associated protein sequence of
* the first entry is put in the map.
*
* @param fastaDbs the FASTA files to parse and their associated (absolute) path
* @return the protein sequences map (key: protein accession; value: protein sequence)
* @throws IOException thrown in case of an input/output related problem
*/
public Map<String, Protein> parse(LinkedHashMap<FastaDb, Path> fastaDbs) throws IOException {
Map<String, Protein> proteinSequences = new HashMap<>();
try {
for (Map.Entry<FastaDb, Path> entry : fastaDbs.entrySet()) {
FastaDb fastaDb = entry.getKey();
Path fastaPath = entry.getValue();
//check if the FASTA has an associated header parse rule and parse accordingly
//otherwise, use the Compomics Utilities library
if (fastaDb.getHeaderParseRule() == null || fastaDb.getHeaderParseRule().equals("") || fastaDb.getHeaderParseRule().equals("none")) {
parseWithoutRule(proteinSequences, fastaPath);
} else {
parseWithRule(proteinSequences, fastaDb, fastaPath);
}
if (proteinSequences.isEmpty()) {
throw new IllegalStateException("No accessions could be parsed from the FASTA DB file(s). Are you using the correct parse rule?");
}
}
} catch (IOException e) {
LOGGER.error(e.getMessage(), e);
throw new IOException("Error parsing FASTA file, please check that it contains valid data");
}
return proteinSequences;
}
/**
* Parse the protein accessions from the given FASTA files into a map (key: the {@link FastaDb} instance; value: the
* set of protein accessions). The argument is a {@link LinkedHashMap} to be able to return the the parsed
* accessions in the same order as they were passed.
*
* @param fastaDbs the FASTA files to parse and their associated (absolute) path
* @param searchEngineType the search engine type
* @return the map of parsed accessions (key: the {@link FastaDb} instance; value: the set of protein accessions).
* @throws IOException thrown in case of an input/output related problem
* @throws IllegalStateException if the set of parsed accessions of the one of the FASTA DB files is empty
*/
public LinkedHashMap<FastaDb, Set<String>> parseAccessions(LinkedHashMap<FastaDb, Path> fastaDbs, SearchEngineType searchEngineType) throws IOException {
LinkedHashMap<FastaDb, Set<String>> parsedFastas = new LinkedHashMap<>();
try {
for (Map.Entry<FastaDb, Path> entry : fastaDbs.entrySet()) {
FastaDb fastaDb = entry.getKey();
Path fastaPath = entry.getValue();
Set<String> accessions;
switch (searchEngineType) {
//check if the FASTA has an associated header parse rule and parse accordingly
case MAXQUANT:
if (fastaDb.getHeaderParseRule() == null || fastaDb.getHeaderParseRule().equals("") || fastaDb.getHeaderParseRule().equals("none")) {
accessions = parseAccessionsWithoutRule(fastaPath);
} else {
accessions = parseAccessionsWithRule(fastaDb, fastaPath);
}
break;
case PEPTIDESHAKER:
accessions = parseAccessionsWithUtilities(fastaPath);
break;
default:
accessions = new HashSet<>();
break;
}
if (accessions.isEmpty()) {
throw new IllegalStateException("No accessions could be parsed from FASTA DB file " + entry.getValue().toString() + ". Are you using the correct parse rule?");
}
parsedFastas.put(fastaDb, accessions);
}
} catch (IOException e) {
LOGGER.error(e.getMessage(), e);
throw new IOException("Error parsing FASTA file, please check that it contains valid data");
}
return parsedFastas;
}
/**
* Test the header parse rule for the given FASTA DB file.
*
* @param fastaPath the FASTA DB file path
* @param parseRule the header parse rule
* @param numberOfHeaders the number of headers that will be returned
* @return the map of parsed headers (key: the parsed accession; value: the original header)
* @throws IOException thrown in case of an input/output related problem
* @throws IllegalStateException if the set of parsed accessions of the one of the FASTA DB files is empty
*/
public LinkedHashMap<String, String> testParseRule(Path fastaPath, String parseRule, int numberOfHeaders) throws IOException {
LinkedHashMap<String, String> headers;
try {
//check if the FASTA has an associated header parse rule and parse accordingly
//otherwise, use the Compomics Utilities library
if (parseRule == null || parseRule.equals("")|| parseRule.equals("none")) {
headers = testParseWithoutRule(fastaPath, numberOfHeaders);
} else {
headers = testParseWithRule(fastaPath, parseRule, numberOfHeaders);
}
if (headers.isEmpty()) {
throw new IllegalStateException("No accessions could be parsed from FASTA DB file " + fastaPath + ".");
}
} catch (IOException e) {
LOGGER.error(e.getMessage(), e);
throw new IOException("Error parsing FASTA file, please check that it contains valid data");
}
return headers;
}
/**
* Parse the given FASTA file in case a header parse rule is present.
*
* @param proteinSequences the protein sequences map
* @param fastaDb the {@link FastaDb} instance
* @param fastaPath the FASTA path
* @throws IOException in case of file reading related problem
*/
private void parseWithRule(Map<String, Protein> proteinSequences, FastaDb fastaDb, Path fastaPath) throws IOException {
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//compile the pattern
Pattern pattern;
if (fastaDb.getHeaderParseRule().contains(PARSE_RULE_SPLITTER)) {
pattern = Pattern.compile(fastaDb.getHeaderParseRule().split(PARSE_RULE_SPLITTER)[1]);
} else {
pattern = Pattern.compile(fastaDb.getHeaderParseRule());
}
//start reading the file
final StringBuilder sequenceBuilder = new StringBuilder();
String fastaHeader = "";
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith(BLOCK_SEPARATOR)) {
//add limiting check for protein store to avoid growing
if (sequenceBuilder.length() > 0) {
Protein protein = new Protein();
protein.setSequence(sequenceBuilder.toString().trim());
String regex = "(?<!\\\\)" + Pattern.quote("|");
if(fastaHeader.substring(1).split(regex).length > 2){
if(fastaHeader.substring(1).split(regex)[2].split(SPLITTER).length >1){
String description =String.join(SPLITTER, Arrays.copyOfRange(fastaHeader.substring(1).split(regex)[2].split(SPLITTER), 1, fastaHeader.substring(1).split(regex)[2].split(SPLITTER).length));
if(description.contains(" OS=")){
protein.setDescription(description.substring(0, description.indexOf(" OS=")));
}
}
}
Matcher matcher = pattern.matcher(fastaHeader.substring(1));
if (matcher.find()) {
proteinSequences.putIfAbsent(matcher.group(1), protein);
} else {
proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein);
}
sequenceBuilder.setLength(0);
}
fastaHeader = line;
} else {
sequenceBuilder.append(line);
}
}
//last line
if (sequenceBuilder.length() > 0) {
Protein protein = new Protein();
protein.setSequence(sequenceBuilder.toString().trim());
if(fastaHeader.substring(1).split("|").length > 2 && fastaHeader.substring(1).split("|")[2].split(SPLITTER).length >1){
String description = fastaHeader.substring(1).split("|")[2].split(SPLITTER)[1];
if(description.contains(" OS=")){
protein.setDescription(description.substring(0, description.indexOf("OS=")));
}
}
Matcher matcher = pattern.matcher(fastaHeader.substring(1).split(SPLITTER)[0]);
if (matcher.find()) {
proteinSequences.putIfAbsent(matcher.group(1), protein);
} else {
proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein);
}
sequenceBuilder.setLength(0);
}
}
}
/**
* Parse the given FASTA file in case no header parse rule is present.
*
* @param proteinSequences the protein sequences map
* @param fastaPath the FASTA path
* @throws IOException in case of file reading related problem
*/
private void parseWithoutRule(Map<String, Protein> proteinSequences, Path fastaPath) throws IOException {
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//start reading the file
final StringBuilder sequenceBuilder = new StringBuilder();
String fastaHeader = "";
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith(BLOCK_SEPARATOR)) {
//add limiting check for protein store to avoid growing
if (sequenceBuilder.length() > 0) {
Protein protein = new Protein();
protein.setSequence(sequenceBuilder.toString().trim());
// protein.setDescription(fastaHeader.substring(1).split(SPLITTER)[1]);
proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein);
sequenceBuilder.setLength(0);
}
fastaHeader = line;
} else {
sequenceBuilder.append(line);
}
}
//last line
if (sequenceBuilder.length() > 0) {
Protein protein = new Protein();
protein.setSequence(sequenceBuilder.toString().trim());
proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein);
sequenceBuilder.setLength(0);
}
}
}
/**
* Parse the given FASTA file in case a header parse rule is present.
*
* @param fastaDb the {@link FastaDb} instance
* @param fastaPath the FASTA path
* @return the set of parsed protein accessions
* @throws IOException in case of file reading related problem
*/
private Set<String> parseAccessionsWithRule(FastaDb fastaDb, Path fastaPath) throws IOException {
Set<String> accessions = new HashSet<>();
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//compile the pattern
Pattern pattern;
if (fastaDb.getHeaderParseRule().contains(PARSE_RULE_SPLITTER)) {
pattern = Pattern.compile(fastaDb.getHeaderParseRule().split(PARSE_RULE_SPLITTER)[1]);
} else {
pattern = Pattern.compile(fastaDb.getHeaderParseRule());
}
//start reading the file
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith(BLOCK_SEPARATOR)) {
Matcher matcher = pattern.matcher(line.substring(1));
if (matcher.find()) {
accessions.add(matcher.group(1));
} else {
accessions.add(line.substring(1).split(SPLITTER)[0]);
}
}
}
}
return accessions;
}
/**
* Parse the given FASTA file in case no header parse rule is present.
*
* @param fastaPath the FASTA path
* @return the set of parsed protein accessions
* @throws IOException in case of file reading related problem
*/
private Set<String> parseAccessionsWithoutRule(Path fastaPath) throws IOException {
Set<String> accessions = new HashSet<>();
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//start reading the file
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith(BLOCK_SEPARATOR)) {
accessions.add(line.substring(1).split(SPLITTER)[0]);
}
}
}
return accessions;
}
/**
* Parse the given FASTA file with the Compomics utilities library.
*
* @param fastaPath the FASTA path
* @return the set of parsed protein accessions
* @throws IOException in case of file reading related problem
*/
private Set<String> parseAccessionsWithUtilities(Path fastaPath) throws IOException {
Set<String> accessions = new HashSet<>();
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//start reading the file
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith(BLOCK_SEPARATOR)) {
Header header = Header.parseFromFASTA(line);
accessions.add(header.getAccessionOrRest());
}
}
}
return accessions;
}
/**
* Parse the given FASTA file in case a header parse rule is present.
*
* @param fastaPath the FASTA DB path
* @param parseRule the header parse rule
* @param numberOfHeaders the number of headers that will be parsed
* @return the set of parsed protein accessions
* @throws IOException in case of file reading related problem
*/
private LinkedHashMap<String, String> testParseWithRule(Path fastaPath, String parseRule, int numberOfHeaders) throws IOException {
LinkedHashMap<String, String> headers = new LinkedHashMap<>();
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//compile the pattern
Pattern pattern;
if (parseRule.contains(PARSE_RULE_SPLITTER)) {
pattern = Pattern.compile(parseRule.split(PARSE_RULE_SPLITTER)[1]);
} else {
pattern = Pattern.compile(parseRule);
}
//start reading the file
String line;
while ((line = bufferedReader.readLine()) != null && headers.size() < numberOfHeaders) {
if (line.startsWith(BLOCK_SEPARATOR)) {
Matcher matcher = pattern.matcher(line.substring(1));
if (matcher.find()) {
headers.put(matcher.group(1), line);
} else {
headers.put(line.substring(1).split(SPLITTER)[0], line);
}
}
}
}
return headers;
}
/**
* Parse the given FASTA file in case no header parse rule is present.
*
* @param fastaPath the FASTA DB path
* @param numberOfHeaders the number of headers that will be parsed
* @return the set of parsed protein accessions
* @throws IOException in case of file reading related problem
*/
private LinkedHashMap<String, String> testParseWithoutRule(Path fastaPath, int numberOfHeaders) throws IOException {
LinkedHashMap<String, String> headers = new LinkedHashMap<>();
try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) {
//start reading the file
String line;
while ((line = bufferedReader.readLine()) != null && headers.size() < numberOfHeaders) {
if (line.startsWith(BLOCK_SEPARATOR)) {
//@TODO return the unparsed header or let compomics utilities try to parse it?
Header header = Header.parseFromFASTA(line);
headers.put(header.getAccessionOrRest(), line);
}
}
}
return headers;
}
}