-
Notifications
You must be signed in to change notification settings - Fork 1
/
MatchTrie.cpp
127 lines (109 loc) · 3.88 KB
/
MatchTrie.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*******************************************************************************
* This file is part of SAP.
*
* SAP is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SAP is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SAP. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
#include "MatchTrie.h"
#include "MatchStructures.h"
static inline int dnaToInt(char c){
if (c >= 'A' && c <= 'Z') c = c - 'A' + 'a';
switch (c){
case 'a': return 0;
case 't': return 1;
case 'g': return 2;
case 'c': return 3;
default: return -1;
}
}
/*
* MatchTrie
*/
MatchTrie::MatchTrie(){
_root = new TrieNode;
}
MatchTrie::~MatchTrie(){
eraseNode(_root);
}
std::vector <MatchTrie::Result> MatchTrie::find(const char *s, int len, int maxErrorCount){
std::vector <Result> ret;
find_p(_root, 0, s, len, maxErrorCount, ret);
return ret;
}
std::vector <MatchTrie::Result> MatchTrie::segmentFind(const char *s, int len, int segmentSize, int maxErrorCount){
std::map <std::pair <Dna *, char *>, int> m;
int count = 1;
for (int i = 0; i < len; i += segmentSize, count ++){
std::vector <Result> r;
int startLoc = i;
if (i + segmentSize > len){
if (len < segmentSize) r = find(s + i, len, maxErrorCount);
else r = find(s + len - segmentSize, segmentSize, maxErrorCount), startLoc = len - segmentSize;
} else r = find(s + i, segmentSize, maxErrorCount);
for (int j = 0; j < r.size(); j ++)
m[std::make_pair(r[j].seq, r[j].start - startLoc)] ++;
for (std::map <std::pair <Dna *, char *>, int>::iterator it = m.begin(); it != m.end(); it ++)
if (it -> second != count) m.erase(it);
}
std::vector <Result> ret;
for (std::map <std::pair <Dna *, char *>, int>::iterator it = m.begin(); it != m.end(); it ++)
ret.push_back(Result(it -> first.first, it -> first.second));
return ret;
}
void MatchTrie::insert(Dna *seq, char* start, char* end){
TrieNode *p = _root;
for (char *c = start; c < end; c ++){
int id = dnaToInt(*c);
if (!(p -> next[id]))
p -> next[id] = new TrieNode;
p = p -> next[id];
if (c - start >= MIN_INSERTING_LENGTH) p -> data.insert(Result(seq, start));
}
}
void MatchTrie::eraseNode(MatchTrie::TrieNode *&node){
if (!node) return;
for (int i = 0; i < 4; i ++) eraseNode(node -> next[i]);
delete node;
node = 0;
}
void MatchTrie::remove(Dna* seq, char *start, char *end){
TrieNode *p = _root;
for (char *c = start; c < end; c ++){
int id = dnaToInt(*c);
if (!(p -> next[id])) return;
p = p -> next[id];
if (c - start >= MIN_INSERTING_LENGTH) p -> data.erase(Result(seq, start));
}
}
void MatchTrie::find_p(MatchTrie::TrieNode *n, int depth, const char *s, int len, int errorCount, std::vector <MatchTrie::Result> &result){
if (!n || errorCount < 0) return;
if (depth == len){
for (std::set <Result>::iterator it = n -> data.begin(); it != n -> data.end(); it ++)
result.push_back(*it);
} else {
for (int i = 0; i < 4; i ++)
find_p(n -> next[i], depth + 1, s, len, errorCount - (dnaToInt(s[depth]) != i), result);
}
}
MatchTrie::Result::Result(const MatchTrie::Result& res) : start(res.start), seq(res.seq){
}
MatchTrie::Result::Result(Dna* seq, char* start) : start(start), seq(seq){
}
bool MatchTrie::Result::operator <(const MatchTrie::Result &res) const{
if (seq < res.seq) return 1;
else if (seq > res.seq) return 0;
else return start < res.start;
}
MatchTrie::TrieNode::TrieNode(){
for (int i = 0; i < 4; i ++) next[i] = 0;
}