-
Notifications
You must be signed in to change notification settings - Fork 1
/
FastqReader.cpp
169 lines (151 loc) · 4.25 KB
/
FastqReader.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#include "FastqReader.hpp"
#include <seqan/seq_io.h>
#include <vector>
#include <deque>
#include <utility>
FastqReader::FastqReader(string path){
seqIn=(new seqan::SeqFileIn(path.c_str()));
chunkSize=10000;
}
pair<string,string> FastqReader::readSeq(){
// string read,id;
// seqan::readRecord(id, read, *seqIn);
// return make_pair(read,"");
return make_pair(" "," ");
}
void FastqReader::readNSeq(deque<pair<string,string> >* res, uint64_t N){
if(N==0)
N=chunkSize;
seqan::StringSet<seqan::CharString> ids;
seqan::StringSet<seqan::CharString> reads;
seqan::readRecords(ids, reads, *seqIn,N);
res->clear();
//cout<<string((char*)toCString(reads[0]))<<endl;
uint64_t i=0;
for(auto read:reads){
res->push_back(make_pair(string((char*)toCString(read)),""));
}
}
bool FastqReader::isEOF(){
return atEnd(*seqIn);
}
FastqReaderSqueker::FastqReaderSqueker(string path){
seqIn = fopen(path.c_str(), "rb");
chunkSize=10000;
uint32_t OVERHEAD_SIZE = 65535;
fp=new file_pointer;
fp->part_buffer = new char[OVERHEAD_SIZE];
uint64_t part_size = 1ULL << 23;
part = (char *)malloc((part_size + OVERHEAD_SIZE)*sizeof(char));
}
pair<string,string> FastqReaderSqueker::readSeq(){
// string read,id;
// seqan::readRecord(id, read, *seqIn);
// return make_pair(read,"");
return make_pair(" "," ");
}
/* move the pointer to the end of the next newline. */
bool skip_next_eol(char *part, int64_t &pos, int64_t max_pos)
{
int64_t i;
for(i = pos; i < max_pos-2; ++i)
if((part[i] == '\n' || part[i] == '\r') && !(part[i+1] == '\n' ||
part[i+1] == '\r'))
break;
if(i >= max_pos-2)
return false;
pos = i+1;
return true;
}
void FastqReaderSqueker::parseReads(deque<pair<string,string> >* res)
{
auto fs = fp->part;
auto fe = fp->part;
auto end = fs + fp->size;
while (fs && fs!=end) {
fs = static_cast<char*>(memchr(fs, '\n', end-fs)); // ignore the first line
fs++; // increment the pointer
fe = static_cast<char*>(memchr(fs, '\n', end-fs)); // read the read
string read(fs, fe-fs);
//(*res)[readID++]=make_pair(read,"");
res->push_back(make_pair(read,""));
fs = ++fe; // increment the pointer
fs = static_cast<char*>(memchr(fs, '\n', end-fs)); // ignore one line
fs++; // increment the pointer
fs = static_cast<char*>(memchr(fs, '\n', end-fs)); // ignore one more line
fs++; // increment the pointer
}
}
void FastqReaderSqueker::readNSeq(deque<pair<string,string> >* res, uint64_t N){
if(N==0)
N=chunkSize;
uint64_t readID=0;
char *& _part = (fp->part);
uint64_t& _size = fp->size;
char*& part_buffer = (fp->part_buffer);
uint64_t& part_filled = fp->part_filled;
uint32_t OVERHEAD_SIZE = 65535;
uint64_t part_size = 1ULL << 23;
res->clear();
memcpy(part, part_buffer, part_filled);
if(isEOF())
return;
uint64_t readed = 0;
readed = fread(part+part_filled, 1, part_size, seqIn);
int64_t total_filled = part_filled + readed;
int64_t i;
if(part_filled >= OVERHEAD_SIZE)
{
cout << "Error: Wrong input file!\n";
exit(EXIT_FAILURE);
}
if(isEOF())
{
_part = part;
_size = total_filled;
part = NULL;
parseReads(res);
return;
}
// Looking for a FASTQ record at the end of the area
{
int64_t line_start[9];
int32_t j;
i = total_filled - OVERHEAD_SIZE / 2;
for(j = 0; j < 9; ++j)
{
if(!skip_next_eol(part, i, total_filled))
break;
line_start[j] = i;
}
_part = part;
if(j < 9)
_size = 0;
else
{
int k;
for(k = 0; k < 4; ++k)
{
if(part[line_start[k]+0] == '@' && part[line_start[k+2]+0] == '+')
{
if(part[line_start[k+2]+1] == '\n' || part[line_start[k+2]+1] == '\r')
break;
if(line_start[k+1]-line_start[k] == line_start[k+3]-line_start[k+2] &&
memcmp(part+line_start[k]+1, part+line_start[k+2]+1,
line_start[k+3]-line_start[k+2]-1) == 0)
break;
}
}
if(k == 4)
_size = 0;
else
_size = line_start[k];
}
}
copy(_part+_size, _part+total_filled, part_buffer);
part_filled = total_filled - _size;
parseReads(res);
}
bool FastqReaderSqueker::isEOF(){
return feof(seqIn) != 0;
}