-
Notifications
You must be signed in to change notification settings - Fork 1
/
bwtindex.c
1629 lines (1568 loc) · 65.4 KB
/
bwtindex.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "bwtindex.h"
#include "tools.h"
#ifdef _MSC_VER
#pragma warning(disable:4996)
#endif
#define DEBUG_BWTIS 1
#define FILEHEADER "IDX0"
typedef struct _IndexBlock {
unsigned int bwtLowBits;
unsigned int bwtHighBits;
unsigned int specialLettersMask;
unsigned int letterJumpsSample[5];
unsigned int textPositionSample;
} IndexBlock;
unsigned int offsetMasks[32] = { // = (1UL<<offset)
0x00000001, // 1st bit
0x00000002, // 2nd bit
0x00000004, // 3rd bit
0x00000008, // 4th bit
0x00000010, // 5th bit
0x00000020, // 6th bit
0x00000040, // 7th bit
0x00000080, // 8th bit
0x00000100, // 9th bit
0x00000200, // 10th bit
0x00000400, // 11th bit
0x00000800, // 12th bit
0x00001000, // 13th bit
0x00002000, // 14th bit
0x00004000, // 15th bit
0x00008000, // 16th bit
0x00010000, // 17th bit
0x00020000, // 18th bit
0x00040000, // 19th bit
0x00080000, // 20th bit
0x00100000, // 21st bit
0x00200000, // 22nd bit
0x00400000, // 23rd bit
0x00800000, // 24th bit
0x01000000, // 25th bit
0x02000000, // 26th bit
0x04000000, // 27th bit
0x08000000, // 28th bit
0x10000000, // 29th bit
0x20000000, // 30th bit
0x40000000, // 31st bit
0x80000000 // 32nd bit
};
unsigned int inverseOffsetMasks[32] = { // = ~(1UL<<offset)
0xFFFFFFFE, // 1st bit
0xFFFFFFFD, // 2nd bit
0xFFFFFFFB, // 3rd bit
0xFFFFFFF7, // 4th bit
0xFFFFFFEF, // 5th bit
0xFFFFFFDF, // 6th bit
0xFFFFFFBF, // 7th bit
0xFFFFFF7F, // 8th bit
0xFFFFFEFF, // 9th bit
0xFFFFFDFF, // 10th bit
0xFFFFFBFF, // 11th bit
0xFFFFF7FF, // 12th bit
0xFFFFEFFF, // 13th bit
0xFFFFDFFF, // 14th bit
0xFFFFBFFF, // 15th bit
0xFFFF7FFF, // 16th bit
0xFFFEFFFF, // 17th bit
0xFFFDFFFF, // 18th bit
0xFFFBFFFF, // 19th bit
0xFFF7FFFF, // 20th bit
0xFFEFFFFF, // 21st bit
0xFFDFFFFF, // 22nd bit
0xFFBFFFFF, // 23rd bit
0xFF7FFFFF, // 24th bit
0xFEFFFFFF, // 25th bit
0xFDFFFFFF, // 26th bit
0xFBFFFFFF, // 27th bit
0xF7FFFFFF, // 28th bit
0xEFFFFFFF, // 29th bit
0xDFFFFFFF, // 30th bit
0xBFFFFFFF, // 31st bit
0x7FFFFFFF // 32nd bit
};
unsigned int firstLettersMasks[33] = { // all bits before the offset = ((1UL<<offset)-1UL)
0x00000000, // lower 0 letters
0x00000001, // lower 1 letters
0x00000003, // lower 2 letters
0x00000007, // lower 3 letters
0x0000000F, // lower 4 letters
0x0000001F, // lower 5 letters
0x0000003F, // lower 6 letters
0x0000007F, // lower 7 letters
0x000000FF, // lower 8 letters
0x000001FF, // lower 9 letters
0x000003FF, // lower 10 letters
0x000007FF, // lower 11 letters
0x00000FFF, // lower 12 letters
0x00001FFF, // lower 13 letters
0x00003FFF, // lower 14 letters
0x00007FFF, // lower 15 letters
0x0000FFFF, // lower 16 letters
0x0001FFFF, // lower 17 letters
0x0003FFFF, // lower 18 letters
0x0007FFFF, // lower 19 letters
0x000FFFFF, // lower 20 letters
0x001FFFFF, // lower 21 letters
0x003FFFFF, // lower 22 letters
0x007FFFFF, // lower 23 letters
0x00FFFFFF, // lower 24 letters
0x01FFFFFF, // lower 25 letters
0x03FFFFFF, // lower 26 letters
0x07FFFFFF, // lower 27 letters
0x0FFFFFFF, // lower 28 letters
0x1FFFFFFF, // lower 29 letters
0x3FFFFFFF, // lower 30 letters
0x7FFFFFFF, // lower 31 letters
0xFFFFFFFF // lower 32 letters
};
unsigned int lastLettersMasks[33] = { // all bits at or after offset = ~((1UL<<offset)-1UL)
0xFFFFFFFF, // higher 32 letters
0xFFFFFFFE, // higher 31 letters
0xFFFFFFFC, // higher 30 letters
0xFFFFFFF8, // higher 29 letters
0xFFFFFFF0, // higher 28 letters
0xFFFFFFE0, // higher 27 letters
0xFFFFFFC0, // higher 26 letters
0xFFFFFF80, // higher 25 letters
0xFFFFFF00, // higher 24 letters
0xFFFFFE00, // higher 23 letters
0xFFFFFC00, // higher 22 letters
0xFFFFF800, // higher 21 letters
0xFFFFF000, // higher 20 letters
0xFFFFE000, // higher 19 letters
0xFFFFC000, // higher 18 letters
0xFFFF8000, // higher 17 letters
0xFFFF0000, // higher 16 letters
0xFFFE0000, // higher 15 letters
0xFFFC0000, // higher 14 letters
0xFFF80000, // higher 13 letters
0xFFF00000, // higher 12 letters
0xFFE00000, // higher 11 letters
0xFFC00000, // higher 10 letters
0xFF800000, // higher 9 letters
0xFF000000, // higher 8 letters
0xFE000000, // higher 7 letters
0xFC000000, // higher 6 letters
0xF8000000, // higher 5 letters
0xF0000000, // higher 4 letters
0xE0000000, // higher 3 letters
0xC0000000, // higher 2 letters
0x80000000, // higher 1 letters
0x00000000 // higher 0 letters
};
unsigned int searchOffsetMasks[32] = { // all bits before or at the offset, except 1st one = (((1UL<<(offset+1))-1UL)&(~1UL))
0x00000000, // lower 1 letters
0x00000002, // lower 2 letters
0x00000006, // lower 3 letters
0x0000000E, // lower 4 letters
0x0000001E, // lower 5 letters
0x0000003E, // lower 6 letters
0x0000007E, // lower 7 letters
0x000000FE, // lower 8 letters
0x000001FE, // lower 9 letters
0x000003FE, // lower 10 letters
0x000007FE, // lower 11 letters
0x00000FFE, // lower 12 letters
0x00001FFE, // lower 13 letters
0x00003FFE, // lower 14 letters
0x00007FFE, // lower 15 letters
0x0000FFFE, // lower 16 letters
0x0001FFFE, // lower 17 letters
0x0003FFFE, // lower 18 letters
0x0007FFFE, // lower 19 letters
0x000FFFFE, // lower 20 letters
0x001FFFFE, // lower 21 letters
0x003FFFFE, // lower 22 letters
0x007FFFFE, // lower 23 letters
0x00FFFFFE, // lower 24 letters
0x01FFFFFE, // lower 25 letters
0x03FFFFFE, // lower 26 letters
0x07FFFFFE, // lower 27 letters
0x0FFFFFFE, // lower 28 letters
0x1FFFFFFE, // lower 29 letters
0x3FFFFFFE, // lower 30 letters
0x7FFFFFFE, // lower 31 letters
0xFFFFFFFE // lower 32 letters
};
unsigned int firstLetterMask = 0x00000001; // lowest bit
unsigned int lastLetterMask = 0x80000000; // highest bit
unsigned int secondLetterMask = 0x00000002; // 2nd lowest bit
unsigned int firstHalfLettersMask = 0x0000FFFF; // lowest 16 bits
unsigned int lastHalfLettersMask = 0xFFFF0000; // highest 16 bits
unsigned int letterLowBitMasks[6] = {
0x00000000, // unused (for '$' (00))
0xFFFFFFFF, // unused (for 'N' (01))
0x00000000, // 1st bit mask for 'A' (00): 0...0
0xFFFFFFFF, // 1st bit mask for 'C' (01): 1...1
0x00000000, // 1st bit mask for 'G' (10): 0...0
0xFFFFFFFF // 1st bit mask for 'T' (11): 1...1
};
unsigned int letterHighBitMasks[6] = {
0x00000000, // unused (for '$' (00))
0x00000000, // unused (for 'N' (01))
0x00000000, // 2nd bit mask for 'A' (00): 0...0
0x00000000, // 2nd bit mask for 'C' (01): 0...0
0xFFFFFFFF, // 2nd bit mask for 'G' (10): 1...1
0xFFFFFFFF // 2nd bit mask for 'T' (11): 1...1
};
unsigned int inverseLetterLowBitMasks[6] = {
0xFFFFFFFF, // unused (for '$' (00))
0x00000000, // unused (for 'N' (01))
0xFFFFFFFF, // 1st bit mask for 'A' (00): ~0...0 = 1...1
0x00000000, // 1st bit mask for 'C' (01): ~1...1 = 0...0
0xFFFFFFFF, // 1st bit mask for 'G' (10): ~0...0 = 1...1
0x00000000 // 1st bit mask for 'T' (11): ~1...1 = 0...0
};
unsigned int inverseLetterHighBitMasks[6] = {
0xFFFFFFFF, // unused (for '$' (00))
0xFFFFFFFF, // unused (for 'N' (01))
0xFFFFFFFF, // 2nd bit mask for 'A' (00): ~0...0 = 1...1
0xFFFFFFFF, // 2nd bit mask for 'C' (01): ~0...0 = 1...1
0x00000000, // 2nd bit mask for 'G' (10): ~1...1 = 0...0
0x00000000 // 2nd bit mask for 'T' (11): ~1...1 = 0...0
};
char letterChars[6] = { '$' , 'N' , 'A' , 'C' , 'G' , 'T' }; // get letter char from letter id
unsigned int sampleIntervalShift = 5; // sample interval of 32 positions (2^5=32)
unsigned int sampleIntervalMask = 0x0000001F; // = ((1<<sampleIntervalShift)-1) = (32-1)
unsigned int sampleIntervalSize = 32; // = (1<<sampleIntervalShift) = 32
unsigned int sampleIntervalHalfSize = 16; // = (1<<(sampleIntervalShift-1)) = 16
// varibles needed to load/store index
struct _IndexBlock *Index = NULL;
unsigned int textSize = 0; // inside the index functions, textSize always counts with the terminator char
unsigned int numSamples = 0;
unsigned int lastBwtPos = 0; // position in the BWT of the 0-th entry of the last sample (multiple of 32 to speed up search of 1st char of pattern)
char *textFilename = NULL;
unsigned char *letterIds = NULL;
void SetCharAtBWTPos(unsigned int charid, unsigned int bwtpos){
unsigned int sample = ( bwtpos >> sampleIntervalShift );
IndexBlock *block = &(Index[sample]); // get sample block
unsigned int offset = ( bwtpos & sampleIntervalMask );
unsigned int mask = inverseOffsetMasks[offset]; // get all bits except the one at the offset
(block->specialLettersMask) &= mask; // reset special letter mark
(block->bwtLowBits) &= mask; // reset low bit
(block->bwtHighBits) &= mask; // reset high bit
mask = offsetMasks[offset]; // get only the bit at the offset
if( charid < 2 ) (block->specialLettersMask) |= mask; // set special letter mark if needed (for ids 0 and 1)
(block->bwtLowBits) |= ( letterLowBitMasks[charid] & mask ); // set low bit
(block->bwtHighBits) |= ( letterHighBitMasks[charid] & mask ); // set high bit
}
unsigned int GetCharIdAtBWTPos(unsigned int bwtpos){
unsigned int sample, offset, mask, charid;
IndexBlock *block;
sample = ( bwtpos >> sampleIntervalShift );
offset = ( bwtpos & sampleIntervalMask );
block = &(Index[sample]); // get sample block
mask = offsetMasks[offset]; // get only the bit at the offset
charid = ( ( (block->bwtLowBits) >> offset ) & firstLetterMask ); // get low bit
charid |= ( ( ( (block->bwtHighBits) >> offset ) & firstLetterMask ) << 1 ); // get high bit
if( !( (block->specialLettersMask) & mask ) ) charid += 2; // check if special letter bit is set, because regular ids start at id=2
return charid;
}
char GetCharAtBWTPos(unsigned int bwtpos){
return letterChars[ GetCharIdAtBWTPos(bwtpos) ]; // get letter char
}
/*
unsigned int GetRightCharIdAtBWTPos(unsigned int bwtpos){
unsigned int charid;
charid = 5;
while( (charid != 0) && (bwtpos < letterStartPos[charid]) ) charid--;
return charid;
}
*/
void InitializeLetterIdsArray(){
int i;
letterIds=(unsigned char *)malloc(256*sizeof(unsigned char));
for(i=0;i<256;i++) letterIds[i]=(unsigned char)1; // ACGT, N and $
letterIds[(int)'$']=(unsigned char)0;
letterIds[(int)'N']=(unsigned char)1;
letterIds[(int)'A']=(unsigned char)2;
letterIds[(int)'C']=(unsigned char)3;
letterIds[(int)'G']=(unsigned char)4;
letterIds[(int)'T']=(unsigned char)5;
letterIds[(int)'n']=(unsigned char)1;
letterIds[(int)'a']=(unsigned char)2;
letterIds[(int)'c']=(unsigned char)3;
letterIds[(int)'g']=(unsigned char)4;
letterIds[(int)'t']=(unsigned char)5;
}
unsigned int FMI_GetTextSize(){
return (textSize-1); // inside the index functions, the textSize variable counts the terminator char too
}
unsigned int FMI_GetBWTSize(){
return lastBwtPos; // last valid filled position of the BWT (aligned to a multiple of the sample interval)
}
char *FMI_GetTextFilename(){
return textFilename;
}
// TODO: find way to remove the decrement in letterJumpsSample[(letterId-1)]
// TODO: move code with the check for special letters to another function, because on normal search we never use jumps by $'s or N's (but on position serch, we do by N's)
unsigned int FMI_LetterJump( unsigned int letterId , unsigned int bwtPos ){
unsigned int offset, bitArray, letterJump;
IndexBlock *block;
block = &(Index[( bwtPos >> sampleIntervalShift )]);
offset = ( bwtPos & sampleIntervalMask );
bitArray = searchOffsetMasks[offset]; // all bits bellow offset, except 1st one
if( letterId < 2 ) bitArray &= (block->specialLettersMask); // if we are looking for '$' or 'N', keep only special chars
else bitArray &= ( ~ (block->specialLettersMask) ); // remove special chars
bitArray &= ( (block->bwtLowBits) ^ inverseLetterLowBitMasks[letterId] ); // keep only ones with the same low bit
bitArray &= ( (block->bwtHighBits) ^ inverseLetterHighBitMasks[letterId] ); // keep only ones with the same high bit
letterJump = (block->letterJumpsSample[(letterId-1)]); // get top letter jump
while( bitArray ){
letterJump++; // of other equal letter exist, increase letter jump
bitArray &= ( bitArray - 1U ); // clear lower occurrence bit
}
return letterJump;
}
// TODO: find better way to deal with original toppointer value
// TODO: move code of called functions inside and reuse variables for speed up
// NOTE: returns the size of the BWT interval if a match exists, and 0 otherwise
unsigned int FMI_FollowLetter( char c , unsigned int *topPointer , unsigned int *bottomPointer ){
unsigned int charid, originaltoppointer;
originaltoppointer = (*topPointer);
charid = letterIds[(int)c];
(*topPointer) = FMI_LetterJump( charid , originaltoppointer );
if( GetCharIdAtBWTPos(originaltoppointer) != charid ) (*topPointer)++; // if the letter is not in the topPointer position (the initial, not the updated one), its next occurrence is after that
(*bottomPointer) = FMI_LetterJump( charid , (*bottomPointer) );
if( (*topPointer) > (*bottomPointer) ) return 0;
return ( (*bottomPointer) - (*topPointer) + 1 );
/*
unsigned int bwtPos, letterId, offset, bitArray, letterJump;
IndexBlock *block;
letterId = letterIds[c];
bwtPos = (*topPointer); // process top pointer
block = &(Index[( bwtPos >> sampleIntervalShift )]); // get sample block
bitArray = ( ~ (block->specialLettersMask) ); // remove special chars
bitArray &= ( (block->bwtLowBits) ^ inverseLetterLowBitMasks[letterId] ); // keep only ones with the same low bit
bitArray &= ( (block->bwtHighBits) ^ inverseLetterHighBitMasks[letterId] ); // keep only ones with the same high bit
letterJump = (block->letterJumpsSample[letterId]); // get top letter jump
offset = ( bwtPos & sampleIntervalMask ); // get offset inside sample
if( !( bitArray & offsetMasks[offset] ) ) letterJump++; // if the letter is not in the topPointer position, its next occurrence is bellow that
bitArray &= searchOffsetMasks[offset]; // keep all bits bellow offset, except 1st one
while( bitArray ){ // while there are occurrences of this letter in the interval
letterJump++; // if other equal letter exist, increase letter jump
bitArray &= ( bitArray - 1U ); // clear lower occurrence bit
}
(*topPointer) = letterJump;
bwtPos = (*bottomPointer); // process bottom pointer
block = &(Index[( bwtPos >> sampleIntervalShift )]);
offset = ( bwtPos & sampleIntervalMask );
bitArray = searchOffsetMasks[offset]; // all bits bellow offset, except 1st one
bitArray &= ( ~ (block->specialLettersMask) ); // remove special chars
bitArray &= ( (block->bwtLowBits) ^ inverseLetterLowBitMasks[letterId] ); // keep only ones with the same low bit
bitArray &= ( (block->bwtHighBits) ^ inverseLetterHighBitMasks[letterId] ); // keep only ones with the same high bit
letterJump = (block->letterJumpsSample[letterId]); // get top letter jump
while( bitArray ){
letterJump++; // if other equal letter exist, increase letter jump
bitArray &= ( bitArray - 1U ); // clear lower occurrence bit
}
(*bottomPointer) = letterJump;
if( (*topPointer) > (*bottomPointer) ) return 0;
return ( (*bottomPointer) - (*topPointer) + 1 );
*/
}
// TODO: move code of called functions inside and reuse variables for speed up
unsigned int FMI_PositionInText( unsigned int bwtpos ){
unsigned int charid, addpos;
addpos = 0;
while( bwtpos & sampleIntervalMask ){ // move backwards until we land on a position with a sample
charid = GetCharIdAtBWTPos(bwtpos);
if( charid == 0 ) return addpos; // check if this is the terminator char
bwtpos = FMI_LetterJump( charid , bwtpos ); // follow the left letter backwards
addpos++; // one more position away from our original position
}
return ( (Index[( bwtpos >> sampleIntervalShift )].textPositionSample) + addpos );
}
void FMI_LoadIndex(char *indexfilename){
FILE *indexfile;
size_t readcount;
fpos_t filepos;
unsigned int i;
long long int seqstart, seqend, seqsize;
char c;
char fileHeader[5] = FILEHEADER;
size_t ncount = 0; ncount = (size_t)ncount;
printf("> Loading index from file <%s> ... ",indexfilename);
fflush(stdout);
indexfile = fopen(indexfilename,"rb");
if( indexfile == NULL ){
printf("\n> ERROR: Cannot open file\n");
exit(0);
}
seqstart=(long long int)ftell(indexfile);
fseek(indexfile,0L,SEEK_END);
seqend=(long long int)ftell(indexfile);
seqsize=(seqend-seqstart);
rewind(indexfile);
printf("(");PrintNumber(seqsize);printf(" bytes)");
fflush(stdout);
for(i=0;i<4;i++){ // check if header is "IDX0"
ncount = fread( &c , sizeof(char) , (size_t)1 , indexfile );
if( c != fileHeader[i] ) break;
}
if( i != 4 ){
printf("\n> ERROR: Invalid index file\n");
exit(0);
}
fgetpos(indexfile,&filepos);
i=0;
while( c!='\0' && c!=EOF ){ // get text filename size
ncount = fread( &c , sizeof(char) , (size_t)1 , indexfile );
i++;
}
textFilename=(char *)calloc((i),sizeof(char));
fsetpos(indexfile,&filepos);
ncount = fread( textFilename , sizeof(char) , (size_t)i , indexfile ); // get text filename chars
ncount = fread( &textSize , sizeof(unsigned int) , (size_t)1 , indexfile ); // counts the terminator char too, so it is actually the BWT size
ncount = fread( &numSamples , sizeof(unsigned int) , (size_t)1 , indexfile );
#ifdef DEBUG
printf("\n [textFilename=\"%s\";textSize=%u;numSamples=%u]",textFilename,textSize,numSamples);
fflush(stdout);
#endif
i = ( ( ( textSize-1 ) >> sampleIntervalShift ) + 1 );
if( ((textSize-1) & sampleIntervalMask) != 0 ) i++; // extra sample
if( numSamples != i ){ // check if number of samples is correct based on text size
printf("\n> ERROR: Invalid index data\n");
exit(0);
}
Index = (IndexBlock *)malloc( numSamples * sizeof(IndexBlock) ); // allocate memory for all index blocks
if( Index == NULL ){
printf("\n> ERROR: Not enough memory\n");
exit(0);
}
readcount = fread( Index , sizeof(IndexBlock) , (size_t)(numSamples) , indexfile ); // read all index blocks
if( readcount != (size_t)numSamples ){
printf("\n> ERROR: Incomplete index data\n");
exit(0);
}
fclose(indexfile);
InitializeLetterIdsArray(); // initialize arrays used by index functions
lastBwtPos = ((numSamples-1) << sampleIntervalShift); // set last BWT pos for pattern matching initialization
printf(" OK\n");
fflush(stdout);
}
// TODO: use indexBlocksStartInFile, distanceToNextBlockBits
void FMI_SaveIndex(char *indexfilename){
FILE *indexfile;
size_t writecount;
long long int filesize;
int i;
printf("> Saving index to file <%s> ... ",indexfilename);
fflush(stdout);
indexfile = fopen(indexfilename,"wb");
if( indexfile == NULL ){
printf("\n> ERROR: Cannot create file\n");
exit(0);
}
i=0;
while(textFilename[i]!='\0') i++; // get text filename size
i++;
fwrite( (FILEHEADER) , sizeof(char) , (size_t)4 , indexfile );
fwrite( textFilename , sizeof(char) , (size_t)i , indexfile );
fwrite( &textSize , sizeof(unsigned int) , (size_t)1 , indexfile );
fwrite( &numSamples , sizeof(unsigned int) , (size_t)1 , indexfile );
writecount = fwrite( Index , sizeof(IndexBlock) , (size_t)(numSamples) , indexfile );
if( writecount != (size_t)(numSamples) ){
printf("\n> ERROR: Cannot write file\n");
exit(0);
}
filesize=(long long int)ftell(indexfile);
fclose(indexfile);
printf("(");PrintNumber(filesize);printf(" bytes) OK\n");
fflush(stdout);
}
void FMI_FreeIndex(){
if(textFilename!=NULL){
free(textFilename);
textFilename=NULL;
}
if(Index!=NULL){
free(Index);
Index=NULL;
}
if(letterIds!=NULL){
free(letterIds);
letterIds=NULL;
}
}
void PrintBWT(char *text, unsigned int *letterStartPos){
unsigned int i, n, p;
printf("%u {", textSize );
for( n = 1 ; n < 6 ; n++ ){
printf(" %c [%02u-%02u] %c", letterChars[n] , letterStartPos[n] , (n==5)?(textSize-1):(letterStartPos[(n+1)]-1) , (n==5)?'}':',' );
}
printf(" 2^%u=%u %u %#.8X\n", sampleIntervalShift , sampleIntervalSize , numSamples , sampleIntervalMask );
printf("[ ] ( ) {");
for( n = 1 ; n < 6 ; n++ ){
printf(" %c%c", letterChars[n] , (n==5)?'}':',' );
}
printf("\n");
for( i = 0 ; i < textSize ; i++ ){ // position in BWT
p = FMI_PositionInText( i );
printf("[%02u]%c(%2u) {", i , (i & sampleIntervalMask)?' ':'*' , p );
for( n = 1 ; n < 6 ; n++ ){
printf("%02u%c", FMI_LetterJump( n , i ) , (n==5)?'}':',' );
}
printf(" %c ", GetCharAtBWTPos(i) );
n = 0;
while( ( n < 5 ) && ( i >= letterStartPos[(n+1)] ) ) n++;
printf(" %c ", letterChars[n] );
if( text != NULL ) printf("%s", (char *)(text+p+1) );
printf("\n");
}
}
typedef struct _PackedNumberArray {
unsigned long long *bitsArray;
unsigned char bitsPerInt;
unsigned int numWords;
//unsigned char bitsPerWord; // = 64
} PackedNumberArray;
PackedNumberArray *NewPackedNumberArray(unsigned int numInts, unsigned int maxInt){
PackedNumberArray *intArray;
unsigned long long numBits;
unsigned int n;
intArray = (PackedNumberArray *)malloc(sizeof(PackedNumberArray));
//(intArray->bitsPerWord) = (unsigned char)(sizeof(unsigned long long)*8); // use 64 bit words (8 bytes * 8 bits/byte)
n = 1; // number of bits needed to store one number
while( ( (1UL << n) - 1 ) < maxInt ) n++; // n bits per number (stores 2^n numbers, but the last one is (2^n-1))
(intArray->bitsPerInt) = (unsigned char)n;
numBits = ( ((unsigned long long)numInts) * ((unsigned long long)n) ); // total number of bits occupied by all the numbers
if( numBits != 0) numBits--; // if it was a multiple of 64 , it would create an extra unused word
n = (unsigned int)( (numBits/64ULL) + 1ULL); // number of 64 bit words required to store (numInts) numbers of (bitsPerInt) bits each
(intArray->numWords) = n;
(intArray->bitsArray) = (unsigned long long *)calloc(n,sizeof(unsigned long long)); // bit array that will store the numbers
return intArray;
}
void FreePackedNumberArray(PackedNumberArray *intArray){
free(intArray->bitsArray);
free(intArray);
}
void ResetPackedNumberArray(PackedNumberArray *intArray){
unsigned int n;
n = (intArray->numWords);
while( n != 0 ) (intArray->bitsArray)[(--n)] = 0ULL;
}
unsigned int GetPackedNumber(PackedNumberArray *intArray, unsigned int pos){
unsigned char offset, bitsPerInt;
unsigned long long temp;
unsigned int number;
bitsPerInt = (intArray->bitsPerInt);
temp = ((unsigned long long)(pos)) * ((unsigned long long)(bitsPerInt)); // number of bits
offset = (unsigned char)( temp & 63ULL ); // mask by (64-1) to get position of 1st bit inside the word
pos = (unsigned int)( temp >> 6 ); // divide by 64 to get word position inside bit array
temp = ( ( 1ULL << bitsPerInt ) - 1ULL ); // mask for the n bits of each number
number = (unsigned int)( ( (intArray->bitsArray)[pos] >> offset ) & temp );
offset = ( 64 - offset ); // check if the bits of the number extend to the next word (get number of bits left until the end of this word)
if( offset < bitsPerInt ) number |= (unsigned int)( ( (intArray->bitsArray)[(++pos)] << offset ) & temp );
return number;
}
void SetPackedNumber(PackedNumberArray *intArray, unsigned int pos, unsigned int num){
unsigned char offset, bitsPerInt;
unsigned long long temp;
bitsPerInt = (intArray->bitsPerInt);
temp = ((unsigned long long)(pos)) * ((unsigned long long)(bitsPerInt)); // number of bits
offset = (unsigned char)( temp & 63ULL ); // mask by (64-1) to get position of 1st bit inside the word
pos = (unsigned int)( temp >> 6 ); // divide by 64 to get word position inside bit array
(intArray->bitsArray)[pos] |= ( ((unsigned long long)num) << offset );
offset = ( 64 - offset ); // check if the bits of the number extend to the next word (get number of bits left until the end of this word)
if( offset < bitsPerInt ) (intArray->bitsArray)[(++pos)] |= ( ((unsigned long long)num) >> offset );
}
void ReplacePackedNumber(PackedNumberArray *intArray, unsigned int pos, unsigned int num){
unsigned char offset, bitsPerInt;
unsigned long long temp;
bitsPerInt = (intArray->bitsPerInt);
temp = ((unsigned long long)(pos)) * ((unsigned long long)(bitsPerInt)); // number of bits
offset = (unsigned char)( temp & 63ULL ); // mask by (64-1) to get position of 1st bit inside the word
pos = (unsigned int)( temp >> 6 ); // divide by 64 to get word position inside bit array
temp = ( ((unsigned long long)num) << offset ); // rightmost bits to set
(intArray->bitsArray)[pos] &= temp; // only keep common 1 bits
(intArray->bitsArray)[pos] |= temp; // set the missing 1 bits (same as reset and then set)
offset = ( 64 - offset ); // check if the bits of the number extend to the next word (get number of bits left until the end of this word)
if( offset < bitsPerInt ){
pos++; // next word
temp = ( ((unsigned long long)num) >> offset ); // leftmost bits to set
(intArray->bitsArray)[pos] &= temp; // only keep common 1 bits
(intArray->bitsArray)[pos] |= temp; // set the missing 1 bits (same as reset and then set)
}
}
typedef struct _PackedIncreasingNumberArray {
unsigned char numTotalBits;
unsigned char numHighBits;
unsigned char numLowBits;
unsigned int highBitsMask;
unsigned int lowBitsMask;
unsigned int *highLimits;
PackedNumberArray *lowBits;
} PackedIncreasingNumberArray;
PackedIncreasingNumberArray *NewPackedIncreasingNumberArray(unsigned int numInts, unsigned int maxInt){
PackedIncreasingNumberArray *incIntArray;
unsigned long long numBits;
unsigned int k;
unsigned char n;
incIntArray = (PackedIncreasingNumberArray *)malloc(sizeof(PackedIncreasingNumberArray));
//k = (unsigned int)(sizeof(unsigned int)*8); // use 32 bits words
numBits = (unsigned long long)(numInts*32); // total number of bits to store all the words regularly
n = 0; // get best number of high bits to compact
while( (((1ULL << n)-1ULL)*32ULL) < numBits ) n++; // for compacting n high bits we need (2^n-1) extra numbers
if( n != 0 ) n--; // the current value already exceeded the regular size
(incIntArray->numHighBits) = n;
n = 0; // number of bits per number
k = 1; // power of two (2^n)
while( k && ( (k-1) < maxInt ) ){ // (2^n-1) is the largest number that can be represented by n bits
n++;
k <<= 1;
}
if( k != 0 ) k--; // mask for all bits
else k = (~k); // n = 32, whole word (~0UL)
(incIntArray->numTotalBits) = n; // total bits of each number
n -= (incIntArray->numHighBits); // number of lower bits
(incIntArray->numLowBits) = n;
(incIntArray->lowBitsMask) = ( (1UL << n) - 1UL ); // mask for lower bits
(incIntArray->highBitsMask) = ( k ^ (incIntArray->lowBitsMask) ); // all bits of number except the low ones
k = ( ( 1UL << (incIntArray->numHighBits) ) - 1UL ); // number of limits needed for compacting the high bits
(incIntArray->highLimits) = (unsigned int *)calloc(k,sizeof(unsigned int));
k = ( ( 1UL << (incIntArray->numLowBits) ) - 1UL ); // highest number representable by the low bits
(incIntArray->lowBits) = NewPackedNumberArray(numInts,k);
return incIntArray;
}
void FreePackedIncreasingNumberArray(PackedIncreasingNumberArray *incIntArray){
FreePackedNumberArray(incIntArray->lowBits);
free(incIntArray->highLimits);
free(incIntArray);
}
void ResetPackedIncreasingNumberArray(PackedIncreasingNumberArray *incIntArray){
unsigned int n;
n = ( ( 1UL << (incIntArray->numHighBits) ) - 1UL ); // number of limits used
while( n != 0 ) (incIntArray->highLimits)[(--n)] = 0UL;
ResetPackedNumberArray(incIntArray->lowBits);
}
unsigned int GetPackedIncreasingNumber(PackedIncreasingNumberArray *incIntArray, unsigned int pos){
unsigned char n;
unsigned int number, limit, i, k;
number = 0UL;
i = pos; // position in the increasing numbers array
k = 0; // current position in the limits array
n = (incIntArray->numHighBits); // current high bit
while( n && i ){
limit = (incIntArray->highLimits)[k];
if( i < limit ){ // bit 0, and advance to limit (2*k+1)
k <<= 1;
k++;
} else { // bit 1, and advance to limit (2*(k+1))
k++;
k <<= 1;
i -= limit; // fix array position relative to the current limit
number++; // the number has a bit 1 at this position
}
n--;
number <<= 1; // go to next bit of number
}
number <<= n; // if we didn't check all high bits, shift the number correctly
return ( ( number << (incIntArray->numLowBits) ) | GetPackedNumber((incIntArray->lowBits),pos) );
}
void SetPackedIncreasingNumber(PackedIncreasingNumberArray *incIntArray, unsigned int pos, unsigned int num){
unsigned char n;
unsigned int mask, k;
mask = (1UL << ((incIntArray->numTotalBits)-1)); // mask for the highest/leftmost bit of the number
k = 0; // current position in the limits array
n = (incIntArray->numHighBits); // current high bits count
while( n > 0 ){
if( num & mask ){ // bit 1, and advance to limit (2*(k+1))
k++;
k <<= 1;
} else { // bit 0, and advance to limit (2*k+1)
((incIntArray->highLimits)[k])++; // increase limit count
k <<= 1;
k++;
}
n--;
mask >>= 1; // process next bit at the right
}
SetPackedNumber((incIntArray->lowBits),pos,(num & (incIntArray->lowBitsMask)));
}
// variables used by induced sort functions
int *nextId;
int *topLMS, *bottomLML;
int *firstId, *lastId;
unsigned int *suffixArray;
int maxAlphabetSize;
// variable used to store final BWT of the initial text
PackedNumberArray *bwtArray;
//#define GETCHAR(text,n,level) ( (level) ? ( ((int *)text)[n] ) : ( (int)letterIds[ (int)((char *)text)[n] ] ) )
#define SETBIT(bitarray,n) ( bitarray[(n>>5)] |= ( 1UL << (n&31) ) )
#define GETBIT(bitarray,n) ( bitarray[(n>>5)] & ( 1UL << (n&31) ) )
#define GETCHARTYPE(bitarray,n) ( (n==0) ? ( GETBIT(bitarray,n)?'s':'L' ) : ( GETBIT(bitarray,n) ? (GETBIT(bitarray,(n-1))?'s':'S') : (GETBIT(bitarray,(n-1))?'L':'l') ) )
// S-type: bit 1 at pos not 0 and with bit 0 at pos (n-1)
#define IS_S_TYPE(bitarray,n) ( GETBIT(bitarray,n) && ( (n!=0) && !GETBIT(bitarray,(n-1)) ) )
/*
// s-type: bit 1 at pos 0 or with bit 1 at pos (n-1)
#define IS_s_TYPE(bitarray,n) ( GETBIT(bitarray,n) && ( (n==0) || GETBIT(bitarray,(n-1)) ) )
// L-type: bit 0 at pos 0 or with bit 1 at pos (n-1)
#define IS_L_TYPE(bitarray,n) ( !GETBIT(bitarray,n) && ( (n==0) || GETBIT(bitarray,(n-1)) ) )
// l-type: bit 0 at pos not 0 and with bit 0 at pos (n-1)
#define IS_l_TYPE(bitarray,n) ( !GETBIT(bitarray,n) && ( (n!=0) && !GETBIT(bitarray,(n-1)) ) )
*/
void PrintISState( char *header, PackedNumberArray *text , unsigned int *posLMS , unsigned int *charTypes , unsigned int textSize , int alphabetSize , unsigned int *usedLMS, int level ){
unsigned int n;
int i,j,k;
if(topLMS[0]!=(-1)){ // S-type suffixes
printf("\n%s\n",header);
for(i=0;i<16;i++){ putchar('-'); } putchar('\n');
k=0;
for(j=0;j<alphabetSize;j++){
i=topLMS[j];
while(i!=(-1)){
n=posLMS[i];
printf("[%02d]{%02d} %c %02u %c",k,i,GETCHARTYPE(charTypes,n),n,((usedLMS && GETBIT(usedLMS,i))?'*':' '));
if(level){
printf("(%02u)",GetPackedNumber(text,n)); n++;
if(n==textSize) n=0;
while(GETCHARTYPE(charTypes,n)!='S'){ printf("(%02u)",GetPackedNumber(text,n)); n++; }
printf("(%02u)",GetPackedNumber(text,n));
} else {
printf("%c",letterChars[GetPackedNumber(text,n)]); n++;
if(n==textSize) n=0;
while(GETCHARTYPE(charTypes,n)!='S'){ printf("%c",letterChars[GetPackedNumber(text,n)]); n++; }
printf("%c",letterChars[GetPackedNumber(text,n)]);
}
putchar('\n');
i=nextId[i];
k++;
}
for(i=0;i<16;i++){ putchar('-'); } putchar('\n');
}
} else { // L-type suffixes
printf("\n%s\n",header);
for(i=0;i<16;i++){ putchar('-'); } putchar('\n');
k=0;
for(j=alphabetSize;j!=0;){
j--;
i=bottomLML[j];
while(i!=(-1)){
n=posLMS[i];
printf("[%02d]{%02d} %c %02u %c",k,i,GETCHARTYPE(charTypes,n),n,((usedLMS && GETBIT(usedLMS,i))?'*':' '));
if(level){
printf("(%02u)",GetPackedNumber(text,n)); n++;
if(n==textSize) n=0;
while(GETCHARTYPE(charTypes,n)!='S'){ printf("(%02u)",GetPackedNumber(text,n)); n++; }
printf("(%02u)",GetPackedNumber(text,n));
} else {
printf("%c",letterChars[GetPackedNumber(text,n)]); n++;
if(n==textSize) n=0;
while(GETCHARTYPE(charTypes,n)!='S'){ printf("%c",letterChars[GetPackedNumber(text,n)]); n++; }
printf("%c",letterChars[GetPackedNumber(text,n)]);
}
putchar('\n');
i=nextId[i];
k++;
}
for(i=0;i<16;i++){ putchar('-'); } putchar('\n');
}
}
}
// NOTE: if fillSA is set, the suffixArray variable is filled too
// NOTE: if fillSA is set and the level is 0, the BWT (of the initial text) is created instead
void InducedSort( PackedNumberArray *text , unsigned int *posLMS , unsigned int *charTypes , unsigned int textSize , int alphabetSize , int fillSA, int level ){
int i, j, k, m;
unsigned int n;
unsigned int *bucketSize, *bucketPointer;
char processingType, currentType;
k=0; // just so compiler does not complain
bucketSize=NULL;
bucketPointer=NULL;
if( fillSA ){ // if we want to fill the suffix array while doing the induced sorting, set pointers to the beginning of the buckets
bucketSize = (unsigned int *)malloc(alphabetSize*sizeof(unsigned int));
for( j = 0 ; j < alphabetSize ; j++ ) bucketSize[j] = 0; // reset bucket size
for( n = 0 ; n < textSize ; n++ ) bucketSize[ GetPackedNumber(text,n) ]++; // count number of each alphabet letter in the text (size of each bucket)
bucketPointer = (unsigned int *)malloc(alphabetSize*sizeof(unsigned int)); // pointer to the location in the suffixArray that will be filled up next
bucketPointer[0] = 0;
for( j = 1 ; j < alphabetSize ; j++ ) bucketPointer[j] = ( bucketPointer[(j-1)] + bucketSize[(j-1)] ); // points to the first position in each bucket
}
for( j = 0 ; j < alphabetSize ; j++ ){ // reset pointers that will be used
bottomLML[j] = (-1);
firstId[j] = (-1);
lastId[j] = (-1);
}
#ifdef DEBUG_BWTIS
if( textSize<100 && level<5 && !fillSA ) PrintISState("[->S]",text,posLMS,charTypes,textSize,alphabetSize,NULL,level);
#endif
for( j = 0 ; j < alphabetSize ; j++ ){ // induce sort L-type chars from S-type chars
processingType = 'l';
i = firstId[j]; // process l-type chars at the top of the bucket
L_from_S:
while( i != (-1) ){
posLMS[i]--; // go backwards
n = posLMS[i]; // position in the text
currentType = GETCHARTYPE(charTypes,n); // char type at this position
m = GetPackedNumber(text,n); // char at this position
if( fillSA ){ // set this suffix position at the correct position in the top of its bucket inside the suffix array
if(level) suffixArray[ bucketPointer[m] ] = n;
else { // at level 0, fill the BWT array
if( n == 0 ) n = textSize;
n--; // position behind the current one
n = GetPackedNumber(text,n);
SetPackedNumber(bwtArray,(bucketPointer[m]),n);
}
bucketPointer[m]++;
}
if( currentType == 'l' ){ // add to bottom of l-type list at the top of the bucket
if( firstId[m] == (-1) ) firstId[m] = i;
else nextId[ lastId[m] ] = i;
k = nextId[i]; // save next id here because if n==j and it was the last id, the next ptr was updated in the line above
nextId[i] = (-1);
lastId[m] = i;
} else if( currentType == 'L' ){ // add to bottom of L-type list at the top of the bucket
k = nextId[i]; // save next id
nextId[i] = bottomLML[m];
bottomLML[m] = i;
}
i = k; // next id
}
if( processingType == 'l' ){
processingType = 'S';
i = topLMS[j]; // process S-type chars at the bottom of the bucket
goto L_from_S;
}
}
if( fillSA ){ // if we want to fill the suffix array while doing the induced sorting, set pointers to the end of the buckets
bucketPointer[0] = bucketSize[0];
for( j = 1 ; j < alphabetSize ; j++ ) bucketPointer[j] = ( bucketPointer[(j-1)] + bucketSize[j] ); // total number of positions at and before this bucket
for( j = 0 ; j < alphabetSize ; j++ ) bucketPointer[j]--; // points to the last position in each bucket
}
for( j = 0 ; j < alphabetSize ; j++ ){ // reset pointers that will be used
topLMS[j] = (-1);
firstId[j] = (-1);
lastId[j] = (-1);
}
#ifdef DEBUG_BWTIS
if( textSize<100 && level<5 && !fillSA ) PrintISState("[S->L]",text,posLMS,charTypes,textSize,alphabetSize,NULL,level);
#endif
for( j = alphabetSize ; j != 0 ; ){ // induce sort S-type chars from L-type chars
j--; // process from bottom to top
processingType = 's';
i = firstId[j]; // process s-type chars at the bottom of the bucket
S_from_L:
while( i != (-1) ){
if( posLMS[i] == 0 ){ // first position in text
posLMS[i] = (textSize-1); // last position in text (it is the '$' symbol, which is an S-type char)
if( fillSA ){ // it corresponds to the position of the terminal symbol
if(level) suffixArray[ bucketPointer[0] ] = posLMS[i];
else { // at level 0, fill the BWT array
n = ( posLMS[i] - 1 ); // position (textSize-2)
n = GetPackedNumber(text,n);
SetPackedNumber(bwtArray,(bucketPointer[0]),n);
}
}
topLMS[0] = i; // it goes to the first and only position in the bucket of the 0-th alphabet char
k = nextId[i]; // save next id
nextId[i] = (-1);
i = k;
continue;
}
posLMS[i]--; // go backwards
n = posLMS[i]; // position in the text
currentType = GETCHARTYPE(charTypes,n); // char type at this position
m = GetPackedNumber(text,n); // char at this position
if( fillSA ){ // set this suffix position at the correct position in the bottom of its bucket inside the suffix array
if(level) suffixArray[ bucketPointer[m] ] = n;
else { // at level 0, fill the BWT array
if( n == 0 ) n = textSize;
n--; // position behind the current one
n = GetPackedNumber(text,n);
SetPackedNumber(bwtArray,(bucketPointer[m]),n);
}
bucketPointer[m]--;
}
if( currentType == 's' ){ // add to top of s-type list at the bottom of the bucket
if( firstId[m] == (-1) ) firstId[m] = i;
else nextId[ lastId[m] ] = i;
k = nextId[i]; // save next id here because if n==j and it was the last id, the next ptr was updated in the line above
nextId[i] = (-1);
lastId[m] = i;
} else if( currentType == 'S' ){ // add to top of S-type list at the bottom of the bucket
k = nextId[i]; // save next id
nextId[i] = topLMS[m];
topLMS[m] = i;
}
i = k; // next id
}
if( processingType == 's' ){
processingType = 'L';
i = bottomLML[j]; // process L-type chars at the top of the bucket
goto S_from_L;
}
}
if( fillSA ){
free(bucketSize);
free(bucketPointer);
}
/*
#ifdef DEBUG_BWTIS
if( textSize<100 && level<5 && !fillSA ) PrintISState("[L->S]",text,posLMS,charType,textSize,alphabetSize,NULL,level);
#endif
*/
}
// TODO: re-enable check until next S-type char inclusive
// TODO: call recursion with used LMS's only
// TODO: use suffixArray of size (numLMS) instead of keeping pointers and directly update as the recursion output
void BWTIS( PackedNumberArray *text , unsigned int textSize , int alphabetSize , int level ){
unsigned int *charTypes;
unsigned int *posLMS;
PackedNumberArray *orderArray;
int numLMS;
int numUniqueLMS;
int i, j, k, m;
unsigned int n;
unsigned int posOne, posTwo;
unsigned int mask;
/*
int numUsedLMS;
unsigned int *repeatedLMS, *usedLMS, *sortedUsedLMS;
*/
printf("\n(%u:%d)",textSize,alphabetSize);fflush(stdout);
n = ( (textSize >> 5) + 1); // number of 32bit words that fit inside the textSize
charTypes = (unsigned int *)calloc(n,sizeof(unsigned int)); // bit array with 0 for L-type chars and 1 for S-type chars
n = ( textSize - 1); // last position of the text
m = ( n >> 5 ); // word position inside charTypes array for current text position
mask = ( 1UL << ( n & 31 ) ); // bit mask for current position inside word
charTypes[m] = mask; // last char of the text is S-type
j = GetPackedNumber(text,n); // last char
k = 1; // type of last char (1 for S-type and 0 for L-type)
numLMS = 0; // current number of LMS (last char is S*-type but it will only be counted next)
while( n != 0 ){ // process text in reverse from end to start
n--; // previous position
mask >>= 1; // shift mask
if( mask == 0 ){ // multiple of 32
m--; // previous word
//charTypes[m] = 0UL; // reset word bits
mask = ( 1UL << 31 ); // re-initialize mask
}
i = GetPackedNumber(text,n); // current char
if( i < j ) k = 1; // lower suffix, S-type char
else if ( i > j ){ // higher suffix, L-type char
if( k ) numLMS++; // if last char was S-type and this one is L-type, last one was S*-type
k = 0;
} // else (i==j), so use last used type
if( k ) charTypes[m] |= mask; // set bit for S-type char
j = i; // current char will be previous char on next step
}
/*
charType = (char *)malloc(textSize*sizeof(char));
n = (textSize-1);