-
Notifications
You must be signed in to change notification settings - Fork 3
/
bigbwt
executable file
·192 lines (168 loc) · 8.02 KB
/
bigbwt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
import sys, time, argparse, subprocess, os.path
Description = """
Tool to build the BWT for higly repetive files using the approach
described in
"Prefix-Free Parsing for Building Big BWTs"
by Christina Boucher, Travis Gagie, Alan Kuhnle and Giovanni Manzini
Proc. WABI '18 (http://drops.dagstuhl.de/opus/volltexte/2018/9304/)
The input file cannot contain the characters 0, 1 or 2 which are
used internally by the algorithm. The character 0 is used as the EOF
in the output BWT. The dictionary and the parse should not be larger than 2GB.
Input files larger than 2GB are ok, but computing the BWT in the traditional way
takes 9n bytes of RAM. If this is a problem, just don't use the option -c and
check the correctness of the BWT by some other means (for example inverting it)
"""
# pf bwt commands
dir_bigbwt = os.path.dirname(os.path.abspath(__file__))
parseNT_exe = os.path.join(dir_bigbwt, "newscanNT.x")
parsetunneling = os.path.join(dir_bigbwt, "tfm_index_construct.x")
pfwg_exe = os.path.join(dir_bigbwt, "pfwg.x")
tunnelinginvert = os.path.join(dir_bigbwt, "tfm_index_invert.x")
parse_exe = ""
parsebwt_exe = ""
parsebwt_exe64 = ""
pfbwt_exe = ""
pfbwtNT_exe = ""
pfbwt_exe64 = ""
pfbwtNT_exe64 = ""
bwt_exe = ""
bwt_exe64 = ""
shasum_exe = "sha256sum"
def main():
parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('input', help='input file name', type=str)
parser.add_argument('-w', '--wsize', help='sliding window size (def. 10)', default=10, type=int)
parser.add_argument('-p', '--mod', help='hash modulus (def. 100)', default=100, type=int)
parser.add_argument('-t', help='number of helper threads (def. None)', default=0, type=int)
parser.add_argument('-k', help='keep temporary files',action='store_true')
parser.add_argument('-v', help='verbose',action='store_true')
parser.add_argument('-f', help='read fasta',action='store_true')
parser.add_argument('-i', help='invert the tunneled WG', action='store_true')
parser.add_argument('--sum', help='compute output files sha256sum',action='store_true')
parser.add_argument('--parsing', help='stop after the parsing phase (debug only)',action='store_true')
parser.add_argument('--compress', help='compress output of the parsing phase (debug only)',action='store_true')
args = parser.parse_args()
if args.f and args.t > 0 and (".fq" in args.input or ".fastq" in args.input or ".fnq" in args.input):
print("bigbwt does not current support FASTQ format! Exiting...")
return
logfile_name = args.input + ".log"
# get main bigbwt directory
args.bigbwt_dir = os.path.split(sys.argv[0])[0]
print("Sending logging messages to file:", logfile_name)
with open(logfile_name,"a") as logfile:
# ---------- parsing of the input file
start0 = start = time.time()
if args.t>0:
command = "{exe} {file} -w {wsize} -p {modulus} -t {th}".format(
exe = os.path.join(args.bigbwt_dir,parse_exe),
wsize=args.wsize, modulus = args.mod, th=args.t, file=args.input)
else:
command = "{exe} {file} -w {wsize} -p {modulus}".format(
exe = os.path.join(args.bigbwt_dir,parseNT_exe),
wsize=args.wsize, modulus = args.mod, file=args.input)
if args.v: command += " -v"
if args.parsing or args.compress: command += " -c"
if args.f: command += " -f"
print("==== Parsing. \nCommand:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
if args.parsing:
# delete temporary parsing files
command = "rm -f {file}.parse_old {file}.last {file}.occ".format(file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("==== Stopping after the parsing phase as requested")
return
elif args.compress:
# save parsing files
start = time.time()
command = "tar -cJf {file}.parse.txz {file}.parse {file}.dicz".format(file=args.input)
print("==== Compressing. \nCommand:", command)
if(execute_command(command,logfile,logfile_name,env={"XZ_OPT":"-9"})!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
delete_temp_files(args,logfile,logfile_name)
print("==== Done: Parsing output xz-compressed as requested")
return
# ----------- computation of the tunneled WG of the parsing
command = "{exe} {file}.parse {file}.tunnel".format(
exe=os.path.join(dir_bigbwt, parsetunneling), file=args.input
)
print("==== WG Tunneling. \nCommand:", command)
start = time.time()
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start));
# ----------- compute final tunneled WG using dictionary and a tunneled WG of parse
start = time.time()
if(os.path.getsize(args.input+".dict") >= (2**31-4) ):
print("The 64-bit version not yet supported, sorry!")
exit()
else: # 32 bit version
command = "{exe} -w {wsize} {file}".format(
exe = os.path.join(args.bigbwt_dir,pfwg_exe),
wsize=args.wsize, file=args.input)
print("==== Computing final tunneled WG. \nCommand:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start))
print("Total construction time: {0:.4f}".format(time.time()-start0))
# ---- compute sha256sum
if args.sum:
digest = file_digest(args.input +".L",logfile)
print("L {exe}: {digest}".format(exe=shasum_exe, digest=digest))
digest = file_digest(args.input +".in",logfile)
print("Din {exe}: {digest}".format(exe=shasum_exe, digest=digest))
digest = file_digest(args.input +".dout",logfile)
print("Dout {exe}: {digest}".format(exe=shasum_exe, digest=digest))
# ---- delete intermediate files
delete_temp_files(args,logfile,logfile_name)
# --- start checking ---
if args.i:
start = time.time()
command = "{exe} {file}".format(
exe = os.path.join(args.bigbwt_dir, tunnelinginvert), file=args.input)
print("==== Inverting the tunneled WG. \nCommand:", command)
if(execute_command(command,logfile,logfile_name)!=True):
return
print("Elapsed time: {0:.4f}".format(time.time()-start));
command = "cmp {file} {file}.untunneled".format(file=args.input);
print("==== Comparing the file with the inverted WG. \nCommand:", command);
if(execute_command(command,logfile,logfile_name)):
print("OK");
else:
print("Inverted file differs");
return 1
# --- end checking ---
print("==== Done")
# delete intermediate files
def delete_temp_files(args,logfile,logfile_name):
if args.k==False:
print("==== Deleting temporary files.") # no need to show the command
command = "rm -f {file}.parse {file}.parse_old {file}.dict {file}.ilist {file}.occ {file}.tunnel".format(file=args.input)
if(execute_command(command,logfile,logfile_name)!=True):
return
# compute hash digest for a file
def file_digest(name,logfile):
try:
hash_command = "{exe} {infile}".format(exe=shasum_exe, infile=name)
hashsum = subprocess.check_output(hash_command.split(),stderr=logfile)
hashsum = hashsum.decode("utf-8").split()[0]
except:
hashsum = "Error!"
return hashsum
# execute command: return True is everything OK, False otherwise
def execute_command(command,logfile,logfile_name,env=None):
try:
#subprocess.run(command.split(),stdout=logfile,stderr=logfile,check=True,env=env)
subprocess.check_call(command.split(),stdout=logfile,stderr=logfile,env=env)
except subprocess.CalledProcessError:
print("Error executing command line:")
print("\t"+ command)
print("Check log file: " + logfile_name)
return False
return True
if __name__ == '__main__':
main()