/
sam2blast
executable file
·96 lines (93 loc) · 3.27 KB
/
sam2blast
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
#@(#)sam2blast 2022-08-17 last modified by A.J.Travis
"""
note on sam flags:
0 = mapped to forward strand
16 = mapped to reverse strand
4 = unmapped
256 = secondary alignment to forward strand
272 = secondary alignment to reverse strand
2048 = ? bwa forward strand
2064 = ? bwa reverse strand
"""
import sys
import re
from math import exp, log
def main(infile):
dblen = 0
for line in open(infile):
li = line.strip()
if not li.startswith("@"):
arr = li.split()
if (int(arr[1]) in [0,256]):
print_line(arr, 'f', dblen)
elif (int(arr[1]) in [16,272]):
print_line(arr, 'r', dblen)
elif (int(arr[1]) == 4):
pass
else:
raise Exception("unknown flag %s" % arr[1])
elif li.startswith("@SQ"):
dblen += int(li.split("LN:")[1])
def print_line(arr, flag, dblen):
cigar, mismatches, gaps, identity = [], 0, 0, 0
cigar_num = re.findall('\d+',arr[5])
cigar_op = re.findall('\D',arr[5])
ref_len = 0
for i in range(len(cigar_op)):
if cigar_op[i] == "M":
[cigar.append(1) for j in range(int(cigar_num[i]))]
ref_len += int(cigar_num[i])
elif cigar_op[i] == "I":
[cigar.append(1) for j in range(int(cigar_num[i]))]
gaps += 1
elif cigar_op[i] == "S":
[cigar.append(0) for j in range(int(cigar_num[i]))]
elif cigar_op[i] == "H":
[cigar.append(0) for j in range(int(cigar_num[i]))]
elif cigar_op[i] == "D":
ref_len += int(cigar_num[i])
gaps += 1
else:
raise Exception("unknown cigar operator %s" % cigar_op[i])
align_start, align_end = 0, len(cigar)
len_read = len(cigar)
for x in cigar:
if x == 1:
break
align_start += 1
for x in reversed(cigar):
if x == 1:
break
align_end -= 1
len_align = align_end - align_start
ref_start = int(arr[3])
if (flag == 'f'):
ref_end = ref_start + ref_len -1
elif (flag == 'r'):
ref_end = ref_start
ref_start = ref_start + ref_len -1
align_start, align_end = len_read - align_end, len_read - align_start
mismatches = int([x for x in arr if x.startswith("NM:i:")]
[0].split("NM:i:")[1])
identity = ((abs(align_end - align_start) - mismatches )/
float(len_align) ) * 100
align_score = int(re.split(r'AS:i:',
[x for x in arr if re.search(r'^AS:i:', x)][0])[1])
#ungapped l, k, h, n, m = 1.33, 0.621, 1.12, dblen, len_read
l, k, h, n, m = 1.28, 0.46, 0.85, dblen, len_read
np = n-log(k*n*m)/h
mp = m-log(k*n*m)/h
if mp < 0:
mp = 0.1
bit_score = ((l*float(align_score))-log(k))/log(2)
# alternative evalue formula, gives 0's for small numbers
# u = (log(k*mp*np))/l
# evalue = 1-exp(-exp(-l*(float(align_score)-u)))
evalue = mp*np*2**-bit_score
print ('%s\t%s\t%.2f\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%.2g\t%.1f'%(
arr[0], arr[2], identity, len_align,
mismatches, gaps, align_start+1, align_end, ref_start, ref_end,
evalue, bit_score))
if __name__ == "__main__":
main(sys.argv[1])