/
transtermhp.py
75 lines (66 loc) · 2.72 KB
/
transtermhp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
import re
import subprocess
import sys
from BCBio import GFF
from Bio import SeqIO
from Bio.SeqFeature import (
FeatureLocation,
SeqFeature
)
def main(expterm, fasta, gff3):
with open(fasta, 'r') as handle:
seq_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
# Build coords file
with open(gff3, 'r') as handle:
for rec in GFF.parse(handle, base_dict=seq_dict):
with open('tmp.coords', 'w') as coords:
for feat in rec.features:
if feat.type == 'gene':
coords.write('\t'.join([
feat.id,
str(feat.location.start + 1),
str(feat.location.end),
rec.id,
]) + '\n')
with open('tmp.fasta', 'w') as fasta_handle:
SeqIO.write(rec, fasta_handle, 'fasta')
cmd = ['transterm', '-p', expterm, fasta, 'tmp.coords']
output = subprocess.check_output(cmd)
# TERM 1 4342 - 4366 + F 93 -11.5 -3.22878 | opp_overlap 4342, overlap 4340 4357
ttre = re.compile(
r'^ (?P<name>.*) (?P<start>\d+) - (?P<end>\d+)\s+'
r'(?P<strand>[-+])\s+(?P<loc>[GFRTHNgfr]+)\s+'
r'(?P<conf>\d+)\s+(?P<hp>[0-9.-]+)\s+(?P<tail>[0-9.-]+)'
)
rec.features = []
batches = output.split('SEQUENCE ')
for batch in batches[1:]:
batch_lines = batch.split('\n')
# Strip the header
interesting = batch_lines[2:]
unformatted = [x for x in interesting if x.startswith(' ')][0::2]
for terminator in unformatted:
m = ttre.match(terminator)
if m:
start = int(m.group('start')) - 1
end = int(m.group('end'))
if m.group('strand') == '+':
strand = 1
else:
strand = 0
feature = SeqFeature(
FeatureLocation(start, end),
type="terminator",
strand=strand,
qualifiers={
"source": "TransTermHP_2.09",
"score": m.group('conf'),
"ID": m.group('name'),
}
)
rec.features.append(feature)
yield rec
if __name__ == '__main__':
for record in main(*sys.argv[1:4]):
GFF.write([record], sys.stdout)