-
Notifications
You must be signed in to change notification settings - Fork 0
/
intron-exon_from-blastx.py
executable file
·159 lines (115 loc) · 5.96 KB
/
intron-exon_from-blastx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
User Commands
\033[1mSYNOPSIS\033[0m
Parses an output file from a blast on NCBI's website (result file : txt)
to find the exon-intron breaks. The sequences blasted against NCBI can
be of any type (e.g contigs from a de novo assembly) and they can be from
gDNA or cDNA. The script verifies the positions on each contig that
blast for an NCBI sequence and writes it down in an output file. The pos-
itions that blast for a protein in genbank (swissprot or nr) are necessa-
rily coding regions and this is the premiss on which the program is based.
\033[1mUSAGE\033[0m
%program <blast_output> <output>
\033[1mINPUT FILE - FORMAT\033[0m
Blast output
************
It has to be either the blast result file (.txt) from \033[1mNCBI's website\033[0m OR
the blast result file from \033[1mBLAST+\033[0m (e.g local blast, swissprot, nr, nt
or custom database) in FORMAT 0 (default).
The program will look for the best blast result to output the exon-intron
breaks. It looks for the best e-value of all the hits for a given query
and uses its blasting positions for the final output.
\033[1mOUTPUT FILE - FORMAT\033[0m
Contig Exons
contig_455 45-267; 897-1203
contig_1434 670-830;
... ...
\033[1mCREDITS\033[0m
Doctor Pants 2012 \m/
"""
import sys
import re
try:
blast_in = sys.argv[1]
out_file = sys.argv[2]
except:
print __doc__
sys.exit(2)
queries = {}
query = ""
started = False
with open(blast_in, "rU") as in_f:
for line in in_f:
line = line.strip()
if line.startswith("Query="):
if query != "":
to_discard = set()
if len(queries[query]) > 1:
for hit in list(sorted(queries[query])):
if hit not in to_discard:
current_evalue = queries[query][hit]["evalue"]
for compared_hit in list(sorted(queries[query])):
if compared_hit != hit:
if compared_hit not in to_discard:
compared_evalue = queries[query][compared_hit]["evalue"]
if current_evalue < compared_evalue:
to_discard.add(compared_hit)
elif current_evalue > compared_evalue:
to_discard.add(hit)
elif current_evalue == compared_evalue:
if len(queries[query][hit]["positions"]) < len(queries[query][compared_hit]["positions"]):
to_discard.add(compared_hit)
elif len(queries[query][hit]["positions"]) > len(queries[query][compared_hit]["positions"]):
to_discard.add(hit)
else:
to_discard.add(hit)
for hit_to_discard in to_discard:
queries[query].pop(hit_to_discard)
elif len(queries[query]) == 1:
pass
query = line.split("Query= ")[1]
queries[query] = {}
elif line.startswith("> "):
hit = line.split("> ")[1]
started = False
queries[query][hit] = {}
elif re.findall("Expect", line) != []:
if started == False:
evalue = float(line.split()[-1])
queries[query][hit]["evalue"] = evalue
queries[query][hit]["positions"] = set()
started = True
elif started:
pass
elif re.findall("Query [0-9]*", line) != []:
pos_sorting = []
pos_sorting.append(int(line.split()[1]))
pos_sorting.append(int(line.split()[-1]))
pos1 = sorted(pos_sorting)[0]
pos2 = sorted(pos_sorting)[1]
for pos in range(pos1,pos2+1):
queries[query][hit]["positions"].add(pos)
exon = set()
with open(out_file, "w") as out_f:
out_f.write("Gene" + "\t" + "Exon positions")
for query in list(sorted(queries)):
out_f.write("\n" + query + "\t")
if queries[query] != {}:
for hit in queries[query]:
lgth = len(queries[query][hit]["positions"])
first_pos = 0
for i in range(0,lgth):
if i == 0:
first_pos = sorted(queries[query][hit]["positions"])[i]
elif sorted(queries[query][hit]["positions"])[i] == sorted(queries[query][hit]["positions"])[i-1] + 1:
pass
elif sorted(queries[query][hit]["positions"])[i] > sorted(queries[query][hit]["positions"])[i-1] + 1:
out_f.write(str(first_pos) + "-" + str(sorted(queries[query][hit]["positions"])[i-1]) + ";")
first_pos = sorted(queries[query][hit]["positions"])[i]
if i == (lgth - 1):
out_f.write(str(first_pos) + "-" + str(sorted(queries[query][hit]["positions"])[i]) + ";")
elif queries[query] == {}:
out_f.write("no exons found")
print "\nFile processed\n"