-
Notifications
You must be signed in to change notification settings - Fork 0
/
intron-exon_from-blastplus.py
executable file
·118 lines (89 loc) · 3.85 KB
/
intron-exon_from-blastplus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
User Commands
\033[1mSYNOPSIS\033[0m
Parses an output file from a blast on NCBI's website (result file : txt)
to find the exon-intron breaks. The sequences blasted against NCBI can
be of any type (e.g contigs from a de novo assembly) and they can be from
gDNA or cDNA. The script verifies the positions on each contig that
blast for an NCBI sequence and writes it down in an output file. The pos-
itions that blast for a protein in genbank (swissprot or nr) are necessa-
rily coding regions and this is the premiss on which the program is based.
\033[1mUSAGE\033[0m
%program <blast_output> <e-value threshold (int)> <output>
\033[1mINPUT FILE - FORMAT\033[0m
Blast output
************
It has to be either the blast result file (.txt) from NCBI's website OR
the blast result file from BLAST+ (e.g local blast, swissprot, nr, nt
or custom database) in FORMAT 0 (default).
The program will look for the best blast result to output the exon-intron
breaks, though it is possible to give it an input file in which ALL the
blast results are good and must be kept (e.g if a set of contigs or se-
quences from an exon capture chip have been blasted against the target se-
quences). The e-value to give as input information must follow the format:
[0-9]*e-[0-9]*
e.g : 6e-38
\033[1mOUTPUT FILE - FORMAT\033[0m
Contig Exons
contig_455 45-267; 897-1203
contig_1434 670-830;
... ...
\033[1mCREDITS\033[0m
Doctor Pants 2012 \m/
"""
import sys
import re
try:
blast_in = sys.argv[1]
evalue = float(sys.argv[2])
out_file = sys.argv[3]
except:
print __doc__
sys.exit(2)
contigs = {}
contig = ""
target = False
started = False
with open(blast_in, "rU") as in_f:
for line in in_f:
line = line.strip()
if line.startswith("Query="):
contig = line.split("Query= ")[1].split("_")[0] + "_" + str("%05i" % \
int(line.split("Query= ")[1].split("_")[1]))
contigs[contig] = set()
started = False
target = False
elif line.startswith("> "):
if target:
started = False
elif target == False:
target = True
elif re.findall("Expect =", line) != [] and target:
current_eval = float(line.split()[-1])
if current_eval <= evalue:
started = True
elif re.findall("Query [0-9]*", line) != [] and started:
pos1 = int(re.split(" [0-9]* ", line)[1])
pos2 = int(re.split(" [0-9]* ", line)[-1])
for pos in range(pos1,pos2+1):
contigs[contig].add(pos)
exon = set()
with open(out_file, "w") as out_f:
out_f.write("Contig" + "\t" + "Exon positions")
for contig in list(sorted(contigs)):
out_f.write("\n" + contig + "\t")
lgth = len(contigs[contig])
first_pos = 0
for i in range(0,lgth):
if i == 0:
first_pos = sorted(contigs[contig])[i]
elif sorted(contigs[contig])[i] == sorted(contigs[contig])[i-1] + 1:
pass
elif sorted(contigs[contig])[i] > sorted(contigs[contig])[i-1] + 1:
out_f.write(str(first_pos) + "-" + str(sorted(contigs[contig])[i-1]) + ";")
first_pos = sorted(contigs[contig])[i]
if i == (lgth - 1):
out_f.write(str(first_pos) + "-" + str(sorted(contigs[contig])[i]) + ";")
print "\nFile processed\n"