-
Notifications
You must be signed in to change notification settings - Fork 6
/
gtf.py
executable file
·134 lines (112 loc) · 4.97 KB
/
gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# Copyright (C) 2012-2013 Collin Tokheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import re
import argparse
from itertools import tee, imap
class Gtf(object):
"""
Separates out gtf parsing from iterating over records. Perhaps a flimsy
use of a class.
"""
def __init__(self, gtf_line):
self.gtf_list = gtf_line
self.seqname, self.source, self.feature, self.start, self.end, self.score, self.strand, self.frame, self.attribute = gtf_line # These indexes are defined by the GTF spec
tmp = map(lambda x: re.split('\s+', x.replace('"', '')),
re.split('\s*;\s*', self.attribute.strip().strip(';')))
self.attribute = dict([x for x in tmp if len(x)==2]) # convert attrs to dict
self.start, self.end = int(self.start) - 1, int(self.end)
def __str__(self):
return '\t'.join(self.gtf_list) + '\n'
def gtf_reader(fileObject, delim):
"""
Iterate over a file to extract 'exon' features of tx
"""
for gtf_line in csv.reader(fileObject, delimiter=delim):
gtf = Gtf(gtf_line)
# only use exon features
if gtf.feature.lower() == 'exon':
yield Gtf(gtf_line)
def sort_gtf(file_name, output):
gtf_list = []
with open(file_name) as handle:
for gtf_line in csv.reader(handle, delimiter='\t'):
gtf = Gtf(gtf_line)
if gtf.feature.lower() == 'exon':
gtf_list.append(gtf)
gtf_list.sort(key=lambda x: (x.seqname, x.attribute['gene_id'], x.attribute['transcript_id'], x.start, x.end))
# write the contents back to a file
with open(output, 'wb') as write_handle:
for gtf_obj in gtf_list:
write_handle.write(str(gtf_obj))
def is_sorted(iterable, compare):
"""Returns if iterable is sorted given the definition of
a compare function"""
a, b = tee(iterable)
next(b, None)
return all(imap(compare, a, b))
def gtf_compare(a, b):
"""compare two lines of a GTF file to see if they are
correctly sorted as "a" before "b"."""
tmp_list = [a, b] # proposed sorted order
srt_list = sorted(tmp_list, key=lambda x: (x.seqname,
x.attribute['gene_id'],
x.attribute['transcript_id'],
x.start, x.end)) # actual sorted order
# return flag for whether the two entries were in sorted order
flag = (tmp_list == srt_list)
return flag
def gtf_iter_reader(handle):
"""Iterator yielding GTF objects."""
for gtf_line in csv.reader(handle, delimiter='\t'):
yield Gtf(gtf_line)
def is_gtf_sorted(file_name):
"""Returns Boolean for if gtf is sorted."""
with open(file_name) as handle:
mygtf_reader = gtf_iter_reader(handle)
return is_sorted(mygtf_reader, gtf_compare)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="""Either performs proper sorting of GTF for PrimerSeq or checks if GTF is sorted.
For Sorting GTF:\npython gtf.py -i unsorted.gtf -o sorted.gtf,
For checking if GTF is sorted:\npython gtf.py -c not_sure_if_sorted.gtf""")
parser.add_argument('-i',
type=str,
default='',
dest='gtf',
action='store',
help='path to gtf file to sort')
parser.add_argument('-o',
type=str,
default='',
dest='output',
action='store',
help='path name of properly sorted gtf')
parser.add_argument('-c',
type=str,
default='',
dest='is_sorted',
action='store',
help='path to gtf file to check if sorted correctly')
args = parser.parse_args()
if args.gtf and args.output:
sort_gtf(args.gtf, args.output) # do the work of sorting
elif args.is_sorted:
if is_gtf_sorted(args.is_sorted):
print '%s is correctly sorted' % (args.is_sorted)
else:
print '%s is not correctly sorted. please sort before use.' % (args.is_sorted)
else:
print 'You must enter either both the -i and -o options or just the -c option.'