/
select_intervals_in_callsTab.py
130 lines (102 loc) · 3.77 KB
/
select_intervals_in_callsTab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
'''
This script extracts lines from a calls file according to scaffold name, start and end positions.
Every interval can be output into an separate file (see the option -t).
calls.file:
#CHROM POS sample1 sample2 sample3 sample4
scaffold_1 113 G G G N
scaffold_1 117 C C N C
scaffold_1 122 C C N C
scaffold_1 137 A A T N
scaffold_1 139 T T T N
scaffold_1 148 A A T N
scaffold_1 161 C C C T
scaffold_1 170 C T C N
scaffold_1 174 A A A N
scaffold_2 13 G G G T
scaffold_2 17 C C C C
scaffold_2 22 C C G C
scaffold_2 27 A T T N
scaffold_2 29 T C T C
scaffold_2 38 A A T T
scaffold_2 111 C C C T
scaffold_2 140 C T C N
scaffold_2 178 A A A N
list.bed:
#CHROM start end
scaffold_1 130 150
scaffold_2 10 80
output.file0:
#CHROM POS sample1 sample2 sample3 sample4
scaffold_1 137 A A T N
scaffold_1 139 T T T N
scaffold_1 148 A A T N
output.file1:
#CHROM POS sample1 sample2 sample3 sample4
scaffold_2 13 G G G T
scaffold_2 17 C C C C
scaffold_2 22 C C G C
scaffold_2 27 A T T N
scaffold_2 29 T C T C
scaffold_2 38 A A T T
command:
python select_intervals.py -i calls.file -l list.bed -o output.file -t separate
contact:
Dmytro Kryvokhyzha dmytro.kryvokhyzha@evobio.eu
'''
############################# modules #############################
import calls # my custom module
############################# options #############################
parser = calls.CommandLineParser()
parser.add_argument('-i', '--input', help = 'name of the input file', type=str, required=True)
parser.add_argument('-o', '--output', help = 'name of the output file', type=str, required=True)
parser.add_argument('-l', '--list_names', help = 'file containing list of scaffolds and start-end positions', type=str, required=True)
parser.add_argument('-t', '--output_type', help = 'whether output should be in one file (one) or separate output for every interval (separate)', type=str, required=False)
args = parser.parse_args()
############################# program #############################
if not args.output_type:
output_type = "one"
elif args.output_type in ["one", "separate"]:
output_type = args.output_type
else:
raise IOError('The output type should be either "one" or "separate". You specified %s' % args.output_type)
scaflist = open(args.list_names, "r")
header_scaf = scaflist.readline()
IntervalWords = scaflist.readline().split()
IntervalScaf = int(IntervalWords[0].split('chr')[-1])
IntervalStart = int(IntervalWords[1])
IntervalEnd = int(IntervalWords[2])
if output_type == "separate":
interval_count = 0
else:
interval_count = ""
output = open(args.output+str(interval_count), 'w')
counter = 0
with open(args.input) as datafile:
header = datafile.readline()
output.write("%s" % header)
for line in datafile:
words = line.split()
scafCalls = int(words[0].split('chr')[-1])
posCalls = int(words[1])
while (scafCalls > IntervalScaf) or (scafCalls == IntervalScaf and posCalls > IntervalEnd):
IntervalWords = scaflist.readline().split()
if IntervalWords == []:
break
else:
if output_type == "separate":
interval_count+=1
output.close()
output = open(args.output+str(interval_count), 'w')
output.write("%s" % header)
IntervalScaf = int(IntervalWords[0].split('chr')[-1])
IntervalStart = int(IntervalWords[1])
IntervalEnd = int(IntervalWords[2])
if scafCalls == IntervalScaf and posCalls >= IntervalStart and posCalls <= IntervalEnd:
output.write(line)
counter += 1
if counter % 1000000 == 0:
print str(counter), "lines processed"
datafile.close()
output.close()
print('Done!')