-
Notifications
You must be signed in to change notification settings - Fork 0
/
VCF_parser.py
executable file
·314 lines (240 loc) · 14 KB
/
VCF_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python
"""
User Commands
\033[1mSYNPOPSIS\033[0m:
Parses a .VCF file to extract the genotype of all individuals based on a
per-individual coverage threshold.
It produces another .VCF file with the locus that passed the per-individual
coverage threshold and it also produces a text file with the genotype of
each individual for these loci.
\033[1mUSAGE\033[0m:
%program <in_vcf> <per-sample_cov_threshold> <num_samples> <samp_compromise>
<output_parsed_vcf> <output_genotype_file>
samp_compromise = number of samples that will be taken off the total number
of samples in which the coverage threshold has to be respected.
\033[1mOUTPUT GENOTYPE FORMAT\033[0m:
------------------------------------------------------------------------------
Contig Pos Pop1 Pop2 Ind1 Ind2 Ind3 Ind4 ...
contig_12 356 8/16 24/0 A/T A/A A/T T/T ...
...
NB : The popualtion counts are in numbers of alleles, considering DIPLOID samples.
It means that if there is 12 samples/pop, the total count per population, if all
the samples have been successfully sequenced, is 24.
\033[1mDESCRIPTION OF THE FILTERS\033[0m:
In order to keep a SNP and output it in the .VCF file, it has to pass the
coverage and quality filters, i.e it has to be present in at least <num_samples>
minus <samp_compromise> individuals with a per-individual coverage chosen by the
user (<cov_threshold>). Plus, SNPs with a quality of <= 20 are discarded.
\033[1mCREDITS\033[0m:
Doctor Pants 2012 \m/
"""
import sys
try:
in_vcf = sys.argv[1]
cov_threshold = int(sys.argv[2])
num_samples = int(sys.argv[3])
samp_compromise = int(sys.argv[4])
output_vcf = sys.argv[5]
output_genotypes = sys.argv[6]
except:
print __doc__
sys.exit(2)
# Parses the input VCF file and outputs the loci that pass the per-individual cov
# threshold in another VCF file specified in the command line (<output_parsed_vcf>).
samples_good_cov = set()
with open(in_vcf, "rU") as vcf_i:
with open(output_vcf, "w") as vcf_o:
for line in vcf_i:
# Copies the vcf header in the vcf parsed output
if line.startswith("#"):
line = line.strip()
vcf_o.write(line + "\n")
if line.startswith("k24"):
line = line.strip()
# Discards the loci with a SNP quality < 20
if float(line.split("\t")[5]) < 20:
pass
# Discards the major INDELs but keeps the micro-INDELs
elif line.split()[7].split(";")[0] == "INDEL":
if len(line.split()[3]) or len(line.split()[4]) > 2:
pass
elif len(line.split()[3]) and len(line.split()[4]) <= 2:
if int(line.split()[7].split(";")[1].split("=")[1]) >= (cov_threshold*(num_samples-samp_compromise)):
sample_count = num_samples
ind_column = 9
smp_good_cov = 0
# Verifies if every sample respects the coverage threshold
while sample_count > 0:
if int(line.split("\t")[ind_column].split(":")[2]) >= cov_threshold:
smp_good_cov += 1
ind_column += 1
sample_count -= 1
# If there is enough samples that respects the cov. threshold the locus
# is kept and written down in the output file.
if smp_good_cov == (num_samples - samp_compromise):
vcf_o.write(line + "\n")
# Discards complex SNPs (tri-nucleotide)
elif len(line.split("\t")[3]) and len(line.split("\t")[4]) > 2:
pass
# Keeps the loci with a depth of coverage (DP) >= to the coverage threshold
# chosen by the user times the number of samples that have to have that
# coverage.
elif int(line.split("\t")[7].split(";")[0].split("=")[1]) >= (cov_threshold*(num_samples-samp_compromise)):
sample_count = num_samples
ind_column = 9
smp_good_cov = 0
# Verifies if every sample respects the coverage threshold
while sample_count > 0:
if int(line.split("\t")[ind_column].split(":")[2]) >= cov_threshold:
smp_good_cov += 1
ind_column += 1
sample_count -= 1
# If there is enough samples that respects the cov. threshold the locus
# is kept and written down in the output file.
if smp_good_cov >= (num_samples - samp_compromise):
vcf_o.write(line + "\n")
print "\nParsed vcf file created\nNow genotyping..."
loci = {}
smp_col = {}
# Extracts the genotype of the samples in the vcf parsed file
with open(output_vcf, "rU") as parsed_vcf:
for line in parsed_vcf:
if line.startswith("##"):
pass
# Associates the name of the sample to its corresponding column position in a dictionnary
elif line.startswith("#CHROM"):
line = line.strip()
samples = num_samples
ind_column = 9
col_num = 1
while samples > 0:
col = "col" + str("%02i" % col_num)
sp = "C-" + line.split()[ind_column].split(".")[2]
smp_col[col] = sp
ind_column += 1
samples -= 1
col_num += 1
elif line.startswith("k24"):
# Creates the dictionnary containing all the information for every contig
# (SNP position, REF allele, ALT allele, genotype per individual, allele counts per pop.)
contig = line.split()[0].split("_")[2] + "_" + str("%05i" % int(line.split()[0].split("_")[3]))
pos = int(line.split()[1])
loci[contig] = {}
loci[contig][pos] = {}
loci[contig][pos]["alleles"] = {}
loci[contig][pos]["genotypes"] = {}
loci[contig][pos]["normals"] = {}
loci[contig][pos]["normals"]["REF"] = 0
loci[contig][pos]["normals"]["ALT"] = 0
loci[contig][pos]["num_normals"] = 0
loci[contig][pos]["dwarfs"] = {}
loci[contig][pos]["dwarfs"]["REF"] = 0
loci[contig][pos]["dwarfs"]["ALT"] = 0
loci[contig][pos]["num_dwarfs"] = 0
# Writes the REF and ALT alleles in the dictionnary for the locus
loci[contig][pos]["alleles"]["REF"] = line.split()[3]
loci[contig][pos]["alleles"]["ALT"] = line.split()[4]
samples = num_samples
ind_column = 9
col_num = 1
# Goes through every sample column and
while samples > 0:
col = "col" + str("%02i" % col_num)
# uses the dictionnary containing the sample names to associate the genotype
# to the good sample according to the column number.
sp = smp_col[col]
loci[contig][pos]["genotypes"][sp] = []
# If the sample respects the coverage threshold, the genotype is written in the
# dictionnary
if int(line.split()[ind_column].split(":")[2]) >= cov_threshold:
# If the cov. threshold is respected, it adds 1 to the number of individuals
# that passed the threshold. In other words, it counts the number of samples
# that have a sufficient coverage to be genotyped.
if sp.startswith("C-N"):
loci[contig][pos]["num_normals"] += 1
if sp.startswith("C-D"):
loci[contig][pos]["num_dwarfs"] += 1
# First allele is REF
if line.split()[ind_column].split(":")[0].split("/")[0] == "0":
loci[contig][pos]["genotypes"][sp].append(loci[contig][pos]["alleles"]["REF"])
if sp.startswith("C-N"):
loci[contig][pos]["normals"]["REF"] += 1
if sp.startswith("C-D"):
loci[contig][pos]["dwarfs"]["REF"] += 1
# Second allele is ALT
if line.split()[ind_column].split(":")[0].split("/")[0] == "1":
loci[contig][pos]["genotypes"][sp].append(loci[contig][pos]["alleles"]["ALT"])
if sp.startswith("C-N"):
loci[contig][pos]["normals"]["ALT"] += 1
if sp.startswith("C-D"):
loci[contig][pos]["dwarfs"]["ALT"] += 1
# First allele is REF
if line.split()[ind_column].split(":")[0].split("/")[1] == "0":
loci[contig][pos]["genotypes"][sp].append(loci[contig][pos]["alleles"]["REF"])
if sp.startswith("C-N"):
loci[contig][pos]["normals"]["REF"] += 1
if sp.startswith("C-D"):
loci[contig][pos]["dwarfs"]["REF"] += 1
# Second allele is ALT
if line.split()[ind_column].split(":")[0].split("/")[1] == "1":
loci[contig][pos]["genotypes"][sp].append(loci[contig][pos]["alleles"]["ALT"])
if sp.startswith("C-N"):
loci[contig][pos]["normals"]["ALT"] += 1
if sp.startswith("C-D"):
loci[contig][pos]["dwarfs"]["ALT"] += 1
# If the sample coverage is below threshold, the genotype is "0"
if int(line.split()[ind_column].split(":")[2]) < cov_threshold:
loci[contig][pos]["genotypes"][sp].append("0")
ind_column += 1
samples -= 1
col_num += 1
print "\nGenotypes extracted\nCreating output file..."
# Writes down the infprmation for each contig in the genotype output file
with open(output_genotypes, "w") as out_geno:
# Creates the first part of the file's header
out_geno.write("Contig" + "\t" + "POS" + "\t" + "REF" + "/" + "ALT" + "\t" + "Dwarfs" + "\t" + "Normals")
# Outputs the samples IN ORDER in the file's header
order = []
for col in smp_col:
order.append(smp_col[col])
for sample in sorted(order):
out_geno.write("\t" + sample)
# Goes through every contig in the mega-dictionnary and outputs the information per sample and per pop.
for contig in list(sorted(loci)):
for pos in list(sorted(loci[contig])):
loci[contig][pos]["normals"]["REF"]
num_a1_D = loci[contig][pos]["dwarfs"]["REF"]
num_a2_D = loci[contig][pos]["dwarfs"]["ALT"]
num_a1_N = loci[contig][pos]["normals"]["REF"]
num_a2_N = loci[contig][pos]["normals"]["ALT"]
num_dwarfs = loci[contig][pos]["num_dwarfs"]
num_normals = loci[contig][pos]["num_normals"]
#If an allele is very rare, it's discarded
if num_a1_D < 3 and num_a1_N < 3:
pass
elif num_a2_D < 3 and num_a2_N < 3:
pass
# If there is less than samp_compromise/2 per population that is successfully genotyped
# the locus is discarded
elif num_dwarfs < (num_samples - samp_compromise)/2 or num_normals \
< (num_samples - samp_compromise)/2:
pass
else:
# Outputs the contig and the position on the contig
out_geno.write("\n" + contig + "\t" + str(pos))
# Outputs the genotypes for that position
out_geno.write("\t" + loci[contig][pos]["alleles"]["REF"] + "/" +
loci[contig][pos]["alleles"]["ALT"])
# Outputs the population information (allele count/pop)
out_geno.write("\t" + str(loci[contig][pos]["dwarfs"]["REF"]) + "/" +
str(loci[contig][pos]["dwarfs"]["ALT"]))
out_geno.write("\t" + str(loci[contig][pos]["normals"]["REF"]) + "/" +
str(loci[contig][pos]["normals"]["ALT"]))
# Outputs the genotype for each sample
for sample in list(sorted(loci[contig][pos]["genotypes"])):
if loci[contig][pos]["genotypes"][sample][0] != "0":
out_geno.write("\t" + loci[contig][pos]["genotypes"][sample][0] + "/" +
loci[contig][pos]["genotypes"][sample][1])
if loci[contig][pos]["genotypes"][sample][0] == "0":
out_geno.write("\t" + loci[contig][pos]["genotypes"][sample][0])
print "Genotype file successfully created\n"