/
SNVer2SNPv2.py
167 lines (123 loc) · 4.83 KB
/
SNVer2SNPv2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python
# script used to convert the output of SNVer, which is in its own vcf like format, to a tab delimited txt format, which is easier to handle
import sys
import os
import re
import csv
from string import translate,maketrans,punctuation
def calcAF(altBases,allBases): # calculates the allele frequency
refBases = int(allBases)-int(altBases) # number of reads concordant to the reference
if refBases == 0:
allelefrequency = 1
elif altBases == 0:
allelefrequency = 0
else:
allelefrequency = float(altBases)/int(allBases)
# print 'Calc Freq: '
# print "Alt: " + str(altBases)
# print "Alt: " + str(float(altBases))
# print "All: " + str(allBases)
# print "All: " + str(float(allBases))
# print allelefrequency
return allelefrequency
def GATKprocessPoolField(poolField):
# Takes one Poolfield; splits it at the ':' delimiter into in a list of five fields, takes the second field, which is a tuple of two, and gives its two values to the calcAF Funktion
# define Transitionmatrix, to exchange delimiters by whitespace
delims = ':'
T = maketrans(delims, ' '*len(delims))
poolField = translate(poolField.strip(), T).split()
if len(poolField) == 5:
Bases = poolField[1].split(',')
refBases = int(Bases[0])
altBases = int(Bases[1])
allBases = refBases + altBases
allelefrequency = calcAF(altBases, allBases)
return altBases, allBases, allelefrequency
else:
return 0
def parseVCF(mode, infile, out):
if os.path.isfile(out):
print "converted variationfile already exists: "
print out
return out
filehandle = open(infile, 'r')
outfile = open(out, 'w+')
## beginn parsing parse SNVer vcf file
if mode == 'SNVer':
## parse SNVer vcf file
for line in filehandle:
if 'NA' not in line :
line = translate(line.strip(), T).split()
if not line[0].startswith("#"): # skip header and comments
#print line
formattedline = line[0:2]
formattedline += line[3:5]
# likelihood_allelefrequency = re.search(r'AF=(\d\.*\d*)', line[7])
# formattedline.append(likelihood_allelefrequency.group(1))
formattedline.append(0)
remainingFields = len(line[9:])
i = 10
while i < len(line):
formattedline.append(line[i])
formattedline.append(line[i+1])
formattedline.append(calcAF(line[i],line[i+1]))
i = i+2
#print('\t'.join(map(str,formattedline)))
outfile.writelines('\t'.join(map(str,formattedline))+'\n')
## parse GATK vcf file
elif mode == 'GATK':
for line in filehandle:
line = line.strip().split()
if not line[0].startswith('#') and './.' not in line: # skip header and comments and NA values
formattedline = line[0:2]
formattedline += line[3:5]
# likelihood_allelefrequency = re.search(r';AF=(\d\.*\d*)', line[7])
# formattedline.append(likelihood_allelefrequency.group(1))
formattedline.append(0)
i = 9
printingFlag = True
while i< len(line):
if ':.:' not in line[i]:
basevalues = GATKprocessPoolField(line[i])
if basevalues != 0:
altBases, allBases, allelefrequency = basevalues
formattedline.append(altBases)
formattedline.append(allBases)
formattedline.append(allelefrequency)
i += 1
else:
printingFlag = False
i += 1
else:
printingFlag = False
i += 1
sys.stderr.write("skipped ")
sys.stderr.write('\t'.join(map(str,line)))
sys.stderr.write('\n')
if printingFlag :
#print('\t'.join(map(str,formattedline)))
outfile.writelines('\t'.join(map(str,formattedline))+'\n')
else:
sys.stderr.write("skipped ")
sys.stderr.write('\t'.join(map(str,line)))
sys.stderr.write('\n')
filehandle.close()
outfile.close()
print "SNP file written"
return out
##
##with open('some.csv', 'wb') as f:
## writer = csv.writer(f)
## writer.writerows(someiterable)
##
## if not line[0].startswith("#"): # skip header and comments
## formatfield = line[7].split(';')
## if formatfield[0] == 'INDEL':
## del formatfield[0]
### print formatfield
## formattedline = line[0:2]
## formattedline += line[3:5]
## allelefrequency = re.search(r'AF1=(\d\.*\d*)', formatfield[2])
## formattedline.append(allelefrequency.group(1))
## print formattedline
##