-
Notifications
You must be signed in to change notification settings - Fork 0
/
get23S.py
102 lines (83 loc) · 3.15 KB
/
get23S.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
import os, pysam, sys
def get23S(bwaPath, inputStem, outputStem, geneLibPath, sampleName, outFile):
#bwaPath - path to bwa mem
#inputStem - prefix for input fastq files which are assumed to end R1.fastq and R2.fastq for forward and reverse reads
#outputStem - prefix for intermediate output file
#geneLibPath - location of reference_23s.fa
#sampleName - human readable sample name
#outfile - name for output file
o = open(outFile, 'w')
header = 'sample_name\tantibiotic\tgene\tsite\taa_snp_presence\n'
o.write(header)
ref = '%s/reference_23s.fa'%geneLibPath
fq1 = inputStem+'.R1.fastq'
fq2 = inputStem+'.R2.fastq'
out_sam = inputStem+'.sam'
out_bam = inputStem+'.bam'
#map files using bwa mem
cmd = '%sbwa mem -t 4 %s %s %s > %s'%(bwaPath, ref, fq1, fq2, out_sam)
print cmd
os.system(cmd)
# put the mapped reads (-F 4) into sorted Bam
cmd = 'samtools view -uS -F 4 %s | samtools sort - -o %s'%(out_sam, out_bam)
print cmd
os.system(cmd)
cmd = 'samtools index %s'%out_bam
print cmd
os.system(cmd)
#get A2059G == 2045 numbered starting from 1
#get C2611T == 2597 numbered starting from 1
base2045 = []
base2597 = []
samfile = pysam.Samfile(out_bam, "rb")
for pileupcolumn in samfile.pileup( '23s_rna', 2000, 2600):
for pileupread in pileupcolumn.pileups:
if pileupcolumn.pos == 2044 and pileupread.query_position != None:
base = pileupread.alignment.query_sequence[pileupread.query_position]
quality = ord(pileupread.alignment.qual[pileupread.query_position])
if quality-33>=30:
base2045.append(base)
elif pileupcolumn.pos == 2596 and pileupread.query_position != None:
base = pileupread.alignment.query_sequence[pileupread.query_position]
quality = ord(pileupread.alignment.qual[pileupread.query_position])
if quality-33>=30:
base2597.append(base)
outputLog = outputStem+'_azt_bwa_log.txt'
ol = open(outputLog, 'w')
header = 'guid\tsite\tA\tC\tG\tT\n'
ol.write(header)
a = len([b for b in base2045 if b=='A'])
c = len([b for b in base2045 if b=='C'])
g = len([b for b in base2045 if b=='G'])
t = len([b for b in base2045 if b=='T'])
ol.write('%s\t2045\t%s\t%s\t%s\t%s\n'%(sampleName, a, c, g, t))
a = len([b for b in base2597 if b=='A'])
c = len([b for b in base2597 if b=='C'])
g = len([b for b in base2597 if b=='G'])
t = len([b for b in base2597 if b=='T'])
ol.write('%s\t2597\t%s\t%s\t%s\t%s\n'%(sampleName, a, c, g, t))
ol.close()
#read in copy number for A2059G and C2611T, aka 2045 and 2597
f = open(outputLog, 'r')
header = f.next() #guid site A C G T
baseCountDict = dict()
for l in f:
l = l.strip().split()
base = int(l[1])
if base == 2045:
snp_column = 4
elif base == 2597:
snp_column = 5
propSnp = float(int(l[snp_column])) / sum(int(i) for i in l[2:]) #check what proportion T - the SNP for both
copyNumber = int(round(propSnp * 4))
baseCountDict[base] = copyNumber
f.close()
for base in baseCountDict.keys():
if base==2045:
mut = 'A2059G'
else:
mut = 'C2611T'
outputString = '%s\t%s\t%s\t%s\t%s\n'%(sampleName, 'azt', 'rrna_mutation_copy_number', mut, baseCountDict[base])
sys.stdout.write(outputString)
o.write(outputString)