-
Notifications
You must be signed in to change notification settings - Fork 0
/
realign_BWA_MEM.py
131 lines (102 loc) · 4.03 KB
/
realign_BWA_MEM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import sys
import re
import os
from Bio import SeqIO
bamIn = "veritas_wgs.bam"
bamOut = "BWA_MEM_Alignment/hg19.gatk.bam"
gatkRefId = "hg19"
bwaThreads = "2"
javaMem = "4g"
for arg in sys.argv:
bamInResult = re.search("^--bam_in=(.*)",arg)
bamOutResult = re.search("^--bam_out=(.*)",arg)
refResult = re.search("^--ref=(.*)",arg)
threadsBwaResult = re.search("^--bwa_threads=(.*)",arg)
memResult = re.search("^--java_mem=(.*)",arg)
helpResult = re.search("^--help",arg)
if bamInResult:
bamIn = bamInResult.group(1)
if bamOutResult:
bamOut = bamOutResult.group(1)
if refResult:
gatkRefId = refResult.group(1)
if threadsBwaResult:
bwaThreads = threadsBwaResult.group(1)
if memResult:
javaMem = memResult.group(1)
if helpResult:
print "Usage: python run_BWA_MEM.py --bam_in=veritas_wgs.bam --bam_out=BWA_MEM_Alignment/hg19.gatk.bam --ref=hg19\n"
print "--bam_in : Previous alignment file\n"
print "--bam_out : BWA-MEM alignment file\n"
print "--ref : Reference name (GATK fasta will be downloaded)\n"
print "--bwa_threads : Threads to be used by BWA\n"
print "--java_mem : Memory Allocation for Picard\n"
sys.exit()
if gatkRefId == "hg19":
command = "wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/ucsc.hg19.fasta.gz"
os.system(command)
refFa = "hg19.gatk.fasta"
command = "gunzip ucsc.hg19.fasta.gz -c > " + refFa
os.system(command)
command = "rm ucsc.hg19.fasta.gz"
os.system(command)
else:
print "Need to provide mapping to download file for " + gatkRefId
sys.exit()
print "\n\nSort by Read Name and Create Unique,Paired FASTQ Files\n\n"
folderResult = re.search("(.*)/",bamOut)
if folderResult:
subfolder = folderResult.group(1)
command = "mkdir " + subfolder
os.system(command)
nameBam = re.sub(".bam",".sort.bam",bamOut)
command = "/opt/samtools-1.3/samtools sort -n " + bamIn+ " -o " + nameBam
os.system(command)
read1 = re.sub(".bam","_R1.fastq",bamIn)
read2 = re.sub(".bam","_R2.fastq",bamIn)
unpaired = "unpaired.fastq"
command = "java -jar -Xmx" + javaMem + " /opt/picard-tools-2.5.0/picard.jar SamToFastq INPUT=" + nameBam + " FASTQ=" + read1 + " SECOND_END_FASTQ=" + read2 + " UNPAIRED_FASTQ=" + unpaired
os.system(command)
command = "rm " + nameBam
os.system(command)
command = "rm " + unpaired
os.system(command)
command = "gzip " + read1
os.system(command)
command = "gzip " + read2
os.system(command)
read1 = read1 + ".gz"
read2 = read2 + ".gz"
#better to run in parallel with GUI
#each thread only allocated 250 MB
#fastqcThreads = "16"
#command = "/opt/FastQC/fastqc " + read1 + " -t " + fastqcThreads
#os.system(command)
#command = "/opt/FastQC/fastqc " + read2 + " -t " + fastqcThreads
#os.system(command)
print "\n\nRun BWA-MEM\n\n"
command = "/opt/bwa/bwa index -a bwtsw " + refFa
os.system(command)
alnSam = re.sub(".bam",".sam",bamOut)
command = "/opt/bwa/bwa mem -t " + bwaThreads + " " + refFa + " " + read1 + " " + read2 + " > " + alnSam
os.system(command)
print "\n\nSam-to-Bam (to save space),Sort, and Index Alignment\n\n"
tempBam = "temp.bam"
command = "/opt/samtools-1.3/samtools view -b " + alnSam+ " > " + tempBam
os.system(command)
command = "rm " + alnSam
os.system(command)
rgBam = re.sub(".bam$",".rg.bam",bamOut)
command = "java -Xmx" + javaMem + " -jar /opt/picard-tools-2.5.0/picard.jar AddOrReplaceReadGroups I=" + tempBam + " O=" + rgBam + " SO=coordinate RGID=1 RGLB=WGA RGPL=Illumina RGPU=Veritas RGSM=realign"
os.system(command)
command = "rm " + tempBam
os.system(command)
tempDir = "tmp"
os.system("mkdir " + tempDir)
duplicateMetrics = "MarkDuplicates_BWA_MEM_metrics.txt"
command = "java -jar -Xmx" + javaMem + " -Djava.io.tmpdir="+tempDir+" /opt/picard-tools-2.5.0/picard.jar MarkDuplicates INPUT=" + rgBam + " OUTPUT=" + bamOut + " METRICS_FILE=" + duplicateMetrics + " REMOVE_DUPLICATES=true CREATE_INDEX=True TMP_DIR="+tempDir
os.system(command)
command = "rm " + rgBam
os.system(command)
tempDir = "tmp"
os.system("rm -R " + tempDir)