#### This notebook runs a smallPipe script for all of the Reference Genomes that have corresponding FeatureCounts tables and have had variants introduced

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
import subprocess
import vcf
import shutil
from slurmpy import Slurm

#### Get the number of Reference Genomes we want to run Small Pipe on

In [3]:
#number of Reference Genome folders with simulated fastq files for both Unaltered and Altered Reference Genomes
Number_of_Reference_Genomes = len( os.listdir('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequence_info_and_calls_against_H37Rv/') )

### [1] UN-ALTERED RefGenome

In [4]:
def Launch_MUMmer_Unaltered_RefGenome(RefGenome_i):
    
    ###############################################################################################################
    ###################################### Create Directories & Get File Paths ####################################
    ###############################################################################################################

    RefGenome_Directory = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequence_info_and_calls_against_H37Rv/'
    os.chdir(RefGenome_Directory)

    #Good Reference Genomes (determined from from gene-length sliding window analyses)
    RefGenome_folders = os.listdir(RefGenome_Directory)
    RefGenome_folder = RefGenome_folders[RefGenome_i]

    #SPECIFY REFERENCE GENOME
    RefGenome_Directory_with_files = RefGenome_Directory + RefGenome_folder
    
    #paths & name for complete genome
    RefGenome_fasta_file = RefGenome_Directory_with_files + '/' + RefGenome_folder + '_genomic.fasta'

    #tag ID for use in naming files
    tag = RefGenome_folder

    #H37Rv Reference Genome Path
    H37Rv_fasta_file = '/n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta' #H37Rv reference

    #create directory to store MUMmer output files
    out_dir = RefGenome_Directory_with_files + '/' + 'MUMmer_output_RefGenome'

    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
        os.makedirs(out_dir)
    elif not os.path.exists(out_dir):
        os.makedirs(out_dir)
        
    #create directory to store O2 SLURM logs
    SLURM_log_dir = out_dir + '/' + 'O2_SLURM_logs'

    if os.path.exists(SLURM_log_dir):
        shutil.rmtree(SLURM_log_dir)
        os.makedirs(SLURM_log_dir)
    elif not os.path.exists(SLURM_log_dir):
        os.makedirs(SLURM_log_dir)

    ###############################################################################################################
    ######################################## Construct job to submit to O2 ########################################
    ###############################################################################################################

    #store all commands in a list
    commands_list = []

    ###################################
    ### Load Necessary Modules ########
    ###################################

    #load perl
    commands_list.append( 'module load perl/5.24.0' )

    ###################################
    #### Run NUCMER & collect SNPs ####
    ###################################
    
    #change working directory to output directory
    commands_list.append( 'cd {}'.format(out_dir) )
    
    #run Nucmer
    commands_list.append( 'nucmer -mum --prefix=H37Rv_{0} {1} {2}'.format(tag, H37Rv_fasta_file, RefGenome_fasta_file) )
    
    #attempt to determine 'correct' repeat copies in repeat regions, eliminates conflicting repeat copies
    commands_list.append( 'delta-filter -r -q H37Rv_{0}.delta > H37Rv_{0}.filter'.format(tag) )
    
    #call SNPs, -C option assures only SNPs in uniquely aligned sequence will be reported (excludes SNPs in repeats)
    commands_list.append( 'show-snps -Clr -T H37Rv_{0}.filter > H37Rv_{0}.snps'.format(tag) )


    ###############################################################################################################
    ######################################## SUBMIT the job to O2 #################################################
    ###############################################################################################################

    #append all commands in a single string to be submitted as a job
    MUMmer_job = ''
    for command_i in commands_list:

        MUMmer_job = MUMmer_job + '\n' + command_i
    
    
    #directory where you want output + error files
    os.chdir(SLURM_log_dir)

    job_name = 'U_RG_' + str(RefGenome_i)

    #submit MUMmer/nucmer job via SLURM
    s = Slurm(job_name , {'partition':'short' , 'n':'1' , 't':'0-0:20:00' , 'mem-per-cpu':'8G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(MUMmer_job)

    print job_name  + ' : ' +  str(job_id)

In [5]:
#submit a job for each of the RefGenomes (that have a feature annotation file & a good H37Rv gene - RefGenome CDS mapping) ; 0-53
for RefGenome_i in range(0 , Number_of_Reference_Genomes):
    
    Launch_MUMmer_Unaltered_RefGenome(RefGenome_i)

submitted: Submitted batch job 2971854


U_RG_0 : 2971854


submitted: Submitted batch job 2971855


U_RG_1 : 2971855


submitted: Submitted batch job 2971856
submitted: Submitted batch job 2971857
submitted: Submitted batch job 2971858


U_RG_2 : 2971856
U_RG_3 : 2971857
U_RG_4 : 2971858


submitted: Submitted batch job 2971859
submitted: Submitted batch job 2971860
submitted: Submitted batch job 2971861
submitted: Submitted batch job 2971862


U_RG_5 : 2971859
U_RG_6 : 2971860
U_RG_7 : 2971861
U_RG_8 : 2971862


submitted: Submitted batch job 2971863
submitted: Submitted batch job 2971864
submitted: Submitted batch job 2971865


U_RG_9 : 2971863
U_RG_10 : 2971864
U_RG_11 : 2971865


submitted: Submitted batch job 2971866


U_RG_12 : 2971866


submitted: Submitted batch job 2971867
submitted: Submitted batch job 2971868
submitted: Submitted batch job 2971869
submitted: Submitted batch job 2971870


U_RG_13 : 2971867
U_RG_14 : 2971868
U_RG_15 : 2971869
U_RG_16 : 2971870


submitted: Submitted batch job 2971871
submitted: Submitted batch job 2971872
submitted: Submitted batch job 2971873
submitted: Submitted batch job 2971874


U_RG_17 : 2971871
U_RG_18 : 2971872
U_RG_19 : 2971873
U_RG_20 : 2971874


submitted: Submitted batch job 2971875
submitted: Submitted batch job 2971876


U_RG_21 : 2971875
U_RG_22 : 2971876


submitted: Submitted batch job 2971877
submitted: Submitted batch job 2971878
submitted: Submitted batch job 2971879
submitted: Submitted batch job 2971880


U_RG_23 : 2971877
U_RG_24 : 2971878
U_RG_25 : 2971879
U_RG_26 : 2971880


submitted: Submitted batch job 2971881
submitted: Submitted batch job 2971882
submitted: Submitted batch job 2971883
submitted: Submitted batch job 2971884


U_RG_27 : 2971881
U_RG_28 : 2971882
U_RG_29 : 2971883
U_RG_30 : 2971884


submitted: Submitted batch job 2971885
submitted: Submitted batch job 2971886
submitted: Submitted batch job 2971887
submitted: Submitted batch job 2971888


U_RG_31 : 2971885
U_RG_32 : 2971886
U_RG_33 : 2971887
U_RG_34 : 2971888


submitted: Submitted batch job 2971889


U_RG_35 : 2971889


submitted: Submitted batch job 2971890


U_RG_36 : 2971890


submitted: Submitted batch job 2971891
submitted: Submitted batch job 2971892
submitted: Submitted batch job 2971893
submitted: Submitted batch job 2971894


U_RG_37 : 2971891
U_RG_38 : 2971892
U_RG_39 : 2971893
U_RG_40 : 2971894


submitted: Submitted batch job 2971895
submitted: Submitted batch job 2971896


U_RG_41 : 2971895
U_RG_42 : 2971896


submitted: Submitted batch job 2971897
submitted: Submitted batch job 2971898
submitted: Submitted batch job 2971899


U_RG_43 : 2971897
U_RG_44 : 2971898
U_RG_45 : 2971899


submitted: Submitted batch job 2971900
submitted: Submitted batch job 2971901
submitted: Submitted batch job 2971902
submitted: Submitted batch job 2971903


U_RG_46 : 2971900
U_RG_47 : 2971901
U_RG_48 : 2971902
U_RG_49 : 2971903
U_RG_50 : 2971904
U_RG_51 : 2971905
U_RG_52 : 2971906
U_RG_53 : 2971907


submitted: Submitted batch job 2971904
submitted: Submitted batch job 2971905
submitted: Submitted batch job 2971906
submitted: Submitted batch job 2971907


### [2] ALTERED RefGenome

In [6]:
def Launch_SmallPipe_Altered_RefGenome(RefGenome_i):
    
    ###############################################################################################################
    ###################################### Create Directories & Get File Paths ####################################
    ###############################################################################################################

    RefGenome_Directory = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequence_info_and_calls_against_H37Rv/'
    os.chdir(RefGenome_Directory)

    #Good Reference Genomes (determined from from gene-length sliding window analyses)
    RefGenome_folders = os.listdir(RefGenome_Directory)
    RefGenome_folder = RefGenome_folders[RefGenome_i]

    #SPECIFY REFERENCE GENOME
    RefGenome_Directory_with_files = RefGenome_Directory + RefGenome_folder

    #paths & names for simulated fastq files
    fqf1 = RefGenome_Directory_with_files + '/' + 'fastq_files_from_ART_for_ALT_RefGenome/' + RefGenome_folder + '_ART_1.fq'
    fqf2 = RefGenome_Directory_with_files + '/' + 'fastq_files_from_ART_for_ALT_RefGenome/' + RefGenome_folder + '_ART_2.fq'

    #tag ID for use in naming files
    tag = RefGenome_folder

    #H37Rv Reference Genome Path
    RefGen = RefGenome_Directory_with_files + '/RefGen/TBRefGen.fasta' #H37Rv reference

    #create directory to store Reference Genome and corresponding Index Files
    RefGen_dir = RefGenome_Directory_with_files + '/' + 'RefGen'

    if os.path.exists(RefGen_dir):
        shutil.rmtree(RefGen_dir)
        os.makedirs(RefGen_dir)
    elif not os.path.exists(RefGen_dir):
        os.makedirs(RefGen_dir)

    #create directory to store pilon VCF, prinseq & Qualimap logs
    out_dir = RefGenome_Directory_with_files + '/' + 'SmPipe_output_ALT_RefGenome'

    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
        os.makedirs(out_dir)
    elif not os.path.exists(out_dir):
        os.makedirs(out_dir)

    #create directory to store intermediary files (trimmed fastq, SAM, sorted BAM, sorted BAM w/ duplicates removed)
    temp_dir = RefGenome_Directory_with_files + '/' + 'temp_for_intermediary_steps'

    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
        os.makedirs(temp_dir)
    elif not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    #create directory to store O2 SLURM logs
    SLURM_log_dir = out_dir + '/' + 'O2_SLURM_logs'

    if os.path.exists(SLURM_log_dir):
        shutil.rmtree(SLURM_log_dir)
        os.makedirs(SLURM_log_dir)
    elif not os.path.exists(SLURM_log_dir):
        os.makedirs(SLURM_log_dir)

    ###############################################################################################################
    ######################################## Construct job to submit to O2 ########################################
    ###############################################################################################################

    #store all commands in a list
    commands_list = []

    #change directory to 1 with fastq files
    commands_list.append( 'cd ' + RefGenome_Directory_with_files )

    ###################################
    ### Load Necessary Modules ########
    ###################################

    #load perl
    commands_list.append( 'module load perl/5.24.0' )

    #load java
    commands_list.append( 'module load java/jdk-1.8u112' )

    #load BWA
    commands_list.append( 'module load bwa/0.7.15' )

    #load Samtools
    commands_list.append( 'module load samtools/1.3.1' )

    #load BCFtools
    commands_list.append( 'module load bcftools/1.3.1' )

    #load Picard
    commands_list.append( 'module load picard/2.8.0' )

    #copy reference genome over to RefGen folder
    commands_list.append( 'cp /n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta RefGen/TBRefGen.fasta' )

    #change directory to RefGen folder
    commands_list.append( 'cd RefGen' )

    ###################################
    ### Create Index Files for H37Rv ##
    ###################################
    commands_list.append( 'samtools faidx TBRefGen.fasta' )
    commands_list.append( 'bwa index TBRefGen.fasta' )

    #Go back up a directory
    commands_list.append( 'cd ..' )

    ####################################
    ### PRINSEQ (trim reads) ##########
    ###################################

    #create directory for prinseq in output directory
    commands_list.append( 'mkdir ' + out_dir + '/prinseq' )

    commands_list.append( 'perl /n/data1/hms/dbmi/farhat/bin/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {0} -fastq2 {1} -out_format 3 -out_good {2}/{3}-trimmed -out_bad null -log {4}/{3}-trimmed.log -min_qual_mean 20 -verbose'.format(fqf1, fqf2, temp_dir, tag , out_dir+'/prinseq') )

    #use newly trimmed fastq files now
    fqf1 = temp_dir + '/{}'.format(tag) + '-trimmed_1.fastq'
    fqf2 = temp_dir + '/{}'.format(tag) + '-trimmed_2.fastq'

    ######################################
    ### BWA (align reads to reference) ###
    ######################################

    #create SAM file
    samfile = temp_dir + '/{}.sam'.format(tag)

    #run BWA
    commands_list.append( 'bwa mem -M {3} {0} {1} > {2}'.format(fqf1 , fqf2 , samfile , RefGen) )

    #####################################
    ### PICARD (sort & convert to BAM) ##
    #####################################

    #create BAM file
    bamfile = temp_dir + '/{0}.sorted.bam'.format(tag)

    commands_list.append( 'java -Xmx16G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar SortSam INPUT={0} OUTPUT={1} SORT_ORDER=coordinate'.format(samfile, bamfile) )

    ####################################
    ### PICARD (remove duplicates) ####
    ###################################

    #create BAM file with removed duplicates
    drbamfile = bamfile.replace(".bam", ".duprem.bam")

    #remove duplicates from BAM file
    commands_list.append( "java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar MarkDuplicates I={0} O={1} REMOVE_DUPLICATES=true M={2} ASSUME_SORT_ORDER=coordinate".format(bamfile, drbamfile, drbamfile[:-4]+'.metrics') )

    ####################################
    ### SAMTOOLS (to index BAM file) ###
    ####################################

    commands_list.append( "samtools index {0}".format(drbamfile) )

    ######################################
    ### QUALIMAP (quality of BAM file) ###
    ######################################

    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'cd ' + out_dir )
    commands_list.append( 'mkdir QualiMap' ) #make a folder for pilon output in output directory
    commands_list.append( 'unset DISPLAY' ) #unset JAVA virtual machine variable [http://qualimap.bioinfo.cipf.es/doc_html/faq.html]
    commands_list.append( "/n/data1/hms/dbmi/farhat/bin/qualimap_v2.2.1/qualimap bamqc -bam {0} --outdir {1} --outfile {2}.pdf --outformat PDF".format(drbamfile, out_dir+'/QualiMap', tag+'_stats') )

    ###################################
    ### PILON (call variants) #########
    ###################################

    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'mkdir pilon' ) #make a folder for pilon output in output directory
    out_pilon_dir = out_dir + '/pilon/' #variable for pilon output path

    commands_list.append( 'java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/pilon/pilon-1.22.jar --genome {0} --bam {1} --output {2} --outdir {3} --variant'.format(RefGen, drbamfile, tag, out_pilon_dir) )

    ###################################
    ### DELETE UNNECESSSRY FILES ######
    ###################################
    
    #delete directory containing following files
    # trimmed fastq files
    # fastq trimming log
    # SAM file produced by running BWA on original or trimmed fastq
    # sorted BAM file
    # sorted BAM file with removed duplicates
    # output text from picard after discarding duplicates
    # index file corresponding to sorted BAM file with or without removed duplicates
    commands_list.append( 'rm -rf {}'.format(temp_dir) )

    #delete RefGen
    commands_list.append( 'rm -rf {}'.format(RefGen_dir) )
    
    #delete fastq files simulate off of Altered Reference Genome
    commands_list.append( 'rm -rf {}'.format(RefGenome_Directory_with_files + '/' + 'fastq_files_from_ART_for_ALT_RefGenome/') )

    #delete PASS CALLS (supporting Reference) and AMBIGUOUS CALLS from VCF file that came from Pilon variant calling
    commands_list.append( 'python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/reduce-pilon-vcf-size.py ' + out_dir + '/pilon/' + tag + '.vcf' )
    

    ###############################################################################################################
    ######################################## SUBMIT the job to O2 #################################################
    ###############################################################################################################

    #append all commands in a single string to be submitted as a job
    SmallPipe_job = ''
    for command_i in commands_list:

        SmallPipe_job = SmallPipe_job + '\n' + command_i


    #directory where you want output + error files
    os.chdir(SLURM_log_dir)

    job_name = 'A_RG_' + str(RefGenome_i)

    #make sure to set run-time much shorter since we're only dealing with 80x coverage
    s = Slurm(job_name , {'partition':'short' , 'n':'1' , 't':'0-3:00:00' , 'mem-per-cpu':'32G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(SmallPipe_job)

    print job_name  + ' : ' +  str(job_id)

In [7]:
#submit a job for each of the RefGenomes (that have a feature annotation file & a good H37Rv gene - RefGenome CDS mapping) ; 0-53
for RefGenome_i in range(0 , Number_of_Reference_Genomes):
    
    Launch_SmallPipe_Altered_RefGenome(RefGenome_i)

submitted: Submitted batch job 2971908


A_RG_0 : 2971908


submitted: Submitted batch job 2971909


A_RG_1 : 2971909


submitted: Submitted batch job 2971919


A_RG_2 : 2971919


submitted: Submitted batch job 2971920


A_RG_3 : 2971920


submitted: Submitted batch job 2971922


A_RG_4 : 2971922


submitted: Submitted batch job 2971928


A_RG_5 : 2971928


submitted: Submitted batch job 2971929


A_RG_6 : 2971929


submitted: Submitted batch job 2971930


A_RG_7 : 2971930


submitted: Submitted batch job 2971931


A_RG_8 : 2971931


submitted: Submitted batch job 2971932


A_RG_9 : 2971932


submitted: Submitted batch job 2971933
submitted: Submitted batch job 2971934
submitted: Submitted batch job 2971935


A_RG_10 : 2971933
A_RG_11 : 2971934
A_RG_12 : 2971935


submitted: Submitted batch job 2971936


A_RG_13 : 2971936


submitted: Submitted batch job 2971937


A_RG_14 : 2971937


submitted: Submitted batch job 2971938


A_RG_15 : 2971938


submitted: Submitted batch job 2971939


A_RG_16 : 2971939


submitted: Submitted batch job 2971940
submitted: Submitted batch job 2971941
submitted: Submitted batch job 2971942


A_RG_17 : 2971940
A_RG_18 : 2971941
A_RG_19 : 2971942


submitted: Submitted batch job 2971943


A_RG_20 : 2971943


submitted: Submitted batch job 2971944
submitted: Submitted batch job 2971945
submitted: Submitted batch job 2971946


A_RG_21 : 2971944
A_RG_22 : 2971945
A_RG_23 : 2971946


submitted: Submitted batch job 2971947


A_RG_24 : 2971947


submitted: Submitted batch job 2971948
submitted: Submitted batch job 2971949


A_RG_25 : 2971948
A_RG_26 : 2971949


submitted: Submitted batch job 2971950


A_RG_27 : 2971950


submitted: Submitted batch job 2971951


A_RG_28 : 2971951


submitted: Submitted batch job 2971952


A_RG_29 : 2971952


submitted: Submitted batch job 2971953


A_RG_30 : 2971953


submitted: Submitted batch job 2971954


A_RG_31 : 2971954


submitted: Submitted batch job 2971956
submitted: Submitted batch job 2971957
submitted: Submitted batch job 2971958


A_RG_32 : 2971956
A_RG_33 : 2971957
A_RG_34 : 2971958


submitted: Submitted batch job 2971959


A_RG_35 : 2971959


submitted: Submitted batch job 2971960


A_RG_36 : 2971960


submitted: Submitted batch job 2971961


A_RG_37 : 2971961


submitted: Submitted batch job 2971962


A_RG_38 : 2971962


submitted: Submitted batch job 2971963


A_RG_39 : 2971963


submitted: Submitted batch job 2971965
submitted: Submitted batch job 2971966
submitted: Submitted batch job 2971967


A_RG_40 : 2971965
A_RG_41 : 2971966
A_RG_42 : 2971967


submitted: Submitted batch job 2971968
submitted: Submitted batch job 2971969
submitted: Submitted batch job 2971970


A_RG_43 : 2971968
A_RG_44 : 2971969
A_RG_45 : 2971970


submitted: Submitted batch job 2971971
submitted: Submitted batch job 2971972
submitted: Submitted batch job 2971973


A_RG_46 : 2971971
A_RG_47 : 2971972
A_RG_48 : 2971973


submitted: Submitted batch job 2971974
submitted: Submitted batch job 2971975


A_RG_49 : 2971974
A_RG_50 : 2971975
A_RG_51 : 2971976
A_RG_52 : 2971977
A_RG_53 : 2971978


submitted: Submitted batch job 2971976
submitted: Submitted batch job 2971977
submitted: Submitted batch job 2971978
