In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
import subprocess
import vcf
import shutil
from slurmpy import Slurm

### [1] process reads and run *pilon* to get variants

In [3]:
def Launch_SmallPipe_Altered_assembly(Mtb_genome_tag):
    
    ###############################################################################################################
    ###################################### Create Directories & Get File Paths ####################################
    ###############################################################################################################

    #paths & names for simulated fastq files
    fqf1 = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/simulated_fastq_files_from_InSilicoSeq_for_altered_seq/' + Mtb_genome_tag + '/' + Mtb_genome_tag + '_R1.fastq'
    fqf2 = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/simulated_fastq_files_from_InSilicoSeq_for_altered_seq/' + Mtb_genome_tag + '/' + Mtb_genome_tag + '_R2.fastq'
    
    assembly_dir = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/process_sim_reads_and_call_variants/'
    os.chdir(assembly_dir)
    
    # create directory where intermediary files and output will be stored
    assembly_dir_with_files = assembly_dir + Mtb_genome_tag
    if os.path.exists(assembly_dir_with_files):
        shutil.rmtree(assembly_dir_with_files)
        os.makedirs(assembly_dir_with_files)
    elif not os.path.exists(assembly_dir_with_files):
        os.makedirs(assembly_dir_with_files)

    #H37Rv Reference Genome Path
    RefGen = assembly_dir_with_files + '/RefGen/TBRefGen.fasta' #H37Rv reference

    #create directory to store Reference Genome and corresponding Index Files
    RefGen_dir = assembly_dir_with_files + '/' + 'RefGen'
    if os.path.exists(RefGen_dir):
        shutil.rmtree(RefGen_dir)
        os.makedirs(RefGen_dir)
    elif not os.path.exists(RefGen_dir):
        os.makedirs(RefGen_dir)

    #create directory to store pilon VCF, prinseq & Qualimap logs
    out_dir = assembly_dir_with_files + '/' + 'SmPipe_output_ALT_assembly'
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
        os.makedirs(out_dir)
    elif not os.path.exists(out_dir):
        os.makedirs(out_dir)

    #create directory to store intermediary files (trimmed fastq, SAM, sorted BAM, sorted BAM w/ duplicates removed)
    temp_dir = assembly_dir_with_files + '/' + 'temp_for_intermediary_steps'
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
        os.makedirs(temp_dir)
    elif not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    #create directory to store O2 SLURM logs
    SLURM_log_dir = out_dir + '/' + 'O2_SLURM_logs'
    if os.path.exists(SLURM_log_dir):
        shutil.rmtree(SLURM_log_dir)
        os.makedirs(SLURM_log_dir)
    elif not os.path.exists(SLURM_log_dir):
        os.makedirs(SLURM_log_dir)

    ###############################################################################################################
    ######################################## Construct job to submit to O2 ########################################
    ###############################################################################################################

    #store all commands in a list
    commands_list = []

    #change directory to 1 with fastq files
    commands_list.append( 'cd ' + assembly_dir_with_files )

    ###################################
    ### Load Necessary Modules ########
    ###################################

    #load perl
    commands_list.append( 'module load perl/5.24.0' )

    #load java
    commands_list.append( 'module load java/jdk-1.8u112' )

    #load BWA
    commands_list.append( 'module load bwa/0.7.15' )

    #load Samtools
    commands_list.append( 'module load samtools/1.3.1' )

    #load BCFtools
    commands_list.append( 'module load bcftools/1.3.1' )

    #load Picard
    commands_list.append( 'module load picard/2.8.0' )

    #copy reference genome over to RefGen folder
    commands_list.append( 'cp /n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta RefGen/TBRefGen.fasta' )

    #change directory to RefGen folder
    commands_list.append( 'cd RefGen' )

    ###################################
    ### Create Index Files for H37Rv ##
    ###################################
    commands_list.append( 'samtools faidx TBRefGen.fasta' )
    commands_list.append( 'bwa index TBRefGen.fasta' )

    #Go back up a directory
    commands_list.append( 'cd ..' )

    ####################################
    ### PRINSEQ (trim reads) ##########
    ###################################

    #create directory for prinseq in output directory
    commands_list.append( 'mkdir ' + out_dir + '/prinseq' )

    commands_list.append( 'perl /n/data1/hms/dbmi/farhat/bin/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {0} -fastq2 {1} -out_format 3 -out_good {2}/{3}-trimmed -out_bad null -log {4}/{3}-trimmed.log -min_qual_mean 20 -verbose'.format(fqf1, fqf2, temp_dir, Mtb_genome_tag , out_dir+'/prinseq') )

    #use newly trimmed fastq files now
    fqf1 = temp_dir + '/{}'.format(Mtb_genome_tag) + '-trimmed_1.fastq'
    fqf2 = temp_dir + '/{}'.format(Mtb_genome_tag) + '-trimmed_2.fastq'

    ######################################
    ### BWA (align reads to reference) ###
    ######################################

    #create SAM file
    samfile = temp_dir + '/{}.sam'.format(Mtb_genome_tag)

    #run BWA
    commands_list.append( 'bwa mem -M {3} {0} {1} > {2}'.format(fqf1 , fqf2 , samfile , RefGen) )

    #####################################
    ### PICARD (sort & convert to BAM) ##
    #####################################

    #create BAM file
    bamfile = temp_dir + '/{0}.sorted.bam'.format(Mtb_genome_tag)

    commands_list.append( 'java -Xmx16G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar SortSam INPUT={0} OUTPUT={1} SORT_ORDER=coordinate'.format(samfile, bamfile) )

    ####################################
    ### PICARD (remove duplicates) ####
    ###################################

    #create BAM file with removed duplicates
    drbamfile = bamfile.replace(".bam", ".duprem.bam")

    #remove duplicates from BAM file
    commands_list.append( "java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar MarkDuplicates I={0} O={1} REMOVE_DUPLICATES=true M={2} ASSUME_SORT_ORDER=coordinate".format(bamfile, drbamfile, drbamfile[:-4]+'.metrics') )

    ####################################
    ### SAMTOOLS (to index BAM file) ###
    ####################################

    commands_list.append( "samtools index {0}".format(drbamfile) )

    ######################################
    ### QUALIMAP (quality of BAM file) ###
    ######################################

    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'cd ' + out_dir )
    commands_list.append( 'mkdir QualiMap' ) #make a folder for pilon output in output directory
    commands_list.append( 'unset DISPLAY' ) #unset JAVA virtual machine variable [http://qualimap.bioinfo.cipf.es/doc_html/faq.html]
    commands_list.append( "/n/data1/hms/dbmi/farhat/bin/qualimap_v2.2.1/qualimap bamqc -bam {0} --outdir {1} --outfile {2}.pdf --outformat PDF".format(drbamfile, out_dir+'/QualiMap', Mtb_genome_tag+'_stats') )

    ###################################
    ### PILON (call variants) #########
    ###################################

    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'mkdir pilon' ) #make a folder for pilon output in output directory
    out_pilon_dir = out_dir + '/pilon/' #variable for pilon output path

    commands_list.append( 'java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/pilon/pilon-1.22.jar --genome {0} --bam {1} --output {2} --outdir {3} --variant'.format(RefGen, drbamfile, Mtb_genome_tag, out_pilon_dir) )

    ###################################
    ### DELETE UNNECESSSRY FILES ######
    ###################################
    
    #delete directory containing following files
    # trimmed fastq files
    # fastq trimming log
    # SAM file produced by running BWA on original or trimmed fastq
    # sorted BAM file
    # sorted BAM file with removed duplicates
    # output text from picard after discarding duplicates
    # index file corresponding to sorted BAM file with or without removed duplicates
    ## commands_list.append( 'rm -rf {}'.format(temp_dir) )

    #delete RefGen
    commands_list.append( 'rm -rf {}'.format(RefGen_dir) )
    
    #delete fastq files simulate off of Altered Reference Genome
    ## commands_list.append( 'rm -rf {}'.format(assembly_dir_with_files + '/' + 'fastq_files_from_ART_for_ALT_RefGenome/') )

    #delete PASS CALLS (supporting Reference) and AMBIGUOUS CALLS from VCF file that came from Pilon variant calling
    commands_list.append( 'python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/reduce-pilon-vcf-size.py ' + out_dir + '/pilon/' + Mtb_genome_tag + '.vcf' )
    

    ###############################################################################################################
    ######################################## SUBMIT the job to O2 #################################################
    ###############################################################################################################

    #append all commands in a single string to be submitted as a job
    SmallPipe_job = ''
    for command_i in commands_list:

        SmallPipe_job = SmallPipe_job + '\n' + command_i


    #directory where you want output + error files
    os.chdir(SLURM_log_dir)

    job_name = 'alt_' + str(Mtb_genome_tag)

    #make sure to set run-time much shorter since we're only dealing with 80x coverage
    s = Slurm(job_name , {'partition':'short', 'account':'farhat', 'N':'1' , 'n':'1' , 't':'0-3:00:00' , 'mem':'32G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(SmallPipe_job)

    print job_name  + ' : ' +  str(job_id)

In [6]:
genome_assemblies = ['RW-TB008', 'N1274', 'N1272', 'N1202', 'N1177', 'N1176', 'N0155', 
                     'N0153', 'N0145', 'N0091', 'N0072', 'N0054', 'N0004', 'M0017522_5', 
                     'M0016737_0', 'M0016395_7', 'M0014888_3', 'M0011368_9', 'M0010874_7', 
                     'M0003941_3', 'DNA120', 'DNA091', 'DNA086', 'DNA075', 'DNA044', 'DNA020', 
                     'DNA019_Rose', 'AZE_02_042', '02_R1896', '02_R1708', '02_R1179', '02_R0894', '01_R1430']

In [7]:
len(genome_assemblies)

33

In [8]:
#submit a job for each of the assemblies
for Mtb_genome_tag in genome_assemblies:
    
    Launch_SmallPipe_Altered_assembly(Mtb_genome_tag)

submitted: Submitted batch job 61896882
submitted: Submitted batch job 61896883


alt_RW-TB008 : 61896882
alt_N1274 : 61896883


submitted: Submitted batch job 61896884
submitted: Submitted batch job 61896885


alt_N1272 : 61896884
alt_N1202 : 61896885


submitted: Submitted batch job 61896886
submitted: Submitted batch job 61896887


alt_N1177 : 61896886
alt_N1176 : 61896887


submitted: Submitted batch job 61896888
submitted: Submitted batch job 61896889


alt_N0155 : 61896888
alt_N0153 : 61896889


submitted: Submitted batch job 61896890
submitted: Submitted batch job 61896891


alt_N0145 : 61896890
alt_N0091 : 61896891


submitted: Submitted batch job 61896892
submitted: Submitted batch job 61896893


alt_N0072 : 61896892
alt_N0054 : 61896893


submitted: Submitted batch job 61896894
submitted: Submitted batch job 61896895
submitted: Submitted batch job 61896896


alt_N0004 : 61896894
alt_M0017522_5 : 61896895
alt_M0016737_0 : 61896896


submitted: Submitted batch job 61896897
submitted: Submitted batch job 61896898
submitted: Submitted batch job 61896899


alt_M0016395_7 : 61896897
alt_M0014888_3 : 61896898
alt_M0011368_9 : 61896899


submitted: Submitted batch job 61896900
submitted: Submitted batch job 61896901


alt_M0010874_7 : 61896900
alt_M0003941_3 : 61896901


submitted: Submitted batch job 61896902
submitted: Submitted batch job 61896903


alt_DNA120 : 61896902
alt_DNA091 : 61896903


submitted: Submitted batch job 61896904
submitted: Submitted batch job 61896905


alt_DNA086 : 61896904
alt_DNA075 : 61896905


submitted: Submitted batch job 61896906
submitted: Submitted batch job 61896907


alt_DNA044 : 61896906
alt_DNA020 : 61896907


submitted: Submitted batch job 61896908


alt_DNA019_Rose : 61896908


submitted: Submitted batch job 61896909
submitted: Submitted batch job 61896910


alt_AZE_02_042 : 61896909
alt_02_R1896 : 61896910


submitted: Submitted batch job 61896911


alt_02_R1708 : 61896911


submitted: Submitted batch job 61896912
submitted: Submitted batch job 61896914


alt_02_R1179 : 61896912
alt_02_R0894 : 61896914
alt_01_R1430 : 61896915


submitted: Submitted batch job 61896915
