#### This notebook was made to create (and submit jobs for) JankyPipe, a pipeline that takes fastq files as input of Mycobacterium tuberculosis isolates, aligns the reads to H37Rv and calls variants. The output is a VCF file, a lineage call and a Qualimap report. This notebook also submits a job that runs JankyPipe on all of the *Longitudinal* isolates in our study.

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
from slurmpy import Slurm
import vcf
import shutil

### *Function* to launch JankyPipe as a Job

In [3]:
def Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir):
    
    '''
    This script launches a job to call variants for the input fastq files against H37Rv
    using a number of packages. The important output (VCF, lineage info files, quality report)
    is stored in the output directory while the intermediary files (SAMs, trimmed fastqs, BAM, etc)
    are stored in a scratch directory.
    '''
    
    #store all commands in a list
    commands_list = []
    
    #change directory to scratch
    commands_list.append( 'cd ' + scratch_dir )

    ###################################
    ### Load Necessary Modules ########
    ###################################

    #load perl
    commands_list.append( 'module load perl/5.24.0' )

    #load java
    commands_list.append( 'module load java/jdk-1.8u112' )

    #load BWA
    commands_list.append( 'module load bwa/0.7.15' )

    #load Samtools
    commands_list.append( 'module load samtools/1.3.1' )

    #load BCFtools
    commands_list.append( 'module load bcftools/1.3.1' )

    #load Picard
    commands_list.append( 'module load picard/2.8.0' )

    #Create Index files for Reference Genome
    commands_list.append( 'mkdir RefGen' )

    #copy reference genome over to RefGen folder
    commands_list.append( 'cp /home/rv76/Farhat_Lab/Reference_Seqs/H37Rv/h37rv.fasta RefGen/TBRefGen.fasta' )

    #change directory to RefGen folder
    commands_list.append( 'cd RefGen' )

    ###################################
    ### Create Index Files for H37Rv ##
    ###################################
    commands_list.append( 'samtools faidx TBRefGen.fasta' )
    commands_list.append( 'bwa index TBRefGen.fasta' )

    RefGen = scratch_dir + '/RefGen/TBRefGen.fasta' #H37Rv reference

    #go back to parent directory
    commands_list.append( 'cd ..' )

    ###################################
    ### UnZip FastQ files #############
    ###################################
    fqf1_base_name = fqf1.split('/')[-1][0:-9]
    fqf2_base_name = fqf2.split('/')[-1][0:-9]

    #work with the unzipped files for the rest of the pipeline (after unzipping them)
    fqf1_unzipped = scratch_dir + '/{}'.format(fqf1_base_name) + '.fastq'
    fqf2_unzipped = scratch_dir + '/{}'.format(fqf2_base_name) + '.fastq'

    commands_list.append( 'zcat {0} > {1}'.format(fqf1, fqf1_unzipped) )
    commands_list.append( 'zcat {0} > {1}'.format(fqf2, fqf2_unzipped) )

    #use the unzipped fastq files now
    fqf1 = fqf1_unzipped
    fqf2 = fqf2_unzipped

    ####################################
    ### PRINSEQ (trim reads) ##########
    ###################################

    #create directory for prinseq in output directory
    commands_list.append( 'mkdir ' + output_dir + '/prinseq' )

    commands_list.append( 'perl /n/data1/hms/dbmi/farhat/bin/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {0} -fastq2 {1} -out_format 3 -out_good {2}/{3}-trimmed -out_bad null -log {4}/{3}-trimmed.log -min_qual_mean 20 -verbose'.format(fqf1, fqf2, scratch_dir, tag , output_dir+'/prinseq') )

    #use newly trimmed fastq files now
    fqf1 = scratch_dir + '/{}'.format(tag) + '-trimmed_1.fastq'
    fqf2 = scratch_dir + '/{}'.format(tag) + '-trimmed_2.fastq'

    ######################################
    ### BWA (align reads to reference) ###
    ######################################

    #create SAM file
    samfile = scratch_dir + '/{}.sam'.format(tag)

    #run BWA
    commands_list.append( 'bwa mem -M {3} {0} {1} > {2}'.format(fqf1 , fqf2 , samfile , RefGen) )

    #####################################
    ### PICARD (sort & convert to BAM) ##
    #####################################

    #create BAM file
    bamfile = scratch_dir + '/{0}.sorted.bam'.format(tag)

    commands_list.append( 'java -Xmx16G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar SortSam INPUT={0} OUTPUT={1} SORT_ORDER=coordinate'.format(samfile, bamfile) )

    ####################################
    ### PICARD (remove duplicates) ####
    ###################################

    #create BAM file with removed duplicates
    drbamfile = bamfile.replace(".bam", ".duprem.bam")

    #remove duplicates from BAM file
    commands_list.append( "java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar MarkDuplicates I={0} O={1} REMOVE_DUPLICATES=true M={2} ASSUME_SORT_ORDER=coordinate".format(bamfile, drbamfile, drbamfile[:-4]+'.metrics') )

    ####################################
    ### SAMTOOLS (to index BAM file) ###
    ####################################
    
    commands_list.append( "samtools index {0}".format(drbamfile) )
    
    ######################################
    ### QUALIMAP (quality of BAM file) ###
    ######################################
    
    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'cd ' + output_dir )
    commands_list.append( 'mkdir QualiMap' ) #make a folder for pilon output in output directory
    commands_list.append( 'unset DISPLAY' ) #unset JAVA virtual machine variable [http://qualimap.bioinfo.cipf.es/doc_html/faq.html]
    commands_list.append( "/n/data1/hms/dbmi/farhat/bin/qualimap_v2.2.1/qualimap bamqc -bam {0} --outdir {1} --outfile {2}.pdf --outformat PDF".format(drbamfile, output_dir+'/QualiMap', tag+'_stats') )

    ###################################
    ### PILON (call variants) #########
    ###################################
    
    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'mkdir pilon' ) #make a folder for pilon output in output directory
    out_pilon_dir = output_dir + '/pilon/' #variable for pilon output path

    commands_list.append( 'java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/pilon/pilon-1.22.jar --genome {0} --bam {1} --output {2} --outdir {3} --variant'.format(RefGen, drbamfile, tag, out_pilon_dir) )

    #####################################
    ### Luca's LINEAGE CALLING script ###
    #####################################

    #create directory 
    commands_list.append( 'mkdir ' + scratch_dir + '/fast-lineage-caller/' )#make a folder for lineage call in output directory
    commands_list.append( 'mkdir ' + output_dir + '/fast-lineage-caller/' )#make a folder for lineage call in scratch directory

    #create VRT file
    vrtfile = scratch_dir + '/fast-lineage-caller/{}.vrt'.format(tag)

    commands_list.append( 'cd ' + scratch_dir + '/fast-lineage-caller' )#change directory to store output in scratch

    #convert VCF to VRT
    commands_list.append( 'vrtTools-vcf2vrt.py {0} {1} 1'.format(out_pilon_dir+tag+'.vcf', vrtfile) )

    #call lineage with SNP database an VRT file
    commands_list.append( 'cd ' + output_dir + '/fast-lineage-caller' )#change directory to store output in VCF output

    commands_list.append( 'FastLineageCaller-assign2lineage.py /home/rv76/Bio_Pipelines/fast-lineage-caller-master/example/db_snps.tsv ' + vrtfile + ' &> ' + 'lineage_call.txt' )

    ###############################################################################################################
    ######################################## SUBMIT as a job to O2 ################################################
    ###############################################################################################################
    
    #append all commands in a single string to be submitted as a job
    JankyPipe_job = ''
    for command_i in commands_list:
        JankyPipe_job = JankyPipe_job + '\n' + command_i
    
    #directory where you want output + error files
    os.chdir(O2_SLURM_logs_dir)

    job_name = tag

    s = Slurm(job_name , {'partition':'short' , 'n':'1' , 't':'0-6:00:00' , 'mem-per-cpu':'36G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(JankyPipe_job)

    print job_name  + ' : ' +  str(job_id)

### Longitudinal Samples

Pull all relevant sequenced isolate and corresponding FastQ file paths

In [4]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/cetr_casali_walker_trauner_witney_xu_guerra_bryant_fastq_path_names.csv' , sep = ',').set_index('patient_id')

In [5]:
sample_annotation.head(n=2)

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I0005973-8,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,,Peru3062,1
I0005973-8,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,,Peru3315,2


In [6]:
np.shape(sample_annotation)

(614, 5)

### Create directories for each isolate and launch JankyPipe

IMPORTANT PARENT DIRECTORIES 

-   /n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe/intermediary_files/

    [to store intermediate files (unzipped fastq, trimmed fastq, SAM, sorted BAM, etc)]


-   /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output/

    [to store final files (pilon VCF, lineage, QualiMap, trim logs)]


-   /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/O2_SLURM_logs/

    [to store submitted SLURM script, and SLURM error & verbose logs]

In [None]:
for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    isolate_fastq_paths = sample_annotation.iloc[isolate_i , 0]

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths.split(';')[0]
    fqf2 = isolate_fastq_paths.split(';')[1]

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = fqf1.split('/')[-1].split('_')[0]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output/' + tag
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    elif not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    #where everything else happens (trimming, aligning, etc.) [SCRATCH FOLDER]
    scratch_dir = '/n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe/intermediary_files/' + tag
    if os.path.exists(scratch_dir):
        shutil.rmtree(scratch_dir)
        os.makedirs(scratch_dir)
    elif not os.path.exists(scratch_dir):
        os.makedirs(scratch_dir)

    #store O2 job log files [LAB FOLDER]
    O2_SLURM_logs_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/O2_SLURM_logs/' + tag

    if os.path.exists(O2_SLURM_logs_dir):
        shutil.rmtree(O2_SLURM_logs_dir)
        os.makedirs(O2_SLURM_logs_dir)
    elif not os.path.exists(O2_SLURM_logs_dir):
        os.makedirs(O2_SLURM_logs_dir)

    #Launch JankyPipe after making necessary directories!!!
    Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir)

#### save tags (corresponds to folder names)

In [7]:
#store tags to each sample
tag_list = []

for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    isolate_fastq_paths = sample_annotation.iloc[isolate_i , 0]

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths.split(';')[0]

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = fqf1.split('/')[-1].split('_')[0]

    tag_list.append(tag)
    
sample_annotation['tag'] = tag_list 

#store as CSV
sample_annotation.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/cetr_casali_walker_trauner_witney_xu_guerra_bryant_fastq_path_names_and_JankyPipe_tags.csv' , sep = ',')

In [8]:
sample_annotation.head(n=2)

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I0005973-8,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,,Peru3062,1,Peru3062
I0005973-8,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,,Peru3315,2,Peru3315


### Determine if jobs ran successfully or not

In [9]:
successful_run = []

for isolate_i in range(0 , np.shape(sample_annotation)[0]):

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = sample_annotation.tag[isolate_i]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output/' + tag
    
    #check to see 'Lineage Call' folder exists in the output directory (last thing that is run in JankyPipe)
    if os.path.exists(output_dir + '/fast-lineage-caller/'):
        
        successful_run.append('yes')
        
    else:
        
        successful_run.append('no')
        
sample_annotation['successful_run'] = successful_run

In [10]:
sample_annotation[sample_annotation.successful_run == 'no']

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag,successful_run
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KPS_1,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR18...,GUERRA,ERR182021,,1,ERR182021,no
KPS_1,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR18...,GUERRA,ERR181711,,2,ERR181711,no
KPS_2,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR21...,GUERRA,ERR212067,,1,ERR212067,no
KPS_2,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR77...,GUERRA,ERR773785,,2,ERR773785,no
KPS_3,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR21...,GUERRA,ERR212061,,1,ERR212061,no
KPS_3,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR21...,GUERRA,ERR212068,,2,ERR212068,no
KPS_4,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR19...,GUERRA,ERR190350,,1,ERR190350,no
KPS_4,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR16...,GUERRA,ERR163987,,2,ERR163987,no
KPS_5,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR03...,GUERRA,ERR037522,,1,ERR037522,no
KPS_5,/n/data1/hms/dbmi/farhat/fastq_db/guerra/ERR21...,GUERRA,ERR212078,,2,ERR212078,no


### Re-Run isolates through pipeline that hit a run-timelimit

In [11]:
#isolates that don't have a lineage-call directory didn't finish running through pipeline
sample_annotation_ReRun = sample_annotation[sample_annotation.successful_run == 'no']

#if path already exists, remove current contents, then recreate empty directory
#if path doesn't exist, create new directory

for isolate_i in range(0 , np.shape(sample_annotation_ReRun)[0]):
    
    isolate_fastq_paths = sample_annotation_ReRun.iloc[isolate_i , 0]

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths.split(';')[0]
    fqf2 = isolate_fastq_paths.split(';')[1]

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = fqf1.split('/')[-1].split('_')[0]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output/' + tag
    
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    elif not os.path.exists(output_dir):
        os.makedirs(output_dir)
        

    #where everything else happens (trimming, aligning, etc.) [SCRATCH FOLDER]
    scratch_dir = '/n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe/intermediary_files/' + tag
    
    if os.path.exists(scratch_dir):
        shutil.rmtree(scratch_dir)
        os.makedirs(scratch_dir)
    elif not os.path.exists(scratch_dir):
        os.makedirs(scratch_dir)

        
    #store O2 job log files [LAB FOLDER]
    O2_SLURM_logs_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/O2_SLURM_logs/' + tag
    
    if os.path.exists(O2_SLURM_logs_dir):
        shutil.rmtree(O2_SLURM_logs_dir)
        os.makedirs(O2_SLURM_logs_dir)
    elif not os.path.exists(O2_SLURM_logs_dir):
        os.makedirs(O2_SLURM_logs_dir)

        
    #Launch JankyPipe after making necessary directories!!!
    Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir)

submitted: Submitted batch job 31784984
submitted: Submitted batch job 31784986
submitted: Submitted batch job 31784988
submitted: Submitted batch job 31784990


ERR182021 : 31784984
ERR181711 : 31784986
ERR212067 : 31784988
ERR773785 : 31784990


submitted: Submitted batch job 31784992
submitted: Submitted batch job 31784994
submitted: Submitted batch job 31784996
submitted: Submitted batch job 31784998


ERR212061 : 31784992
ERR212068 : 31784994
ERR190350 : 31784996
ERR163987 : 31784998


submitted: Submitted batch job 31785000


ERR037522 : 31785000


submitted: Submitted batch job 31785002
submitted: Submitted batch job 31785004


ERR212078 : 31785002
ERR036217 : 31785004


submitted: Submitted batch job 31785006


ERR176580 : 31785006


submitted: Submitted batch job 31785008


ERR161165 : 31785008


submitted: Submitted batch job 31785013


ERR161075 : 31785013


submitted: Submitted batch job 31785016


ERR245837 : 31785016


submitted: Submitted batch job 31785018


ERR181941 : 31785018


submitted: Submitted batch job 31785021
submitted: Submitted batch job 31785023
submitted: Submitted batch job 31785025
submitted: Submitted batch job 31785027


ERR245836 : 31785021
ERR212054 : 31785023
ERR036189 : 31785025
ERR163985 : 31785027


submitted: Submitted batch job 31785029
submitted: Submitted batch job 31785031
submitted: Submitted batch job 31785033
submitted: Submitted batch job 31785035


ERR216977 : 31785029
ERR473347 : 31785031
ERR036219 : 31785033
ERR245847 : 31785035


submitted: Submitted batch job 31785037
submitted: Submitted batch job 31785039


ERR212075 : 31785037
ERR221577 : 31785039


submitted: Submitted batch job 31785041
submitted: Submitted batch job 31785043
submitted: Submitted batch job 31785045
submitted: Submitted batch job 31785047


ERR190357 : 31785041
ERR176704 : 31785043
ERR221552 : 31785045
ERR161092 : 31785047


submitted: Submitted batch job 31785048
submitted: Submitted batch job 31785050
submitted: Submitted batch job 31785052
submitted: Submitted batch job 31785054


ERR212065 : 31785048
ERR212070 : 31785050
ERR245671 : 31785052
ERR181736 : 31785054


submitted: Submitted batch job 31785056
submitted: Submitted batch job 31785058


ERR037517 : 31785056
ERR181818 : 31785058


submitted: Submitted batch job 31785062
submitted: Submitted batch job 31785064
submitted: Submitted batch job 31785065
submitted: Submitted batch job 31785067
submitted:

ERR176552 : 31785062
ERR212003 : 31785064
ERR163986 : 31785065
ERR182014 : 31785067


 Submitted batch job 31785069
submitted: Submitted batch job 31785071
submitted: Submitted batch job 31785073
submitted: Submitted batch job 31785075
submitted: Submitted batch job 31785077


ERR181993 : 31785069
ERR212062 : 31785071
ERR163961 : 31785073
ERR181751 : 31785075
ERR212017 : 31785077


submitted: Submitted batch job 31785079
submitted: Submitted batch job 31785081
submitted: Submitted batch job 31785083


ERR245805 : 31785079
ERR245678 : 31785081
ERR216939 : 31785083


submitted: Submitted batch job 31785085
submitted: Submitted batch job 31785087
submitted: Submitted batch job 31785089
submitted: Submitted batch job 31785091


ERR181935 : 31785085
ERR245725 : 31785087
ERR126604 : 31785089
ERR212142 : 31785091


submitted: Submitted batch job 31785093
submitted: Submitted batch job 31785095
submitted: Submitted batch job 31785097
submitted: Submitted batch job 31785099


ERR245676 : 31785093
ERR181800 : 31785095
ERR161063 : 31785097
ERR181910 : 31785099


submitted: Submitted batch job 31785101
submitted: Submitted batch job 31785103
submitted: Submitted batch job 31785105
submitted: Submitted batch job 31785107


ERR176461 : 31785101
ERR163962 : 31785103
ERR176598 : 31785105
ERR126598 : 31785107


submitted: Submitted batch job 31785109
submitted: Submitted batch job 31785111


ERR176735 : 31785109
ERR773799 : 31785111


submitted: Submitted batch job 31785113
submitted: Submitted batch job 31785114
submitted: Submitted batch job 31785115
submitted: Submitted batch job 31785116


ERR212098 : 31785113
ERR163989 : 31785114
ERR176530 : 31785115
ERR323034 : 31785116


submitted: Submitted batch job 31785117
submitted: Submitted batch job 31785118
submitted: Submitted batch job 31785119
submitted: Submitted batch job 31785120
submitted: Submitted batch job 31785121


ERR212063 : 31785117
ERR245729 : 31785118
ERR036195 : 31785119
ERR037482 : 31785120
ERR176575 : 31785121


submitted: Submitted batch job 31785122
submitted: Submitted batch job 31785123
submitted: Submitted batch job 31785124
submitted: Submitted batch job 31785125
submitted: Submitted batch job 31785126


ERR176646 : 31785122
ERR161197 : 31785123
ERR221556 : 31785124
ERR245652 : 31785125
ERR182054 : 31785126


submitted: Submitted batch job 31785127
submitted: Submitted batch job 31785128
submitted: Submitted batch job 31785129
submitted: Submitted batch job 31785130


ERR037467 : 31785127
ERR473284 : 31785128
ERR245804 : 31785129
ERR181742 : 31785130


submitted: Submitted batch job 31785131
submitted: Submitted batch job 31785132
submitted: Submitted batch job 31785133


ERR176764 : 31785131
ERR176535 : 31785132
ERR036248 : 31785133


submitted: Submitted batch job 31785134
submitted: Submitted batch job 31785135
submitted: Submitted batch job 31785136
submitted: Submitted batch job 31785137
submitted: Submitted batch job 31785138


ERR221535 : 31785134
ERR211990 : 31785135
ERR181932 : 31785136
ERR036191 : 31785137
ERR176761 : 31785138


submitted: Submitted batch job 31785139
submitted: Submitted batch job 31785140
submitted: Submitted batch job 31785141
submitted: Submitted batch job 31785142
submitted: Submitted batch job 31785143


ERR212025 : 31785139
ERR212053 : 31785140
ERR176649 : 31785141
ERR190334 : 31785142
ERR736807 : 31785143


submitted: Submitted batch job 31785144
submitted: Submitted batch job 31785145
submitted: Submitted batch job 31785146
submitted: Submitted batch job 31785147
submitted: Submitted batch job 31785148


ERR736820 : 31785144
ERR245667 : 31785145
ERR161074 : 31785146
ERR176731 : 31785147
ERR176657 : 31785148


submitted: Submitted batch job 31785149
submitted: Submitted batch job 31785150
submitted: Submitted batch job 31785151


ERR181877 : 31785149
ERR245654 : 31785150
ERR176605 : 31785151


submitted: Submitted batch job 31785152
submitted: Submitted batch job 31785153
submitted: Submitted batch job 31785154
submitted: Submitted batch job 31785155


ERR176513 : 31785152
ERR037483 : 31785153
ERR212030 : 31785154
ERR221586 : 31785155


submitted: Submitted batch job 31785156
submitted: Submitted batch job 31785157
submitted: Submitted batch job 31785159
submitted: Submitted batch job 31785160
submitted: Submitted batch job 31785161


ERR221590 : 31785156
ERR245796 : 31785157
ERR181864 : 31785159
ERR323044 : 31785160
ERR181976 : 31785161


submitted: Submitted batch job 31785162
submitted: Submitted batch job 31785163
submitted: Submitted batch job 31785164
submitted: Submitted batch job 31785165
submitted: Submitted batch job 31785166


ERR245774 : 31785162
ERR181868 : 31785163
ERR245698 : 31785164
ERR126602 : 31785165
ERR182020 : 31785166


submitted: Submitted batch job 31785167
submitted: Submitted batch job 31785168
submitted: Submitted batch job 31785169


ERR181735 : 31785167
ERR164014 : 31785168
ERR473286 : 31785169


submitted: Submitted batch job 31785170
submitted: Submitted batch job 31785171
submitted: Submitted batch job 31785172
submitted: Submitted batch job 31785173


ERR176585 : 31785170
ERR190380 : 31785171
ERR036233 : 31785172
ERR176757 : 31785173


submitted: Submitted batch job 31785174
submitted: Submitted batch job 31785175
submitted: Submitted batch job 31785176
submitted: Submitted batch job 31785177
submitted: Submitted batch job 31785178


ERR245702 : 31785174
ERR181854 : 31785175
ERR176729 : 31785176
ERR181857 : 31785177
ERR176638 : 31785178


submitted: Submitted batch job 31785179
submitted: Submitted batch job 31785180
submitted: Submitted batch job 31785181
submitted: Submitted batch job 31785182
submitted: Submitted batch job 31785183


ERR181965 : 31785179
ERR161198 : 31785180
ERR161099 : 31785181
ERR161106 : 31785182
ERR181971 : 31785183


submitted: Submitted batch job 31785184
submitted: Submitted batch job 31785185
submitted: Submitted batch job 31785186


ERR181680 : 31785184
ERR216962 : 31785185
ERR176472 : 31785186


submitted: Submitted batch job 31785187
submitted: Submitted batch job 31785188


ERR181875 : 31785187
ERR181867 : 31785188


submitted: Submitted batch job 31785189
submitted: Submitted batch job 31785190
submitted: Submitted batch job 31785191
submitted: Submitted batch job 31785192
submitted:

ERR176717 : 31785189
ERR176623 : 31785190
ERR176794 : 31785191
ERR176471 : 31785192


 Submitted batch job 31785193
submitted: Submitted batch job 31785194
submitted: Submitted batch job 31785195
submitted: Submitted batch job 31785196
submitted: Submitted batch job 31785197
submitted: Submitted batch job 31785198


ERR212007 : 31785193
ERR176787 : 31785194
ERR161155 : 31785195
ERR212090 : 31785196
ERR323049 : 31785197
ERR181956 : 31785198


submitted: Submitted batch job 31785199
submitted: Submitted batch job 31785200
submitted: Submitted batch job 31785201
submitted: Submitted batch job 31785202
submitted: Submitted batch job 31785203


ERR161043 : 31785199
ERR221543 : 31785200
ERR161117 : 31785201
ERR216899 : 31785202
ERR181940 : 31785203


submitted: Submitted batch job 31785204
submitted: Submitted batch job 31785205
submitted: Submitted batch job 31785206
submitted: Submitted batch job 31785207


ERR221551 : 31785204
ERR182011 : 31785205
ERR161069 : 31785206
ERR126634 : 31785207


submitted: Submitted batch job 31785208
submitted: Submitted batch job 31785209
submitted: Submitted batch job 31785210
submitted: Submitted batch job 31785211


ERR176457 : 31785208
ERR181776 : 31785209
ERR182056 : 31785210
ERR124649 : 31785211


submitted: Submitted batch job 31785212
submitted: Submitted batch job 31785213
submitted: Submitted batch job 31785214
submitted: Submitted batch job 31785215
submitted: Submitted batch job 31785216


ERR182044 : 31785212
ERR181715 : 31785213
ERR181713 : 31785214
ERR181852 : 31785215
ERR181827 : 31785216


submitted: Submitted batch job 31785218
submitted: Submitted batch job 31785219
submitted: Submitted batch job 31785220
submitted: Submitted batch job 31785221
submitted: Submitted batch job 31785222


ERR212156 : 31785218
ERR473307 : 31785219
ERR736847 : 31785220
ERR108127 : 31785221
ERR108128 : 31785222


submitted: Submitted batch job 31785223
submitted: Submitted batch job 31785224
submitted: Submitted batch job 31785225


ERR108155 : 31785223
ERR108156 : 31785224
ERR108157 : 31785225


submitted: Submitted batch job 31785226
submitted: Submitted batch job 31785227
submitted: Submitted batch job 31785228
submitted: Submitted batch job 31785229
submitted: Submitted batch job 31785230


ERR108158 : 31785226
ERR966620 : 31785227
ERR108168 : 31785228
ERR108171 : 31785229
ERR108172 : 31785230


submitted: Submitted batch job 31785231
submitted: Submitted batch job 31785232
submitted: Submitted batch job 31785233
submitted: Submitted batch job 31785234
submitted: Submitted batch job 31785235


ERR108173 : 31785231
ERR108174 : 31785232
ERR108159 : 31785233
ERR108160 : 31785234
ERR108161 : 31785235


submitted: Submitted batch job 31785236
submitted: Submitted batch job 31785237
submitted: Submitted batch job 31785238
submitted: Submitted batch job 31785239
submitted: Submitted batch job 31785240


ERR108162 : 31785236
ERR108143 : 31785237
ERR108144 : 31785238
ERR108175 : 31785239
ERR108176 : 31785240


submitted: Submitted batch job 31785241
submitted: Submitted batch job 31785242
submitted: Submitted batch job 31785243


ERR108179 : 31785241
ERR108180 : 31785242
ERR108185 : 31785243


submitted: Submitted batch job 31785244
submitted: Submitted batch job 31785245
submitted: Submitted batch job 31785246
submitted: Submitted batch job 31785247


ERR108186 : 31785244
ERR108133 : 31785245
ERR108134 : 31785246
ERR966621 : 31785247


submitted: Submitted batch job 31785248
submitted: Submitted batch job 31785249
submitted: Submitted batch job 31785250
submitted: Submitted batch job 31785251
submitted: Submitted batch job 31785252


ERR108140 : 31785248
ERR108164 : 31785249
ERR108163 : 31785250
ERR108177 : 31785251
ERR108178 : 31785252


submitted: Submitted batch job 31785253
submitted: Submitted batch job 31785254
submitted: Submitted batch job 31785255
submitted: Submitted batch job 31785256


ERR171166 : 31785253
ERR171167 : 31785254
ERR108149 : 31785255
ERR108150 : 31785256


submitted: Submitted batch job 31785257
submitted: Submitted batch job 31785258
submitted: Submitted batch job 31785259
submitted: Submitted batch job 31785260
submitted: Submitted batch job 31785261


ERR108137 : 31785257
ERR108138 : 31785258
ERR108181 : 31785259
ERR108182 : 31785260
ERR171128 : 31785261


submitted: Submitted batch job 31785262
submitted: Submitted batch job 31785263
submitted: Submitted batch job 31785264
submitted: Submitted batch job 31785265


ERR171129 : 31785262
ERR171130 : 31785263
ERR171131 : 31785264
ERR171132 : 31785265


submitted: Submitted batch job 31785266
submitted: Submitted batch job 31785267
submitted: Submitted batch job 31785268
submitted: Submitted batch job 31785269


ERR171133 : 31785266
ERR108131 : 31785267
ERR108132 : 31785268
ERR108165 : 31785269


submitted: Submitted batch job 31785270
submitted: Submitted batch job 31785271
submitted: Submitted batch job 31785272
submitted: Submitted batch job 31785273
submitted: Submitted batch job 31785274


ERR108166 : 31785270
ERR108183 : 31785271
ERR108184 : 31785272
ERR108135 : 31785273
ERR108136 : 31785274


submitted: Submitted batch job 31785275
submitted: Submitted batch job 31785276
submitted: Submitted batch job 31785277
submitted: Submitted batch job 31785278
submitted: Submitted batch job 31785279


ERR108141 : 31785275
ERR108142 : 31785276
ERR171134 : 31785277
ERR171135 : 31785278
ERR171136 : 31785279


submitted: Submitted batch job 31785280
submitted: Submitted batch job 31785282
submitted: Submitted batch job 31785283


ERR171137 : 31785280
ERR171138 : 31785282
ERR171139 : 31785283


submitted: Submitted batch job 31785284
submitted: Submitted batch job 31785285
submitted: Submitted batch job 31785286
submitted: Submitted batch job 31785287
submitted: Submitted batch job 31785288


ERR966619 : 31785284
ERR171141 : 31785285
ERR171142 : 31785286
ERR171143 : 31785287
ERR108152 : 31785288


submitted: Submitted batch job 31785289
submitted: Submitted batch job 31785290
submitted: Submitted batch job 31785291
submitted: Submitted batch job 31785292
submitted: Submitted batch job 31785293


ERR108151 : 31785289
ERR108130 : 31785290
ERR108129 : 31785291
ERR108146 : 31785292
ERR108145 : 31785293


submitted: Submitted batch job 31785294
submitted: Submitted batch job 31785295
submitted: Submitted batch job 31785296
submitted: Submitted batch job 31785297
submitted: Submitted batch job 31785298


ERR108154 : 31785294
ERR108153 : 31785295
ERR108148 : 31785296
ERR108147 : 31785297
ERR108126 : 31785298


submitted: Submitted batch job 31785299
submitted: Submitted batch job 31785300


ERR108125 : 31785299
ERR171144 : 31785300


submitted: Submitted batch job 31785301
submitted: Submitted batch job 31785302
submitted: Submitted batch job 31785303
submitted: Submitted batch job 31785304


ERR171145 : 31785301
ERR171154 : 31785302
ERR171155 : 31785303
ERR171156 : 31785304
ERR171157 : 31785305
ERR171158 : 31785306
ERR171159 : 31785307


submitted: Submitted batch job 31785305
submitted: Submitted batch job 31785306
submitted: Submitted batch job 31785307


## Scrape and Analyze Mean Coverage

#### Import Sample Annotation file for filtered *longitudinal* isolates pairs

In [6]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/Longitudinal_fastq_path_names_and_JankyPipe_tags_filtered_final.csv' , sep  = ',').set_index('patient_id')

In [7]:
sample_annotation.head()

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag,isolate_type
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P000183,/n/data1/hms/dbmi/farhat/fastq_db/walker/ERR03...,WALKER,ERR039337,,1,ERR039337,longitudinal
P000183,/n/data1/hms/dbmi/farhat/fastq_db/walker/ERR03...,WALKER,ERR039338,,2,ERR039338,longitudinal
1960,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,MMJA00000000,Peru5115,1,Peru5115,longitudinal
1960,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,MMPC00000000,Peru4668,2,Peru4668,longitudinal
2491,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,MMZT00000000,Peru4903,1,Peru4903,longitudinal


In [8]:
np.shape(sample_annotation)

(400, 7)

In [14]:
from itertools import compress
import time
import sys
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from pylab import plot, show, savefig, xlim, figure, hold, ylim, legend, boxplot, setp, axes
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [27]:
#genomic data directory
rolling_DB_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output/'

#get all folders (each folder corresponds to a different sequenced isolate)
isolate_directories = list(sample_annotation.tag)

#dictionary that stores the mean coverage for each isolate (that successfully ran through megapipe2.0) from QUALIMAP
mean_coverage_dict = {} #key: isolate_ID , value: mean coverage

#iterate through each sequenced isolate
isolate_i = 0

for isolate_ID in isolate_directories:

    #directory that stores files for each sequenced isolate
    directory_for_sequenced_isolate = rolling_DB_dir + isolate_ID

    #check to see if megapipe successfully ran on sequenced isolate
    try:
        #existence of a PILON and QUALIMAP directories and corresponding VCF file [there's also an option for FAST-LINEAGE-CALLER]
        if ( 'pilon' in os.listdir(directory_for_sequenced_isolate) ) and ( 'QualiMap' in os.listdir(directory_for_sequenced_isolate) ):
            
            #existence of a VCF and GENOME-QUALITY files in relevent directories [there's also an option for LINEAGE]
            if ( 'vcf' in list( itertools.chain.from_iterable( [filename.split('.') for filename in os.listdir(directory_for_sequenced_isolate + '/pilon/')] ) ) ) and ( 'genome_results.txt' in os.listdir(directory_for_sequenced_isolate + '/QualiMap/') ):
                
                #we have a valid VCF and Quality-Map (and Lineage file?) file so megapipe ran successfully, let's keep the variant call information for this sequenced isolate and look for qualimap, lineage call data as well
                
                #QUALIMAP DATA
                ########################################################################################################################
                #look for qualimap output txt file that has mean coverage & mean read length
                qualimap_BAM_file_stats = directory_for_sequenced_isolate + '/QualiMap/' + 'genome_results.txt'

                #parse qualimap txt file and store the mean coverage for the BWA mapping (BAM file) & mean read length
                with open(qualimap_BAM_file_stats ,'r') as f:

                    #iterate through lines in text file
                    for stat_per_line in f:

                        #find the mean coverage for mapping
                        if 'mean coverageData' in stat_per_line:
                            
                            mean_coverage = float( stat_per_line.split('=')[-1][:-2].replace(',' , '') )
                            mean_coverage_dict[isolate_ID] = mean_coverage
                            
                            break #once we have mean coverage
                ########################################################################################################################
                
                #keep track of progress   
                isolate_i += 1
                if isolate_i % np.ceil(0.05*len(isolate_directories)) == 0:
                    print float(isolate_i) / float(len(isolate_directories))
        
    except OSError: #hit some file that is not another directory with genomic data 
        continue

0.05
0.1
0.15
0.2
0.25
0.3
0.35
0.4
0.45
0.5
0.55
0.6
0.65
0.7
0.75
0.8
0.85
0.9
0.95


In [28]:
mean_coverage_DF = pd.DataFrame()

mean_coverage_series = pd.Series(mean_coverage_dict)

mean_coverage_DF['mean_coverage'] = mean_coverage_series
mean_coverage_DF['isolate_ID'] = mean_coverage_DF.index

In [29]:
mean_coverage_DF.head()

Unnamed: 0,mean_coverage,isolate_ID
ERR025846,144.2826,ERR025846
ERR025847,94.7828,ERR025847
ERR036189,281.9261,ERR036189
ERR036195,139.9493,ERR036195
ERR037467,244.1663,ERR037467


In [30]:
np.shape(mean_coverage_DF)

(397, 2)

Average Coverage across isolates

In [32]:
mean_coverage_DF.mean_coverage.mean()

185.900731234257