#### This notebook was made to create (and submit jobs for) JankyPipe, a pipeline that takes fastq files as input of Mycobacterium tuberculosis isolates, aligns the reads to H37Rv and calls variants. The output is a VCF file, a lineage call and a Qualimap report. This notebook also submits a job that runs JankyPipe on all of the *Replicate* isolates in our study.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
from slurmpy import Slurm
import vcf
import shutil

### *Function* to launch JankyPipe as a Job

In [3]:
def Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir):
    
    '''
    This script launches a job to call variants for the input fastq files against H37Rv
    using a number of packages. The important output (VCF, lineage info files, quality report)
    is stored in the output directory while the intermediary files (SAMs, trimmed fastqs, BAM, etc)
    are stored in a scratch directory.
    '''
    
    #store all commands in a list
    commands_list = []
    
    #change directory to scratch
    commands_list.append( 'cd ' + scratch_dir )

    ###################################
    ### Load Necessary Modules ########
    ###################################

    #load perl
    commands_list.append( 'module load perl/5.24.0' )

    #load java
    commands_list.append( 'module load java/jdk-1.8u112' )

    #load BWA
    commands_list.append( 'module load bwa/0.7.15' )

    #load Samtools
    commands_list.append( 'module load samtools/1.3.1' )

    #load BCFtools
    commands_list.append( 'module load bcftools/1.3.1' )

    #load Picard
    commands_list.append( 'module load picard/2.8.0' )

    #Create Index files for Reference Genome
    commands_list.append( 'mkdir RefGen' )

    #copy reference genome over to RefGen folder
    commands_list.append( 'cp /home/rv76/Farhat_Lab/Reference_Seqs/H37Rv/h37rv.fasta RefGen/TBRefGen.fasta' )

    #change directory to RefGen folder
    commands_list.append( 'cd RefGen' )

    ###################################
    ### Create Index Files for H37Rv ##
    ###################################
    commands_list.append( 'samtools faidx TBRefGen.fasta' )
    commands_list.append( 'bwa index TBRefGen.fasta' )

    RefGen = scratch_dir + '/RefGen/TBRefGen.fasta' #H37Rv reference

    #go back to parent directory
    commands_list.append( 'cd ..' )

    ###################################
    ### UnZip FastQ files #############
    ###################################
    fqf1_base_name = fqf1.split('/')[-1][0:-9]
    fqf2_base_name = fqf2.split('/')[-1][0:-9]

    #work with the unzipped files for the rest of the pipeline (after unzipping them)
    fqf1_unzipped = scratch_dir + '/{}'.format(fqf1_base_name) + '.fastq'
    fqf2_unzipped = scratch_dir + '/{}'.format(fqf2_base_name) + '.fastq'

    commands_list.append( 'zcat {0} > {1}'.format(fqf1, fqf1_unzipped) )
    commands_list.append( 'zcat {0} > {1}'.format(fqf2, fqf2_unzipped) )

    #use the unzipped fastq files now
    fqf1 = fqf1_unzipped
    fqf2 = fqf2_unzipped
    
    ###################################
    ### Clean FastQ read names ########
    ###################################
    
    #delete any weird caracters from the read names in the FastQ files
    commands_list.append( 'python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/megapipe-correct-names-reads.py {}'.format(fqf1) )
    commands_list.append( 'python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/megapipe-correct-names-reads.py {}'.format(fqf2) )

    ####################################
    ### PRINSEQ (trim reads) ##########
    ###################################

    #create directory for prinseq in output directory
    commands_list.append( 'mkdir ' + output_dir + '/prinseq' )

    commands_list.append( 'perl /n/data1/hms/dbmi/farhat/bin/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {0} -fastq2 {1} -out_format 3 -out_good {2}/{3}-trimmed -out_bad null -log {4}/{3}-trimmed.log -min_qual_mean 20 -verbose'.format(fqf1, fqf2, scratch_dir, tag , output_dir+'/prinseq') )

    #use newly trimmed fastq files now
    fqf1 = scratch_dir + '/{}'.format(tag) + '-trimmed_1.fastq'
    fqf2 = scratch_dir + '/{}'.format(tag) + '-trimmed_2.fastq'

    ######################################
    ### BWA (align reads to reference) ###
    ######################################

    #create SAM file
    samfile = scratch_dir + '/{}.sam'.format(tag)

    #run BWA
    commands_list.append( 'bwa mem -M {3} {0} {1} > {2}'.format(fqf1 , fqf2 , samfile , RefGen) )

    #####################################
    ### PICARD (sort & convert to BAM) ##
    #####################################

    #create BAM file
    bamfile = scratch_dir + '/{0}.sorted.bam'.format(tag)

    commands_list.append( 'java -Xmx16G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar SortSam INPUT={0} OUTPUT={1} SORT_ORDER=coordinate'.format(samfile, bamfile) )

    ####################################
    ### PICARD (remove duplicates) ####
    ###################################

    #create BAM file with removed duplicates
    drbamfile = bamfile.replace(".bam", ".duprem.bam")

    #remove duplicates from BAM file
    commands_list.append( "java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/picard/picard/build/libs/picard.jar MarkDuplicates I={0} O={1} REMOVE_DUPLICATES=true M={2} ASSUME_SORT_ORDER=coordinate".format(bamfile, drbamfile, drbamfile[:-4]+'.metrics') )

    ####################################
    ### SAMTOOLS (to index BAM file) ###
    ####################################
    
    commands_list.append( "samtools index {0}".format(drbamfile) )
    
    ######################################
    ### QUALIMAP (quality of BAM file) ###
    ######################################
    
    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'cd ' + output_dir )
    commands_list.append( 'mkdir QualiMap' ) #make a folder for pilon output in output directory
    commands_list.append( 'unset DISPLAY' ) #unset JAVA virtual machine variable [http://qualimap.bioinfo.cipf.es/doc_html/faq.html]
    commands_list.append( "/n/data1/hms/dbmi/farhat/bin/qualimap_v2.2.1/qualimap bamqc -bam {0} --outdir {1} --outfile {2}.pdf --outformat PDF".format(drbamfile, output_dir+'/QualiMap', tag+'_stats') )

    ###################################
    ### PILON (call variants) #########
    ###################################
    
    #store quality report, pilon VCF & lineage call information all in Output directory
    commands_list.append( 'mkdir pilon' ) #make a folder for pilon output in output directory
    out_pilon_dir = output_dir + '/pilon/' #variable for pilon output path

    commands_list.append( 'java -Xmx32G -jar /n/data1/hms/dbmi/farhat/bin/pilon/pilon-1.22.jar --genome {0} --bam {1} --output {2} --outdir {3} --variant'.format(RefGen, drbamfile, tag, out_pilon_dir) )

    #####################################
    ### Luca's LINEAGE CALLING script ###
    #####################################

    #create directory 
    commands_list.append( 'mkdir ' + scratch_dir + '/fast-lineage-caller/' )#make a folder for lineage call in output directory
    commands_list.append( 'mkdir ' + output_dir + '/fast-lineage-caller/' )#make a folder for lineage call in scratch directory

    #create VRT file
    vrtfile = scratch_dir + '/fast-lineage-caller/{}.vrt'.format(tag)

    commands_list.append( 'cd ' + scratch_dir + '/fast-lineage-caller' )#change directory to store output in scratch

    #convert VCF to VRT
    commands_list.append( 'vrtTools-vcf2vrt.py {0} {1} 1'.format(out_pilon_dir+tag+'.vcf', vrtfile) )

    #call lineage with SNP database an VRT file
    commands_list.append( 'cd ' + output_dir + '/fast-lineage-caller' )#change directory to store output in VCF output

    commands_list.append( 'FastLineageCaller-assign2lineage.py /home/rv76/Bio_Pipelines/fast-lineage-caller-master/example/db_snps.tsv ' + vrtfile + ' &> ' + 'lineage_call.txt' )

    ###############################################################################################################
    ######################################## SUBMIT as a job to O2 ################################################
    ###############################################################################################################
    
    #append all commands in a single string to be submitted as a job
    JankyPipe_job = ''
    for command_i in commands_list:
        JankyPipe_job = JankyPipe_job + '\n' + command_i
        
        print command_i
        print ' '
    
    
    #directory where you want output + error files
    os.chdir(O2_SLURM_logs_dir)

    job_name = tag

    s = Slurm(job_name , {'partition':'short' , 'n':'1' , 't':'0-10:00:00' , 'mem-per-cpu':'48G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(JankyPipe_job)

    print job_name  + ' : ' +  str(job_id)
    

### Load Replicate Samples

Pull all relevant sequenced isolate and corresponding FastQ file paths

In [4]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/REPLICATE_fastq_path_names.csv' , sep = ',').set_index('patient_id')

In [5]:
sample_annotation.head(n=2)

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I0002918-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru4092,0
I0002918-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru3380,0


In [6]:
np.shape(sample_annotation)

(163, 5)

### Create directories for each isolate and launch JankyPipe

IMPORTANT PARENT DIRECTORIES 

-   /n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe_REPLICATES/intermediary_files/

    [to store intermediate files (unzipped fastq, trimmed fastq, SAM, sorted BAM, etc)]


-   /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output_REPLICATES/

    [to store final files (pilon VCF, lineage, QualiMap, trim logs)]


-   /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/O2_SLURM_logs_REPLICATES/

    [to store submitted SLURM script, and SLURM error & verbose logs]

In [47]:
for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    isolate_fastq_paths = sample_annotation.iloc[isolate_i , 0]

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths.split(';')[0]
    fqf2 = isolate_fastq_paths.split(';')[1]
    
    #get the tag ID for the fastq files (same as ID for fastq files)
    
    #check to see if 'CETR' , 'POOLS' or 'TRAUNER'
    if (fqf1.split('/')[-1][:3] == 'Per') or (fqf1.split('/')[-1][:3] == 'ERR'): #CETR or TRAUNER sample
        tag = fqf1.split('/')[-1].split('_')[0]
        
    else: #POOLS sample
        tag = fqf1.split('/')[-1].split('.1')[0]
        

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output_REPLICATES/' + tag
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    elif not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
        
    #where everything else happens (trimming, aligning, etc.) [SCRATCH FOLDER]
    scratch_dir = '/n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe_REPLICATES/intermediary_files/' + tag
    if os.path.exists(scratch_dir):
        shutil.rmtree(scratch_dir)
        os.makedirs(scratch_dir)
    elif not os.path.exists(scratch_dir):
        os.makedirs(scratch_dir)
        

    #store O2 job log files [LAB FOLDER]
    O2_SLURM_logs_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/O2_SLURM_logs_REPLICATES/' + tag
    if os.path.exists(O2_SLURM_logs_dir):
        shutil.rmtree(O2_SLURM_logs_dir)
        os.makedirs(O2_SLURM_logs_dir)
    elif not os.path.exists(O2_SLURM_logs_dir):
        os.makedirs(O2_SLURM_logs_dir)

    #Launch JankyPipe after making necessary directories!!!
    Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir)

submitted: Submitted batch job 23891126


Peru4092 : 23891126


submitted: Submitted batch job 23891127
submitted: Submitted batch job 23891128


Peru3380 : 23891127
Peru2905 : 23891128


submitted: Submitted batch job 23891129
submitted: Submitted batch job 23891130
submitted: Submitted batch job 23891131


Peru4104 : 23891129
Peru3016 : 23891130
Peru4110 : 23891131


submitted: Submitted batch job 23891132
submitted: Submitted batch job 23891133
submitted: Submitted batch job 23891134
submitted: Submitted batch job 23891135


Peru4133 : 23891132
Peru3343 : 23891133
Peru3324 : 23891134
Peru4480 : 23891135


submitted: Submitted batch job 23891136
submitted: Submitted batch job 23891137
submitted: Submitted batch job 23891138
submitted: Submitted batch job 23891139


Peru5110 : 23891136
00-R0025 : 23891137
Peru4686 : 23891138
00-R0308 : 23891139


submitted: Submitted batch job 23891140
submitted: Submitted batch job 23891141
submitted: Submitted batch job 23891142


Peru4690 : 23891140
00-R1156 : 23891141
Peru4689 : 23891142


submitted: Submitted batch job 23891143
submitted: Submitted batch job 23891144
submitted: Submitted batch job 23891145
submitted: Submitted batch job 23891146


00-R1405 : 23891143
Peru4683 : 23891144
00-R1547 : 23891145
Peru4904 : 23891146


submitted: Submitted batch job 23891147
submitted: Submitted batch job 23891148
submitted: Submitted batch job 23891149
submitted: Submitted batch job 23891150
submitted: Submitted batch job 23891151


00-R1549 : 23891147
Peru5426 : 23891148
00-R1562 : 23891149
Peru5120 : 23891150
01-R0153 : 23891151


submitted: Submitted batch job 23891152
submitted: Submitted batch job 23891153
submitted: Submitted batch job 23891154


Peru4671 : 23891152
01-R0185 : 23891153
Peru4519 : 23891154


submitted: Submitted batch job 23891155
submitted: Submitted batch job 23891156
submitted: Submitted batch job 23891157


01-R0238 : 23891155
Peru5112 : 23891156
01-R0239 : 23891157


submitted: Submitted batch job 23891158
submitted: Submitted batch job 23891159
submitted: Submitted batch job 23891160
submitted: Submitted batch job 23891161


Peru4679 : 23891158
01-R0265 : 23891159
Peru4670 : 23891160
01-R0272 : 23891161


submitted: Submitted batch job 23891162
submitted: Submitted batch job 23891163
submitted: Submitted batch job 23891164


Peru4908 : 23891162
01-R0276 : 23891163
Peru4664 : 23891164


submitted: Submitted batch job 23891165
submitted: Submitted batch job 23891166
submitted: Submitted batch job 23891167
submitted: Submitted batch job 23891168


01-R0290 : 23891165
Peru4675 : 23891166
01-R0420 : 23891167
Peru4698 : 23891168


submitted: Submitted batch job 23891169
submitted: Submitted batch job 23891170
submitted: Submitted batch job 23891171


01-R0685 : 23891169
Peru4672 : 23891170
01-R0737 : 23891171


submitted: Submitted batch job 23891172
submitted: Submitted batch job 23891173
submitted: Submitted batch job 23891174


Peru4938 : 23891172
01-R0774 : 23891173
Peru5074 : 23891174


submitted: Submitted batch job 23891175
submitted: Submitted batch job 23891176
submitted: Submitted batch job 23891177


01-R0878 : 23891175
Peru4714 : 23891176
01-R0880 : 23891177


submitted: Submitted batch job 23891178
submitted: Submitted batch job 23891179
submitted: Submitted batch job 23891180


Peru4922 : 23891178
01-R0897 : 23891179
Peru4707 : 23891180


submitted: Submitted batch job 23891181
submitted: Submitted batch job 23891182
submitted: Submitted batch job 23891183
submitted: Submitted batch job 23891184


01-R0899 : 23891181
Peru4694 : 23891182
01-R0902 : 23891183
Peru4700 : 23891184


submitted: Submitted batch job 23891185
submitted: Submitted batch job 23891186
submitted: Submitted batch job 23891187
submitted: Submitted batch job 23891188


01-R0904 : 23891185
Peru4716 : 23891186
01-R0908 : 23891187
Peru4719 : 23891188


submitted: Submitted batch job 23891189
submitted: Submitted batch job 23891190
submitted: Submitted batch job 23891191


01-R0909 : 23891189
Peru4911 : 23891190
01-R1018 : 23891191


submitted: Submitted batch job 23891193
submitted: Submitted batch job 23891194
submitted: Submitted batch job 23891195
submitted: Submitted batch job 23891196


Peru4722 : 23891193
01-R1305 : 23891194
Peru4703 : 23891195
01-R1309 : 23891196


submitted: Submitted batch job 23891197
submitted: Submitted batch job 23891198
submitted: Submitted batch job 23891199
submitted: Submitted batch job 23891200


Peru4709 : 23891197
01-R1321 : 23891198
Peru4947 : 23891199
01-R1386 : 23891200


submitted: Submitted batch job 23891201
submitted: Submitted batch job 23891202


Peru5410 : 23891201
01-R1387 : 23891202


submitted: Submitted batch job 23891203
submitted: Submitted batch job 23891204
submitted: Submitted batch job 23891205
submitted: Submitted batch job 23891206


Peru4914 : 23891203
01-R1505 : 23891204
Peru4711 : 23891205
01-R1559 : 23891206


submitted: Submitted batch job 23891207
submitted: Submitted batch job 23891208
submitted: Submitted batch job 23891209
submitted: Submitted batch job 23891210


Peru4953 : 23891207
01-R1599 : 23891208
Peru4919 : 23891209
02-R0099 : 23891210


submitted: Submitted batch job 23891211
submitted: Submitted batch job 23891212
submitted: Submitted batch job 23891213
submitted: Submitted batch job 23891214


Peru4957 : 23891211
02-R0119 : 23891212
Peru5418 : 23891213
02-R0236 : 23891214


submitted: Submitted batch job 23891215
submitted: Submitted batch job 23891216
submitted: Submitted batch job 23891217


Peru4967 : 23891215
02-R0328 : 23891216
Peru5132 : 23891217


submitted: Submitted batch job 23891218


02-R0417 : 23891218


submitted: Submitted batch job 23891219
submitted: Submitted batch job 23891220
submitted: Submitted batch job 23891221


Peru5442 : 23891219
02-R0812 : 23891220
Peru5124 : 23891221


submitted: Submitted batch job 23891222
submitted: Submitted batch job 23891223


02-R0848 : 23891222
Peru5443 : 23891223


submitted: Submitted batch job 23891224
submitted: Submitted batch job 23891225
submitted: Submitted batch job 23891226


02-R1444 : 23891224
Peru4972 : 23891225
02-R1589 : 23891226


submitted: Submitted batch job 23891227
submitted: Submitted batch job 23891228


Peru5143 : 23891227
02-R1641 : 23891228


submitted: Submitted batch job 23891229
submitted: Submitted batch job 23891230


Peru5090 : 23891229
02-R1645 : 23891230


submitted: Submitted batch job 23891231
submitted: Submitted batch job 23891232
submitted: Submitted batch job 23891233
submitted: Submitted batch job 23891234


Peru5088 : 23891231
02-R1742 : 23891232
Peru4963 : 23891233
02-R1825 : 23891234


submitted: Submitted batch job 23891235
submitted: Submitted batch job 23891236
submitted: Submitted batch job 23891237


Peru4720 : 23891235
02-R1945 : 23891236
Peru4653 : 23891237


submitted: Submitted batch job 23891238
submitted: Submitted batch job 23891239
submitted: Submitted batch job 23891240
submitted: Submitted batch job 23891241


03-R0061 : 23891238
Peru4994 : 23891239
03-R0221 : 23891240
Peru4974 : 23891241


submitted: Submitted batch job 23891242
submitted: Submitted batch job 23891243


03-R0319 : 23891242
Peru5137 : 23891243


submitted: Submitted batch job 23891244
submitted: Submitted batch job 23891245
submitted: Submitted batch job 23891246
submitted: Submitted batch job 23891247


03-R0915 : 23891244
Peru4574 : 23891245
04-R0266 : 23891246
Peru5025 : 23891247


submitted: Submitted batch job 23891248
submitted: Submitted batch job 23891249
submitted: Submitted batch job 23891250
submitted: Submitted batch job 23891251


98-R454 : 23891248
Peru5119 : 23891249
98-R660 : 23891250
Peru5037 : 23891251


submitted: Submitted batch job 23891252
submitted: Submitted batch job 23891253
submitted: Submitted batch job 23891254
submitted: Submitted batch job 23891255


98-R790 : 23891252
Peru5029 : 23891253
99-10364 : 23891254
Peru4659 : 23891255


submitted: Submitted batch job 23891256
submitted: Submitted batch job 23891257


99-R545 : 23891256
Peru4662 : 23891257


submitted: Submitted batch job 23891258
submitted: Submitted batch job 23891259
submitted: Submitted batch job 23891260
submitted: Submitted batch job 23891261


99-R719 : 23891258
Peru4688 : 23891259
99-R862 : 23891260
Peru5420 : 23891261


submitted: Submitted batch job 23891262
submitted: Submitted batch job 23891263
submitted: Submitted batch job 23891264


99-R887 : 23891262
Peru4685 : 23891263
99-R893 : 23891264


submitted: Submitted batch job 23891265
submitted: Submitted batch job 23891266


Peru5146 : 23891265
99-R995 : 23891266


submitted: Submitted batch job 23891267
submitted: Submitted batch job 23891268


Peru4702 : 23891267
02-R1447 : 23891268


submitted: Submitted batch job 23891269
submitted: Submitted batch job 23891270
submitted: Submitted batch job 23891271
submitted: Submitted batch job 23891272


Peru4948 : 23891269
02-R1683 : 23891270
Peru5434 : 23891271
03-R0419 : 23891272


submitted: Submitted batch job 23891273
submitted: Submitted batch job 23891274
submitted: Submitted batch job 23891275


Peru4726 : 23891273
03-R0979 : 23891274
Peru5414 : 23891275


submitted: Submitted batch job 23891276
submitted: Submitted batch job 23891277
submitted: Submitted batch job 23891278
submitted: Submitted batch job 23891279


02-R0191 : 23891276
Peru4706 : 23891277
02-R0325 : 23891278
Peru4977 : 23891279


submitted: Submitted batch job 23891280
submitted: Submitted batch job 23891281
submitted: Submitted batch job 23891282
submitted: Submitted batch job 23891283


02-R0861 : 23891280
Peru4668 : 23891281
02-R0948 : 23891282
Peru5117 : 23891283


submitted: Submitted batch job 23891284


02-R1527 : 23891284


submitted: Submitted batch job 23891285
submitted: Submitted batch job 23891286
submitted: Submitted batch job 23891287


Peru4665 : 23891285
03-R0324 : 23891286
ERR1352350 : 23891287
ERR1352351 : 23891288
ERR1352352 : 23891289


submitted: Submitted batch job 23891288
submitted: Submitted batch job 23891289


#### save tags (corresponds to folder names)

In [7]:
#store tags to each sample
tag_list = []

for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    isolate_fastq_paths = sample_annotation.iloc[isolate_i , 0]

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths.split(';')[0]

    #get the tag ID for the fastq files (same as ID for fastq files)
    
    #check to see if 'CETR' , 'POOLS' or 'TRAUNER'
    if (fqf1.split('/')[-1][:3] == 'Per') or (fqf1.split('/')[-1][:3] == 'ERR'): #CETR or TRAUNER sample
        tag = fqf1.split('/')[-1].split('_')[0]
        
    else: #POOLS sample
        tag = fqf1.split('/')[-1].split('.1')[0]

    tag_list.append(tag)
    
sample_annotation['tag'] = tag_list 

#store as CSV
sample_annotation.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/REPLICATE_fastq_path_names_and_JankyPipe_tags.csv' , sep = ',')

In [8]:
sample_annotation.head()

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I0002918-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru4092,0,Peru4092
I0002918-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru3380,0,Peru3380
I0003710-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru2905,0,Peru2905
I0003710-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru4104,0,Peru4104
I0003922-7,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru3016,0,Peru3016


In [9]:
sample_annotation.tail()

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
replicate_pair_74,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CP_REP,,Peru4665,0,Peru4665
replicate_pair_74,/n/data1/hms/dbmi/farhat/fastq_db/pools/03-R03...,CP_REP,,03-R0324,0,03-R0324
P12,/n/data1/hms/dbmi/farhat/fastq_db/trauner/ERR1...,TR_REP,ERR1352350,SAMEA3921015,0,ERR1352350
P12,/n/data1/hms/dbmi/farhat/fastq_db/trauner/ERR1...,TR_REP,ERR1352351,SAMEA3921016,0,ERR1352351
P12,/n/data1/hms/dbmi/farhat/fastq_db/trauner/ERR1...,TR_REP,ERR1352352,SAMEA3921017,0,ERR1352352


### Determine if jobs ran successfully or not

In [10]:
successful_run = []

for isolate_i in range(0 , np.shape(sample_annotation)[0]):

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = sample_annotation.tag[isolate_i]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output_REPLICATES/' + tag
    
    #check to see 'Lineage Call' folder exists in the output directory (last thing that is run in JankyPipe)
    if os.path.exists(output_dir + '/fast-lineage-caller/'):
        
        successful_run.append('yes')
        
    else:
        
        successful_run.append('no')
        
sample_annotation['successful_run'] = successful_run

In [11]:
sample_annotation[sample_annotation.successful_run == 'no']

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag,successful_run
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
replicate_pair_63,/n/data1/hms/dbmi/farhat/fastq_db/pools/99-R89...,CP_REP,,99-R893,0,99-R893,no


In [12]:
np.shape(sample_annotation[sample_annotation.successful_run == 'no'])

(1, 7)

In [13]:
#Drop any isolate from sample annotation that DID NOT have a successful run in JankyPipe (and corresponding paired isolate)
sample_annotation.drop('replicate_pair_63' , inplace = True)

#store as CSV
sample_annotation.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/REPLICATE_fastq_path_names_and_JankyPipe_tags.csv' , sep = ',')

### Re-Run isolates through pipeline that hit a run-timelimit

In [26]:
#isolates that don't have a lineage-call directory didn't finish running through pipeline
sample_annotation_ReRun = sample_annotation[sample_annotation.successful_run == 'no']

#if path already exists, remove current contents, then recreate empty directory
#if path doesn't exist, create new directory

for isolate_i in range(0 , np.shape(sample_annotation_ReRun)[0]):
    
    isolate_fastq_paths = sample_annotation_ReRun.iloc[isolate_i , 0]

    #paths & names for fastq files
    fqf1 = isolate_fastq_paths.split(';')[0]
    fqf2 = isolate_fastq_paths.split(';')[1]

    #get the tag ID for the fastq files (same as ID for fastq files)
    tag = fqf1.split('/')[-1].split('_')[0]

    #where pilon VCF and lineage information will be stored [LAB FOLDER]
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output_REPLICATES/' + tag
    
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
    elif not os.path.exists(output_dir):
        os.makedirs(output_dir)
        

    #where everything else happens (trimming, aligning, etc.) [SCRATCH FOLDER]
    scratch_dir = '/n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe_REPLICATES/intermediary_files/' + tag
    
    if os.path.exists(scratch_dir):
        shutil.rmtree(scratch_dir)
        os.makedirs(scratch_dir)
    elif not os.path.exists(scratch_dir):
        os.makedirs(scratch_dir)

        
    #store O2 job log files [LAB FOLDER]
    O2_SLURM_logs_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/O2_SLURM_logs_REPLICATES/' + tag
    
    if os.path.exists(O2_SLURM_logs_dir):
        shutil.rmtree(O2_SLURM_logs_dir)
        os.makedirs(O2_SLURM_logs_dir)
    elif not os.path.exists(O2_SLURM_logs_dir):
        os.makedirs(O2_SLURM_logs_dir)

        
    #Launch JankyPipe after making necessary directories!!!
    Launch_JankyPipe(fqf1 , fqf2 , tag , output_dir , scratch_dir , O2_SLURM_logs_dir)

ERR1352352 : 23966782


submitted: Submitted batch job 23966782
