In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import os
import pandas as pd
import numpy as np
from slurmpy import Slurm
import vcf
import shutil

### [1] Run Kraken2 on sequenced isolates

#### Get sample annotation & (trimmed) fastq paths for all sequenced samples

In [4]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/REPLICATE_fastq_path_names_and_JankyPipe_tags.csv' , sep  = ',').set_index('patient_id')

#drop duplicate isolates (may have arisen from Trauner fudging of annotation file)
drop_duplicate_isolate_filter = [not redundant for redundant in list( sample_annotation.duplicated('tag') ) ]
sample_annotation = sample_annotation[drop_duplicate_isolate_filter]

In [5]:
np.shape(sample_annotation)

(161, 7)

In [6]:
sample_annotation.head()

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag,successful_run
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
I0002918-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru4092,0,Peru4092,yes
I0002918-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru3380,0,Peru3380,yes
I0003710-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru2905,0,Peru2905,yes
I0003710-6,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru4104,0,Peru4104,yes
I0003922-7,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,LC_REP,,Peru3016,0,Peru3016,yes


### *Function* to submit job to run Kraken on sequnced isolates as a job for each isolate

In [7]:
def run_kraken2_on_isolate(isolate_tag):
    
    '''
    This function launches a job that runs a sequenced isolate through Kraken2.
    The output is an output text file with information about the classification 
    of each (paired) read from the sequenced output and a report text file with
    aggregate level information on how many reads were assigned to each taxon.
    '''

    #directory where timmed-fastq files are stored
    scratch_dir = '/n/scratch2/rv76/inhost_TB_dynamics_project/JankyPipe_REPLICATES/intermediary_files/'

    #paths & names for (trimmed) fastq files
    fqf1_trimmed = scratch_dir + isolate_tag + '/' + isolate_tag + '-trimmed_1.fastq'
    fqf2_trimmed = scratch_dir + isolate_tag + '/' + isolate_tag + '-trimmed_2.fastq'

    #directory where we will store Kraken output
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output_REPLICATES/'

    #make directory for kraken2 output (if one already exists, remove it)
    kraken2_dir = output_dir + isolate_tag + '/kraken2/'
    
    #if directory for Kraken output doesn't exist, submit a job to run Kraken
    if not os.path.exists(kraken2_dir):
        
        os.makedirs(kraken2_dir) #make directory to store Kraken output

        #construct job
        kraken_job = 'kraken2 --paired --use-names --report ' + kraken2_dir + isolate_tag + '_trimmed_report.txt ' + '--output ' + kraken2_dir + isolate_tag + '_trimmed_output.txt ' +  fqf1_trimmed + ' ' + fqf2_trimmed  + ' ' + '--db /n/data1/hms/dbmi/farhat/Roger/kraken_db'

        #submit job
        #directory where you want output + error files
        os.chdir(kraken2_dir)

        job_name = isolate_tag
        s = Slurm(job_name 
                  , {'partition':'short' , 'n':'1' , 't':'0-3:00:00' , 'mem-per-cpu':'64G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

        #submits the job
        job_id = s.run(kraken_job)

        print job_name  + ' : ' +  str(job_id)
    
    #if the directory for Kraken files DOES exist but an output file is missing, submit a job to run Kraken
    elif os.path.exists(kraken2_dir):
        
        #check to see if report & output txt files from Kraken run already exist, if either file is missing (submit job and) re-run kraken
        if ( isolate_tag + '_trimmed_report.txt' not in os.listdir(kraken2_dir) ) or ( isolate_tag + '_trimmed_output.txt' not in os.listdir(kraken2_dir) ):

            shutil.rmtree(kraken2_dir) #remove all current files
            os.makedirs(kraken2_dir) #re-make directory to store Kraken output

            #construct job
            kraken_job = 'kraken2 --paired --use-names --report ' + kraken2_dir + isolate_tag + '_trimmed_report.txt ' + '--output ' + kraken2_dir + isolate_tag + '_trimmed_output.txt ' +  fqf1_trimmed + ' ' + fqf2_trimmed  + ' ' + '--db /n/data1/hms/dbmi/farhat/Roger/kraken_db'

            #submit job
            #directory where you want output + error files
            os.chdir(kraken2_dir)

            job_name = isolate_tag
            s = Slurm(job_name 
                      , {'partition':'short' , 'n':'1' , 't':'0-3:00:00' , 'mem-per-cpu':'64G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

            #submits the job
            job_id = s.run(kraken_job)

            print job_name  + ' : ' +  str(job_id)

In [8]:
for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    #pull tag from sample annotation
    isolate_tag = sample_annotation.iloc[isolate_i , 5]
    
    #run Kraken2 on sequenced isolate
    run_kraken2_on_isolate(isolate_tag)

### [2] Parse through Kraken results & calculate proportion of reads assigned to MTBC

Legend for Kraken Report output

A - Percentage of fragments covered by the clade rooted at this taxon

B - Number of fragments covered by the clade rooted at this taxon

C - Number of fragments assigned directly to this taxon

D - A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom,
   (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
   Taxa that are not at any of these 10 ranks have a rank code that is
   formed by using the rank code of the closest ancestor rank with
   a number indicating the distance from that rank.  E.g., "G2" is a
   rank code indicating a taxon is between genus and species and the
   grandparent taxon is at the genus rank.

E - NCBI taxonomic ID number

F - Indented scientific name

### *Function* that parses through Kraken report and returns the proportion of reads (from sequencing run) that were identied as matching to MTBC

In [7]:
def get_prop_reads_MTBC(kraken_report_path , kraken_MTBC_prop_output_path):
    
    '''
    This function parses through the kraken report for a sequenced isolate
    and writes a text file with the proportion of reads that were identified
    as MTBC from that sequenced isolate.
    '''
    
    #load Report spit out by Kraken
    kraken_report = pd.read_csv(kraken_report_path , sep = '\t' , names = ['A' , 'B' , 'C' , 'D' , 'E' , 'F'])

    #(1)
    #get total number of reads that were run through Kraken (Classified + Unclassified)
    CLASSIFIED_read_count = list(kraken_report[kraken_report.E == 1].B)[0]
    UNCLASSIFIED_read_count = list(kraken_report[kraken_report.E == 0].B)[0]
    TOTAL_read_count = CLASSIFIED_read_count + UNCLASSIFIED_read_count

    #(2)
    #Scientific IDs of Interest - taxon IDs for which we want read count for, reads assigned to every taxon between ROOT & MYCOBACTERIUM
    ROOT_to_MYCOBACTERIUM_taxon_IDs = [1 , 131567 , 2 , 1783272 , 201174 , 1760 , 85007 , 1762 , 1763]

    #get total number of reads assignd to taxon between 'root' & 'Mycobacterium' 
    ROOT_to_MYCOBACTERIUM_read_count = 0

    #iterate through every taxon id and get number of reads assigned to each id
    for taxon_id in ROOT_to_MYCOBACTERIUM_taxon_IDs:

        ROOT_to_MYCOBACTERIUM_read_count = ROOT_to_MYCOBACTERIUM_read_count + list(kraken_report[kraken_report.E == taxon_id].C)[0]

    #(3)
    #get total number of reads assigned to 'Mycobacterium tuberculosis complex' & every taxon below 
    MTBC_and_BELOW_read_count = list(kraken_report[kraken_report.E == 77643].B)[0]

    #calculate total number of reads assigned to MTBC
    prop_reads_MTBC = float(ROOT_to_MYCOBACTERIUM_read_count + MTBC_and_BELOW_read_count) / float(TOTAL_read_count)
    
    #write the proportion of reads that were assigned to MTBC to a text file in the same directory as Kraken report & output
    f = open(kraken_MTBC_prop_output_path,'w')
    f.write( str(prop_reads_MTBC) )
    f.close()

In [8]:
for isolate_i in range(0 , np.shape(sample_annotation)[0]):
    
    #directory where Kraken output is stored
    output_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output_REPLICATES/'
    
    #pull tag from sample annotation
    isolate_tag = sample_annotation.iloc[isolate_i , 5]
    
    #directory for kraken2 output
    kraken2_dir = output_dir + isolate_tag + '/kraken2/'

    #parse through kraken output and calculate % of reads in MTBC
    get_prop_reads_MTBC( kraken2_dir + isolate_tag + '_trimmed_report.txt' , kraken2_dir + isolate_tag + '_MTBC_prop_from_Kraken.txt' )