In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### This notebook was created to classify each sample from the 307 patients (N=614) as *Susceptible* or *Resistant* for several antibiotics SNPs detected in VCF files and Predictive Resistance Mutations from Farhat et al. 2016

In [2]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker

from pylab import plot, show, savefig, xlim, figure, hold, ylim, legend, boxplot, setp, axes
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys
import math

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools

import networkx as nx
import scipy

import datetime as dt
import statsmodels.api as sm
import scipy.stats as stats
import decimal

#for exporting to Adobe Illustrator
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

### Import Sample Annotation file for all *longitudinal* isolates pairs

In [5]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/Table S2B for DR genotyping.csv' , sep  = ',').set_index('Patient ID')

In [7]:
sample_annotation.head()

Unnamed: 0_level_0,Study Source,Run ID,Sample ID,Sample Order,tag,Isolate Type,Dates,Filtered
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
I0005973-8,Farhat et. al. 2019,MQFF00000000,Peru3062,1,Peru3062,Longitudinal Isolate,6/27/12,
I0005973-8,Farhat et. al. 2019,MQKH00000000,Peru3315,2,Peru3315,Longitudinal Isolate,8/20/12,
I0005229-5,Farhat et. al. 2019,MQAO00000000,Peru2908,1,Peru2908,Longitudinal Isolate,3/30/12,
I0005229-5,Farhat et. al. 2019,MQKD00000000,Peru3278,2,Peru3278,Longitudinal Isolate,8/6/12,
I0005235-2,Farhat et. al. 2019,MQBA00000000,Peru2921,1,Peru2921,Longitudinal Isolate,4/21/12,


In [8]:
np.shape(sample_annotation)

(614, 8)

### Load SNPs associated with drug resistance from Farhat et. al. 2016

In [27]:
def load_drug_res_variants(filepath):
    
    AR_variants_DF = pd.DataFrame(columns = ['variant_type' , 'region_type' , 'ref_position' , 'ref_allele' , 'alt_allele'])
    with open(filepath) as fp:  
       for cnt, line in enumerate(fp):
           #print("Line {}: {}".format(cnt, line))
            AR_variants_DF.loc[cnt , :] = [line.split('_')[0] , line.split('_')[1] , int(line.split('_')[2]) , line.split('_')[3][0] , line.split('_')[3][1]]

    #sort values by Reference Position
    AR_variants_DF.sort_values(by = 'ref_position' , ascending = True , inplace = True)
    
    #drop duplicate reference positions
    duplicated_variants = list( AR_variants_DF[AR_variants_DF.duplicated(subset = 'ref_position' , keep = 'first')].index )
    AR_variants_DF.drop(duplicated_variants , axis = 0 , inplace = True)

    #drop any variant that isn't a SNP and re-index
    AR_variants_DF = AR_variants_DF[AR_variants_DF.variant_type == 'SNP']
    AR_variants_DF.reset_index(drop = True , inplace = True)
    
    return AR_variants_DF

In [28]:
AR_ALL_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations.txt')
AR_RIF_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_RIF.txt')
AR_INH_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_INH.txt')
AR_AMK_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_AMK.txt')
AR_CAP_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_CAP.txt')
AR_CIP_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_CIP.txt')
AR_EMB_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_EMB.txt')
AR_ETH_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_ETH.txt')
AR_KAN_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_KAN.txt')
AR_LEVO_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_LEVO.txt')
AR_OFLX_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_OFLX.txt')
AR_PAS_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_PAS.txt')
AR_PZA_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_PZA.txt')
AR_STR_variants_DF = load_drug_res_variants('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/predictive_AR_mutations/Table_S17_238_DR_mutations_STR.txt')

In [29]:
AR_ALL_variants_DF.head(n = 3)

Unnamed: 0,variant_type,region_type,ref_position,ref_allele,alt_allele
0,SNP,CN,6735,A,C
1,SNP,CN,7563,G,T
2,SNP,CN,7566,G,A


In [30]:
np.shape(AR_ALL_variants_DF)

(178, 5)

In [108]:
H37Rv_positions_of_interest = list( AR_ALL_variants_DF.ref_position ) 

Create list of Resistance mutations from DataFrames

In [31]:
AR_RIF_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_RIF_variants_DF.ref_position , AR_RIF_variants_DF.alt_allele)]
AR_INH_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_INH_variants_DF.ref_position , AR_INH_variants_DF.alt_allele)]
AR_AMK_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_AMK_variants_DF.ref_position , AR_AMK_variants_DF.alt_allele)]
AR_CAP_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_CAP_variants_DF.ref_position , AR_CAP_variants_DF.alt_allele)]
AR_CIP_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_CIP_variants_DF.ref_position , AR_CIP_variants_DF.alt_allele)]
AR_EMB_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_EMB_variants_DF.ref_position , AR_EMB_variants_DF.alt_allele)]
AR_ETH_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_ETH_variants_DF.ref_position , AR_ETH_variants_DF.alt_allele)]
AR_KAN_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_KAN_variants_DF.ref_position , AR_KAN_variants_DF.alt_allele)]
AR_LEVO_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_LEVO_variants_DF.ref_position , AR_LEVO_variants_DF.alt_allele)]
AR_OFLX_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_OFLX_variants_DF.ref_position , AR_OFLX_variants_DF.alt_allele)]
AR_PAS_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_PAS_variants_DF.ref_position , AR_PAS_variants_DF.alt_allele)]
AR_PZA_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_PZA_variants_DF.ref_position , AR_PZA_variants_DF.alt_allele)]
AR_STR_variants = [str(ref_pos) + '_' + alt_allele for ref_pos, alt_allele in zip(AR_STR_variants_DF.ref_position , AR_STR_variants_DF.alt_allele)]

### *Function* to Extract SNPs from VCF file

In [118]:
def SNPs_from_VCF(VCF_file):

    vcf_reader = vcf.Reader(open(VCF_file , 'r'))

    #create dictionaries to store information for each call
    ref_bases = {}
    qry_bases = {}
    ref_positions = {}
    ## INFO_for_call = {}
    
    #indexer for dataframe containing SNPs
    index = 0

    #iterate through each Variant Call 
    for record in vcf_reader:

        #check to see if the call is a PASS by Pilon
        if record.FILTER == []:
            
            #check to see if SNP is located in a region associated with Antibiotic Resistance
            if record.POS in H37Rv_positions_of_interest:

                #check to see if variant is SNP; length of reference allele is 1, there is only 1 alternate allele, length of alternate allele is 1
                if (len(record.REF) == 1) and (len(record.ALT) == 1) and (len(str( record.ALT[0] )) == 1):

                        ##### Retrieve Relevant information for filtering quality of Base Call #####
                        # Mean Base Quality @ locus
                        BQ = record.INFO['BQ']
                        # Mean Mapping Quality @ locus
                        MQ = record.INFO['MQ']
                        # Number of Reads w/ Deletion 
                        DC = record.INFO['DC']
                        # Number of Reads w/ Insertion
                        IC = record.INFO['IC']
                        # Depth of Valid Reads in Pileup
                        VD = record.INFO['DP']

                        ### Filtering Criteria
                        #---> Mean Base Quality > 20
                        #---> Mean Mapping Quality > 30
                        #---> No Reads Supporting Insertions
                        #---> No Reads Supporting Deletions
                        #---> Number of High Quality Reads >= 25
                        if (BQ > 20) and (MQ > 30) and (DC == 0) and (IC == 0) and (VD >= 25): #SNP passed all filtering criteria!

                            # Filtering Criteria for mutant allele frequency calculation (range: 0.75 - 1.0)
                            ref_allele = str(record.REF)
                            alt_allele = str(record.ALT[0])

                            #After extensive filtering and categorization, store all of the pertinent information about the Base Call
                            ref_bases[index] = ref_allele
                            qry_bases[index] = alt_allele
                            ref_positions[index] = record.POS
                            ## INFO_for_call[index] = record.INFO

                            index += 1
                
    #convert dictionaries to series
    ref_bases = pd.Series(ref_bases)
    qry_bases = pd.Series(qry_bases)
    ref_positions = pd.Series(ref_positions)
    ## INFO_for_call = pd.Series(INFO_for_call)

    #create DataFrame to hold all base calls for a given sample
    Variant_Call_DF = pd.DataFrame()
    Variant_Call_DF['ref_base'] = ref_bases
    Variant_Call_DF['alt_base'] = qry_bases
    Variant_Call_DF['ref_position'] = ref_positions
    ## Variant_Call_DF['INFO'] = INFO_for_call
    
    return Variant_Call_DF #DataFrame for base calls for a single isolate

### *Functions* to Annotate SNPs

In [33]:
# Important Packages
################################################################################################################################################################################################
import os
import pandas as pd
import numpy as np
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
################################################################################################################################################################################################


# Relevant Information for H37Rv sequence SNP functional annotation
################################################################################################################################################################################################
####### Collect all DNA and Amino Acid sequences corresponding to genes on H37Rv #######
#load reference genome and reference annotation
reference_genome = '/n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta'
for reference_genome in SeqIO.parse(reference_genome, "fasta"):
    reference_genome.seq.alphabet = IUPAC.unambiguous_dna

reference_genome_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/H37Rv/h37rv_genome_summary.txt', '\t').set_index('name')

####### Function to translate coding DNA sequences ####### 
def translate(gene_id, sequence):

    #find which strand the gene is located on and translate
    strand = reference_genome_annotation.loc[gene_id, 'strand']
    if strand == '+':
        protein_sequence = sequence.translate(table="Bacterial", cds=False)
    elif strand == '-':
        protein_sequence = sequence.reverse_complement().translate(table="Bacterial", cds=False)

    return protein_sequence

####### Load in dictionaries for SNP annotation #######
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/dicts_for_SNP_annotation/H37Rv_gene_seq_records.pickle', 'rb') as handle:
    ref_gene_sequences_records = pickle.load(handle)
    
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/dicts_for_SNP_annotation/H37Rv_protein_seq_records.pickle', 'rb') as handle:
    ref_protein_sequences_records = pickle.load(handle)
    
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/dicts_for_SNP_annotation/H37Rv_coord_gene_mapping.pickle', 'rb') as handle:
    ReferencePosition_Gene_mapping = pickle.load(handle)
    
####### get Gene Categories #######
gene_categories = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/gene_categories/gene_categories.csv').set_index('name')
gene_categories_dict = dict([gene_id , gene_category] for gene_id, gene_category in zip(list(gene_categories.gene_id) , list(gene_categories.Gene_Category)))

####### get Gene Symbols #######
gene_symbol_dict = dict([gene_id , gene_symbol] for gene_id, gene_symbol in zip(list(reference_genome_annotation.symbol.index) , list( reference_genome_annotation.symbol )))
################################################################################################################################################################################################


# Function to annotate Intergenic SNPs
################################################################################################################################################################################################
def find_flanking_genes_for_intergenic_region(intergenic_ref_pos): 

    #this function finds the genes flagging an intergenic region given a reference position

    #find gene immediately in the 5' direction
    for i in range(0 , 100000):

        #move toward 5' direction
        if ReferencePosition_Gene_mapping[intergenic_ref_pos - i] != []:

            gene_to_left = ReferencePosition_Gene_mapping[intergenic_ref_pos - i][0]
            break

    #find gene immediately in the 3' direction       
    for i in range(0 , 100000):

        #move toward 3' direction
        try:
            if ReferencePosition_Gene_mapping[intergenic_ref_pos + i] != []:

                gene_to_right = ReferencePosition_Gene_mapping[intergenic_ref_pos + i][0]
                break
        
        #KeyError means we have hit the 'end' of the chromosome, the intergenic region at then end of H37Rv in 5' > 3' orientation 
        #since TB chromosome is circular the gene to the 'right' is Rv0001    
        except KeyError:
            
            gene_to_right = 'Rv0001'
            break
    
    return gene_to_left + '_' + gene_to_right
################################################################################################################################################################################################


# Function to determine whether SNPs are Synonymous or Non-Synonymous; Returns gene coordinate, codon position, AA changes, Gene Category & Symbol
################################################################################################################################################################################################
def SNP_annotate(ref_seq_position , alt_allele_i):
    
    '''
    This function takes as input a reference position on H37Rv located within a 
    gene and an alternate allele and returns whether the base change 
    would correspond to a different Amino Acid sequence that results 
    from translating the DNA sequence into an AA sequence.
    
    '''
    gene_intergenic_id_list = []
    genomic_coord_list = []
    gene_category_list = []
    gene_symbol_list = []
    Syn_NSyn_list = []
    AA_change_list = []
    
    #get the Reference Allele from the complete H37Rv reference genome, indexing starts from 0
    ref_allele_i = reference_genome.seq[int(ref_seq_position) - 1] 
    
    #find the gene that SNP occurs on; check list corresponding to H37Rv coordinate to see if there are any genes associated with RefPosition
    if len(ReferencePosition_Gene_mapping[ref_seq_position]) > 0:

        #iterate through all genes that ReferencePosition is mapped to (i.e. SNP might correspond to 2 genes)
        for gene_intergenic_id in ReferencePosition_Gene_mapping[ref_seq_position]:

            #find genomic coordinate of SNP relative to gene (subtract 1 since reference seq starts counting at 1)
            gene_relative_coord = (ref_seq_position - 1) - min( reference_genome_annotation.loc[gene_intergenic_id , 'chromStart'] , reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd'] )
            
            #find the genomic coordinate (relative to the gene, in the 5' to 3' direction)
            strand = reference_genome_annotation.loc[gene_intergenic_id, 'strand']
            if strand == '+':
                 genomic_5_to_3_coord = (ref_seq_position) - reference_genome_annotation.loc[gene_intergenic_id , 'chromStart']

            elif strand == '-':
                 genomic_5_to_3_coord = (reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd']) - (ref_seq_position-1)
                    
            #find gene category (if one exists)
            try:
                gene_category_i = gene_categories_dict[gene_intergenic_id]
            except KeyError:
                gene_category_i = 'None'
            
            #find gene symbol (if one exists)
            try:
                gene_symbol_i = gene_symbol_dict[gene_intergenic_id]
            except KeyError:
                gene_symbol_i = 'None'
            
            #alternate allele is an actual base
            if alt_allele_i in ['A','C','G','T']:

                #translate into protein sequence with the SNP in place if not InDel or intergenic region
                SNP_change = alt_allele_i

                #ALTERNATE allele (is it Syn or NSyn?)
                #get sequence from dictionary of sequences (and convert to mutable object)
                test_gene_sequence = ref_gene_sequences_records[gene_intergenic_id].seq.tomutable()

                #change reference gene sequence by the SNP in the query sequence
                test_gene_sequence[int(gene_relative_coord)] = SNP_change

                #convert back immutable object
                test_gene_sequence = test_gene_sequence.toseq()

                #translate sequence into amino acid seq
                test_protein_sequence = translate(gene_intergenic_id , test_gene_sequence)

                #store the H37Rv AA seq to compare against
                H37Rv_AA_sequence = ref_protein_sequences_records[gene_intergenic_id].seq

                #get the codon number where the SNP occurs within
                ## take the genomic coordinate (relative to the gene, in the 5' to 3' direction), divide by 3, then take the ceiling of this number (will be fraction if SNP occurs in 1st or 2nd position on codon)
                strand = reference_genome_annotation.loc[gene_intergenic_id, 'strand']
                if strand == '+':
                     genomic_5_to_3_coord = (ref_seq_position) - reference_genome_annotation.loc[gene_intergenic_id , 'chromStart']

                elif strand == '-':
                     genomic_5_to_3_coord = (reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd']) - (ref_seq_position-1)

                codon_coord = int(np.ceil( float( genomic_5_to_3_coord) / 3.0 ))

                #compare to AA seq of original gene
                if test_protein_sequence == H37Rv_AA_sequence:

                    SNP_type = 'S'

                    #get the AA before & after
                    AA_change = H37Rv_AA_sequence[codon_coord-1] + str(codon_coord) + test_protein_sequence[codon_coord-1]

                else:
                    SNP_type = 'N'

                    #get the AA before & after
                    AA_change = H37Rv_AA_sequence[codon_coord-1] + str(codon_coord) + test_protein_sequence[codon_coord-1]
                    
            #alternate allele is a dummy (Base Call completely supports the Reference Allele)       
            else:
                
                SNP_type = 'None'
                AA_change = 'None'

            #store relevant info in lists    
            gene_intergenic_id_list.append(gene_intergenic_id)
            genomic_coord_list.append(genomic_5_to_3_coord)
            gene_category_list.append(gene_category_i)
            gene_symbol_list.append(gene_symbol_i)
            Syn_NSyn_list.append(SNP_type)
            AA_change_list.append(AA_change)
    
    #if no gene in H37Rv corresponds to the Reference Position for SNP, then SNP must be intergenic
    else:
        
        gene_intergenic_id = find_flanking_genes_for_intergenic_region(ref_seq_position)
        genomic_5_to_3_coord = 'None'
        gene_category_i = 'None'
        gene_symbol_i = 'None'
        SNP_type = 'I'
        AA_change = 'None'
        
        #store relevant info in lists    
        gene_intergenic_id_list.append(gene_intergenic_id)
        genomic_coord_list.append(genomic_5_to_3_coord)
        gene_category_list.append(gene_category_i)
        gene_symbol_list.append(gene_symbol_i)
        Syn_NSyn_list.append(SNP_type)
        AA_change_list.append(AA_change)
    
    #if there is only a single gene associated with this SNP, just return the individual elememts
    if len(gene_intergenic_id_list) == 1:
        return [ref_allele_i , gene_intergenic_id , genomic_5_to_3_coord , gene_category_i , gene_symbol_i , SNP_type , AA_change]
    
    #else if there are two genes associated with this SNP, return elements for each SNP annotation in a list
    elif len(gene_intergenic_id_list) > 1:
        return [ref_allele_i , gene_intergenic_id_list , genomic_coord_list , gene_category_list , gene_symbol_list , Syn_NSyn_list , AA_change_list]
################################################################################################################################################################################################

### Reduce full VCF corresponding to each sample from each patient (deletes lines that correspond to Reference Positions where reads support the Reference Allele)

################################################################################################################################################################################

### Create the script

In [None]:
#!/usr/bin/env python

'''
This script reduced the file of pilon-outputted VCF files by deleting blank lines
(i.e. Reference Positions that have no variants)
'''

import re
import sys
import subprocess as sp

'''
if len(sys.argv) != 2:
    print "::usage: %s <file_in.vcf> " % sys.argv[0]
    exit()

print "--reducing size of vcf file %s" % sys.argv[1]
'''

with open(sys.argv[1]+"_smallvcf","w") as outf:
    with open(sys.argv[1],"r") as inp:
        for line in inp:

            #skip the comment lines
            if line.startswith("#"):
                outf.write(line)
                continue
            data=line.rstrip("\n").split("\t")
            #if ALT is "." and the REF has only one base -> skip it
            if ((len(data[3])==1) and (data[4]==".")):
                continue
            else:
                outf.write(line)


cmd="mv "+sys.argv[1]+"_smallvcf "+sys.argv[1]
print(cmd)
sp.call(cmd,shell=True)

### Submit Jobs to run script on each sample

In [56]:
from slurmpy import Slurm

Create BASH scripts to reduce VCF files of N isolates per job

In [84]:
N_isolates_per_job = 20

njobs = int( np.ceil( float( np.shape(sample_annotation)[0] ) / float(N_isolates_per_job) ) ) #number of jobs required if we split for every N isolates

In [85]:
print njobs

31


In [103]:
bash_scripts = []

#split jobs up into tasks of N
num_isolates = 0
commands_list = []
total_isolate_count = 0
for sample_tag in sample_annotation.tag:
    
    cmd_1 = 'cp /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/output/{0}/pilon/{0}.vcf /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/reduced_VCF_files_for_all_longitudinal_isolates'.format(sample_tag)
    cmd_2 = 'python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/reduce-pilon-vcf-size.py /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/reduced_VCF_files_for_all_longitudinal_isolates/{0}.vcf'.format(sample_tag)
    
    commands_list.append(cmd_1)
    commands_list.append(cmd_2)
    
    num_isolates +=1 
    total_isolate_count += 1
    
    if (num_isolates == N_isolates_per_job):
        bash_scripts.append(commands_list)
        num_isolates = 0
        commands_list = []
        
    if total_isolate_count == 614: #add last few isolates
        bash_scripts.append(commands_list)

#### Submit each job to O2

In [104]:
job_num = 1
for job_i_commands_list in bash_scripts:
    
    #append all commands in a single string to be submitted as a job
    VCF_reduce_job = ''
    for command_i in job_i_commands_list:
        VCF_reduce_job  = VCF_reduce_job  + '\n' + command_i
    
    #directory where you want output + error files
    os.chdir('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/reduced_VCF_files_for_all_longitudinal_isolates/O2_SLURM_logs/')

    job_name = 'VCF_RED' + str(job_num)

    s = Slurm(job_name , {'partition':'short' , 'N':'1' , 't':'0-12:00:00' , 'mem':'2G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(VCF_reduce_job)

    print job_name  + ' : ' +  str(job_id)
    job_num += 1

submitted: Submitted batch job 20693354
submitted: Submitted batch job 20693355


VCF_RED1 : 20693354
VCF_RED2 : 20693355


submitted: Submitted batch job 20693356


VCF_RED3 : 20693356


submitted: Submitted batch job 20693357
submitted: Submitted batch job 20693358


VCF_RED4 : 20693357
VCF_RED5 : 20693358


submitted: Submitted batch job 20693359
submitted: Submitted batch job 20693360
submitted: Submitted batch job 20693361


VCF_RED6 : 20693359
VCF_RED7 : 20693360
VCF_RED8 : 20693361


submitted: Submitted batch job 20693362
submitted: Submitted batch job 20693363


VCF_RED9 : 20693362
VCF_RED10 : 20693363


submitted: Submitted batch job 20693364
submitted: Submitted batch job 20693365


VCF_RED11 : 20693364
VCF_RED12 : 20693365


submitted: Submitted batch job 20693366
submitted: Submitted batch job 20693367


VCF_RED13 : 20693366
VCF_RED14 : 20693367


submitted: Submitted batch job 20693368
submitted: Submitted batch job 20693369


VCF_RED15 : 20693368
VCF_RED16 : 20693369


submitted: Submitted batch job 20693370
submitted: Submitted batch job 20693371


VCF_RED17 : 20693370
VCF_RED18 : 20693371


submitted: Submitted batch job 20693372
submitted: Submitted batch job 20693373


VCF_RED19 : 20693372
VCF_RED20 : 20693373


submitted: Submitted batch job 20693374


VCF_RED21 : 20693374


submitted: Submitted batch job 20693375
submitted: Submitted batch job 20693376


VCF_RED22 : 20693375
VCF_RED23 : 20693376


submitted: Submitted batch job 20693377
submitted: Submitted batch job 20693378


VCF_RED24 : 20693377
VCF_RED25 : 20693378


submitted: Submitted batch job 20693379
submitted: Submitted batch job 20693380


VCF_RED26 : 20693379
VCF_RED27 : 20693380


submitted: Submitted batch job 20693381
submitted: Submitted batch job 20693382
submitted: Submitted batch job 20693383


VCF_RED28 : 20693381
VCF_RED29 : 20693382
VCF_RED30 : 20693383
VCF_RED31 : 20693384


submitted: Submitted batch job 20693384


################################################################################################################################################################################

### Iterate through Reduced VCF corresponding to each sample from each subject and collect all AR SNPs if present

In [34]:
sample_annotation.head()

Unnamed: 0_level_0,Study Source,Run ID,Sample ID,Sample Order,tag,Isolate Type,Dates,Filtered
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
I0005973-8,Farhat et. al. 2019,MQFF00000000,Peru3062,1,Peru3062,Longitudinal Isolate,6/27/12,
I0005973-8,Farhat et. al. 2019,MQKH00000000,Peru3315,2,Peru3315,Longitudinal Isolate,8/20/12,
I0005229-5,Farhat et. al. 2019,MQAO00000000,Peru2908,1,Peru2908,Longitudinal Isolate,3/30/12,
I0005229-5,Farhat et. al. 2019,MQKD00000000,Peru3278,2,Peru3278,Longitudinal Isolate,8/6/12,
I0005235-2,Farhat et. al. 2019,MQBA00000000,Peru2921,1,Peru2921,Longitudinal Isolate,4/21/12,


In [35]:
np.shape(sample_annotation)

(614, 8)

In [121]:
#create a dataframe to hold the AR SNPs detected in each isolate for each subject
all_AR_SNPS_in_samples_df = pd.DataFrame()

isolate_i = 0
for isolate_tag in list(sample_annotation.tag):

    #path to Reduced VCF file
    Reduced_VCF_file = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/JankyPipe/reduced_VCF_files_for_all_longitudinal_isolates/' + isolate_tag + '.vcf'

    #pull SNPs from VCF file
    SNPs_from_isolate = SNPs_from_VCF(Reduced_VCF_file)
    
    ################################################################################
    ### Annotate SNPs
    ################################################################################

    gene_id_list = []
    gene_coord_list = []
    gene_category_list = []
    gene_symbol_list = []
    SNP_ftype_list = []
    AA_change_list = []

    #Annotate Filtered Base Calls (make sure there is at least 1 SNP)
    if np.shape(SNPs_from_isolate)[0] > 0:

        for ref_position_i , alt_base_i in zip(list(SNPs_from_isolate.ref_position) , list(SNPs_from_isolate.alt_base)):

            #annotate SNP
            gene_id_i , gene_coord_i , gene_category_i , gene_symbol_i , SNP_ftype_i , AA_change_i = SNP_annotate(ref_position_i , alt_base_i)[1:]

            gene_id_list.append(gene_id_i)
            gene_coord_list.append(gene_coord_i)
            gene_category_list.append(gene_category_i)
            gene_symbol_list.append(gene_symbol_i)
            SNP_ftype_list.append(SNP_ftype_i)
            AA_change_list.append(AA_change_i)

        #create columns to store SNP annotation info
        SNPs_from_isolate['gene_id'] = gene_id_list
        SNPs_from_isolate['gene_coord'] = gene_coord_list
        SNPs_from_isolate['gene_category'] = gene_category_list
        SNPs_from_isolate['gene_symbol'] = gene_symbol_list
        SNPs_from_isolate['SNP_ftype'] = SNP_ftype_list
        SNPs_from_isolate['AA_change'] = AA_change_list

    #No predictive AR SNPs detected from this isolate (empty DataFrame)
    else:

        SNPs_from_isolate['gene_id'] = ""
        SNPs_from_isolate['gene_coord'] = ""
        SNPs_from_isolate['gene_category'] = ""
        SNPs_from_isolate['gene_symbol'] = ""
        SNPs_from_isolate['SNP_ftype'] = ""
        SNPs_from_isolate['AA_change'] = ""
        
    #drop synonymous SNPs & re-index
    SNPs_from_isolate = SNPs_from_isolate[SNPs_from_isolate.SNP_ftype != 'S']
    SNPs_from_isolate.reset_index(inplace = True , drop = True)

    #add column to patient_id & isolate tag
    patient_id = sample_annotation[sample_annotation.tag == isolate_tag].index[0]
    isolate_tag = sample_annotation[sample_annotation.tag == isolate_tag].tag[0]
    
    SNPs_from_isolate['patient_id'] = [patient_id]*np.shape(SNPs_from_isolate)[0]
    SNPs_from_isolate['isolate_tag'] = [isolate_tag]*np.shape(SNPs_from_isolate)[0]
    
    #create a DataFrame that stores all AR SNPs detected across all of the samples
    all_AR_SNPS_in_samples_df = all_AR_SNPS_in_samples_df.append(SNPs_from_isolate)
    
    isolate_i += 1
    if isolate_i % 50 == 0:
        print isolate_i
        
#reset index for DataFrame containing all AR SNPs in first clinical isoaltes for each serial pair
all_AR_SNPS_in_samples_df.reset_index(inplace = True , drop = True)

20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600


In [122]:
all_AR_SNPS_in_samples_df.head()

Unnamed: 0,ref_base,alt_base,ref_position,gene_id,gene_coord,gene_category,gene_symbol,SNP_ftype,AA_change,patient_id,isolate_tag
0,A,T,761110.0,Rv0667,1304,Antibiotic Resistance,rpoB,N,D435V,I0005973-8,Peru3062
1,C,G,2155168.0,Rv1908c,944,Antibiotic Resistance,katG,N,S315T,I0005973-8,Peru3062
2,T,C,2289213.0,Rv2043c,29,Antibiotic Resistance,pncA,N,Q10R,I0005973-8,Peru3062
3,G,A,2518919.0,Rv2245,805,Antibiotic Resistance,kasA,N,G269S,I0005973-8,Peru3062
4,G,T,4242182.0,Rv3793,2320,Antibiotic Resistance,embC,N,A774S,I0005973-8,Peru3062


In [123]:
np.shape(all_AR_SNPS_in_samples_df)

(1013, 11)

#### Filter out any *gid* E92D mutations since these are likely lineage markers

In [125]:
non_gid_E92D_SNPs_filter = [not ((all_AR_SNPS_in_samples_df.loc[SNP_i, :].AA_change == 'E92D') and (all_AR_SNPS_in_samples_df.loc[SNP_i, :].gene_id == 'Rv3919c')) for SNP_i in all_AR_SNPS_in_samples_df.index]
all_AR_SNPS_in_samples_df = all_AR_SNPS_in_samples_df[non_gid_E92D_SNPs_filter]

#reset index
all_AR_SNPS_in_samples_df.reset_index(inplace = True , drop = True)

In [126]:
np.shape(all_AR_SNPS_in_samples_df)

(915, 11)

### Get Genotype Drug Resistance classification for each sample

In [130]:
RIF_DR_list = []
INH_DR_list = []
AMK_DR_list = []
CAP_DR_list = []
CIP_DR_list = []
EMB_DR_list = []
ETH_DR_list = []
KAN_DR_list = []
LEVO_DR_list = []
OFLX_DR_list = []
PAS_DR_list = []
PZA_DR_list = []
STR_DR_list = []

for sample_tag in sample_annotation.tag:
    
    RIF_i = 'S'
    INH_i = 'S'
    AMK_i = 'S'
    CAP_i = 'S'
    CIP_i = 'S'
    EMB_i = 'S'
    ETH_i = 'S'
    KAN_i = 'S'
    LEVO_i = 'S'
    OFLX_i = 'S'
    PAS_i = 'S'
    PZA_i = 'S'
    STR_i = 'S'
    
    #subset to AR SNPs detected in sample
    all_AR_SNPS_sample_df_for_sample = all_AR_SNPS_in_samples_df[all_AR_SNPS_in_samples_df.isolate_tag == sample_tag]
    
    #create list of Ref Positions + Alternate Allele
    all_AR_SNPS_sample_df_for_sample = [str(int(ref_pos)) + '_' + alt_allele for ref_pos, alt_allele in zip(all_AR_SNPS_sample_df_for_sample.ref_position , all_AR_SNPS_sample_df_for_sample.alt_base)]
    
    #check for SNP assoc. with resistance to different drugs
    for AR_SNP in all_AR_SNPS_sample_df_for_sample:
        
        if AR_SNP in AR_RIF_variants:
            RIF_i = 'R'
        if AR_SNP in AR_INH_variants:
            INH_i = 'R'
        if AR_SNP in AR_AMK_variants:
            AMK_i = 'R'
        if AR_SNP in AR_CAP_variants:
            CAP_i = 'R'
        if AR_SNP in AR_CIP_variants:
            CIP_i = 'R'
        if AR_SNP in AR_EMB_variants:
            EMB_i = 'R'
        if AR_SNP in AR_ETH_variants:
            ETH_i = 'R'
        if AR_SNP in AR_KAN_variants:
            KAN_i = 'R'
        if AR_SNP in AR_LEVO_variants:
            LEVO_i = 'R'
        if AR_SNP in AR_OFLX_variants:
            OFLX_i = 'R'
        if AR_SNP in AR_PAS_variants:
            PAS_i = 'R'
        if AR_SNP in AR_PZA_variants:
            PZA_i = 'R'
        if AR_SNP in AR_STR_variants:
            STR_i = 'R'
            
    RIF_DR_list.append(RIF_i)
    INH_DR_list.append(INH_i)
    AMK_DR_list.append(AMK_i)
    CAP_DR_list.append(CAP_i)
    CIP_DR_list.append(CIP_i)
    EMB_DR_list.append(EMB_i)
    ETH_DR_list.append(ETH_i)
    KAN_DR_list.append(KAN_i)
    LEVO_DR_list.append(LEVO_i)
    OFLX_DR_list.append(OFLX_i)
    PAS_DR_list.append(PAS_i)
    PZA_DR_list.append(PZA_i)
    STR_DR_list.append(STR_i)

In [134]:
sample_annotation.loc[: , 'RIF'] = RIF_DR_list
sample_annotation.loc[: , 'INH'] = INH_DR_list
sample_annotation.loc[: , 'AMK'] = AMK_DR_list
sample_annotation.loc[: , 'CAP'] = CAP_DR_list
sample_annotation.loc[: , 'CIP'] = CIP_DR_list
sample_annotation.loc[: , 'EMB'] = EMB_DR_list
sample_annotation.loc[: , 'ETH'] = ETH_DR_list
sample_annotation.loc[: , 'KAN'] = KAN_DR_list
sample_annotation.loc[: , 'LEVO'] = LEVO_DR_list
sample_annotation.loc[: , 'OFLX'] = OFLX_DR_list
sample_annotation.loc[: , 'PAS'] = PAS_DR_list
sample_annotation.loc[: , 'PZA'] = PZA_DR_list
sample_annotation.loc[: , 'STR'] = STR_DR_list

In [136]:
sample_annotation.head()

Unnamed: 0_level_0,Study Source,Run ID,Sample ID,Sample Order,tag,Isolate Type,Dates,Filtered,RIF,INH,...,CAP,CIP,EMB,ETH,KAN,LEVO,OFLX,PAS,PZA,STR
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
I0005973-8,Farhat et. al. 2019,MQFF00000000,Peru3062,1,Peru3062,Longitudinal Isolate,6/27/12,,R,R,...,S,S,S,S,S,S,S,S,S,S
I0005973-8,Farhat et. al. 2019,MQKH00000000,Peru3315,2,Peru3315,Longitudinal Isolate,8/20/12,,R,R,...,S,S,S,S,S,S,S,S,S,S
I0005229-5,Farhat et. al. 2019,MQAO00000000,Peru2908,1,Peru2908,Longitudinal Isolate,3/30/12,,R,R,...,S,S,S,S,S,S,S,S,S,S
I0005229-5,Farhat et. al. 2019,MQKD00000000,Peru3278,2,Peru3278,Longitudinal Isolate,8/6/12,,R,R,...,S,S,S,S,S,S,S,S,S,S
I0005235-2,Farhat et. al. 2019,MQBA00000000,Peru2921,1,Peru2921,Longitudinal Isolate,4/21/12,,R,R,...,S,R,S,S,S,R,R,S,S,S


In [137]:
sample_annotation.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/Table S2B after DR genotyping.csv')