In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### This notebook was written to look for convergent evolution at the *pathway level* for in-host SNPs

In [2]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from pylab import plot, show, savefig, xlim, figure, hold, ylim, legend, boxplot, setp, axes
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools

import networkx as nx
import scipy

#for exporting to Adobe Illustrator
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

### Import annotation from Mycobrowser

In [3]:
mycobrowser_table = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/H37Rv_info/Mycobrowser_Release3/Mycobacterium_tuberculosis_H37Rv_txt_v3.txt' , sep = '\t')

#drop unnecessary columns
mycobrowser_table = mycobrowser_table.loc[: , ['Refseq_ID' , 'Feature' , 'Start' , 'Stop', 'Score' , 'Strand' , 'Frame' , 'Locus' , 'Name' , 'Function' , 'Product' , 'Comments' , 'UniProt_AC' , 'Functional_Category']]

#organize by Start Ref Position
mycobrowser_table.sort_values(by = ['Start'] , inplace = True)

#subset to CDS regions
mycobrowser_table = mycobrowser_table[mycobrowser_table.Feature == 'CDS']

#reset index
mycobrowser_table.reset_index(inplace = True , drop = True)

In [4]:
mycobrowser_table.head(n=5)

Unnamed: 0,Refseq_ID,Feature,Start,Stop,Score,Strand,Frame,Locus,Name,Function,Product,Comments,UniProt_AC,Functional_Category
0,NC_000962.3,CDS,1,1524,.,+,0.0,Rv0001,dnaA,Plays an important role in the initiation and ...,Chromosomal replication initiator protein DnaA,"Rv0001, (MT0001, MTV029.01, P49993), len: 507 ...",P9WNW3,information pathways
1,NC_000962.3,CDS,2052,3260,.,+,0.0,Rv0002,dnaN,"DNA polymerase III is a complex, multichain en...",DNA polymerase III (beta chain) DnaN (DNA nucl...,"Rv0002, (MTV029.02, MTCY10H4.0), len: 402 aa. ...",P9WNU1,information pathways
2,NC_000962.3,CDS,3280,4437,.,+,0.0,Rv0003,recF,The RECF protein is involved in DNA metabolism...,DNA replication and repair protein RecF (singl...,"Rv0003, (MTCY10H4.01), len: 385 aa. RecF, DNA ...",P9WHI9,information pathways
3,NC_000962.3,CDS,4434,4997,.,+,0.0,Rv0004,Rv0004,Function unknown,Conserved hypothetical protein,"Rv0004, (MTCY10H4.02), len: 187 aa. Conserved ...",P9WFL1,conserved hypotheticals
4,NC_000962.3,CDS,5240,7267,.,+,0.0,Rv0005,gyrB,DNA gyrase negatively supercoils closed circul...,DNA gyrase (subunit B) GyrB (DNA topoisomerase...,"Rv0005, (MTCY10H4.03), len: 675 aa. GyrB, DNA ...",P9WG45,information pathways


In [5]:
np.shape(mycobrowser_table)

(4031, 14)

#### Subset Mycobrowser table to genes that we kept for analysis

In [6]:
#load CSV file for Gene Categories
gene_categories = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/gene_categories/gene_categories.csv').set_index('name')

#get list of all Genes to consider in Significance Testing
H37Rv_tags_to_keep = []
for gene_category in ['Essential' , 'Non-Essential' , 'Antigen' , 'Antibiotic Resistance' , 'PE/PPE']: #only genes included in analysis
    H37Rv_tags_to_keep = H37Rv_tags_to_keep + list( gene_categories[gene_categories.Gene_Category == gene_category].gene_id )

In [7]:
len(H37Rv_tags_to_keep)

3886

In [8]:
#subset to certain genes
mycobrowser_table_filter = [(H37Rv_tag in H37Rv_tags_to_keep) for H37Rv_tag in list( mycobrowser_table.Locus )]
mycobrowser_table = mycobrowser_table[mycobrowser_table_filter]

#reset index
mycobrowser_table.reset_index(inplace = True , drop = True)

In [9]:
np.shape(mycobrowser_table)

(3883, 14)

#### Create a list that holds each H37Rv tag and its genomic coordinates (+/- a fudge factor)

In [10]:
H37Rv_coords_tag_map = []

for H37Rv_region_i in mycobrowser_table.index:
    
    H37Rv_tag = mycobrowser_table.loc[H37Rv_region_i , 'Locus']
    H37Rv_coords = np.sort( list( mycobrowser_table.loc[H37Rv_region_i , ['Start' , 'Stop']] ) ) 
    start_coord = H37Rv_coords[0]
    stop_coord = H37Rv_coords[1]
    
    #structure: H37Rv tag, start coord +/- fudge factor, stop coord +/- fudge factor (20bp)
    H37Rv_coords_tag_map.append( [H37Rv_tag , range(start_coord-20 , start_coord+21) , range(stop_coord-20 , stop_coord+21)] )

In [11]:
len(H37Rv_coords_tag_map)

3883

Test for some gene

In [12]:
print H37Rv_coords_tag_map[10][0] #H37Rv tag

Rv0011c


In [13]:
print H37Rv_coords_tag_map[10][1] #20bp left/right of H37Rv start coordinate

[13694, 13695, 13696, 13697, 13698, 13699, 13700, 13701, 13702, 13703, 13704, 13705, 13706, 13707, 13708, 13709, 13710, 13711, 13712, 13713, 13714, 13715, 13716, 13717, 13718, 13719, 13720, 13721, 13722, 13723, 13724, 13725, 13726, 13727, 13728, 13729, 13730, 13731, 13732, 13733, 13734]


In [14]:
print H37Rv_coords_tag_map[10][2] #20bp left/right of H37Rv stop coordinate

[13975, 13976, 13977, 13978, 13979, 13980, 13981, 13982, 13983, 13984, 13985, 13986, 13987, 13988, 13989, 13990, 13991, 13992, 13993, 13994, 13995, 13996, 13997, 13998, 13999, 14000, 14001, 14002, 14003, 14004, 14005, 14006, 14007, 14008, 14009, 14010, 14011, 14012, 14013, 14014, 14015]


In [15]:
mycobrowser_table[mycobrowser_table.Locus == 'Rv0011c']

Unnamed: 0,Refseq_ID,Feature,Start,Stop,Score,Strand,Frame,Locus,Name,Function,Product,Comments,UniProt_AC,Functional_Category
10,NC_000962.3,CDS,13714,13995,.,-,0.0,Rv0011c,Rv0011c,Unknown,Probable conserved transmembrane protein,"Rv0011c, (MTCY10H4.11c), len: 93 aa. Probable ...",P9WP57,cell wall and cell processes


### Import subsystem classification from SEED

In [16]:
SEED_annotation = pd.read_csv('//n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/pathway_analysis/SEED_H37Rv_83332_1_downloaded_4_1_19.tsv' , sep = '\t')

In [17]:
SEED_annotation.head(n=5)

Unnamed: 0,Feature ID,Type,Contig,Start,Stop,Frame,Strand,Length (bp),Function,Subsystem,NCBI GI,locus
0,fig|83332.1.pbs.3,PBS,NC_000962,2396081,2396096,2,+,16,NrdR_Proteobacteria,- none -,,
1,fig|83332.1.pbs.4,PBS,NC_000962,4265613,4265628,3,+,16,NrdR_Proteobacteria,- none -,,
2,fig|83332.1.peg.1,CDS,NC_000962,1,1524,1,+,1524,Chromosomal replication initiator protein DnaA,DNA replication cluster 1,gi|15607143,
3,fig|83332.1.peg.2,CDS,NC_000962,2052,3260,3,+,1209,DNA polymerase III beta subunit (EC 2.7.7.7),DNA replication cluster 1,gi|15607144,
4,fig|83332.1.peg.3,CDS,NC_000962,3280,4437,1,+,1158,DNA recombination and repair protein RecF,"DNA repair, bacterial RecFOR pathway; <br>DNA ...",gi|15607145,


#### Find all unique subsystem classifications

In [18]:
subsystem_types = []

#subset SEED annotation to genomic regions with at least 1 subsystem
SEED_annotation_with_subsystems = SEED_annotation[SEED_annotation.Subsystem != '- none -']

for region_with_subsytem_i in SEED_annotation_with_subsystems.Subsystem.index:
    
    #subset to subsystems for genomic region (CDS, RNA, etc.)
    subsystems_for_region_i = SEED_annotation_with_subsystems.loc[region_with_subsytem_i , 'Subsystem']
    
    #break up subsystem list into different subsystems (if there is more than 1)
    subsystems_for_region_i = subsystems_for_region_i.split(';')
    
    #append to list of all subsytems detected across all genomic regions
    subsystem_types = subsystem_types + subsystems_for_region_i
    
#find all unique subsystems present in the H37Rv genome
subsystem_types = list(set(subsystem_types))

In [19]:
len(subsystem_types)

576

### Assign each subsystem to any relevant loci from Mycobrowser table

In [20]:
subsystem_gene_dict = {}

for subsystem in subsystem_types:
    
    #create emtpy list for genes identified as part of subsystem
    genes_in_subsystem = []
    
    #find all genomic regions from SEED annotation table assigned to that subsystem
    for genomic_region_i in SEED_annotation.index:
        
        #if genomic region is part of subsytem
        if subsystem in SEED_annotation.loc[genomic_region_i , 'Subsystem'].split(';'):
            
            #find coordinates and map to H37Rv tag using Mycobrowser table
            SEED_region_coords = np.sort( list( SEED_annotation.loc[genomic_region_i , ['Start' , 'Stop']] ) )
            start_coord = SEED_region_coords[0]
            stop_coord = SEED_region_coords[1]
            
            #check to see if coordinates for SEED feature region match to a region from H37Rv Mycobrowser annotation
            for H37Rv_locus_and_coords in H37Rv_coords_tag_map:
                
                #check to see if SEED region start/stop coordinates in in the same region as Mycobroswer annotation
                if (start_coord in H37Rv_locus_and_coords[1]) and (stop_coord in H37Rv_locus_and_coords[2]):
                    
                    genes_in_subsystem.append(H37Rv_locus_and_coords[0])
                
    #store list of genes identified as being involved in subsystem in dictionary
    subsystem_gene_dict[subsystem] = genes_in_subsystem

In [21]:
len(subsystem_gene_dict)

576

In [22]:
subsystem_gene_series = pd.Series( subsystem_gene_dict )

#save to subsystem - H37Rv locus tag mappting to CSV
subsystem_gene_series.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/pathway_analysis/subsystem_H37Rv_tag_map.csv')

In [23]:
subsystem_gene_series.head()

 <br>Acetyl-CoA fermentation to Butyrate                                     [Rv0468, Rv0860]
 <br>Adenosyl nucleosidases                                                          [Rv0091]
 <br>Arginine Biosynthesis -- gjo                                                    [Rv1653]
 <br>Arginine Biosynthesis extended         [Rv1202, Rv1652, Rv1653, Rv1654, Rv1655, Rv165...
 <br>Arginine Deiminase Pathway                                              [Rv1656, Rv1657]
dtype: object

### Load in *in-host* SNPs (with $\Delta$AF $\ge$ 70%)

In [24]:
SNP_variants_within_patients = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/variant_calling/longitudinal_SNPs/SNPs_between_isolates_delta_70.pkl')

In [25]:
SNP_variants_within_patients.head()

Unnamed: 0,population,patient_id,ref_position,ref_allele,alt_allele,gene_id,genomic_coord,gene_category,gene_symbol,alt_AF_diff,SNP_type,AA_change
0,CASALI,P251,761139,C,G,Rv0667,1333,Antibiotic Resistance,rpoB,1.0,N,H445D
1,CASALI,P251,861378,T,G,Rv0768,467,Non-Essential,aldA,1.0,N,I156S
2,BRYANT,2020E,1253207,A,G,Rv1129c,1328,Non-Essential,,1.0,N,I443T
3,BRYANT,2020E,2075138,T,A,Rv1830,298,Non-Essential,,1.0,N,S100T
4,BRYANT,2020E,2694727,A,G,Rv2397c,238,Essential,cysA1,1.0,N,F80L


In [26]:
np.shape(SNP_variants_within_patients)

(175, 12)

#### Subset to SNPs in gene regions

In [27]:
SNP_variants_within_patients = SNP_variants_within_patients[SNP_variants_within_patients.SNP_type != 'I']
SNP_variants_within_patients.reset_index(drop = True , inplace = True)

In [28]:
SNP_variants_within_patients.head()

Unnamed: 0,population,patient_id,ref_position,ref_allele,alt_allele,gene_id,genomic_coord,gene_category,gene_symbol,alt_AF_diff,SNP_type,AA_change
0,CASALI,P251,761139,C,G,Rv0667,1333,Antibiotic Resistance,rpoB,1.0,N,H445D
1,CASALI,P251,861378,T,G,Rv0768,467,Non-Essential,aldA,1.0,N,I156S
2,BRYANT,2020E,1253207,A,G,Rv1129c,1328,Non-Essential,,1.0,N,I443T
3,BRYANT,2020E,2075138,T,A,Rv1830,298,Non-Essential,,1.0,N,S100T
4,BRYANT,2020E,2694727,A,G,Rv2397c,238,Essential,cysA1,1.0,N,F80L


#### Number of SNPs for each (subsystem) pathway

In [29]:
subsystem_SNP_count = {}

for subsystem in subsystem_gene_series.index:
    
    SNPs_in_subsystem_genes = 0
    
    for gene_id in subsystem_gene_series[subsystem]:
        
        SNPs_in_subsystem_genes = SNPs_in_subsystem_genes + sum(SNP_variants_within_patients.gene_id == gene_id)
    
    subsystem_SNP_count[subsystem] = SNPs_in_subsystem_genes
    
#convert dict to series & sort
subsystem_SNP_count = pd.Series( subsystem_SNP_count )
subsystem_SNP_count.sort_values(ascending = False, inplace = True)

In [30]:
subsystem_SNP_count[subsystem_SNP_count > 1]

 <br>RNA polymerase bacterial                                                            9
Mycobacterium virulence operon involved in DNA transcription                             9
DNA gyrase subunits                                                                      4
 <br>Oxidative stress                                                                    4
 <br>DNA replication cluster 1                                                           4
 <br>DNA topoisomerases, Type II, ATP-dependent                                          4
Oxidative stress                                                                         4
 <br>Protection from Reactive Oxygen Species                                             4
 <br>Resistance to fluoroquinolones                                                      4
Biotin biosynthesis                                                                      3
Ribosome LSU bacterial                                                                   3

In [31]:
subsystem_id = 'Biotin biosynthesis'
SNP_variants_within_patients_subsystem_filter = [SNP_variants_within_patients.loc[SNP_i , 'gene_id'] in subsystem_gene_series[subsystem_id] for SNP_i in SNP_variants_within_patients.index ]
SNP_variants_within_patients[SNP_variants_within_patients_subsystem_filter]

Unnamed: 0,population,patient_id,ref_position,ref_allele,alt_allele,gene_id,genomic_coord,gene_category,gene_symbol,alt_AF_diff,SNP_type,AA_change
11,CASALI,P233,4301121,C,T,Rv3826,1310,Non-Essential,fadD23,0.98,N,P437L
69,GUERRA,KPS_79,485640,C,A,Rv0404,1664,Essential,fadD30,0.89,N,A555E
104,CETR,3451,3301183,C,A,Rv2950c,1273,Non-Essential,fadD29,0.73,N,V425F


### Number of patients with at least 1 in-host SNP in each (subsystem) pathway

In [32]:
subsystem_subjects_with_SNP_count = {}

for subsystem in subsystem_gene_series.index:
    
    subjects_with_SNP_in_subsystem_genes = []
    
    for gene_id in subsystem_gene_series[subsystem]:
        
        subjects_with_SNP_in_subsystem_genes = subjects_with_SNP_in_subsystem_genes + list( SNP_variants_within_patients[SNP_variants_within_patients.gene_id == gene_id].patient_id ) 
    
    subsystem_subjects_with_SNP_count[subsystem] = len(list(set(subjects_with_SNP_in_subsystem_genes)))
    
#convert dict to series & sort
subsystem_subjects_with_SNP_count = pd.Series( subsystem_subjects_with_SNP_count )
subsystem_subjects_with_SNP_count.sort_values(ascending = False, inplace = True)

In [33]:
subsystem_subjects_with_SNP_count[subsystem_subjects_with_SNP_count > 1]

 <br>RNA polymerase bacterial                                   7
Mycobacterium virulence operon involved in DNA transcription    7
DNA gyrase subunits                                             4
 <br>Oxidative stress                                           4
 <br>DNA replication cluster 1                                  4
 <br>DNA topoisomerases, Type II, ATP-dependent                 4
Oxidative stress                                                4
 <br>Protection from Reactive Oxygen Species                    4
 <br>Resistance to fluoroquinolones                             4
Biotin biosynthesis                                             3
Ribosome LSU bacterial                                          3
Glycerolipid and Glycerophospholipid Metabolism in Bacteria     2
ESAT-6 proteins secretion system in Actinobacteria              2
 <br>Coenzyme B12 biosynthesis                                  2
CBSS-164757.7.peg.5020                                          2
Cobalamin 

### Display all *subsystems* pathways in which multiple individuals have at least 1 SNP along with associated *in-host* SNPs

In [34]:
#subsystems with where multiple subjects have a SNP & count of individuals with at least 1 SNP
parallel_mutation_subsytems = subsystem_subjects_with_SNP_count[subsystem_subjects_with_SNP_count > 1]

#store subsystems showing evidence of parallel convergence & associated mutations
parallel_mutation_subsytems_with_SNPs_DF = pd.DataFrame(columns = list(SNP_variants_within_patients.columns)+['pathway','Num Subjects w/ SNP in pathway'])

for subsystem_id, num_individuals in zip(list(parallel_mutation_subsytems.index) , list(parallel_mutation_subsytems.values)):

    SNP_variants_within_patients_subsystem_filter = [SNP_variants_within_patients.loc[SNP_i , 'gene_id'] in subsystem_gene_series[subsystem_id] for SNP_i in SNP_variants_within_patients.index ]
    SNP_variants_within_patients_detected_in_subsystem = SNP_variants_within_patients[SNP_variants_within_patients_subsystem_filter]
    
    #store in DataFrame
    SNP_variants_within_patients_detected_in_subsystem['pathway'] = [subsystem_id]*np.shape(SNP_variants_within_patients_detected_in_subsystem)[0] #column for subsystem ID
    SNP_variants_within_patients_detected_in_subsystem['Num Subjects w/ SNP in pathway'] = [num_individuals]*np.shape(SNP_variants_within_patients_detected_in_subsystem)[0] #column for num subjects with SNP in pathway
    parallel_mutation_subsytems_with_SNPs_DF = parallel_mutation_subsytems_with_SNPs_DF.append(SNP_variants_within_patients_detected_in_subsystem) #SNPs detected within pathway
    
#re-order columns
column_order = ['pathway', 'Num Subjects w/ SNP in pathway','population', 'patient_id', 'ref_position', 'genomic_coord', 'ref_allele', 'alt_allele', 'gene_id', 'gene_category', 'gene_symbol', 'alt_AF_diff', 'SNP_type', 'AA_change']
parallel_mutation_subsytems_with_SNPs_DF = parallel_mutation_subsytems_with_SNPs_DF.loc[: , column_order]

#output as CSV
parallel_mutation_subsytems_with_SNPs_DF.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/pathway_analysis/subsystems_with_parallel_SNPs.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(parallel_mutation_subsytems_with_SNPs_DF)

Unnamed: 0,pathway,Num Subjects w/ SNP in pathway,population,patient_id,ref_position,genomic_coord,ref_allele,alt_allele,gene_id,gene_category,gene_symbol,alt_AF_diff,SNP_type,AA_change
0,<br>RNA polymerase bacterial,7,CASALI,P251,761139,1333,C,G,Rv0667,Antibiotic Resistance,rpoB,1.0,N,H445D
40,<br>RNA polymerase bacterial,7,CASALI,P052,760314,508,G,T,Rv0667,Antibiotic Resistance,rpoB,0.98,N,V170F
41,<br>RNA polymerase bacterial,7,CASALI,P052,764819,1450,T,G,Rv0668,Antibiotic Resistance,rpoC,0.96,N,W484G
48,<br>RNA polymerase bacterial,7,WALKER,P000059,760314,508,G,T,Rv0667,Antibiotic Resistance,rpoB,0.85,N,V170F
49,<br>RNA polymerase bacterial,7,WALKER,P000059,765617,2248,G,C,Rv0668,Antibiotic Resistance,rpoC,0.98,N,E750Q
56,<br>RNA polymerase bacterial,7,WALKER,P000227,761277,1471,A,T,Rv0667,Antibiotic Resistance,rpoB,0.99,N,I491F
105,<br>RNA polymerase bacterial,7,CETR,2688,767123,3754,G,T,Rv0668,Antibiotic Resistance,rpoC,1.0,N,V1252L
122,<br>RNA polymerase bacterial,7,CETR,2511,761140,1334,A,C,Rv0667,Antibiotic Resistance,rpoB,1.0,N,H445P
129,<br>RNA polymerase bacterial,7,GUERRA,KPS_82,761139,1333,C,G,Rv0667,Antibiotic Resistance,rpoB,0.99,N,H445D
0,Mycobacterium virulence operon involved in DNA...,7,CASALI,P251,761139,1333,C,G,Rv0667,Antibiotic Resistance,rpoB,1.0,N,H445D
