# This notebook was created to 1. check for mutations in *atpE* in *mmpR* / *mmpL5* double mutants, 2. check for mutations in *rrs* in *eis* promoter / *eis* double mutants and 3. check for co-occurence of 4 most common *eis* promoter mutations with 3 *rrs* aminoglycoside resistance conferring mutations

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio import Phylo

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools
import gzip

import networkx as nx
import scipy
from collections import Counter

### *Function* to convert lineage-calls to new/simpler lineage call scheme

In [4]:
def convert_lineage_calls(isolate_annotation_DF):

    lineage_hierarchincal_to_lineage_map_df = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/lineage_hierachical_to_lineage_map.csv')
    lineage_hierarchincal_to_lineage_map_series = pd.Series(list(lineage_hierarchincal_to_lineage_map_df.loc[:, 'lineage']), index = lineage_hierarchincal_to_lineage_map_df.lineage_hierarchical)

    lineage_call_update_list = []

    for hierarchical_lineage_call in isolate_annotation_DF.lineage_call:

        # if lineage call has an updated shorter call
        if hierarchical_lineage_call in lineage_hierarchincal_to_lineage_map_series.index:
            lineage_call_update_list.append(lineage_hierarchincal_to_lineage_map_series[hierarchical_lineage_call])

        # if lineage call doesn't have an updated shorter call
        else:
            lineage_call_update_list.append(hierarchical_lineage_call)

    # replace the hierarchical calls with the shorter ones
    isolate_annotation_DF.loc[:,'lineage_call'] = lineage_call_update_list
    
    return isolate_annotation_DF

# [1] INDELs

## [1.1] Load INDEL genotype matrix and Annotation Files

### 31,428 isolates

In [5]:
#load isolate annotation file (columns of Genotype Matrix)
INDEL_isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')
INDEL_isolate_annotation_DF = INDEL_isolate_annotation_DF.loc[:, ['isolate_ID','lineage_call']] #drop columns
INDEL_isolate_annotation_DF = convert_lineage_calls(INDEL_isolate_annotation_DF)

#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
INDEL_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_functional_annotation.pkl')
INDEL_annotation_DF.reset_index(inplace = True, drop = False)

#load Genotypes Matrix
INDEL_genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_matrix.npy')

In [6]:
INDEL_isolate_annotation_DF.head()

Unnamed: 0,isolate_ID,lineage_call
0,SAMN13051687,2.2.1.1.1.i3
1,SAMN09100245,4.3.i4.1
2,SAMN08732238,2.2.1.1.1
3,SAMN07658260,3.1.1
4,SAMN03648003,2.2.1.1.1


In [7]:
np.shape(INDEL_isolate_annotation_DF)

(31428, 2)

In [8]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A,Essential,dnaA,Rv0001,313.0,del,inframe,105.0
1,TC_1549_T,1549,TC,T,,,Rv0001_Rv0002,,del,frameshift,
2,T_1552_TAA,1552,T,TAA,,,Rv0001_Rv0002,,ins,frameshift,
3,TAA_1552_T,1552,TAA,T,,,Rv0001_Rv0002,,del,frameshift,
4,T_1552_TA,1552,T,TA,,,Rv0001_Rv0002,,ins,frameshift,


In [9]:
np.shape(INDEL_annotation_DF)

(50260, 11)

In [10]:
INDEL_genotypes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [11]:
np.shape(INDEL_genotypes_array)

(50260, 31428)

### additional 12 *eis* C-14T mutants with AG MICs

In [12]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF_extra_strains = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_isolate_annotation.pkl')
isolate_annotation_DF_extra_strains = isolate_annotation_DF_extra_strains.loc[:, ['isolate_ID','lineage_call']]
isolate_annotation_DF_extra_strains = convert_lineage_calls(isolate_annotation_DF_extra_strains)

#load Genotypes Matrix
genotypes_array_extra_strains =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_Filtered/genotypes_matrix.npy')

In [13]:
isolate_annotation_DF_extra_strains.head()

Unnamed: 0,isolate_ID,lineage_call
0,168-19,2.2.1.1.1
1,622-19,2.2.1.1.1.i3
2,655-19,4.3.i3.1
3,IT1070,2.2.1.1.1.i3
4,IT123,2.2.1.1.1


In [14]:
genotypes_array_extra_strains

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 9, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [15]:
np.shape(genotypes_array_extra_strains)

(50260, 12)

### Merge *isolate annotation files* and *genotypes matrices* together

In [16]:
# merge isolate annotation dataframes
INDEL_isolate_annotation_DF = INDEL_isolate_annotation_DF.append(isolate_annotation_DF_extra_strains)
INDEL_isolate_annotation_DF.reset_index(inplace = True, drop = True)

In [17]:
np.shape(INDEL_isolate_annotation_DF)

(31440, 2)

In [18]:
# merge genotypes matrices
INDEL_genotypes_array = np.hstack((INDEL_genotypes_array, genotypes_array_extra_strains))

In [19]:
np.shape(INDEL_genotypes_array)

(50260, 31440)

### *Function* to retrieve isolates with INDELs in a specific genomic loci

In [20]:
def get_strains_with_indels_in_gene(gene_intergenic_id, strains_of_interest):

    # filter STRAINS
    strains_to_keep = [strain_i in strains_of_interest for strain_i in INDEL_isolate_annotation_DF.isolate_ID] # construct boolean filter for strains to keep

    # filter Genotypes Matrix
    INDEL_genotypes_array_filtered = INDEL_genotypes_array[: , strains_to_keep]

    # filter Isolate annotation file
    INDEL_isolate_annotation_DF_filtered = INDEL_isolate_annotation_DF[strains_to_keep]
    INDEL_isolate_annotation_DF_filtered.reset_index(drop = True , inplace = True) #re-index new filtered INDEL annotation DF (so new index matches indexing of genotypes matrix rows)

    # filter VARIANTS
    indels_in_targeted_region = list(INDEL_annotation_DF[INDEL_annotation_DF.gene_id == gene_intergenic_id].key) # keep only indels that occur within this gene/intergenic region
    indels_to_keep_filter = [indel_i in indels_in_targeted_region for indel_i in INDEL_annotation_DF.key] # construct boolean filter

    # filter Genotypes Matrix
    INDEL_genotypes_array_filtered = INDEL_genotypes_array_filtered[indels_to_keep_filter , :]

    # filter INDEL annotation file
    INDEL_annotation_DF_filtered = INDEL_annotation_DF[indels_to_keep_filter]
    INDEL_annotation_DF_filtered.reset_index(drop = True , inplace = True) #re-index new filtered INDEL annotation DF (so new index matches indexing of genotypes matrix rows)

    isolate_with_variant_df = pd.DataFrame(columns = ['key','pos','ref','alt','gene_category','gene_name','gene_id','gene_pos','ins_del','INDEL_type','codon_pos','isolate_ID','lineage_call','group'])
    isolate_with_variant_df_index = 0
    # iterate through each variant (row) and store a row into a dataframe for each isolate with an INDEL
    for row_i in INDEL_annotation_DF_filtered.index:

        #iterate through each strain for this variant row
        for strain_j in INDEL_isolate_annotation_DF_filtered.index:

            genotypes_row_i_strain_j = INDEL_genotypes_array_filtered[row_i, strain_j]

            #check if strain_i has variant
            if genotypes_row_i_strain_j == 1:

                #store a row indicating that this strain had this variant
                isolate_j_with_variant_i_row = list(INDEL_annotation_DF_filtered.loc[row_i]) + list(INDEL_isolate_annotation_DF_filtered.loc[strain_j, ['isolate_ID','lineage_call','group']])
                isolate_with_variant_df.loc[isolate_with_variant_df_index, :] = isolate_j_with_variant_i_row
                isolate_with_variant_df_index += 1
            
    return isolate_with_variant_df

In [21]:
get_strains_with_indels_in_gene('Rv0676c', ['99R893','Peru4670','SAMEA5569914','Peru5003','Peru3367'])

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,isolate_ID,lineage_call,group
0,AC_777875_A,777875,AC,A,Non-Essential,mmpL5,Rv0676c,606,del,frameshift,202,99R893,4.11,
1,AC_777875_A,777875,AC,A,Non-Essential,mmpL5,Rv0676c,606,del,frameshift,202,Peru4670,4.11,
2,AC_777875_A,777875,AC,A,Non-Essential,mmpL5,Rv0676c,606,del,frameshift,202,SAMEA5569914,4.11,
3,AC_777875_A,777875,AC,A,Non-Essential,mmpL5,Rv0676c,606,del,frameshift,202,Peru5003,4.11,
4,AC_777875_A,777875,AC,A,Non-Essential,mmpL5,Rv0676c,606,del,frameshift,202,Peru3367,4.11,


# [2] SNPs

### 31,428 isolates

## [2.1] Load SNP genotype matrix and Annotation Files

In [22]:
#load isolate annotation file (columns of Genotype Matrix)
SNP_isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')
SNP_isolate_annotation_DF = SNP_isolate_annotation_DF.loc[:, ['isolate_ID','lineage_call']] #drop columns
SNP_isolate_annotation_DF = convert_lineage_calls(SNP_isolate_annotation_DF)

#load SNP annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')
SNP_func_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_functional_annotation.pkl')
SNP_func_annotation_DF.loc[:, 'mut_key'] = SNP_func_annotation_DF.index

#load Genotypes Matrix
SNP_genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_matrix.npy')

In [23]:
SNP_isolate_annotation_DF.head()

Unnamed: 0,isolate_ID,lineage_call
0,SAMEA3558733,4.3.i4.2
1,SAMN03648641,4.4.1.1
2,SAMN03647419,3.1.1.i1
3,SAMEA3671418,4.3.i2
4,SAMN07659096,1.1.3


In [24]:
np.shape(SNP_isolate_annotation_DF)

(31428, 2)

In [25]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA
2,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
3,69,C,[T],Rv0001,68.0,Essential,dnaA
4,71,C,[T],Rv0001,70.0,Essential,dnaA


In [26]:
np.shape(SNP_annotation_DF)

(782565, 7)

In [27]:
SNP_func_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,mut_key
48_T,48,C,T,Essential,dnaA,Rv0001,48,S,V16V,48_T
64_C,64,G,C,Essential,dnaA,Rv0001,64,N,G22R,64_C
67_A,67,G,A,Essential,dnaA,Rv0001,67,N,D23N,67_A
67_T,67,G,T,Essential,dnaA,Rv0001,67,N,D23Y,67_T
69_T,69,C,T,Essential,dnaA,Rv0001,69,S,D23D,69_T


In [28]:
np.shape(SNP_func_annotation_DF)

(844429, 10)

In [29]:
SNP_genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 9, 9, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1]], dtype=int8)

In [30]:
np.shape(SNP_genotypes_array)

(782565, 31428)

### additional 12 *eis* C-14T mutants with AG MICs

In [31]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF_extra_strains = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_isolate_annotation.pkl')
isolate_annotation_DF_extra_strains = isolate_annotation_DF_extra_strains.loc[:, ['isolate_ID','lineage_call']]
isolate_annotation_DF_extra_strains = convert_lineage_calls(isolate_annotation_DF_extra_strains)

#load Genotypes Matrix
genotypes_array_extra_strains =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_matrix.npy')

In [32]:
isolate_annotation_DF_extra_strains.head()

Unnamed: 0,isolate_ID,lineage_call
0,168-19,2.2.1.1.1
1,622-19,2.2.1.1.1.i3
2,655-19,4.3.i3.1
3,IT1070,2.2.1.1.1.i3
4,IT123,2.2.1.1.1


In [33]:
genotypes_array_extra_strains

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [34]:
np.shape(genotypes_array_extra_strains)

(782565, 12)

### Merge *isolate annotation files* and *genotypes matrices* together

In [35]:
# merge isolate annotation dataframes
SNP_isolate_annotation_DF = SNP_isolate_annotation_DF.append(isolate_annotation_DF_extra_strains)
SNP_isolate_annotation_DF.reset_index(inplace = True, drop = True)

In [36]:
np.shape(SNP_isolate_annotation_DF)

(31440, 2)

In [37]:
# merge genotypes matrices
SNP_genotypes_array = np.hstack((SNP_genotypes_array, genotypes_array_extra_strains))

In [38]:
np.shape(SNP_genotypes_array)

(782565, 31440)

# [3] Look for resistance mutations for specific sets of strains

### *Function* to retrieve isolates with SNVs in a specific genomic loci

In [39]:
def get_strains_with_SNPs_in_gene(gene_intergenic_id, strains_of_interest):

    #get the number code that corresponds to the minor allele, 9 indicated bad quality call
    base_code_dict = {'A':0, 'C':1, 'G':2, 'T':3}

    # filter STRAINS
    strains_to_keep = [strain_i in strains_of_interest for strain_i in SNP_isolate_annotation_DF.isolate_ID] # construct boolean filter for strains to keep

    # filter Genotypes Matrix
    SNP_genotypes_array_filtered = SNP_genotypes_array[: , strains_to_keep]

    # filter Isolate annotation file
    SNP_isolate_annotation_DF_filtered = SNP_isolate_annotation_DF[strains_to_keep]
    SNP_isolate_annotation_DF_filtered.reset_index(drop = True , inplace = True) #re-index new filtered SNP annotation DF (so new index matches indexing of genotypes matrix rows)

    # filter VARIANTS
    SNPs_in_targeted_region = list(set(SNP_func_annotation_DF[SNP_func_annotation_DF.gene_id == gene_intergenic_id].pos)) # keep only SNPs that occur within this gene/intergenic region
    SNPs_to_keep_filter = [SNP_i in SNPs_in_targeted_region for SNP_i in SNP_annotation_DF.pos] # construct boolean filter

    # filter Genotypes Matrix
    SNP_genotypes_array_filtered = SNP_genotypes_array_filtered[SNPs_to_keep_filter , :]

    # filter SNP annotation file
    SNP_annotation_DF_filtered = SNP_annotation_DF[SNPs_to_keep_filter]
    SNP_annotation_DF_filtered.reset_index(drop = True , inplace = True) #re-index new filtered SNP annotation DF (so new index matches indexing of genotypes matrix rows)

    isolate_with_variant_df = pd.DataFrame(columns = ['mut_key','pos','ref','alt','isolate_ID','lineage_call'])
    isolate_with_variant_df_index = 0
    # iterate through each variant (row) and store a row into a dataframe for each isolate with an INDEL
    for row_i in SNP_annotation_DF_filtered.index:

        # get a list of alternate alleles for this SNP site (row
        row_i_alt_allele_list = SNP_annotation_DF_filtered.loc[row_i, 'alt']

        # iterate through each strain for this variant row
        for strain_j in SNP_isolate_annotation_DF_filtered.index:

            genotypes_row_i_strain_j = SNP_genotypes_array_filtered[row_i, strain_j]

            # check if strain_i has variant for each alternate allele
            for alt_allele_i in row_i_alt_allele_list:

                if genotypes_row_i_strain_j == base_code_dict[alt_allele_i]:

                    # store a row indicating that this strain had this variant
                    isolate_j_with_variant_i_row = [str(SNP_annotation_DF_filtered.loc[row_i, 'pos']) + '_' + alt_allele_i] + list(SNP_annotation_DF_filtered.loc[row_i, ['pos','ref']]) + [alt_allele_i] + list(SNP_isolate_annotation_DF_filtered.loc[strain_j, ['isolate_ID','lineage_call']])
                    isolate_with_variant_df.loc[isolate_with_variant_df_index, :] = isolate_j_with_variant_i_row
                    isolate_with_variant_df_index += 1
                    
    # get functional annotation for df
    isolate_with_variant_df = isolate_with_variant_df.merge(SNP_func_annotation_DF.loc[:,['gene_category','gene_name','gene_id','gene_pos','SNP_type','AA_change','mut_key']], how='left', on='mut_key')
            
    return isolate_with_variant_df

### *Function* to retrieve genotypes for a SNP site given a set of strains, gene and gene coordinate for SNP

In [40]:
def get_genotypes_for_SNP(strains_of_interest, gene_id, gene_pos_SNP_i):

    # get the number code that corresponds to the minor allele, 9 indicated bad quality call
    base_code_dict = {'A':0, 'C':1, 'G':2, 'T':3}
    base_code_dict_r = {0:'A', 1:'C', 2:'G', 3:'T', 9:'N'}

    # filter STRAINS
    strains_to_keep = [strain_i in strains_of_interest for strain_i in SNP_isolate_annotation_DF.isolate_ID] # construct boolean filter for strains to keep

    # filter Genotypes Matrix
    SNP_genotypes_array_filtered = SNP_genotypes_array[: , strains_to_keep]

    # filter for SNP position for VARIANT
    H37Rv_ref_pos_for_SNP = SNP_func_annotation_DF[(SNP_func_annotation_DF.gene_id == gene_id) & (SNP_func_annotation_DF.gene_pos == gene_pos_SNP_i)].pos[0]
    SNP_to_keep_filter = SNP_annotation_DF.pos == H37Rv_ref_pos_for_SNP # construct boolean filter

    # filter Genotypes Matrix
    SNP_genotypes_vector = SNP_genotypes_array_filtered[SNP_to_keep_filter , :]
    SNP_genotypes_vector = [base_code_dict_r[allele_i] for allele_i in SNP_genotypes_vector[0]]

    # func annotation for SNP
    func_annot_for_SNP_list = list(SNP_func_annotation_DF[SNP_func_annotation_DF.pos == H37Rv_ref_pos_for_SNP].loc[: , ['pos','ref','gene_pos']].values[0]) + list(SNP_annotation_DF[SNP_annotation_DF.pos == H37Rv_ref_pos_for_SNP].alt)

    # append the genotypes for the strains to the func annotation for the SNP
    return func_annot_for_SNP_list + SNP_genotypes_vector

# [4] Side-Analysis
mutation: *mmpL5* Y300* 4.10.i1(1) - 1.1.1.1(292); check whether the single typing result for the L4 isolate is accurate? In Merker et al. 2020, it was found to be a marker for 1.1.1.1, which is in line with the remaining typing results.


Some probing for this - there were 556 isolates typed as 1.1.1.1 in our dataset, 292/556 had this mutation. Furthermore, there were 2227 isolates typed as 4.10.i1 in our dataset, 1/2227 had this mutation and it was not one of the 14/31428 that were mis-typed according to our SNP barcoding scheme, so this isolate is certainly lineage 4.

### 1.1.1.1

Get the number of isolates that are typed as sub-lineage **1.1.1.1**

In [41]:
len(SNP_isolate_annotation_DF[SNP_isolate_annotation_DF.lineage_call == '1.1.1.1'])

556

Get the genotypes as H37Rv position **777,581** for the **1.1.1.1** isolates

In [42]:
pos_777581_1_1_1_1_genotypes = get_genotypes_for_SNP(list(SNP_isolate_annotation_DF[SNP_isolate_annotation_DF.lineage_call == '1.1.1.1'].isolate_ID), 'Rv0676c', 900)

In [43]:
len(pos_777581_1_1_1_1_genotypes[4:])

556

How many of these **1.1.1.1** isolates have the *mmpL5* Y300* allele?

In [44]:
sum(np.array(pos_777581_1_1_1_1_genotypes[4:]) == 'T')

292

### 4.10.i1

Get the number of isolates that are typed as sub-lineage **4.10.i1**

In [45]:
len(SNP_isolate_annotation_DF[SNP_isolate_annotation_DF.lineage_call == '4.10.i1'])

2227

Get the genotypes as H37Rv position **777,581** for the **4.10.i1** isolates

In [46]:
pos_777581_4_10_i1_genotypes = get_genotypes_for_SNP(list(SNP_isolate_annotation_DF[SNP_isolate_annotation_DF.lineage_call == '4.10.i1'].isolate_ID), 'Rv0676c', 900)

In [47]:
len(pos_777581_4_10_i1_genotypes[4:])

2227

How many of these **4.10.i1** isolates have the *mmpL5* Y300* allele?

In [48]:
sum(np.array(pos_777581_4_10_i1_genotypes[4:]) == 'T')

1

What was the breakdown of **alleles** for hte 2,227 **4.10.i1** isolates and what was the isolate ID of the single **4.10.i1** isolate that had the *mmpL5* Y300* allele?

In [49]:
strains_of_interest = list(SNP_isolate_annotation_DF[SNP_isolate_annotation_DF.lineage_call == '4.10.i1'].isolate_ID)
gene_id = 'Rv0676c'
gene_pos_SNP_i = 900

# get the number code that corresponds to the minor allele, 9 indicated bad quality call
base_code_dict = {'A':0, 'C':1, 'G':2, 'T':3}
base_code_dict_r = {0:'A', 1:'C', 2:'G', 3:'T', 9:'N'}

# filter STRAINS
strains_to_keep = [strain_i in strains_of_interest for strain_i in SNP_isolate_annotation_DF.isolate_ID] # construct boolean filter for strains to keep
SNP_isolate_annotation_DF_filtered = SNP_isolate_annotation_DF[strains_to_keep]

# filter Genotypes Matrix
SNP_genotypes_array_filtered = SNP_genotypes_array[: , strains_to_keep]

# filter for SNP position for VARIANT
H37Rv_ref_pos_for_SNP = SNP_func_annotation_DF[(SNP_func_annotation_DF.gene_id == gene_id) & (SNP_func_annotation_DF.gene_pos == gene_pos_SNP_i)].pos[0]
SNP_to_keep_filter = SNP_annotation_DF.pos == H37Rv_ref_pos_for_SNP # construct boolean filter

# filter Genotypes Matrix
SNP_genotypes_vector = SNP_genotypes_array_filtered[SNP_to_keep_filter , :]
SNP_genotypes_vector = [base_code_dict_r[allele_i] for allele_i in SNP_genotypes_vector[0]]

In [50]:
SNP_isolate_annotation_DF_filtered[np.array(SNP_genotypes_vector) == 'T']

Unnamed: 0,isolate_ID,lineage_call
4304,SAMEA1118012,4.10.i1


In [51]:
np.shape(SNP_isolate_annotation_DF_filtered[np.array(SNP_genotypes_vector) == 'T'])

(1, 2)

In [52]:
np.shape((SNP_isolate_annotation_DF_filtered[np.array(SNP_genotypes_vector) == 'G']))

(2200, 2)

In [53]:
np.shape((SNP_isolate_annotation_DF_filtered[np.array(SNP_genotypes_vector) == 'N']))

(26, 2)

# [5] Check *atpE* for mmpR / mmpL5 double mutant isolates

## [5.1] Check *atpE* for mmpR / mmpL5 double mutant isolates that were part of the Peruvian cluster (n=82)

Get list of isolates that had both the *mmpR* insertion **and** the *mmpL5* deletion (from other notebook)

In [54]:
mmpR_mmpL5_double_mutant_isolate_IDs = ['SAMEA5569929', 'SAMEA3402909', 'SAMEA5569553', 'Peru2918', 'Peru4515', 'SAMEA3392629', 'SAMEA5569985', 'SAMEA5569889', 'Peru4671', 'Peru4670', '01R1305', 'Peru2959', '02R0099', 'Peru4683', 'Peru4652', 'Peru4722', 'Peru4961', 'Peru4900', 'SAMEA1101329', 'Peru3418', 'SAMEA3414462', 'Peru2965', 'Peru3056', 'SAMEA2682981', 'Peru5003', '00R1547', 'SAMEA5570040', '02R0417', 'SAMEA5569638', '02R0948', 'Peru3376', 'Peru4521', 'Peru5012', '99R893', 'SAMEA5569636', 'SAMEA2683080', 'SAMEA3558289', 'Peru4988', 'SAMEA3401008', 'Peru4668', 'SAMEA2682695', 'Peru4685', 'SAMEA5569938', '01R0272', '1791', 'Peru4565', 'Peru5036', 'Peru4577', 'Peru4932', 'Peru4646', 'Peru4647', 'Peru4936', 'Peru4937', '02R1942', 'Peru3367', 'Peru4582', 'SAMEA5569790', 'Peru4549', 'Peru4057', 'SAMEA3558270', 'SAMEA2683134', '01R0451', 'SAMEA2682679', 'Peru4492', 'Peru3407', 'Peru5076', 'SAMN02414923', 'SAMEA5569666', 'Peru3342', 'Peru3389', 'Peru3408', 'Peru5445', 'Peru3405', 'Peru5115', 'Peru5114', 'Peru4707', 'SAMEA5569914', 'SAMEA5569642', 'SAMEA5570058', 'Peru3031', 'SAMEA3558288', 'Peru4919']

In [55]:
len(mmpR_mmpL5_double_mutant_isolate_IDs)

82

Set of SNP sites in **atpE** in which strains are probed for the presence of an *alternate* allele

In [56]:
SNP_func_annotation_DF[SNP_func_annotation_DF.gene_name == 'atpE']

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,mut_key
1461068_T,1461068,C,T,Essential,atpE,Rv1305,24,S,G8G,1461068_T
1461080_T,1461080,C,T,Essential,atpE,Rv1305,36,S,G12G,1461080_T
1461081_A,1461081,G,A,Essential,atpE,Rv1305,37,N,G13S,1461081_A
1461090_G,1461090,A,G,Essential,atpE,Rv1305,46,N,I16V,1461090_G
1461095_A,1461095,G,A,Essential,atpE,Rv1305,51,N,M17I,1461095_A
1461096_T,1461096,G,T,Essential,atpE,Rv1305,52,N,A18S,1461096_T
1461119_A,1461119,T,A,Essential,atpE,Rv1305,75,S,G25G,1461119_A
1461120_G,1461120,A,G,Essential,atpE,Rv1305,76,N,I26V,1461120_G
1461143_T,1461143,C,T,Essential,atpE,Rv1305,99,S,N33N,1461143_T
1461154_G,1461154,C,G,Essential,atpE,Rv1305,110,N,S37C,1461154_G


In [57]:
np.shape(SNP_func_annotation_DF[SNP_func_annotation_DF.gene_name == 'atpE'])

(29, 10)

In [58]:
atpE_muts_for_mmpR_mmpL5_double_mutants = get_strains_with_SNPs_in_gene('Rv1305', mmpR_mmpL5_double_mutant_isolate_IDs)

In [59]:
atpE_muts_for_mmpR_mmpL5_double_mutants

Unnamed: 0,pos,ref,alt,isolate_ID,lineage_call,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,mut_key


## [5.2] Check *atpE* for mmpR / mmpL5 double mutant isolates that were note part of Peruvian cluster (n=2)

Get list of isolates that had both a *mmpR* frameshift **and** a *mmpL5* insertion (from other notebook)

In [60]:
mmpR_mmpL5_double_mutant_isolate_IDs = ['SAMN12551172','SAMD00117890']

In [61]:
len(mmpR_mmpL5_double_mutant_isolate_IDs)

2

Set of SNP sites in **atpE** in which strains are probed for the presence of an *alternate* allele

In [62]:
SNP_func_annotation_DF[SNP_func_annotation_DF.gene_name == 'atpE'].head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,mut_key
1461068_T,1461068,C,T,Essential,atpE,Rv1305,24,S,G8G,1461068_T
1461080_T,1461080,C,T,Essential,atpE,Rv1305,36,S,G12G,1461080_T
1461081_A,1461081,G,A,Essential,atpE,Rv1305,37,N,G13S,1461081_A
1461090_G,1461090,A,G,Essential,atpE,Rv1305,46,N,I16V,1461090_G
1461095_A,1461095,G,A,Essential,atpE,Rv1305,51,N,M17I,1461095_A


In [63]:
np.shape(SNP_func_annotation_DF[SNP_func_annotation_DF.gene_name == 'atpE'])

(29, 10)

In [64]:
atpE_muts_for_mmpR_mmpL5_double_mutants = get_strains_with_SNPs_in_gene('Rv1305', mmpR_mmpL5_double_mutant_isolate_IDs)

In [65]:
atpE_muts_for_mmpR_mmpL5_double_mutants

Unnamed: 0,pos,ref,alt,isolate_ID,lineage_call,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,mut_key


In [66]:
mmpR_mmpL5_double_mutant_isolate_IDs

['SAMN12551172', 'SAMD00117890']

# [6] Check *rrs* for *eis* promoter / *eis* double mutant isolates

In [67]:
SNP_func_annotation_DF[(SNP_func_annotation_DF.pos >= 1471846) & (SNP_func_annotation_DF.pos <= 1473382)].head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,mut_key
1471848_A,1471848,T,A,,,Rvnr01,3,N,F1L,1471848_A
1471850_T,1471850,G,T,,,Rvnr01,5,N,C2F,1471850_T
1471861_A,1471861,T,A,,,Rvnr01,16,N,L6M,1471861_A
1471861_C,1471861,T,C,,,Rvnr01,16,S,L6L,1471861_C
1471861_G,1471861,T,G,,,Rvnr01,16,N,L6V,1471861_G


Get list of isolates that were additional 12 *eis* C-14T mutants with AG MICs and/or had both the *eis* promoter SNP **and** the *eis* LOF mutations

Check additional 12 *eis* C-14T mutants with AG MICs Strains: 
1. IT952
1. IT947
1. IT77
1. IT634
1. IT524
1. IT233
1. IT184
1. IT123
1. IT1070
1. 655-19
1. 622-19
1. 168-19

Other strains that have a *eis* promoter mutation & *eis* LOF mutation:
1. SAMN07956543
1. SAMN02584676
1. SAMEA1016073
1. SAMN04633319
1. SAMEA1403638
1. SAMEA1403685
1. SAMN02419586
1. SAMN02419559
1. SAMN02419535
1. SAMN07236283
1. SAMN02419543
1. SAMN07236283
1. Peru3354
1. Peru2946
1. SAMN08376196
1. SAMN08709032
1. SAMN06210015
1. SAMN02584612

#### Check to see that strains that (additional 12 *eis* C-14T mutants with AG MICs) have the G-14A *eis* promoter mutation (@ position 2715346)

In [68]:
get_strains_with_SNPs_in_gene('Rv2416c_Rv2417c', ['IT952','IT947','IT77','IT634','IT524','IT233','IT184','IT123','IT1070','655-19','622-19','168-19'])

Unnamed: 0,mut_key,pos,ref,alt,isolate_ID,lineage_call,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change
0,2715346_A,2715346,G,A,168-19,2.2.1.1.1,,,Rv2416c_Rv2417c,,I,
1,2715346_A,2715346,G,A,622-19,2.2.1.1.1.i3,,,Rv2416c_Rv2417c,,I,
2,2715346_A,2715346,G,A,655-19,4.3.i3.1,,,Rv2416c_Rv2417c,,I,
3,2715346_A,2715346,G,A,IT1070,2.2.1.1.1.i3,,,Rv2416c_Rv2417c,,I,
4,2715346_A,2715346,G,A,IT123,2.2.1.1.1,,,Rv2416c_Rv2417c,,I,
5,2715346_A,2715346,G,A,IT184,2.2.1.1.1,,,Rv2416c_Rv2417c,,I,
6,2715346_A,2715346,G,A,IT233,2.2.1.1.1.i3,,,Rv2416c_Rv2417c,,I,
7,2715346_A,2715346,G,A,IT524,4.10.i2,,,Rv2416c_Rv2417c,,I,
8,2715346_A,2715346,G,A,IT634,2.2.1.1.1.i3,,,Rv2416c_Rv2417c,,I,
9,2715346_A,2715346,G,A,IT77,2.2.1.1.1.i3,,,Rv2416c_Rv2417c,,I,


In [69]:
strains_of_interest = ['IT952','IT947','IT77','IT634','IT524','IT233','IT184','IT123','IT1070','655-19','622-19','168-19','SAMN07956543','SAMN02584676','SAMEA1016073','SAMN04633319','SAMEA1403638','SAMEA1403685','SAMN02419586','SAMN02419559','SAMN02419535','SAMN07236283','SAMN02419543','SAMN07236283','Peru3354','Peru2946','SAMN08376196','SAMN08709032','SAMN06210015','SAMN02584612']
print(len(strains_of_interest))

30


In [70]:
# filter Isolate annotation file
strains_to_keep = [strain_i in strains_of_interest for strain_i in SNP_isolate_annotation_DF.isolate_ID] # construct boolean filter for strains to keep
SNP_isolate_annotation_DF_filtered = SNP_isolate_annotation_DF[strains_to_keep]
SNP_isolate_annotation_DF_filtered.reset_index(drop = True , inplace = True) #re-index new filtered SNP annotation DF (so new index matches indexing of genotypes matrix rows)

In [71]:
SNP_isolate_annotation_DF_filtered

Unnamed: 0,isolate_ID,lineage_call
0,SAMEA1016073,2.2.1.1.1.i3
1,SAMN08376196,2.2.1.1.1
2,SAMEA1403685,2.2.1.1.1.i3
3,SAMEA1403638,2.2.1.1.1.i3
4,SAMN06210015,2.2.1.1.1
5,SAMN02419535,2.2.1.1.1.i3
6,SAMN02419543,2.2.1.1.1.i3
7,SAMN07236283,1.1.1.1
8,SAMN02584676,2.2.1.1.1.i3
9,SAMN02419586,2.2.1.1.1.i3


In [72]:
rrs_genotypes_for_strains_of_interest = pd.DataFrame(columns = ['pos','ref','gene_pos','alt'] + list(SNP_isolate_annotation_DF_filtered.isolate_ID))

#add strain lineage calls
rrs_genotypes_for_strains_of_interest.loc['lineage_call', :] = [np.nan, np.nan, np.nan, np.nan] + list(SNP_isolate_annotation_DF_filtered.lineage_call)

# A1401G
rrs_genotypes_for_strains_of_interest.loc['rrs_SNP_1', :] = get_genotypes_for_SNP(strains_of_interest, 'Rvnr01', 1401)

# C1402T
rrs_genotypes_for_strains_of_interest.loc['rrs_SNP_2', :] = get_genotypes_for_SNP(strains_of_interest, 'Rvnr01', 1402)

# G1484T
rrs_genotypes_for_strains_of_interest.loc['rrs_SNP_3', :] = get_genotypes_for_SNP(strains_of_interest, 'Rvnr01', 1484)

In [73]:
rrs_genotypes_for_strains_of_interest.T

Unnamed: 0,lineage_call,rrs_SNP_1,rrs_SNP_2,rrs_SNP_3
pos,,1473246,1473247,1473329
ref,,A,C,G
gene_pos,,1401,1402,1484
alt,,[G],"[A, T]","[C, T]"
SAMEA1016073,2.2.1.1.1.i3,A,C,G
SAMN08376196,2.2.1.1.1,A,C,G
SAMEA1403685,2.2.1.1.1.i3,A,C,G
SAMEA1403638,2.2.1.1.1.i3,A,C,G
SAMN06210015,2.2.1.1.1,A,C,G
SAMN02419535,2.2.1.1.1.i3,A,C,G


Output as CSV file

In [74]:
rrs_genotypes_for_strains_of_interest.T.to_csv('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/rrs_resistance_mut_check_extra_strains_and_eis_prom_eis_double_muts.csv')

### Check that the *rrs* mutations using the other function

In [79]:
rrs_muts_for_strains_of_interest = get_strains_with_SNPs_in_gene('Rvnr01', strains_of_interest)

In [80]:
rrs_muts_for_strains_of_interest.drop(['SNP_type','AA_change','gene_category'],axis=1,inplace=True)
rrs_muts_for_strains_of_interest.loc[:, 'gene_name'] = ['rrs']*np.shape(rrs_muts_for_strains_of_interest)[0]

In [81]:
rrs_muts_for_strains_of_interest

Unnamed: 0,mut_key,pos,ref,alt,isolate_ID,lineage_call,gene_name,gene_id,gene_pos
0,1472359_C,1472359,A,C,655-19,4.3.i3.1,rrs,Rvnr01,514
1,1472362_T,1472362,C,T,SAMEA1016073,2.2.1.1.1.i3,rrs,Rvnr01,517
2,1472362_T,1472362,C,T,SAMEA1403685,2.2.1.1.1.i3,rrs,Rvnr01,517
3,1472362_T,1472362,C,T,SAMEA1403638,2.2.1.1.1.i3,rrs,Rvnr01,517
4,1472362_T,1472362,C,T,SAMN02419535,2.2.1.1.1.i3,rrs,Rvnr01,517
5,1472362_T,1472362,C,T,SAMN02419543,2.2.1.1.1.i3,rrs,Rvnr01,517
6,1472362_T,1472362,C,T,SAMN07236283,1.1.1.1,rrs,Rvnr01,517
7,1472362_T,1472362,C,T,SAMN02584676,2.2.1.1.1.i3,rrs,Rvnr01,517
8,1472362_T,1472362,C,T,SAMN02419586,2.2.1.1.1.i3,rrs,Rvnr01,517
9,1472362_T,1472362,C,T,SAMN02419559,2.2.1.1.1.i3,rrs,Rvnr01,517


# [7] Check co-occurence of 4 most common *eis* promoter mutations with 3 *rrs* aminoglycoside resistance conferring mutations

- Jnawali et al. illustrates why checking for *rrs* A1401G, C1402T and G1484T is important (i.e. it describes two *eis* -14 mutants with different LoF mutations that coincide with *rrs* 1401)

- How often do each of the four classical eis promoter mutations (i.e. c-14t, c-37t, g-10a, and c-12t) **occur without** one of the three rrs mutations in out sample.

*eis* promoter mutations associated with KAN resistance

- eis G-10A, 2715342, C>T
- eis C-12T, 2715344, G>A
- eis C-14T, 2715346, G>A
- eis G-37T, 2715369, C>A

*rrs* mutations associated with AG resistance
- A1401G, 1473246, A>G
- C1402T, 1473247, C>T
- G1484T, 1473329, G>T

In [87]:
SNP_sites_of_interest = [2715342, 2715344, 2715346, 2715369, 1473246, 1473247, 1473329]

# get the number code that corresponds to the minor allele, 9 indicated bad quality call
base_code_dict = {'A':0, 'C':1, 'G':2, 'T':3}
base_code_dict_r = {0:'A', 1:'C', 2:'G', 3:'T', 9:'N'}

# filter SNP sites
SNP_sites_to_keep = [SNP_i in SNP_sites_of_interest for SNP_i in SNP_annotation_DF.pos] # construct boolean filter for SNP sites to keep
SNP_annotation_DF_filtered = SNP_annotation_DF[SNP_sites_to_keep]
SNP_annotation_DF_filtered.reset_index(inplace = True, drop = True)
SNP_annotation_DF_filtered.loc[:, 'variant'] = ['rrs A1401G', 'rrs C1402T', 'rrs G1484T', 'eis G-10A', 'eis C-12T', 'eis C-14T', 'eis G-37T']
SNP_annotation_DF_filtered.loc[:, 'resistance_allele'] = ['G', 'T', 'T', 'T', 'A', 'A', 'A']

# filter Genotypes Matrix
SNP_genotypes_array_filtered = SNP_genotypes_array[SNP_sites_to_keep , :]

In [98]:
SNP_annotation_DF_filtered

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,variant,resistance_allele
0,1473246,A,[G],Rvnr01,1400.0,,,rrs A1401G,G
1,1473247,C,"[A, T]",Rvnr01,1401.0,,,rrs C1402T,T
2,1473329,G,"[C, T]",Rvnr01,1483.0,,,rrs G1484T,T
3,2715342,C,"[T, G]",intergenic,,,,eis G-10A,T
4,2715344,G,[A],intergenic,,,,eis C-12T,A
5,2715346,G,[A],intergenic,,,,eis C-14T,A
6,2715369,C,[A],intergenic,,,,eis G-37T,A


In [95]:
SNP_genotypes_array_filtered

array([[0, 0, 9, ..., 9, 0, 0],
       [1, 1, 9, ..., 9, 1, 1],
       [2, 2, 9, ..., 9, 2, 2],
       ...,
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [96]:
SNP_genotypes_array_filtered.shape

(7, 31440)

### Find the number of isolates with each SNV (just to double check)

In [101]:
num_isolates_with_variant_allele = []
for SNV_index, SNV_res_allele in zip(SNP_annotation_DF_filtered.index,SNP_annotation_DF_filtered.resistance_allele):
    
    num_isolates_with_variant_allele.append(np.sum(SNP_genotypes_array_filtered[SNV_index,:] == base_code_dict[SNV_res_allele]))
    
SNP_annotation_DF_filtered.loc[:,'num_isolates'] = num_isolates_with_variant_allele

In [104]:
SNP_annotation_DF_filtered

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,variant,resistance_allele,num_isolates
0,1473246,A,[G],Rvnr01,1400.0,,,rrs A1401G,G,1452
1,1473247,C,"[A, T]",Rvnr01,1401.0,,,rrs C1402T,T,18
2,1473329,G,"[C, T]",Rvnr01,1483.0,,,rrs G1484T,T,26
3,2715342,C,"[T, G]",intergenic,,,,eis G-10A,T,293
4,2715344,G,[A],intergenic,,,,eis C-12T,A,332
5,2715346,G,[A],intergenic,,,,eis C-14T,A,181
6,2715369,C,[A],intergenic,,,,eis G-37T,A,285


### Find the number of times an *eis* promoter KAN res mutation occurs with an *rrs* AG res mutation

In [136]:
eis_prom_rrs_co_occur_count = pd.DataFrame(index = ['rrs A1401G', 'rrs C1402T', 'rrs G1484T'], columns = ['eis G-10A', 'eis C-12T', 'eis C-14T', 'eis G-37T'])

for rrs_SNV_index in [0,1,2]:
    
    #boolean filter that is TRUE for isolates that have the rrs SNV
    rrs_SNV_bool = SNP_genotypes_array_filtered[rrs_SNV_index,:] == base_code_dict[SNP_annotation_DF_filtered.loc[rrs_SNV_index,'resistance_allele']]
    
    for eis_prom_SNV_index in [3,4,5,6]:
        
        #boolean filter that is TRUE for isolates that have the eis prom SNV
        eis_prom_SNV_bool = SNP_genotypes_array_filtered[eis_prom_SNV_index,:] == base_code_dict[SNP_annotation_DF_filtered.loc[eis_prom_SNV_index,'resistance_allele']]
        
        #count the number of isolates that have the eis prom SNV and DO NOT have the rrs SNV
        num_isolates_both_SNVs = np.sum([(rrs_SNV_i and eis_prom_SNV_i) for rrs_SNV_i, eis_prom_SNV_i in zip(rrs_SNV_bool, eis_prom_SNV_bool)])
        
        #store in DataFrame
        eis_prom_rrs_co_occur_count.loc[SNP_annotation_DF_filtered.loc[rrs_SNV_index,'variant'],SNP_annotation_DF_filtered.loc[eis_prom_SNV_index,'variant']] = num_isolates_both_SNVs

In [137]:
eis_prom_rrs_co_occur_count

Unnamed: 0,eis G-10A,eis C-12T,eis C-14T,eis G-37T
rrs A1401G,2,23,2,2
rrs C1402T,0,0,0,0
rrs G1484T,1,1,0,0


Add in row/column that indicates the total number of isolates have each allele

In [138]:
eis_prom_rrs_co_occur_count.loc['# isolates eis prom SNV',:] = [293,332,181,285] # eis promoter mutations
eis_prom_rrs_co_occur_count.loc[:,'# isolates rrs SNV'] = [1452,18,26,0] # rrs mutations

In [139]:
eis_prom_rrs_co_occur_count

Unnamed: 0,eis G-10A,eis C-12T,eis C-14T,eis G-37T,# isolates rrs SNV
rrs A1401G,2,23,2,2,1452
rrs C1402T,0,0,0,0,18
rrs G1484T,1,1,0,0,26
# isolates eis prom SNV,293,332,181,285,0


Add in row that indicates the total number of isolates with each of the four classical eis promoter mutations (i.e. c-14t, c-37t, g-10a, and c-12t) that **occur without** one of the three rrs mutations.

In [140]:
eis_prom_rrs_co_occur_count.loc['# isolates eis prom SNV and no rrs SNV',:] = [293-3,332-24,181-2,285-2,0] # eis promoter mutations

In [143]:
eis_prom_rrs_co_occur_count = eis_prom_rrs_co_occur_count.astype(int)

In [144]:
eis_prom_rrs_co_occur_count

Unnamed: 0,eis G-10A,eis C-12T,eis C-14T,eis G-37T,# isolates rrs SNV
rrs A1401G,2,23,2,2,1452
rrs C1402T,0,0,0,0,18
rrs G1484T,1,1,0,0,26
# isolates eis prom SNV,293,332,181,285,0
# isolates eis prom SNV and no rrs SNV,290,308,179,283,0


In [145]:
eis_prom_rrs_co_occur_count.to_excel('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/eis_prom_SNV_rrs_SNV_co_occur_count.xlsx')