# This notebook was created to 1. collect (SNP & INDEL) variants in *whiB7* and the *whiB7* promoter and 2. detect instances of co-occurence of LoF variants in *whiB7* and the *whiB7* promoter

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio import Phylo

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools
import gzip

import networkx as nx
import scipy
from collections import Counter

# *whiB7*

promoter: 3,568,680 - 3,569,082

coding: 3,568,401 - 3,568,679

H37Rv region: 3,568,401 - 3,569,082

In [3]:
H37Rv_region_start = 3568401
H37Rv_region_end = 3569082
regulator_id = 'Rv3197A_Rv3198c'
regulon_id = 'Rv3197A'
gene_symbol = 'whiB7'

### *Function* to convert lineage-calls to new/simpler lineage call scheme

In [4]:
def convert_lineage_calls(isolate_annotation_DF):

    lineage_hierarchincal_to_lineage_map_df = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/lineage_hierachical_to_lineage_map.csv')
    lineage_hierarchincal_to_lineage_map_series = pd.Series(list(lineage_hierarchincal_to_lineage_map_df.loc[:, 'lineage']), index = lineage_hierarchincal_to_lineage_map_df.lineage_hierarchical)

    lineage_call_update_list = []

    for hierarchical_lineage_call in isolate_annotation_DF.lineage_call:

        # if lineage call has an updated shorter call
        if hierarchical_lineage_call in lineage_hierarchincal_to_lineage_map_series.index:
            lineage_call_update_list.append(lineage_hierarchincal_to_lineage_map_series[hierarchical_lineage_call])

        # if lineage call doesn't have an updated shorter call
        else:
            lineage_call_update_list.append(hierarchical_lineage_call)

    # replace the hierarchical calls with the shorter ones
    isolate_annotation_DF.loc[:,'lineage_call'] = lineage_call_update_list
    
    return isolate_annotation_DF

# [1] INDELs

## [1.1] Load INDEL genotype matrix and Annotation Files

### 31,428 isolates

In [5]:
#load isolate annotation file (columns of Genotype Matrix)
INDEL_isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')
INDEL_isolate_annotation_DF = INDEL_isolate_annotation_DF.loc[:, ['isolate_ID','lineage_call']] #drop columns
INDEL_isolate_annotation_DF = convert_lineage_calls(INDEL_isolate_annotation_DF)

#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
INDEL_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_functional_annotation.pkl')
INDEL_annotation_DF.reset_index(inplace = True, drop = False)

#load Genotypes Matrix
INDEL_genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_matrix.npy')

In [6]:
INDEL_isolate_annotation_DF.head()

Unnamed: 0,isolate_ID,lineage_call
0,SAMN13051687,2.2.1.1.1.i3
1,SAMN09100245,4.3.i4.1
2,SAMN08732238,2.2.1.1.1
3,SAMN07658260,3.1.1
4,SAMN03648003,2.2.1.1.1


In [7]:
np.shape(INDEL_isolate_annotation_DF)

(31428, 2)

In [8]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A,Essential,dnaA,Rv0001,313.0,del,inframe,105.0
1,TC_1549_T,1549,TC,T,,,Rv0001_Rv0002,,del,frameshift,
2,T_1552_TAA,1552,T,TAA,,,Rv0001_Rv0002,,ins,frameshift,
3,TAA_1552_T,1552,TAA,T,,,Rv0001_Rv0002,,del,frameshift,
4,T_1552_TA,1552,T,TA,,,Rv0001_Rv0002,,ins,frameshift,


In [9]:
np.shape(INDEL_annotation_DF)

(50260, 11)

In [10]:
INDEL_genotypes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [11]:
np.shape(INDEL_genotypes_array)

(50260, 31428)

### additional 12 eis C-14T mutants with AG MICs

In [12]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF_extra_strains = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_isolate_annotation.pkl')
isolate_annotation_DF_extra_strains = isolate_annotation_DF_extra_strains.loc[:, ['isolate_ID','lineage_call']]
isolate_annotation_DF_extra_strains = convert_lineage_calls(isolate_annotation_DF_extra_strains)

#load Genotypes Matrix
genotypes_array_extra_strains =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_Filtered/genotypes_matrix.npy')

In [13]:
isolate_annotation_DF_extra_strains.head()

Unnamed: 0,isolate_ID,lineage_call
0,168-19,2.2.1.1.1
1,622-19,2.2.1.1.1.i3
2,655-19,4.3.i3.1
3,IT1070,2.2.1.1.1.i3
4,IT123,2.2.1.1.1


In [14]:
genotypes_array_extra_strains

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 9, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [15]:
np.shape(genotypes_array_extra_strains)

(50260, 12)

### Merge *isolate annotation files* and *genotypes matrices* together

In [16]:
# merge isolate annotation dataframes
INDEL_isolate_annotation_DF = INDEL_isolate_annotation_DF.append(isolate_annotation_DF_extra_strains)
INDEL_isolate_annotation_DF.reset_index(inplace = True, drop = True)

In [17]:
np.shape(INDEL_isolate_annotation_DF)

(31440, 2)

In [18]:
# merge genotypes matrices
INDEL_genotypes_array = np.hstack((INDEL_genotypes_array, genotypes_array_extra_strains))

In [19]:
np.shape(INDEL_genotypes_array)

(50260, 31440)

## [1.2] Subset to targeted chomosomal region

In [20]:
# keep only indels that occur within chromosomal region
indels_in_targeted_region = list(INDEL_annotation_DF[(INDEL_annotation_DF.pos >= H37Rv_region_start) & (INDEL_annotation_DF.pos <= H37Rv_region_end)].key)

# construct boolean filter
indels_to_keep_filter = [indel_i in indels_in_targeted_region for indel_i in INDEL_annotation_DF.key]

# filter Genotypes Matrix
INDEL_genotypes_array = INDEL_genotypes_array[indels_to_keep_filter , :]

# filter INDEL annotation file
INDEL_annotation_DF = INDEL_annotation_DF[indels_to_keep_filter]
INDEL_annotation_DF.reset_index(drop = True , inplace = True) #re-index new filtered INDEL annotation DF (so new index matches indexing of genotypes matrix rows)

In [21]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,C_3568429_CCG,3568429,C,CCG,Non-Essential,whiB7,Rv3197A,251,ins,frameshift,84
1,C_3568434_CAA,3568434,C,CAA,Non-Essential,whiB7,Rv3197A,246,ins,frameshift,82
2,GCTT_3568439_G,3568439,GCTT,G,Non-Essential,whiB7,Rv3197A,241,del,inframe,81
3,C_3568487_CG,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193,ins,frameshift,65
4,GC_3568488_G,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192,del,frameshift,64


In [22]:
np.shape(INDEL_annotation_DF)

(26, 11)

In [23]:
INDEL_genotypes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [24]:
np.shape(INDEL_genotypes_array)

(26, 31440)

# [2] SNPs

### 31,428 isolates

## [2.1] Load SNP genotype matrix and Annotation Files

In [25]:
#load isolate annotation file (columns of Genotype Matrix)
SNP_isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')
SNP_isolate_annotation_DF = SNP_isolate_annotation_DF.loc[:, ['isolate_ID','lineage_call']] #drop columns
SNP_isolate_annotation_DF = convert_lineage_calls(SNP_isolate_annotation_DF)

#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')
SNP_annotation_DF = SNP_annotation_DF.loc[:, ['pos','ref','alt']] # drop columns
SNP_annotation_DF.reset_index(inplace = True, drop = False)

#load Genotypes Matrix
SNP_genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_matrix.npy')

In [26]:
SNP_isolate_annotation_DF.head()

Unnamed: 0,isolate_ID,lineage_call
0,SAMEA3558733,4.3.i4.2
1,SAMN03648641,4.4.1.1
2,SAMN03647419,3.1.1.i1
3,SAMEA3671418,4.3.i2
4,SAMN07659096,1.1.3


In [27]:
np.shape(SNP_isolate_annotation_DF)

(31428, 2)

In [28]:
SNP_annotation_DF.head()

Unnamed: 0,index,pos,ref,alt
0,0,48,C,[T]
1,1,64,G,[C]
2,2,67,G,"[A, T]"
3,3,69,C,[T]
4,4,71,C,[T]


In [29]:
np.shape(SNP_annotation_DF)

(782565, 4)

In [30]:
SNP_genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 9, 9, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1]], dtype=int8)

In [31]:
np.shape(SNP_genotypes_array)

(782565, 31428)

### additional 12 eis C-14T mutants with AG MICs

In [32]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF_extra_strains = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_isolate_annotation.pkl')
isolate_annotation_DF_extra_strains = isolate_annotation_DF_extra_strains.loc[:, ['isolate_ID','lineage_call']]
isolate_annotation_DF_extra_strains = convert_lineage_calls(isolate_annotation_DF_extra_strains)

#load Genotypes Matrix
genotypes_array_extra_strains =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_matrix.npy')

In [33]:
isolate_annotation_DF_extra_strains.head()

Unnamed: 0,isolate_ID,lineage_call
0,168-19,2.2.1.1.1
1,622-19,2.2.1.1.1.i3
2,655-19,4.3.i3.1
3,IT1070,2.2.1.1.1.i3
4,IT123,2.2.1.1.1


In [34]:
genotypes_array_extra_strains

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [35]:
np.shape(genotypes_array_extra_strains)

(782565, 12)

### Merge *isolate annotation files* and *genotypes matrices* together

In [36]:
# merge isolate annotation dataframes
SNP_isolate_annotation_DF = SNP_isolate_annotation_DF.append(isolate_annotation_DF_extra_strains)
SNP_isolate_annotation_DF.reset_index(inplace = True, drop = True)

In [37]:
np.shape(SNP_isolate_annotation_DF)

(31440, 2)

In [38]:
# merge genotypes matrices
SNP_genotypes_array = np.hstack((SNP_genotypes_array, genotypes_array_extra_strains))

In [39]:
np.shape(SNP_genotypes_array)

(782565, 31440)

## [2.2] Subset to targeted chomosomal region

In [40]:
# keep only SNPs that occur within chromosomal region
SNPs_in_targeted_region = list(SNP_annotation_DF[(SNP_annotation_DF.pos >= H37Rv_region_start) & (SNP_annotation_DF.pos <= H37Rv_region_end)].pos)

# construct boolean filter
SNPs_to_keep_filter = [SNP_i in SNPs_in_targeted_region for SNP_i in SNP_annotation_DF.pos]

# filter Genotypes Matrix
SNP_genotypes_array = SNP_genotypes_array[SNPs_to_keep_filter , :]

# filter SNP annotation file
SNP_annotation_DF = SNP_annotation_DF[SNPs_to_keep_filter]
SNP_annotation_DF.reset_index(drop = True , inplace = True) #re-index new filtered SNP annotation DF (so new index matches indexing of genotypes matrix rows)
SNP_annotation_DF.loc[:,'array_index'] = SNP_annotation_DF.index

In [41]:
SNP_annotation_DF.head()

Unnamed: 0,index,pos,ref,alt,array_index
0,631520,3568406,C,[T],0
1,631521,3568410,A,[C],1
2,631522,3568417,T,[G],2
3,631523,3568427,G,[C],3
4,631524,3568435,G,[A],4


In [42]:
np.shape(SNP_annotation_DF)

(150, 5)

#### Re-format SNP annotation DataFrame so that each alternate allele is its own row

In [43]:
SNP_annotation_DF_mod = pd.DataFrame(columns = SNP_annotation_DF.columns)
new_SNP_index = 0
for SNP_i_row in SNP_annotation_DF.index:
    
    SNP_i_info = SNP_annotation_DF.loc[SNP_i_row, :]
    
    #iterate through alternate alleles, store a row for each one
    for SNP_i_alt_allele in SNP_i_info.alt:
        
        SNP_annotation_DF_mod.loc[new_SNP_index, :] = list(SNP_i_info.loc[['index','pos','ref']]) + [SNP_i_alt_allele] + list(SNP_i_info.loc[['array_index']])
        new_SNP_index += 1

SNP_annotation_DF = SNP_annotation_DF_mod
#create new column to merge func annotation on
SNP_annotation_DF.loc[:, 'key'] = [str(SNP_i_pos) + '_' + SNP_i_alt for SNP_i_pos, SNP_i_alt in zip(SNP_annotation_DF.pos, SNP_annotation_DF.alt)]

In [44]:
SNP_annotation_DF.head()

Unnamed: 0,index,pos,ref,alt,array_index,key
0,631520,3568406,C,T,0,3568406_T
1,631521,3568410,A,C,1,3568410_C
2,631522,3568417,T,G,2,3568417_G
3,631523,3568427,G,C,3,3568427_C
4,631524,3568435,G,A,4,3568435_A


In [45]:
np.shape(SNP_annotation_DF)

(160, 6)

#### Merge functional annotation data

In [46]:
SNP_func_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_functional_annotation.pkl')
SNP_func_annotation_DF.loc[:, 'key'] = SNP_func_annotation_DF.index

In [47]:
SNP_func_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change,key
48_T,48,C,T,Essential,dnaA,Rv0001,48,S,V16V,48_T
64_C,64,G,C,Essential,dnaA,Rv0001,64,N,G22R,64_C
67_A,67,G,A,Essential,dnaA,Rv0001,67,N,D23N,67_A
67_T,67,G,T,Essential,dnaA,Rv0001,67,N,D23Y,67_T
69_T,69,C,T,Essential,dnaA,Rv0001,69,S,D23D,69_T


In [48]:
SNP_annotation_DF = SNP_annotation_DF.merge(SNP_func_annotation_DF.loc[: , ['gene_category','gene_name','gene_id','gene_pos','SNP_type','AA_change','key']], how = 'left', on = 'key')
SNP_annotation_DF = SNP_annotation_DF.loc[:, ['key','pos','ref','alt','gene_category','gene_name','gene_id','gene_pos','SNP_type','AA_change','array_index']] #re-arrange columns
SNP_annotation_DF.rename(columns={'SNP_type':'mut_type', 'AA_change':'codon_pos'}, inplace = True) #change column names

In [49]:
SNP_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,array_index
0,3568406_T,3568406,C,T,Non-Essential,whiB7,Rv3197A,274,N,A92T,0
1,3568410_C,3568410,A,C,Non-Essential,whiB7,Rv3197A,270,S,A90A,1
2,3568417_G,3568417,T,G,Non-Essential,whiB7,Rv3197A,263,N,K88T,2
3,3568427_C,3568427,G,C,Non-Essential,whiB7,Rv3197A,253,N,R85G,3
4,3568435_A,3568435,G,A,Non-Essential,whiB7,Rv3197A,245,N,P82L,4


In [50]:
np.shape(SNP_annotation_DF)

(160, 11)

In [51]:
SNP_genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 3, 3, ..., 3, 3, 3],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [52]:
np.shape(SNP_genotypes_array)

(150, 31440)

# [3] Pull info for isolates with mutant alleles

## [3.1] Get genotypes for SNPs of interest

### *Function* to get the genotypes for a specifc SNP and get attributes for isolates that support SNP

In [53]:
def get_genotypes_for_SNP(SNP_i_pos, SNP_i_alt_allele):
    
    #get the number code that corresponds to the alt allele, 9 indicated bad quality call
    base_code_dict = {'A':0, 'C':1, 'G':2, 'T':3}
    SNP_i_alt_allele_code = base_code_dict[SNP_i_alt_allele]
    SNP_i_genotypes = SNP_genotypes_array[SNP_annotation_DF[SNP_annotation_DF.pos == SNP_i_pos].array_index.values[0] , :] 

    #count the number of isolates that support this SNP call
    num_isolates_SNP_i = np.sum(SNP_i_genotypes == SNP_i_alt_allele_code)

    #create a boolean filter and extract information for the isolates that support this call
    SNP_i_isolate_filter = SNP_i_genotypes == SNP_i_alt_allele_code
    isolate_annotation_SNP_i = SNP_isolate_annotation_DF[SNP_i_isolate_filter]

    #find number of sub-lineages that have SNP support in at least 1 isolate
    num_sublineages_with_SNP_i = len(set(list(isolate_annotation_SNP_i.lineage_call.values)))
    
    #get list of sublineages w/ at least 1 isolates that supports SNP call
    sublineages_with_SNP_count_dict = Counter(isolate_annotation_SNP_i.lineage_call.values)
    sublineages_with_SNP_list = []
    for sublineage_i in sublineages_with_SNP_count_dict.keys():

        sublineage_i_with_SNP = sublineage_i + '({0})'.format(str(sublineages_with_SNP_count_dict[sublineage_i]))
        sublineages_with_SNP_list.append(sublineage_i_with_SNP)

    sublineages_with_SNP = ' - '.join(sublineages_with_SNP_list)

    return [num_isolates_SNP_i , num_sublineages_with_SNP_i , sublineages_with_SNP]

In [54]:
SNP_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,array_index
0,3568406_T,3568406,C,T,Non-Essential,whiB7,Rv3197A,274,N,A92T,0
1,3568410_C,3568410,A,C,Non-Essential,whiB7,Rv3197A,270,S,A90A,1
2,3568417_G,3568417,T,G,Non-Essential,whiB7,Rv3197A,263,N,K88T,2
3,3568427_C,3568427,G,C,Non-Essential,whiB7,Rv3197A,253,N,R85G,3
4,3568435_A,3568435,G,A,Non-Essential,whiB7,Rv3197A,245,N,P82L,4


In [55]:
np.shape(SNP_annotation_DF)

(160, 11)

In [56]:
num_isolates_with_SNP = []
num_sublineages_with_SNP = []
sublineages_with_SNP = []

for SNP_i_pos, SNP_i_alt_allele in zip(SNP_annotation_DF.pos, SNP_annotation_DF.alt):
    
    num_isolates_with_SNP_i , num_sublineages_with_SNP_i , sublineages_with_SNP_i = get_genotypes_for_SNP(SNP_i_pos, SNP_i_alt_allele)
    
    num_isolates_with_SNP.append(num_isolates_with_SNP_i)
    num_sublineages_with_SNP.append(num_sublineages_with_SNP_i)
    sublineages_with_SNP.append(sublineages_with_SNP_i)
    
SNP_annotation_DF.loc[: , 'num_isolates'] = num_isolates_with_SNP
SNP_annotation_DF.loc[: , 'num_sublineages'] = num_sublineages_with_SNP
SNP_annotation_DF.loc[: , 'sublineages'] = sublineages_with_SNP

#drop SNPs present in 0 isolates
SNP_annotation_DF = SNP_annotation_DF[SNP_annotation_DF.num_isolates > 0]
SNP_annotation_DF.reset_index(inplace = True, drop = True)

In [57]:
SNP_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,array_index,num_isolates,num_sublineages,sublineages
0,3568406_T,3568406,C,T,Non-Essential,whiB7,Rv3197A,274,N,A92T,0,1,1,4.3.i4.2(1)
1,3568410_C,3568410,A,C,Non-Essential,whiB7,Rv3197A,270,S,A90A,1,2,1,1.1.2(2)
2,3568417_G,3568417,T,G,Non-Essential,whiB7,Rv3197A,263,N,K88T,2,2,1,4.10.i1(2)
3,3568427_C,3568427,G,C,Non-Essential,whiB7,Rv3197A,253,N,R85G,3,2,1,2.2.1.1.1(2)
4,3568435_A,3568435,G,A,Non-Essential,whiB7,Rv3197A,245,N,P82L,4,1,1,3.1.1.i2(1)


In [58]:
np.shape(SNP_annotation_DF)

(160, 14)

## [3.2] Get genotypes for indels of interest

### *Function* to get the genotypes for a specifc indel and get attributes for isolates that support indel

In [59]:
def get_genotypes_for_indel(indel_i_key):
    
    #0 supports Ref, 1 supports Alt, 9 indicated bad quality call
    indel_i_genotypes = INDEL_genotypes_array[INDEL_annotation_DF[INDEL_annotation_DF.key == indel_i_key].index[0] , :] 

    #count the number of isolates that support this indel call
    num_isolates_indel_i = np.sum(indel_i_genotypes == 1)

    #create a boolean filter and extract information for the isolates taht support this call
    indel_i_isolate_filter = indel_i_genotypes == 1
    isolate_annotation_indel_i = INDEL_isolate_annotation_DF[indel_i_isolate_filter]

    #find number of sub-lineages that have indel support in at least 1 isolate
    num_sublineages_with_indel_i = len(set(list(isolate_annotation_indel_i.lineage_call.values)))
    
    #get list of sublineages w/ at least 1 isolates that supports indel call
    sublineage_with_indel_count_dict = Counter(isolate_annotation_indel_i.lineage_call.values)
    sublineages_with_indel_list = []
    for sublineage_i in sublineage_with_indel_count_dict.keys():

        sublineage_i_with_indel = sublineage_i + '({0})'.format(str(sublineage_with_indel_count_dict[sublineage_i]))
        sublineages_with_indel_list.append(sublineage_i_with_indel)

    sublineages_with_indel = ' - '.join(sublineages_with_indel_list)
    
    return [num_isolates_indel_i , num_sublineages_with_indel_i , sublineages_with_indel]

In [60]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,C_3568429_CCG,3568429,C,CCG,Non-Essential,whiB7,Rv3197A,251,ins,frameshift,84
1,C_3568434_CAA,3568434,C,CAA,Non-Essential,whiB7,Rv3197A,246,ins,frameshift,82
2,GCTT_3568439_G,3568439,GCTT,G,Non-Essential,whiB7,Rv3197A,241,del,inframe,81
3,C_3568487_CG,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193,ins,frameshift,65
4,GC_3568488_G,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192,del,frameshift,64


In [61]:
np.shape(INDEL_annotation_DF)

(26, 11)

In [62]:
num_isolates_with_indel = []
num_sublineages_with_indel = []
sublineages_with_indel = []

for indel_i_key in INDEL_annotation_DF.key:
    
    num_isolates_with_indel_i , num_sublineages_with_indel_i , sublineages_with_indel_i = get_genotypes_for_indel(indel_i_key)
    
    num_isolates_with_indel.append(num_isolates_with_indel_i)
    num_sublineages_with_indel.append(num_sublineages_with_indel_i)
    sublineages_with_indel.append(sublineages_with_indel_i)
    
INDEL_annotation_DF.loc[: , 'num_isolates'] = num_isolates_with_indel
INDEL_annotation_DF.loc[: , 'num_sublineages'] = num_sublineages_with_indel
INDEL_annotation_DF.loc[: , 'sublineages'] = sublineages_with_indel

#drop indels present in 0 isolates
INDEL_annotation_DF = INDEL_annotation_DF[INDEL_annotation_DF.num_isolates > 0]

In [63]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,num_isolates,num_sublineages,sublineages
0,C_3568429_CCG,3568429,C,CCG,Non-Essential,whiB7,Rv3197A,251,ins,frameshift,84,1,1,4.3.i3.1(1)
1,C_3568434_CAA,3568434,C,CAA,Non-Essential,whiB7,Rv3197A,246,ins,frameshift,82,2,1,2.2.1.1.1(2)
2,GCTT_3568439_G,3568439,GCTT,G,Non-Essential,whiB7,Rv3197A,241,del,inframe,81,1,1,4.1.i1.1.1.1(1)
3,C_3568487_CG,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193,ins,frameshift,65,3,3,4.10.i1(1) - 4.1.i1.1.1.1(1) - 2.2.1.1.2(1)
4,GC_3568488_G,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192,del,frameshift,64,573,3,1.2.1.1.2(74) - 1.2.1.1(5) - 1.2.1.1.1(494)


In [64]:
np.shape(INDEL_annotation_DF)

(26, 14)

# [4] Check co-occurrence between *regulator* SNPs/INDELs & *regulon* SNPs/INDELs

### SNPs in regulator

In [65]:
regulator_SNPs_DF = SNP_annotation_DF[SNP_annotation_DF.gene_id == regulator_id]

In [66]:
regulator_SNPs_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,array_index,num_isolates,num_sublineages,sublineages
59,3568686_C,3568686,T,C,,,Rv3197A_Rv3198c,,I,,56,1,1,1.2.2.1(1)
60,3568686_G,3568686,T,G,,,Rv3197A_Rv3198c,,I,,56,1,1,2.2.1.1.1(1)
61,3568692_T,3568692,C,T,,,Rv3197A_Rv3198c,,I,,57,1,1,1.2.2.1(1)
62,3568701_A,3568701,G,A,,,Rv3197A_Rv3198c,,I,,58,1,1,4.10.i1(1)
63,3568702_T,3568702,G,T,,,Rv3197A_Rv3198c,,I,,59,1,1,3.1.1(1)


In [67]:
np.shape(regulator_SNPs_DF)

(101, 14)

### INDELs in regulator

In [68]:
regulator_INDELs_DF = INDEL_annotation_DF[INDEL_annotation_DF.gene_id == regulator_id]

In [69]:
regulator_INDELs_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,num_isolates,num_sublineages,sublineages
11,C_3568730_CG,3568730,C,CG,,,Rv3197A_Rv3198c,,ins,frameshift,,1,1,4.10.i1(1)
12,CA_3568735_C,3568735,CA,C,,,Rv3197A_Rv3198c,,del,frameshift,,1,1,3.1.1(1)
13,T_3568795_TC,3568795,T,TC,,,Rv3197A_Rv3198c,,ins,frameshift,,1,1,4.1.i1.1.1.1(1)
14,AC_3568854_A,3568854,AC,A,,,Rv3197A_Rv3198c,,del,frameshift,,1,1,2.2.1.1.1(1)
15,C_3568881_CGG,3568881,C,CGG,,,Rv3197A_Rv3198c,,ins,frameshift,,1,1,4.3.i3.1(1)


In [70]:
np.shape(regulator_INDELs_DF)

(15, 14)

Drop **INDEL_type** column and rename **ins_del** column to fit with SNPs DataFrames

In [71]:
regulator_INDELs_DF.drop(['INDEL_type'], axis=1, inplace = True)
regulator_INDELs_DF.rename(columns={'ins_del':'mut_type'}, inplace = True) #change column names

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


### SNPs in regulon

We're going to include SNPs in a gene that either 

- cause a **premature STOP codon** (nonsense mutation)
- **synonymous mutations at the first codon that abolish the start codon** (i.e. eis, whiB7 and mmpR which start with a **valine** as this is not possible with ahpC, mmpS5 or mmpL5, which start with a **methionine**)
- **non-synonymous mutations at the first codon that abolish the start codon** (i.e. that change the start to an AA other than **valine** and **methionine**)

In [72]:
regulon_SNPs_DF = SNP_annotation_DF[SNP_annotation_DF.gene_id == regulon_id]

In [73]:
regulon_SNPs_DF[[(AA_change[-1]=='*') | ((len(AA_change)==3) and (AA_change[1]=='1')) for AA_change in regulon_SNPs_DF.codon_pos]]

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,array_index,num_isolates,num_sublineages,sublineages


In [74]:
regulon_SNPs_DF = regulon_SNPs_DF[[(AA_change[-1]=='*') | ((len(AA_change)==3) and (AA_change[1]=='1')) for AA_change in regulon_SNPs_DF.codon_pos]]

Save SNPs that occur in the first codon seperately

In [75]:
regulon_SNPs_DF[[((len(AA_change)==3) and (AA_change[1]=='1')) for AA_change in regulon_SNPs_DF.codon_pos]].to_excel('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/START_codon_SNVs/whiB7_START_codon_SNVs.xlsx')

### INDELs in regulon

#### keep only *frameshift* indels for gene body

In [76]:
INDEL_annotation_DF[(INDEL_annotation_DF.gene_id == regulon_id) & (INDEL_annotation_DF.INDEL_type == 'frameshift')].head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,num_isolates,num_sublineages,sublineages
0,C_3568429_CCG,3568429,C,CCG,Non-Essential,whiB7,Rv3197A,251,ins,frameshift,84,1,1,4.3.i3.1(1)
1,C_3568434_CAA,3568434,C,CAA,Non-Essential,whiB7,Rv3197A,246,ins,frameshift,82,2,1,2.2.1.1.1(2)
3,C_3568487_CG,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193,ins,frameshift,65,3,3,4.10.i1(1) - 4.1.i1.1.1.1(1) - 2.2.1.1.2(1)
4,GC_3568488_G,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192,del,frameshift,64,573,3,1.2.1.1.2(74) - 1.2.1.1(5) - 1.2.1.1.1(494)
5,GC_3568501_G,3568501,GC,G,Non-Essential,whiB7,Rv3197A,179,del,frameshift,60,8,1,4.10.i1(8)


In [77]:
regulon_INDELs_DF = INDEL_annotation_DF[(INDEL_annotation_DF.gene_id == regulon_id) & (INDEL_annotation_DF.INDEL_type == 'frameshift')]

In [78]:
regulon_INDELs_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,num_isolates,num_sublineages,sublineages
0,C_3568429_CCG,3568429,C,CCG,Non-Essential,whiB7,Rv3197A,251,ins,frameshift,84,1,1,4.3.i3.1(1)
1,C_3568434_CAA,3568434,C,CAA,Non-Essential,whiB7,Rv3197A,246,ins,frameshift,82,2,1,2.2.1.1.1(2)
3,C_3568487_CG,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193,ins,frameshift,65,3,3,4.10.i1(1) - 4.1.i1.1.1.1(1) - 2.2.1.1.2(1)
4,GC_3568488_G,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192,del,frameshift,64,573,3,1.2.1.1.2(74) - 1.2.1.1(5) - 1.2.1.1.1(494)
5,GC_3568501_G,3568501,GC,G,Non-Essential,whiB7,Rv3197A,179,del,frameshift,60,8,1,4.10.i1(8)


In [79]:
np.shape(regulon_INDELs_DF)

(10, 14)

Drop **INDEL_type** column (since all indels are frameshifts) and rename **ins_del** column to fit with SNPs DataFrames

In [81]:
regulon_INDELs_DF.drop(['INDEL_type'], axis=1, inplace = True)
regulon_INDELs_DF.rename(columns={'ins_del':'mut_type'}, inplace = True) #change column names

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Save DataFrame for *regulator* & *regulon* mutations

In [82]:
regulator_SNPs_DF_for_export = regulator_SNPs_DF.drop(['array_index'], axis=1)
regulon_SNPs_DF_for_export = regulon_SNPs_DF.drop(['array_index'], axis=1)
regulator_regulon_mutations_df = regulator_SNPs_DF_for_export.append(regulon_SNPs_DF_for_export.append(regulator_INDELs_DF.append(regulon_INDELs_DF)))

regulator_regulon_mutations_df.drop(['key'], axis=1, inplace = True) #drop col
regulator_regulon_mutations_df.sort_values(by = 'pos', inplace = True) #sort by reference position
regulator_regulon_mutations_df.reset_index(inplace = True, drop = True) #reset index

In [83]:
np.shape(regulator_regulon_mutations_df)

(126, 12)

In [84]:
regulator_regulon_mutations_df

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,num_isolates,num_sublineages,sublineages
0,3568429,C,CCG,Non-Essential,whiB7,Rv3197A,251,ins,84,1,1,4.3.i3.1(1)
1,3568434,C,CAA,Non-Essential,whiB7,Rv3197A,246,ins,82,2,1,2.2.1.1.1(2)
2,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193,ins,65,3,3,4.10.i1(1) - 4.1.i1.1.1.1(1) - 2.2.1.1.2(1)
3,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192,del,64,573,3,1.2.1.1.2(74) - 1.2.1.1(5) - 1.2.1.1.1(494)
4,3568501,GC,G,Non-Essential,whiB7,Rv3197A,179,del,60,8,1,4.10.i1(8)
5,3568547,TCA,T,Non-Essential,whiB7,Rv3197A,133,del,45,2,2,4.3.i4.2(1) - 2.2.2(1)
6,3568626,CA,C,Non-Essential,whiB7,Rv3197A,54,del,18,61,2,2.2.1.1(1) - 2.2.1.1.1(60)
7,3568646,TG,T,Non-Essential,whiB7,Rv3197A,34,del,12,2,1,5(2)
8,3568646,T,TG,Non-Essential,whiB7,Rv3197A,34,ins,12,3,1,2.2.1.1.1(3)
9,3568648,G,GA,Non-Essential,whiB7,Rv3197A,32,ins,11,2,1,4.6.2.1.2(2)


In [85]:
regulator_regulon_mutations_df.to_excel('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/mutations_detected_within_isolates/promoter_whiB7_gene_body_mutations_in_31440_isolates.xlsx')

#### Save DataFrame for *regulator* & *regulon* mutations for *Table 1*

**REGULATOR** Subset to the 4 most commonly occurring mutations in the regulator regions

In [86]:
regulator_mutations_df_subset = regulator_regulon_mutations_df[regulator_regulon_mutations_df.gene_id == regulator_id].sort_values(by = 'num_isolates', ascending = False).head(n=4)

**REGULON** Subset to mutations that occured in at least 2 Mtb sub-lineages in the regulon regions

In [87]:
regulon_mutations_df_subset = regulator_regulon_mutations_df[regulator_regulon_mutations_df.gene_id == regulon_id]
regulon_mutations_df_subset = regulon_mutations_df_subset[regulon_mutations_df_subset.num_sublineages >= 2]

Export as excel file

In [88]:
regulator_regulon_subset_mutations_df = regulator_mutations_df_subset.append(regulon_mutations_df_subset)
regulator_regulon_subset_mutations_df.sort_values(by = 'pos', ascending = True, inplace = True)
regulator_regulon_subset_mutations_df.reset_index(inplace = True, drop = True)

In [89]:
regulator_regulon_subset_mutations_df

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,mut_type,codon_pos,num_isolates,num_sublineages,sublineages
0,3568487,C,CG,Non-Essential,whiB7,Rv3197A,193.0,ins,65.0,3,3,4.10.i1(1) - 4.1.i1.1.1.1(1) - 2.2.1.1.2(1)
1,3568488,GC,G,Non-Essential,whiB7,Rv3197A,192.0,del,64.0,573,3,1.2.1.1.2(74) - 1.2.1.1(5) - 1.2.1.1.1(494)
2,3568547,TCA,T,Non-Essential,whiB7,Rv3197A,133.0,del,45.0,2,2,4.3.i4.2(1) - 2.2.2(1)
3,3568626,CA,C,Non-Essential,whiB7,Rv3197A,54.0,del,18.0,61,2,2.2.1.1(1) - 2.2.1.1.1(60)
4,3568779,A,G,,,Rv3197A_Rv3198c,,I,,256,2,4.4.1.1(244) - 4.2.1.2.2.1(12)
5,3568857,G,A,,,Rv3197A_Rv3198c,,I,,73,2,3.1.1(72) - 3.1(1)
6,3568921,C,G,,,Rv3197A_Rv3198c,,I,,117,2,4.1.i1.1.2(110) - 4.1.i1.1(7)
7,3569029,T,C,,,Rv3197A_Rv3198c,,I,,249,3,4.2.1.1.1.2.2.1.1.1(5) - 4.6.1.1.1.1.2(173) - ...


In [90]:
regulator_regulon_subset_mutations_df.to_excel('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/mutations_detected_within_isolates/promoter_whiB7_gene_body_mutations_in_31440_isolates_for_Table_1.xlsx')

### Construct DataFrame (which displays each pair of co-occuring mutations as a seperate row with info for each pair of mutations)

In [91]:
variant_mutation_pair_df = pd.DataFrame(columns=['A_type', 'A_pos', 'A_ref', 'A_alt', 'A_mut_type', 'A_codon_pos', 'A_gene_id', 'A_gene_name', 'A_gene_pos' ,'A_num_isolates', 'B_type', 'B_pos', 'B_ref', 'B_alt', 'B_mut_type', 'B_codon_pos', 'B_gene_id', 'B_gene_name', 'B_gene_pos', 'B_num_isolates', 'num_isolates_co_occur', 'isolate_IDs', 'sublineages'])

### Get the ordering of the INDELs genotypes columns in terms of the SNPs genotypes columns

In [92]:
SNP_isolate_annotation_DF['isolate_order'] = SNP_isolate_annotation_DF.index
SNP_isolate_annotation_DF.set_index('isolate_ID', drop = True, inplace = True)
INDEL_isolate_ordering_for_SNP_genotypes_array = SNP_isolate_annotation_DF.loc[INDEL_isolate_annotation_DF.isolate_ID, :].isolate_order.values

In [93]:
INDEL_isolate_ordering_for_SNP_genotypes_array

array([  188,  2733, 24409, ..., 31437, 31438, 31439])

### Fill in dataframe by quering the genotypes matrices

In [94]:
#SNPs or INDELs in regulator x SNPs or INDELs in regulon
regulator_mut_keys = list(regulator_SNPs_DF.key) + list(regulator_INDELs_DF.key)
regulon_mut_keys = list(regulon_SNPs_DF.key) + list(regulon_INDELs_DF.key)

In [95]:
#index for DataFrame
variant_mutation_pair_df_index = 0

for variant_i in regulator_mut_keys: #iterate through regulator SNPs & INDELs
    
    if variant_i.count('_') == 1: # if variant in regulator is a SNP

        SNP_i_pos = int(variant_i.split('_')[0])
        SNP_i_alt_allele = variant_i.split('_')[1]

        #get the number code that corresponds to the alt allele, 9 indicated bad quality call
        base_code_dict = {'A':0, 'C':1, 'G':2, 'T':3}
        SNP_i_alt_allele_code = base_code_dict[SNP_i_alt_allele]

        #genotypes for each SNP site
        genotypes_for_SNP_i = SNP_genotypes_array[SNP_annotation_DF[SNP_annotation_DF.pos == SNP_i_pos].array_index.values[0] , :]

        #re-order isolates according to the order of the columns in the INDEL genotypes matrix
        genotypes_for_SNP_i = genotypes_for_SNP_i[INDEL_isolate_ordering_for_SNP_genotypes_array]

        for variant_j in regulon_mut_keys: #iterate through mutations in regulon

            if variant_j.count('_') == 1: #if variant in regulon is a SNP

                SNP_j_pos = int(variant_j.split('_')[0])
                SNP_j_alt_allele = variant_j.split('_')[1]

                #get the number code that corresponds to the alt allele, 9 indicated bad quality call
                SNP_j_alt_allele_code = base_code_dict[SNP_j_alt_allele]

                #genotypes for each SNP site
                genotypes_for_SNP_j = SNP_genotypes_array[SNP_annotation_DF[SNP_annotation_DF.pos == SNP_j_pos].array_index.values[0] , :]

                #re-order isolates according to the order of the columns in the INDEL genotypes matrix
                genotypes_for_SNP_j = genotypes_for_SNP_j[INDEL_isolate_ordering_for_SNP_genotypes_array]

                #boolean filter of isolates with both variants
                isolates_with_both_variants_filter = [(SNP_i_called and SNP_j_called) for SNP_i_called , SNP_j_called in zip(list(genotypes_for_SNP_i == SNP_i_alt_allele_code) , list(genotypes_for_SNP_j == SNP_j_alt_allele_code))]

                #number of isolates that have both variants
                num_isolates_with_both_variants = np.sum(isolates_with_both_variants_filter)

                #check to see if the number of isolates carrying both variants was >= 1
                if num_isolates_with_both_variants >= 1:

                    # get info for mutation in regulator
                    regulator_SNP_info = list(regulator_SNPs_DF[(regulator_SNPs_DF.pos == SNP_i_pos) & (regulator_SNPs_DF.alt == SNP_i_alt_allele)].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    # get info for mutation in regulon
                    regulon_SNP_info = list(regulon_SNPs_DF[(regulon_SNPs_DF.pos == SNP_j_pos) & (regulon_SNPs_DF.alt == SNP_j_alt_allele)].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    strain_names = [' - '.join(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].isolate_ID))]

                    # sublineage calls
                    strain_lineage_calls_count_dict = Counter(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].lineage_call))
                    strain_lineage_calls_list = []
                    for sublineage_i in strain_lineage_calls_count_dict.keys():
                        sublineage_i_with_mutations = sublineage_i + '({0})'.format(str(strain_lineage_calls_count_dict[sublineage_i]))
                        strain_lineage_calls_list.append(sublineage_i_with_mutations)
                    strain_lineage_calls = ' - '.join(strain_lineage_calls_list)

                    co_occurring_SNP_i_and_SNP_or_indel_j_info = ['SNP'] + regulator_SNP_info + ['SNP'] + regulon_SNP_info + [num_isolates_with_both_variants] + strain_names + [strain_lineage_calls]
                    variant_mutation_pair_df.loc[variant_mutation_pair_df_index, :] = co_occurring_SNP_i_and_SNP_or_indel_j_info
                    variant_mutation_pair_df_index += 1

            if variant_j.count('_') == 2: #if variant in regulon is an INDEL

                #genotypes for each INDEL
                genotypes_for_indel_j = INDEL_genotypes_array[INDEL_annotation_DF[INDEL_annotation_DF.key == variant_j].index[0] , :]

                #boolean filter of isolates with both variants
                isolates_with_both_variants_filter = [(SNP_i_called and indel_j_called) for SNP_i_called , indel_j_called in zip(list(genotypes_for_SNP_i == SNP_i_alt_allele_code) , list(genotypes_for_indel_j == 1))]

                #number of isolates that have both variants
                num_isolates_with_both_variants = np.sum(isolates_with_both_variants_filter)

                #check to see if the number of isolates carrying both variants was >= 1
                if num_isolates_with_both_variants >= 1:

                    # get info for mutation in regulator
                    regulator_SNP_info = list(regulator_SNPs_DF[(regulator_SNPs_DF.pos == SNP_i_pos) & (regulator_SNPs_DF.alt == SNP_i_alt_allele)].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    # get info for mutation in regulon
                    regulon_INDEL_info = list(regulon_INDELs_DF[regulon_INDELs_DF.key == variant_j].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    strain_names = [' - '.join(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].isolate_ID))]

                    # sublineage calls
                    strain_lineage_calls_count_dict = Counter(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].lineage_call))
                    strain_lineage_calls_list = []
                    for sublineage_i in strain_lineage_calls_count_dict.keys():
                        sublineage_i_with_mutations = sublineage_i + '({0})'.format(str(strain_lineage_calls_count_dict[sublineage_i]))
                        strain_lineage_calls_list.append(sublineage_i_with_mutations)
                    strain_lineage_calls = ' - '.join(strain_lineage_calls_list)

                    co_occurring_SNP_i_and_SNP_or_indel_j_info = ['SNP'] + regulator_SNP_info + ['INDEL'] + regulon_INDEL_info + [num_isolates_with_both_variants] + strain_names + [strain_lineage_calls]
                    variant_mutation_pair_df.loc[variant_mutation_pair_df_index, :] = co_occurring_SNP_i_and_SNP_or_indel_j_info
                    variant_mutation_pair_df_index += 1
    
    elif variant_i.count('_') == 2: # if variant in regulator is an INDEL
        
        #genotypes for each INDEL
        genotypes_for_indel_i = INDEL_genotypes_array[INDEL_annotation_DF[INDEL_annotation_DF.key == variant_i].index[0] , :]

        for variant_j in regulon_mut_keys: #iterate through mutations in regulon

            if variant_j.count('_') == 1: #if variant in regulon is a SNP

                SNP_j_pos = int(variant_j.split('_')[0])
                SNP_j_alt_allele = variant_j.split('_')[1]

                #get the number code that corresponds to the alt allele, 9 indicated bad quality call
                SNP_j_alt_allele_code = base_code_dict[SNP_j_alt_allele]

                #genotypes for each SNP site
                genotypes_for_SNP_j = SNP_genotypes_array[SNP_annotation_DF[SNP_annotation_DF.pos == SNP_j_pos].array_index.values[0] , :]

                #re-order isolates according to the order of the columns in the INDEL genotypes matrix
                genotypes_for_SNP_j = genotypes_for_SNP_j[INDEL_isolate_ordering_for_SNP_genotypes_array]

                #boolean filter of isolates with both variants
                isolates_with_both_variants_filter = [(indel_i_called and SNP_j_called) for indel_i_called , SNP_j_called in zip(list(genotypes_for_indel_i == 1) , list(genotypes_for_SNP_j == SNP_j_alt_allele_code))]

                #number of isolates that have both variants
                num_isolates_with_both_variants = np.sum(isolates_with_both_variants_filter)

                #check to see if the number of isolates carrying both variants was >= 1
                if num_isolates_with_both_variants >= 1:

                    # get info for mutation in regulator
                    regulator_INDEL_info = list(regulator_INDELs_DF[regulator_INDELs_DF.key == variant_i].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    # get info for mutation in regulon
                    regulon_SNP_info = list(regulon_SNPs_DF[(regulon_SNPs_DF.pos == SNP_j_pos) & (regulon_SNPs_DF.alt == SNP_j_alt_allele)].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    strain_names = [' - '.join(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].isolate_ID))]

                    # sublineage calls
                    strain_lineage_calls_count_dict = Counter(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].lineage_call))
                    strain_lineage_calls_list = []
                    for sublineage_i in strain_lineage_calls_count_dict.keys():
                        sublineage_i_with_mutations = sublineage_i + '({0})'.format(str(strain_lineage_calls_count_dict[sublineage_i]))
                        strain_lineage_calls_list.append(sublineage_i_with_mutations)
                    strain_lineage_calls = ' - '.join(strain_lineage_calls_list)

                    co_occurring_SNP_i_and_SNP_or_indel_j_info = ['INDEL'] + regulator_INDEL_info + ['SNP'] + regulon_SNP_info + [num_isolates_with_both_variants] + strain_names + [strain_lineage_calls]
                    variant_mutation_pair_df.loc[variant_mutation_pair_df_index, :] = co_occurring_SNP_i_and_SNP_or_indel_j_info
                    variant_mutation_pair_df_index += 1

            if variant_j.count('_') == 2: # if variant in regulon is an INDEL

                #genotypes for each INDEL
                genotypes_for_indel_j = INDEL_genotypes_array[INDEL_annotation_DF[INDEL_annotation_DF.key == variant_j].index[0] , :]

                #boolean filter of isolates with both variants
                isolates_with_both_variants_filter = [(indel_i_called and indel_j_called) for indel_i_called , indel_j_called in zip(list(genotypes_for_indel_i == 1) , list(genotypes_for_indel_j == 1))]

                #number of isolates that have both variants
                num_isolates_with_both_variants = np.sum(isolates_with_both_variants_filter)

                #check to see if the number of isolates carrying both variants was >= 1
                if num_isolates_with_both_variants >= 1:

                    # get info for mutation in regulator
                    regulator_INDEL_info = list(regulator_INDELs_DF[regulator_INDELs_DF.key == variant_i].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    # get info for mutation in regulon
                    regulon_INDEL_info = list(regulon_INDELs_DF[regulon_INDELs_DF.key == variant_j].loc[:, ['pos','ref','alt','mut_type','codon_pos','gene_id','gene_name','gene_pos','num_isolates']].values[0])

                    strain_names = [' - '.join(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].isolate_ID))]

                    # sublineage calls
                    strain_lineage_calls_count_dict = Counter(list(INDEL_isolate_annotation_DF[isolates_with_both_variants_filter].lineage_call))
                    strain_lineage_calls_list = []
                    for sublineage_i in strain_lineage_calls_count_dict.keys():
                        sublineage_i_with_mutations = sublineage_i + '({0})'.format(str(strain_lineage_calls_count_dict[sublineage_i]))
                        strain_lineage_calls_list.append(sublineage_i_with_mutations)
                    strain_lineage_calls = ' - '.join(strain_lineage_calls_list)

                    co_occurring_SNP_i_and_SNP_or_indel_j_info = ['INDEL'] + regulator_INDEL_info + ['INDEL'] + regulon_INDEL_info + [num_isolates_with_both_variants] + strain_names + [strain_lineage_calls]
                    variant_mutation_pair_df.loc[variant_mutation_pair_df_index, :] = co_occurring_SNP_i_and_SNP_or_indel_j_info
                    variant_mutation_pair_df_index += 1

In [96]:
variant_mutation_pair_df

Unnamed: 0,A_type,A_pos,A_ref,A_alt,A_mut_type,A_codon_pos,A_gene_id,A_gene_name,A_gene_pos,A_num_isolates,...,B_alt,B_mut_type,B_codon_pos,B_gene_id,B_gene_name,B_gene_pos,B_num_isolates,num_isolates_co_occur,isolate_IDs,sublineages
0,SNP,3568826,A,G,I,,Rv3197A_Rv3198c,,,1,...,G,del,64,Rv3197A,whiB7,192,573,1,SAMN07766333,1.2.1.1.1(1)
1,INDEL,3568893,TG,T,del,,Rv3197A_Rv3198c,,,9,...,G,del,64,Rv3197A,whiB7,192,573,1,SAMN08436033,1.2.1.1.1(1)
2,INDEL,3568995,G,GC,ins,,Rv3197A_Rv3198c,,,2,...,G,del,64,Rv3197A,whiB7,192,573,1,SAMN07766299,1.2.1.1.1(1)


In [97]:
np.shape(variant_mutation_pair_df)

(3, 23)

Save dataframe as CSV file

In [98]:
variant_mutation_pair_df.to_excel('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/CSV files/co_occurring_mutations_within_isolates/whiB7_promoter_gene_body_co_occurence_mutations_in_31440_isolates.xlsx')