# This notebook was created to include and process (i.e. genotype) an additional 12 *eis* C-14T mutants with AG MICs

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }<|/style>"))

In [3]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools
import gzip

import networkx as nx
import scipy
import pickle

from collections import Counter
import allel # import scikit-allel

#for exporting to Adobe Illustrator
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

####################################################################################################################################################################################

# [1] Get Coverage for Isolates

####################################################################################################################################################################################

Isolate IDs listed below

In [3]:
strain_tags = ['168-19','622-19','655-19','IT1070','IT123','IT184','IT230','IT233','IT524','IT634','IT77','IT947','IT952']

### Function that takes in the path to a full zipped VCF files, extracts just the *POS* and *VD* fields and stores this data as a Series

In [4]:
def get_depth_series(tag):
    
    # genomic data directory
    rolling_DB_dir = '/n/data1/hms/dbmi/farhat/lfreschi/repos/megapipe/megapipe_snakemake/results_mmpR_eis/'

    # path to full zipped VCF
    zipped_VCF = rolling_DB_dir + tag + '/pilon/' + tag + '_full.vcf.gz'

    # load in the Position & Valid Depth fields from the VCF file
    POS_DP_fields = allel.read_vcf(zipped_VCF, fields=['variants/POS', 'variants/DP'])

    # create a series with H37Rv Reference Positions as the Index & Depth as the values
    depth_series = pd.Series(POS_DP_fields['variants/DP'], index = POS_DP_fields['variants/POS'])

    # pickle series for downstream analyses
    valid_depth_VCF_dir = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/rolling_DB_scrape_indels/depth_series/'
    #depth_series.to_pickle('{0}{1}_POS_VD.pkl'.format(valid_depth_VCF_dir, tag))
    
    return depth_series

Iterate through all tag names and get depth series for all strains

In [10]:
depth_series_per_isolate_dict = {}
for tag in strain_tags:
    depth_series_per_isolate_dict[tag] = get_depth_series(tag)

In [23]:
avg_depth_series_per_isolate = [np.mean(depth_series_per_isolate_dict[depth_series_isolate_i]) for depth_series_isolate_i in strain_tags]
avg_depth_series_per_isolate = pd.Series(avg_depth_series_per_isolate, index = strain_tags)

In [24]:
avg_depth_series_per_isolate

168-19    92.890217
622-19    51.134228
655-19    91.686259
IT1070    58.677926
IT123     61.067880
IT184     62.892872
IT230     18.216280
IT233     76.325863
IT524     40.771388
IT634     28.441705
IT77      43.649817
IT947     67.702494
IT952     58.931473
dtype: float64

- isolate **IT230** from Milan has an average coverage of 18.2x across the genome. One of our filters to call an allele at a SNP site (from the VCF) requires at least a depth of 20x, so we won't be able to call the majority of alleles present at the 790k SNP sites we're analyzing in total, for this isolate, using our allele calling filters.

- isolate **63** from Ukraine doesn't pass our contamination filter (used Kraken for this), the number of reads that map to the MTBC is ~80%. That level of contamination could lead to spurious heterozygous calls which would muddy the observations if we're looking at mixed indel calls for this isolate.

####################################################################################################################################################################################

# [2] Process SNP Genotypes

####################################################################################################################################################################################

#### Side-Note: Some isolate have ~20x coverage and we require 20 reads at a given position to make a SNP call

In [4]:
strain_tags = ['168-19','622-19','655-19','IT1070','IT123','IT184','IT233','IT524','IT634','IT77','IT947','IT952']

In [5]:
len(strain_tags)

12

####################################################################################################################################################################################

## [2.1] This section of the notebook prepares the script that iterates over all of the VCF files in rollingDB and outputs a genotype vector.

####################################################################################################################################################################################

In [6]:
import vcf
import os
import pandas as pd
import numpy as np
import sys

#### load SNP annotation file (columns of Genotype Matrix for 31.5k strains) for indexing genotypes

In [7]:
SNV_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')

#subset to columns that are needed
SNV_annotation_DF = SNV_annotation_DF.loc[:, ['pos','ref','alt']]

In [8]:
SNV_annotation_DF.head()

Unnamed: 0,pos,ref,alt
0,48,C,[T]
1,64,G,[C]
2,67,G,"[A, T]"
3,69,C,[T]
4,71,C,[T]


#### Convert SNV annotation into dictionaries with keys = ROW indices of genotype array

In [9]:
#keys: H37Rv Reference Position , values: Genotype Row Index
ref_pos_dict = dict( zip( list(SNV_annotation_DF.pos) , list(SNV_annotation_DF.index) ) )

#### Create set of H37Rv Reference Positions that are SNP sites

In [10]:
H37Rv_ref_positions_of_interest = set(SNV_annotation_DF.pos)

#### Construct 'skeleton' Numpy Array that will store the supported base for all Reference Positions of interest for a single isolate in rollingDB

- **0** for _A_
- **1** for _C_
- **2** for _G_
- **3** for _T_
- **9** for Bad Quality SNV calls

In [11]:
#number of Reference Positions we're interested in (where High Quality SNV calls were made)
number_of_SNVs = len(ref_pos_dict.keys())

In [12]:
number_of_SNVs

782565

### *Function* to get SNP genotypes array from VCF file

In [13]:
def get_genotypes_array_from_full_VCF(isolate_ID):
    
    # INPUT - isolate ID
    
    ##################################################################
    # In constructing the Genotype Vector, we're going to iterate through all Reference Positions of interest and find the supported base from the VCF file for each position
    # create an empty numpy array to store genotypes
    genotypes_array = np.full(number_of_SNVs , 9 , dtype = int)
    
    ##################################################################
    # Iterate through the VCF file corresponding to the isolate in rollingDB and fill in genotypes vector

    #genomic data directory
    VCF_genotypes_dir = '/n/data1/hms/dbmi/farhat/lfreschi/repos/megapipe/megapipe_snakemake/results_mmpR_eis/'

    #directory that stores files for each sequenced isolate
    directory_for_sequenced_isolate = VCF_genotypes_dir + isolate_ID + '/pilon/'

    #position-base dictionaries (order of output in Pilon)
    base_order = {0:'A' , 1:'C' , 2:'G', 3:'T'}
    base_order_r = {'A':0 , 'C':1 , 'G':2 , 'T':3}

    #load in VCF file if one was found
    vcf_reader = vcf.Reader( open(directory_for_sequenced_isolate  + isolate_ID + '_full.vcf.gz' , 'r') )

    #iterate through each Variant Call 
    for record in vcf_reader:

        #check to see if H37Rv Reference Position is one of the SNP sites of interest
        if record.POS in H37Rv_ref_positions_of_interest:

            #check to see if variant is SNP (length of Reference Allele is 1 base and there's only 1 alternate allele and length of alternate allele is 1 base is alternate allele exists)
            if (len(record.REF) == 1) and (len(record.ALT) == 1) and ( (record.ALT[0] == None) or (len(str(record.ALT[0])) == 1) ):

                #check to see if the call is a PASS by Pilon
                if record.FILTER == []:

                    ##### Retrieve Relevant information for filtering quality of Base Call #####
                    # Mean Base Quality @ locus
                    BQ = record.INFO['BQ']
                    # Mean Mapping Quality @ locus
                    MQ = record.INFO['MQ']
                    # Number of Reads w/ Deletion 
                    DC = record.INFO['DC']
                    # Number of Reads w/ Insertion
                    IC = record.INFO['IC']
                    # Depth of Valid Reads in Pileup
                    VD = record.INFO['DP']

                    ### Filtering Criteria
                    #---> Mean Base Quality > 20
                    #---> Mean Mapping Quality > 30
                    #---> No Reads Supporting Insertions
                    #---> No Reads Supporting Deletions
                    #---> Number of High Quality Reads >= 20
                    if (BQ > 20) and (MQ > 30) and (DC == 0) and (IC == 0) and (VD >= 20): 

                        # Quality-Percentage - Percentage of As, Cs, Gs, Ts weighted by Q & MQ at locus
                        QP = record.INFO['QP']

                        if (sum(np.array(QP) >= 75) == 1): #1 Allele is supported by at least 75% of the reads, store information for this Base Call

                            #create a Boolean list for the Bases that have at least 75% of support from reads
                            supported_bases_boolean = list( np.array( QP ) >= 75 )

                            #position of Base [A,C,G,T] with at least 75% of support from reads
                            supported_bases_i = [i for i,x in enumerate(supported_bases_boolean) if x == True] #returns list of length 
                            supported_base = base_order[supported_bases_i[0]]

                            #write an exception in case a SNV (with a reference position that we're not interested in) 'sneaks' past the subsetting and filtering
                            try:
                                #store a '0', '1', '2', or '3' in Genotype Vector (Reference Position Index) referencing the Base with at least 90% support from the reads
                                genotypes_array[ref_pos_dict[record.POS]] = base_order_r[supported_base]
                            except KeyError:
                                pass

    ##################################################################
    # Create directory for the isolate to store output for genotyping

    isolate_genotype_dir = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_per_isolate/' + isolate_ID
    if os.path.exists(isolate_genotype_dir):
        shutil.rmtree(isolate_genotype_dir)
        os.makedirs(isolate_genotype_dir)
    elif not os.path.exists(isolate_genotype_dir):
        os.makedirs(isolate_genotype_dir)

    ##################################################################
    # Pickle Genotypes Array for downstream analysis
    np.save('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_per_isolate/' + isolate_ID + '/' + isolate_ID + '_genotypes' , genotypes_array , allow_pickle = True)

    print(isolate_ID)

In [None]:
for isolate_ID in strain_tags:
    get_genotypes_array_from_full_VCF(isolate_ID)

168-19
622-19
655-19
IT1070
IT123
IT184
IT233
IT524
IT634
IT77
IT947


In [14]:
get_genotypes_array_from_full_VCF('IT952')

IT952


## [2.2] Construct Genotype Matrix (from genotype vectors)

In [15]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import pickle

In [16]:
strain_tags = ['168-19','622-19','655-19','IT1070','IT123','IT184','IT233','IT524','IT634','IT77','IT947','IT952']

In [17]:
len(strain_tags)

12

### Make sure all isolates have a corresponding _genotypes_ array

In [21]:
isolate_id_series = pd.Series(strain_tags)

In [22]:
isolate_id_series.head()

0    168-19
1    622-19
2    655-19
3    IT1070
4     IT123
dtype: object

In [23]:
np.shape(isolate_id_series)

(12,)

### Concatenate genotypes arrays into a genotypes matrix for all isolates in rollingDB

Store the __genotypes array__ for each isolate into a list

In [25]:
genotypes_directory = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_per_isolate/'
genotype_arrays_per_isolate_list = []

#iterate through the genotypes arrays for each isolate
isolate_i = 0
for isolate_ID in isolate_id_series:
    
    #load genotypes array
    genotypes_array = np.load(genotypes_directory + isolate_ID + '/' + isolate_ID + '_genotypes.npy')
    
    #convert genotypes array from dtype 'int64' to 'int8' to save lots of memory
    genotypes_array = genotypes_array.astype('int8')
    
    #store genotypes array for this isolate in a list with all other genotypes arrays
    genotype_arrays_per_isolate_list.append(genotypes_array)

Stack genotype arrays for all isolates to create one __genotypes matrix__ for all isolates 

In [26]:
genotypes_matrix = np.stack(genotype_arrays_per_isolate_list , axis = -1)

In [27]:
np.shape(genotypes_matrix)

(782565, 12)

In [28]:
genotypes_matrix

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [29]:
print("%d bytes" % (genotypes_matrix.size * genotypes_matrix.itemsize))

9390780 bytes


Genotypes Matrix is ~9.4 MB large

Save __Genotypes Matrix__ in case kernel crashes when filtering

In [30]:
#save Genotypes Matrix
np.save('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_matrix' , genotypes_matrix , allow_pickle = True)

Load __SNP annotation file__

In [31]:
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')

In [32]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA
2,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
3,69,C,[T],Rv0001,68.0,Essential,dnaA
4,71,C,[T],Rv0001,70.0,Essential,dnaA


In [33]:
np.shape(SNP_annotation_DF)

(782565, 7)

####################################################################################################################################################################################

# [3]  Lineage Typing using SNPs

####################################################################################################################################################################################

### [3.1] Load/Construct SNP genotype matrix and Annotation Files

Columns of Genotype Matrix

In [4]:
strain_tags = ['168-19','622-19','655-19','IT1070','IT123','IT184','IT233','IT524','IT634','IT77','IT947','IT952']
isolate_annotation_DF = pd.Series(strain_tags)

In [5]:
isolate_annotation_DF.head()

0    168-19
1    622-19
2    655-19
3    IT1070
4     IT123
dtype: object

In [6]:
np.shape(isolate_annotation_DF)

(12,)

Rows of Genotype Matrix

In [7]:
#load SNP annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')

In [8]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA
2,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
3,69,C,[T],Rv0001,68.0,Essential,dnaA
4,71,C,[T],Rv0001,70.0,Essential,dnaA


In [9]:
np.shape(SNP_annotation_DF)

(782565, 7)

Genotype Matrix

In [10]:
#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_matrix.npy')

In [11]:
genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [12]:
np.shape(genotypes_array)

(782565, 12)

### [3.2] Import  Luca's SNP barcode

#### Import 96-SNP barcode from Luca's work

In [13]:
all_barcode_diagnostic_SNPs = pd.read_csv('/home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/validation_resistant_isolates/snp_scheme_freschi_experimental.tsv' , sep = '\t')

In [14]:
all_barcode_diagnostic_SNPs.head()

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
0,3.1.1,635139,,G/C,,,,,
1,3,342873,,C/T,,,,,
2,3.2.2,16727,,G/A,,,,,
3,3.2,63850,,T/C,,,,,
4,3.1,4044872,,G/A,,,,,


Drop redundant SNP

In [15]:
all_barcode_diagnostic_SNPs[all_barcode_diagnostic_SNPs.loc[: , '#lineage'] == '4.2.1.2.1']

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
20,4.2.1.2.1,63771,,C/T,,,,,


In [16]:
all_barcode_diagnostic_SNPs[all_barcode_diagnostic_SNPs.loc[: , '#lineage'] == '4.2.1.2']

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
27,4.2.1.2,63771,,C/T,,,,,


In [17]:
all_barcode_diagnostic_SNPs = all_barcode_diagnostic_SNPs[all_barcode_diagnostic_SNPs.loc[: , '#lineage'] != '4.2.1.2']

In [18]:
np.shape(all_barcode_diagnostic_SNPs)

(96, 9)

#### Subset DataFrame to SNPs that we have present in our Genotypes Matrix

In [19]:
barcode_SNPs_in_Genotypes_Matrix = []
barcode_SNPs_not_in_Genotypes_Matrix = []

#iterate through each of 97 SNPs and find which ones are present in our dataset
for SNP_i in all_barcode_diagnostic_SNPs.index:
    
    if int( all_barcode_diagnostic_SNPs.loc[SNP_i , 'position'] ) in list(SNP_annotation_DF.pos):
        
        barcode_SNPs_in_Genotypes_Matrix.append(SNP_i)
        
    else:
        
        barcode_SNPs_not_in_Genotypes_Matrix.append(SNP_i)

#subset to SNPs that we have in Genotypes Matrix
barcode_diagnostic_SNPs = all_barcode_diagnostic_SNPs.loc[barcode_SNPs_in_Genotypes_Matrix , :]
barcode_diagnostic_SNPs_missing = all_barcode_diagnostic_SNPs.loc[barcode_SNPs_not_in_Genotypes_Matrix , :]

In [20]:
barcode_diagnostic_SNPs.head()

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
0,3.1.1,635139,,G/C,,,,,
1,3,342873,,C/T,,,,,
2,3.2.2,16727,,G/A,,,,,
3,3.2,63850,,T/C,,,,,
4,3.1,4044872,,G/A,,,,,


In [21]:
np.shape(barcode_diagnostic_SNPs)

(94, 9)

In [22]:
barcode_diagnostic_SNPs_missing.head()

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
8,3.1.2,17842,,G/C,,,,,
17,4.2.1.1.1.2.1.1.1.1.2,126249,,C/A,,,,,


In [23]:
np.shape(barcode_diagnostic_SNPs_missing)

(2, 9)

The original 96 SNP barcode outlined in Luca's analysis corresponds to 96 branches, we have 94 of 96 SNPs in our Genotypes Matrix.

#### Format DataFrame with 95 lineage-defining SNPs

In [24]:
barcode_SNP_annot = barcode_diagnostic_SNPs.loc[: , ['#lineage' , 'position' , 'allele_change'] ]

#create Reference & Alternate allele columns from Allele Change column
barcode_SNP_annot['ref'] = [ref_alt_alleles.split('/')[0] for ref_alt_alleles in list( barcode_SNP_annot.loc[: , 'allele_change'] ) ]
barcode_SNP_annot['alt'] = [ref_alt_alleles.split('/')[1] for ref_alt_alleles in list( barcode_SNP_annot.loc[: , 'allele_change'] ) ]

#drop unnecessary Allele Change column
barcode_SNP_annot = barcode_SNP_annot.loc[: , ['#lineage' , 'position' , 'ref' , 'alt'] ]

#rename reference position column
barcode_SNP_annot = barcode_SNP_annot.rename(index=str, columns={"position": "pos"})

#reset index
barcode_SNP_annot.reset_index(inplace = True , drop = True)

In [25]:
np.shape(barcode_SNP_annot)

(94, 4)

In [26]:
barcode_SNP_annot.head()

Unnamed: 0,#lineage,pos,ref,alt
0,3.1.1,635139,G,C
1,3,342873,C,T
2,3.2.2,16727,G,A
3,3.2,63850,T,C
4,3.1,4044872,G,A


#### Create a column that contains the diagnostic allele and another that contains the base code for the diagnostic allele

__Note__: The diagnostic allele is the Reference Allele (not the Alternate Allele) for these two SNPs

In [27]:
barcode_SNP_annot[barcode_SNP_annot.pos == 931123]

Unnamed: 0,#lineage,pos,ref,alt
90,4**,931123,T,C


In [28]:
barcode_SNP_annot.loc[90 , '#lineage'] = '4'

In [29]:
barcode_SNP_annot[barcode_SNP_annot.pos == 1759252]

Unnamed: 0,#lineage,pos,ref,alt
89,4.2.1.1.1.1.1.1.i2**,1759252,G,T


In [30]:
barcode_SNP_annot.loc[89 , '#lineage'] = '4.2.1.1.1.1.1.1.i2'

In [31]:
#create dictionary for each base - code map
base_to_code_map = {'A':0 , 'C':1 , 'G':2 , 'T':3}
diag_allele = []
diag_allele_code = []

for diagnostic_SNP_i in barcode_SNP_annot.index:
    
    #default that the alternate allele is the diagnostic allele, except for two SNPs in which the diagnostic allele is the reference allele
    if barcode_SNP_annot.loc[diagnostic_SNP_i,:].pos in [931123 , 1759252]:
        
        diagnostic_SNP_i_allele = barcode_SNP_annot.loc[diagnostic_SNP_i,:].ref
        
    else:
        
        diagnostic_SNP_i_allele = barcode_SNP_annot.loc[diagnostic_SNP_i,:].alt
        
        
    diag_allele.append(diagnostic_SNP_i_allele)
    diag_allele_code.append(base_to_code_map[diagnostic_SNP_i_allele])
    
#append columns to barcode SNP DF
barcode_SNP_annot['diag_allele'] = diag_allele
barcode_SNP_annot['diag_allele_code'] = diag_allele_code

In [32]:
barcode_SNP_annot.tail()

Unnamed: 0,#lineage,pos,ref,alt,diag_allele,diag_allele_code
89,4.2.1.1.1.1.1.1.i2,1759252,G,T,G,2
90,4,931123,T,C,T,3
91,5,1799921,C,A,A,0
92,6,1816587,C,G,G,2
93,7,1137518,G,A,A,0


### [3.3] Assign Isolates to Lineage and Sub-lineages based off of SNP barcode

#### Subset Genotypes matrix to SNP barcode SNPs

In [33]:
#subset SNP annotation to the annotation for just the SNP barcode
barcode_SNPs_filter = [SNP_i in list(barcode_SNP_annot.pos) for SNP_i in SNP_annotation_DF.pos]
barcode_SNPs_in_genotypes_matrix = SNP_annotation_DF[barcode_SNPs_filter]

In [34]:
barcode_SNPs_in_genotypes_matrix.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
71,324,C,[G],Rv0001,323.0,Essential,dnaA
4524,15177,C,"[T, G]",Rv0013,263.0,Non-Essential,trpG
4720,15890,G,[A],Rv0014c,300.0,Essential,pknB
4904,16526,C,[T],Rv0014c,936.0,Essential,pknB
4958,16727,G,[A],Rv0014c,1137.0,Essential,pknB


In [35]:
np.shape( barcode_SNPs_in_genotypes_matrix )

(94, 7)

#### Merge lineage/sub-lineage, diagnostic allele and diagnostic allele codes (from the barcode SNP annotation above) information with barcode SNPs annotation

In [36]:
barcode_SNPs_in_genotypes_matrix = barcode_SNPs_in_genotypes_matrix.merge(barcode_SNP_annot.loc[: , ['diag_allele' , 'diag_allele_code' , '#lineage' , 'pos']] , how = 'inner' , on = 'pos')

In [37]:
barcode_SNPs_in_genotypes_matrix.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,diag_allele,diag_allele_code,#lineage
0,324,C,[G],Rv0001,323.0,Essential,dnaA,G,2,4.2.1.1.1.2.1.1.1.2
1,15177,C,"[T, G]",Rv0013,263.0,Non-Essential,trpG,G,2,1.1.1.2
2,15890,G,[A],Rv0014c,300.0,Essential,pknB,A,0,2.2.2
3,16526,C,[T],Rv0014c,936.0,Essential,pknB,T,3,1.2.1.1
4,16727,G,[A],Rv0014c,1137.0,Essential,pknB,A,0,3.2.2


#### Subset Genotypes Matrix to barcode SNPs

In [38]:
genotypes_array_barcode_SNPs = genotypes_array[ np.array(barcode_SNPs_filter) , :]

In [39]:
np.shape(genotypes_array_barcode_SNPs)

(94, 12)

In [40]:
genotypes_array_barcode_SNPs

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2]], dtype=int8)

### Iterate through each isolate and assign a lineage/sub-lineage based off of the most specific lineage call made for that isolate

In [41]:
#create DataFrame to store the lineage call for each isolate
lineage_calls_for_isolates_DF = pd.DataFrame(index = isolate_annotation_DF.index , columns = ['lineage_call'])

for isolate_i in list(isolate_annotation_DF.index):

    genotypes_for_isolate = genotypes_array_barcode_SNPs[: , isolate_i]

    #find Lineage(s) types by the isolate (match the genotypes for the isolate to the diagnostic alleles from the SNP barcode and get lineages for matching alleles)
    lineage_calls = list( barcode_SNPs_in_genotypes_matrix[genotypes_for_isolate == barcode_SNPs_in_genotypes_matrix.diag_allele_code.values].loc[: , '#lineage'] )
    
    #sort by length of element...global lineage first, then sublineage, then sub-sub-lineage
    lineage_calls = sorted(lineage_calls , key=len)
    
    #keep the 'most specific' lineage call if one exists
    if len(lineage_calls) > 0:
        
        most_specific_lineage_call = lineage_calls[-1]
    
        #iterate through the lineage & sublineages for most specific lineage call and fill in DF

        #check Bovis
        if most_specific_lineage_call == 'BOV': 
            lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = 'BOV'

        #check Bovis-Africa
        elif most_specific_lineage_call == 'BOV_AFRI': 
            lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = 'BOV_AFRI'

            #check to see if global lineage was also called
            try: 
                lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = lineage_calls[-2]
                
            except IndexError: #BOV_AFRI sublineage was called but Global Lineage was not
                continue
    
        #any other lineage call can be split into the global lineage & sub-lineages
        else:
            #iterate through lineage & sub-lineages and fill in DF
            lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = most_specific_lineage_call

In [42]:
lineage_calls_for_isolates_DF.head()

Unnamed: 0,lineage_call
0,2.2.1.1.1
1,2.2.1.1.1.i3
2,4.2.1.2.1.1.i4.1
3,2.2.1.1.1.i3
4,2.2.1.1.1


#### Append Isolate Annotation Series to Lineage/Sub-lineage assignments

In [43]:
lineage_calls_for_isolates_DF['isolate_ID'] = list( isolate_annotation_DF )
isolate_annotation_DF = lineage_calls_for_isolates_DF.copy()

In [89]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_call,isolate_ID
0,2.2.1.1.1,168-19
1,2.2.1.1.1.i3,622-19
2,4.2.1.2.1.1.i4.1,655-19
3,2.2.1.1.1.i3,IT1070
4,2.2.1.1.1,IT123


In [47]:
np.shape(isolate_annotation_DF)

(12, 2)

Save Isolate Annotation file

In [83]:
isolate_annotation_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_isolate_annotation.pkl')

#### Check to see that all isolates were genotyped for the *diagnostic allele* that types sub-lineage *2.2.1.1.1.i3*

In [49]:
barcode_SNPs_in_genotypes_matrix[barcode_SNPs_in_genotypes_matrix.loc[:, '#lineage'] == '2.2.1.1.1.i3']

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,diag_allele,diag_allele_code,#lineage
90,4238675,C,[T],Rv3792,743.0,Essential,aftA,T,3,2.2.1.1.1.i3


In [52]:
genotypes_array_barcode_SNPs[90, :]

array([1, 3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 1], dtype=int8)

####################################################################################################################################################################################

# [4] Process INDEL Genotypes

####################################################################################################################################################################################

In [4]:
strain_tags = ['168-19','622-19','655-19','IT1070','IT123','IT184','IT233','IT524','IT634','IT77','IT947','IT952']

In [5]:
len(strain_tags)

12

## [4.1] This section of the notebook prepares the script that iterates over all of the VCF files in rollingDB and outputs a genotype vector.

In [6]:
import vcf
import os
import pandas as pd
import numpy as np
import sys

#### load INDEL-index annotation file (columns of Genotype Matrix for 31.5k strains) for indexing genotypes

In [53]:
#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
INDEL_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_functional_annotation.pkl')
INDEL_annotation_DF.reset_index(inplace = True , drop = False)

#subset to columns that are needed
INDEL_annotation_DF = INDEL_annotation_DF.loc[:, ['key','pos','ref','alt']]

In [54]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A
1,TC_1549_T,1549,TC,T
2,T_1552_TAA,1552,T,TAA
3,TAA_1552_T,1552,TAA,T
4,T_1552_TA,1552,T,TA


#### Convert INDEL annotation into dictionaries with keys = ROW indices of genotype array

In [55]:
#keys: INDEL identifier , values: Genotype Row Index
INDEL_dict = dict( zip( list(INDEL_annotation_DF.key) , list(INDEL_annotation_DF.index) ) )

#### Create set of H37Rv Reference Positions that are INDEL sites

In [56]:
H37Rv_ref_positions_of_interest = set(INDEL_annotation_DF.pos)

#### Construct 'skeleton' Numpy Array that will store the supported base for all INDELs of interest for a single isolate in rollingDB

- **0** INDEL not called in isolate (but good quality call otherwise)
- **1** INDEL called in isolate (Reference & Alternate Alleles match) 
- **9** for Bad Quality calls

#### In constructing the Genotype Vector, we're going to iterate through all Reference Positions of interest and find the supported variant call from the VCF file for each position, assume bad quality Base Call/Missing Data unless evidence from VCF file supports a high-quality variant call

In [57]:
#number of INDELs we're interested in (where High Quality INDEL calls were made)
number_of_INDELs = len(INDEL_dict.keys()) 

In [58]:
number_of_INDELs

50260

### *Function* to get INDEL genotypes array from VCF file

In [59]:
def get_genotypes_array_from_full_VCF(isolate_ID):
    
    # INPUT - isolate ID
    
    ##################################################################
    # In constructing the Genotype Vector, we're going to iterate through all Reference Positions of interest and find the supported base from the VCF file for each position
    #create an empty numpy array to store genotypes
    genotypes_array = np.full(number_of_INDELs , 9 , dtype = int)
    
    ##################################################################
    # Iterate through the VCF file corresponding to the isolate in rollingDB and fill in genotypes vector

    #genomic data directory
    VCF_genotypes_dir = '/n/data1/hms/dbmi/farhat/lfreschi/repos/megapipe/megapipe_snakemake/results_mmpR_eis/'

    #directory that stores files for each sequenced isolate
    directory_for_sequenced_isolate = VCF_genotypes_dir + isolate_ID + '/pilon/'

    #load in VCF file if one was found
    vcf_reader = vcf.Reader( open(directory_for_sequenced_isolate  + isolate_ID + '_full.vcf.gz' , 'r') )
    
    #iterate through each Variant Call 
    for record in vcf_reader:

        #check to see if H37Rv Reference Position is one of the INDEL sites of interest
        if record.POS in H37Rv_ref_positions_of_interest:

            ref_pos = int( record.POS )
            ref_allele = str( record.REF )  #allele on H37Rv
            alt_allele = str( record.ALT[0] ) #alternate allele supported by reads
            quality_metrics = record.INFO.keys()
            variant_i_key = ref_allele + '_' + str(ref_pos) + '_' + alt_allele #variant key, unique identifier for a specific variant

            #get the all relevant INDEL keys for possible INDELs @ this reference position
            INDEL_keys_at_ref_pos = list( INDEL_annotation_DF[INDEL_annotation_DF.pos == ref_pos].key )

             #BOOLEAN for whether INDEL was found
            INDEL_FOUND = False

            ###################################################################################################
            #The variant call at this reference position was high quality and was an INDEL of interest
            ###################################################################################################
            #check to see if the call is a PASS by Pilon (PASS by Pilon ; there's exactly 1 alternate allele ; call supports the alternate allele)
            if (record.FILTER == []) and (len(record.ALT) == 1) and (record.ALT != [None]):

                #check to see if variant is INDEL (either Refeference or Alternate Alleles are > 1 base but not both ; 1 allele is 1bp, the other allele is > 1bp)
                if ( (len(ref_allele) > 1) or (len(alt_allele) > 1) ) and ( (len(ref_allele) == 1) or (len(alt_allele) == 1) ):

                    #keep only INDELs <= 10bp long (1 allele is 1bp ; the other allele is <= 11bp)
                    if (len(ref_allele) <= 11) and (len(alt_allele) <= 11):

                        #analyze only INDELs with Depth metrics (larger structural variants don't have this)
                        if ('MQ' in quality_metrics) and ('DC' in quality_metrics) and ('IC' in quality_metrics) and ('TD' in quality_metrics) and ('DP' in quality_metrics):

                            ##### Retrieve Relevant information for filtering quality of Base Call #####
                            # Mean Mapping Quality @ locus
                            MQ = record.INFO['MQ']
                            # Number of Reads w/ Deletion 
                            DC = record.INFO['DC']
                            # Number of Reads w/ Insertion
                            IC = record.INFO['IC']
                            # Depth of Valid Reads in Pileup
                            VD = record.INFO['DP']
                            # Total Depth in Pileup
                            TD = record.INFO['TD']

                            ### Filtering Criteria

                            #---> Mean Mapping Quality > 30
                            #---> Number of High Quality Reads >= 20
                            if (MQ > 30) and (VD >= 20): #INDEL passed filtering criteria!

                                # Calculate Alternate Allele Frequency

                                #calculate INDEL (Alternate Allele) Frequency from Deletion/Insertion Count & Total Depth (all metrics include ALL reads)
                                alt_allele_frequency = float(max(DC , IC)) / float(TD)

                                # use only consensus INDELs (where alternate allele frequency >= 75%)
                                if alt_allele_frequency >= 0.75:

                                    #INDEL passed filters, find relevant INDEL index and store '1' and store '0' in for all other possible INDELs at this reference position
                                    if variant_i_key in INDEL_dict.keys():

                                        #BOOLEAN for whether INDEL was found
                                        INDEL_FOUND = True

                                        #store a '1' in Genotype Vector (INDEL index) referencing that the INDEL is supported in this isolate
                                        genotypes_array[INDEL_dict[variant_i_key]] = 1

                                        #store a '0' in Genotype Vector (INDEL index) for all other INDELs @ this reference position (if there were multiple indels here)
                                        INDELs_not_detected = INDEL_keys_at_ref_pos.remove(variant_i_key)

                                        #if list of not empty, then there are some INDEL @ this ref pos that were not detected in this isolate
                                        if INDELs_not_detected != None: 

                                            for INDEL_i_key in INDELs_not_detected:

                                                genotypes_array[INDEL_dict[INDEL_i_key]] = 0

            ###################################################################################################
            #The variant call at this reference position was high quality but was not an INDEL of interest
            ###################################################################################################
            if (INDEL_FOUND == False) and (record.FILTER == []) and ('MQ' in quality_metrics) and ('DP' in quality_metrics):

                ##### Retrieve Relevant information for filtering quality of Base Call #####
                # Mean Mapping Quality @ locus
                MQ = record.INFO['MQ']
                # Depth of Valid Reads in Pileup
                VD = record.INFO['DP']

                ### Filtering Criteria

                #---> Mean Mapping Quality > 30
                #---> Number of High Quality Reads >= 20
                if (MQ > 30) and (VD >= 20): #variant passed filtering criteria!

                    #store a '0' in Genotype Vector (INDEL index) for all INDELs @ this reference position
                    for INDEL_i_key in INDEL_keys_at_ref_pos:

                        genotypes_array[INDEL_dict[INDEL_i_key]] = 0                           
                            
    ##################################################################
    # Create directory for the isolate to store output for genotyping
    isolate_genotype_dir = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_per_isolate/' + isolate_ID
    if os.path.exists(isolate_genotype_dir):
        shutil.rmtree(isolate_genotype_dir)
        os.makedirs(isolate_genotype_dir)
    elif not os.path.exists(isolate_genotype_dir):
        os.makedirs(isolate_genotype_dir)

    ##################################################################
    # Pickle Genotypes Array for downstream analysis
    np.save('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_per_isolate/' + isolate_ID + '/' + isolate_ID + '_genotypes' , genotypes_array , allow_pickle = True)

    print(isolate_ID)

In [60]:
for isolate_ID in strain_tags:
    get_genotypes_array_from_full_VCF(isolate_ID)

168-19
622-19
655-19
IT1070
IT123
IT184
IT233
IT524
IT634
IT77
IT947
IT952


## [4.2] Construct Genotype Matrix (from genotype vectors)

In [67]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import pickle

In [68]:
strain_tags = ['168-19','622-19','655-19','IT1070','IT123','IT184','IT233','IT524','IT634','IT77','IT947','IT952']

In [69]:
len(strain_tags)

12

### Make sure all isolates have a corresponding _genotypes_ array

In [70]:
isolate_id_series = pd.Series(strain_tags)

In [71]:
isolate_id_series.head()

0    168-19
1    622-19
2    655-19
3    IT1070
4     IT123
dtype: object

In [72]:
np.shape(isolate_id_series)

(12,)

### Concatenate genotypes arrays into a genotypes matrix for all isolates in rollingDB

Store the __genotypes array__ for each isolate into a list

In [73]:
genotypes_directory = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_per_isolate/'
genotype_arrays_per_isolate_list = []

#iterate through the genotypes arrays for each isolate
isolate_i = 0
for isolate_ID in isolate_id_series:
    
    #load genotypes array
    genotypes_array = np.load(genotypes_directory + isolate_ID + '/' + isolate_ID + '_genotypes.npy')
    
    #convert genotypes array from dtype 'int64' to 'int8' to save lots of memory
    genotypes_array = genotypes_array.astype('int8')
    
    #store genotypes array for this isolate in a list with all other genotypes arrays
    genotype_arrays_per_isolate_list.append(genotypes_array)

Stack genotype arrays for all isolates to create one __genotypes matrix__ for all isolates 

In [74]:
genotypes_matrix = np.stack(genotype_arrays_per_isolate_list , axis = -1)

In [75]:
np.shape(genotypes_matrix)

(50260, 12)

In [76]:
genotypes_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 9, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [77]:
print("%d bytes" % (genotypes_matrix.size * genotypes_matrix.itemsize))

603120 bytes


Genotypes Matrix is ~0.6 MB large

Save __Genotypes Matrix__ in case kernel crashes when filtering

In [78]:
#save Genotypes Matrix
np.save('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_Filtered/genotypes_matrix' , genotypes_matrix , allow_pickle = True)

Load __INDEL annotation file__

In [79]:
#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
INDEL_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_functional_annotation.pkl')
INDEL_annotation_DF.reset_index(inplace = True , drop = False)

In [80]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A,Essential,dnaA,Rv0001,313.0,del,inframe,105.0
1,TC_1549_T,1549,TC,T,,,Rv0001_Rv0002,,del,frameshift,
2,T_1552_TAA,1552,T,TAA,,,Rv0001_Rv0002,,ins,frameshift,
3,TAA_1552_T,1552,TAA,T,,,Rv0001_Rv0002,,del,frameshift,
4,T_1552_TA,1552,T,TA,,,Rv0001_Rv0002,,ins,frameshift,


In [81]:
np.shape(INDEL_annotation_DF)

(50260, 11)

####################################################################################################################################################################################

# [5] How to load SNP & INDEL genotypes for extra strains

####################################################################################################################################################################################

### Isolate Annotation DataFrame

Columns of Genotype Matrix

In [86]:
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_isolate_annotation.pkl')

In [87]:
isolate_annotation_DF

Unnamed: 0,lineage_call,isolate_ID
0,2.2.1.1.1,168-19
1,2.2.1.1.1.i3,622-19
2,4.2.1.2.1.1.i4.1,655-19
3,2.2.1.1.1.i3,IT1070
4,2.2.1.1.1,IT123
5,2.2.1.1.1,IT184
6,2.2.1.1.1.i3,IT233
7,4.2.1.1.1.1.1.1.i2,IT524
8,2.2.1.1.1.i3,IT634
9,2.2.1.1.1.i3,IT77


In [88]:
np.shape(isolate_annotation_DF)

(12, 2)

### SNP Genotypes

Rows of Genotype Matrix

In [7]:
#load SNP annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')

In [8]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA
2,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
3,69,C,[T],Rv0001,68.0,Essential,dnaA
4,71,C,[T],Rv0001,70.0,Essential,dnaA


In [9]:
np.shape(SNP_annotation_DF)

(782565, 7)

Genotype Matrix

In [93]:
#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes/Genotypes_Filtered/genotypes_matrix.npy')

In [94]:
genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [95]:
np.shape(genotypes_array)

(782565, 12)

### INDEL Genotypes

Rows of Genotype Matrix

In [96]:
#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
INDEL_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_functional_annotation.pkl')
INDEL_annotation_DF.reset_index(inplace = True , drop = False)

In [97]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A,Essential,dnaA,Rv0001,313.0,del,inframe,105.0
1,TC_1549_T,1549,TC,T,,,Rv0001_Rv0002,,del,frameshift,
2,T_1552_TAA,1552,T,TAA,,,Rv0001_Rv0002,,ins,frameshift,
3,TAA_1552_T,1552,TAA,T,,,Rv0001_Rv0002,,del,frameshift,
4,T_1552_TA,1552,T,TA,,,Rv0001_Rv0002,,ins,frameshift,


In [98]:
np.shape(INDEL_annotation_DF)

(50260, 11)

Genotype Matrix

In [99]:
#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/eis_promoter_mutant_strains_from_Milan/Genotypes_indels/Genotypes_Filtered/genotypes_matrix.npy')

In [100]:
genotypes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 9, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [101]:
np.shape(genotypes_array)

(50260, 12)