# This notebook was created to (a) assign lineage to each isolate via SNP-barcoding and (b) *re-filter* out SNPs with minor allele < 1 isolates

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools
import gzip

import networkx as nx
import scipy

####################################################################################################################################################################################

## [1] Load SNP genotype matrix and Annotation Files

####################################################################################################################################################################################

In [3]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered/genotypes_isolate_annotation.pkl')

#load SNP annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered/genotypes_SNP_annotation_with_gene_info.pkl')

#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered/genotypes_matrix.npy')

Columns of Genotype Matrix

In [4]:
isolate_annotation_DF.head()

0    SAMEA3558733
1    SAMN03648641
2    SAMN03647419
3    SAMEA3671418
4    SAMN07659096
dtype: object

In [5]:
np.shape(isolate_annotation_DF)

(32210,)

Rows of Genotype Matrix

In [6]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,44,C,[T],Rv0001,43.0,Essential,dnaA
1,48,C,[T],Rv0001,47.0,Essential,dnaA
2,64,G,[C],Rv0001,63.0,Essential,dnaA
3,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
4,69,C,[T],Rv0001,68.0,Essential,dnaA


In [7]:
np.shape(SNP_annotation_DF)

(835979, 7)

Genotype Matrix

In [8]:
genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 9, 9, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1]], dtype=int8)

In [9]:
np.shape(genotypes_array)

(835979, 32210)

####################################################################################################################################################################################

## [2] Import  Luca's SNP barcode

####################################################################################################################################################################################

#### Import 96-SNP barcode from Luca's work

In [88]:
all_barcode_diagnostic_SNPs = pd.read_csv('/home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/validation_resistant_isolates/snp_scheme_freschi_experimental.tsv' , sep = '\t')

In [89]:
all_barcode_diagnostic_SNPs.head()

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
0,3.1.1,635139,,G/C,,,,,
1,3,342873,,C/T,,,,,
2,3.2.2,16727,,G/A,,,,,
3,3.2,63850,,T/C,,,,,
4,3.1,4044872,,G/A,,,,,


Drop redundant SNP

In [95]:
all_barcode_diagnostic_SNPs[all_barcode_diagnostic_SNPs.loc[: , '#lineage'] == '4.2.1.2.1']

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
20,4.2.1.2.1,63771,,C/T,,,,,


In [92]:
all_barcode_diagnostic_SNPs[all_barcode_diagnostic_SNPs.loc[: , '#lineage'] == '4.2.1.2']

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
27,4.2.1.2,63771,,C/T,,,,,


In [93]:
all_barcode_diagnostic_SNPs = all_barcode_diagnostic_SNPs[all_barcode_diagnostic_SNPs.loc[: , '#lineage'] != '4.2.1.2']

In [94]:
np.shape(all_barcode_diagnostic_SNPs)

(96, 9)

#### Subset DataFrame to SNPs that we have present in our Genotypes Matrix

In [96]:
barcode_SNPs_in_Genotypes_Matrix = []
barcode_SNPs_not_in_Genotypes_Matrix = []

#iterate through each of 97 SNPs and find which ones are present in our dataset
for SNP_i in all_barcode_diagnostic_SNPs.index:
    
    if int( all_barcode_diagnostic_SNPs.loc[SNP_i , 'position'] ) in list(SNP_annotation_DF.pos):
        
        barcode_SNPs_in_Genotypes_Matrix.append(SNP_i)
        
    else:
        
        barcode_SNPs_not_in_Genotypes_Matrix.append(SNP_i)

#subset to SNPs that we have in Genotypes Matrix
barcode_diagnostic_SNPs = all_barcode_diagnostic_SNPs.loc[barcode_SNPs_in_Genotypes_Matrix , :]
barcode_diagnostic_SNPs_missing = all_barcode_diagnostic_SNPs.loc[barcode_SNPs_not_in_Genotypes_Matrix , :]

In [97]:
barcode_diagnostic_SNPs.head()

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
0,3.1.1,635139,,G/C,,,,,
1,3,342873,,C/T,,,,,
2,3.2.2,16727,,G/A,,,,,
3,3.2,63850,,T/C,,,,,
4,3.1,4044872,,G/A,,,,,


In [98]:
np.shape(barcode_diagnostic_SNPs)

(94, 9)

In [99]:
barcode_diagnostic_SNPs_missing.head()

Unnamed: 0,#lineage,position,gene_coord,allele_change,codon_number,codon_change,aa_change,locus_id,gene_name
8,3.1.2,17842,,G/C,,,,,
17,4.2.1.1.1.2.1.1.1.1.2,126249,,C/A,,,,,


In [100]:
np.shape(barcode_diagnostic_SNPs_missing)

(2, 9)

The original 96 SNP barcode outlined in Luca's analysis corresponds to 96 branches, we have 94 of 96 SNPs in our Genotypes Matrix.

#### Format DataFrame with 95 lineage-defining SNPs

In [107]:
barcode_SNP_annot = barcode_diagnostic_SNPs.loc[: , ['#lineage' , 'position' , 'allele_change'] ]

#create Reference & Alternate allele columns from Allele Change column
barcode_SNP_annot['ref'] = [ref_alt_alleles.split('/')[0] for ref_alt_alleles in list( barcode_SNP_annot.loc[: , 'allele_change'] ) ]
barcode_SNP_annot['alt'] = [ref_alt_alleles.split('/')[1] for ref_alt_alleles in list( barcode_SNP_annot.loc[: , 'allele_change'] ) ]

#drop unnecessary Allele Change column
barcode_SNP_annot = barcode_SNP_annot.loc[: , ['#lineage' , 'position' , 'ref' , 'alt'] ]

#rename reference position column
barcode_SNP_annot = barcode_SNP_annot.rename(index=str, columns={"position": "pos"})

#reset index
barcode_SNP_annot.reset_index(inplace = True , drop = True)

In [108]:
np.shape(barcode_SNP_annot)

(94, 4)

In [109]:
barcode_SNP_annot.head()

Unnamed: 0,#lineage,pos,ref,alt
0,3.1.1,635139,G,C
1,3,342873,C,T
2,3.2.2,16727,G,A
3,3.2,63850,T,C
4,3.1,4044872,G,A


#### Create a column that contains the diagnostic allele and another that contains the base code for the diagnostic allele

__Note__: The diagnostic allele is the Reference Allele (not the Alternate Allele) for these two SNPs

In [110]:
barcode_SNP_annot[barcode_SNP_annot.pos == 931123]

Unnamed: 0,#lineage,pos,ref,alt
90,4**,931123,T,C


In [111]:
barcode_SNP_annot.loc[90 , '#lineage'] = '4'

In [112]:
barcode_SNP_annot[barcode_SNP_annot.pos == 1759252]

Unnamed: 0,#lineage,pos,ref,alt
89,4.2.1.1.1.1.1.1.i2**,1759252,G,T


In [113]:
barcode_SNP_annot.loc[89 , '#lineage'] = '4.2.1.1.1.1.1.1.i2'

In [114]:
#create dictionary for each base - code map
base_to_code_map = {'A':0 , 'C':1 , 'G':2 , 'T':3}
diag_allele = []
diag_allele_code = []

for diagnostic_SNP_i in barcode_SNP_annot.index:
    
    #default that the alternate allele is the diagnostic allele, except for two SNPs in which the diagnostic allele is the reference allele
    if barcode_SNP_annot.loc[diagnostic_SNP_i,:].pos in [931123 , 1759252]:
        
        diagnostic_SNP_i_allele = barcode_SNP_annot.loc[diagnostic_SNP_i,:].ref
        
    else:
        
        diagnostic_SNP_i_allele = barcode_SNP_annot.loc[diagnostic_SNP_i,:].alt
        
        
    diag_allele.append(diagnostic_SNP_i_allele)
    diag_allele_code.append(base_to_code_map[diagnostic_SNP_i_allele])
    
#append columns to barcode SNP DF
barcode_SNP_annot['diag_allele'] = diag_allele
barcode_SNP_annot['diag_allele_code'] = diag_allele_code

In [115]:
barcode_SNP_annot.tail()

Unnamed: 0,#lineage,pos,ref,alt,diag_allele,diag_allele_code
89,4.2.1.1.1.1.1.1.i2,1759252,G,T,G,2
90,4,931123,T,C,T,3
91,5,1799921,C,A,A,0
92,6,1816587,C,G,G,2
93,7,1137518,G,A,A,0


####################################################################################################################################################################################

## [3] Assign Isolates to Lineage and Sub-lineages based off of SNP barcode

####################################################################################################################################################################################

#### Subset Genotypes matrix to SNP barcode SNPs

In [127]:
#subset SNP annotation to the annotation for just the SNP barcode
barcode_SNPs_filter = [SNP_i in list(barcode_SNP_annot.pos) for SNP_i in SNP_annotation_DF.pos]
barcode_SNPs_in_genotypes_matrix = SNP_annotation_DF[barcode_SNPs_filter]

In [128]:
barcode_SNPs_in_genotypes_matrix.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
73,324,C,[G],Rv0001,323.0,Essential,dnaA
4851,15177,C,"[T, G]",Rv0013,263.0,Non-Essential,trpG
5058,15890,G,[A],Rv0014c,300.0,Essential,pknB
5251,16526,C,[T],Rv0014c,936.0,Essential,pknB
5306,16727,G,[A],Rv0014c,1137.0,Essential,pknB


In [129]:
np.shape( barcode_SNPs_in_genotypes_matrix )

(94, 7)

#### Merge lineage/sub-lineage, diagnostic allele and diagnostic allele codes (from the barcode SNP annotation above) information with barcode SNPs annotation

In [130]:
barcode_SNPs_in_genotypes_matrix = barcode_SNPs_in_genotypes_matrix.merge(barcode_SNP_annot.loc[: , ['diag_allele' , 'diag_allele_code' , '#lineage' , 'pos']] , how = 'inner' , on = 'pos')

In [131]:
barcode_SNPs_in_genotypes_matrix.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,diag_allele,diag_allele_code,#lineage
0,324,C,[G],Rv0001,323.0,Essential,dnaA,G,2,4.2.1.1.1.2.1.1.1.2
1,15177,C,"[T, G]",Rv0013,263.0,Non-Essential,trpG,G,2,1.1.1.2
2,15890,G,[A],Rv0014c,300.0,Essential,pknB,A,0,2.2.2
3,16526,C,[T],Rv0014c,936.0,Essential,pknB,T,3,1.2.1.1
4,16727,G,[A],Rv0014c,1137.0,Essential,pknB,A,0,3.2.2


#### Subset Genotypes Matrix to barcode SNPs

In [132]:
genotypes_array_barcode_SNPs = genotypes_array[ np.array(barcode_SNPs_filter) , :]

In [133]:
np.shape(genotypes_array_barcode_SNPs)

(94, 32210)

In [134]:
genotypes_array_barcode_SNPs

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 3, 1, ..., 9, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 0, ..., 2, 2, 2]], dtype=int8)

### Iterate through each isolate and assign a lineage/sub-lineage based off of the most specific lineage call made for that isolate

In [142]:
#create DataFrame to store the lineage call for each isolate
lineage_calls_for_isolates_DF = pd.DataFrame(index = isolate_annotation_DF.index , columns = ['lineage_1' , 'lineage_2' , 'lineage_3' , 'lineage_4' , 'lineage_5' , 'lineage_6' , 'lineage_7' , 'lineage_8' , 'lineage_9' , 'lineage_10' , 'lineage_11'])

for isolate_i in list(isolate_annotation_DF.index):

    genotypes_for_isolate = genotypes_array_barcode_SNPs[: , isolate_i]

    #find Lineage(s) types by the isolate (match the genotypes for the isolate to the diagnostic alleles from the SNP barcode and get lineages for matching alleles)
    lineage_calls = list( barcode_SNPs_in_genotypes_matrix[genotypes_for_isolate == barcode_SNPs_in_genotypes_matrix.diag_allele_code.values].loc[: , '#lineage'] )
    
    #sort by length of element...global lineage first, then sublineage, then sub-sub-lineage
    lineage_calls = sorted(lineage_calls , key=len)
    
    #keep the 'most specific' lineage call if one exists
    if len(lineage_calls) > 0:
        
        most_specific_lineage_call = lineage_calls[-1]
    
        #iterate through the lineage & sublineages for most specific lineage call and fill in DF

        #check Bovis
        if most_specific_lineage_call == 'BOV': 
            lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = 'BOV'

        #check Bovis-Africa
        elif most_specific_lineage_call == 'BOV_AFRI': 
            lineage_calls_for_isolates_DF.iloc[isolate_i , 1] = 'BOV_AFRI'

            #check to see if global lineage was also called
            try: 
                lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = lineage_calls[-2]
                
            except IndexError: #BOV_AFRI sublineage was called but Global Lineage was not
                continue
    
        #any other lineage call can be split into the global lineage & sub-lineages
        else:
            most_specific_lineage_call_split = most_specific_lineage_call.split('.')

            #iterate through lineage & sub-lineages and fill in DF
            column_i = 0
            for lineage_call in most_specific_lineage_call_split:
                lineage_calls_for_isolates_DF.iloc[isolate_i , column_i] = lineage_call
                column_i += 1

In [143]:
lineage_calls_for_isolates_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11
0,4,2,1,2,1.0,1.0,i3,1.0,,,
1,4,2,1,2,2.0,1.0,1,,,,
2,3,1,1,i1,,,,,,,
3,4,2,1,2,1.0,1.0,i1,,,,
4,1,1,1,2,,,,,,,


How many Isolates did not have a high-confidence lineage call (no lineage_1/global lineage designation)?

In [145]:
isolate_without_lineage_calls = 0

for isolate_i in lineage_calls_for_isolates_DF.index:
    
    #if the first column is NaN, then we don't have any lineage calls for this isolate*
    ## (*possible to have a sub-lineage 'BOV_AFRI' with no global lineage but we're dropping these)
    if lineage_calls_for_isolates_DF.loc[isolate_i , 'lineage_1'] is np.nan:
        
        isolate_without_lineage_calls += 1

In [146]:
print isolate_without_lineage_calls

290


These isolates probably failed to get categorized at the global lineage level because the (global) lineage-defining SNP/Base Call did not meet the Alternate Allele calling filters (BQ, MQ, indel, depth, etc.)

#### Append Isolate Annotation Series to Lineage/Sub-lineage assignments

In [147]:
lineage_calls_for_isolates_DF['isolate_ID'] = list( isolate_annotation_DF )
isolate_annotation_DF = lineage_calls_for_isolates_DF.copy()

In [148]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641
2,3,1,1,i1,,,,,,,,SAMN03647419
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418
4,1,1,1,2,,,,,,,,SAMN07659096


In [149]:
np.shape(isolate_annotation_DF)

(32210, 12)

####################################################################################################################################################################################

## [4] *FILTER*: Drop Isolates without a Global Lineage Call or that had more than 1 Global Lineage Call

####################################################################################################################################################################################

#### Only keep isolates with a single 'Global Lineage call' (many global lineages were actually inferred from more specific sub-lineage defining SNPs)

Filter out isolates with no lineage calls

In [150]:
#boolean list that is 'True' for all isolates no global lineage calls
isolates_no_lineage_calls_filter = []

for isolate_i in lineage_calls_for_isolates_DF.index:
    
    #if the first column is NaN, then we don't have any lineage calls for this isolate*
    ## (*possible to have a sub-lineage 'BOV_AFRI' with no global lineage but we're dropping these)
    if lineage_calls_for_isolates_DF.loc[isolate_i , 'lineage_1'] is np.nan:
        
        isolates_no_lineage_calls_filter.append(True)
        
    else:
        
        isolates_no_lineage_calls_filter.append(False)

In [151]:
np.sum(isolates_no_lineage_calls_filter) #drop these isolates

290

In [153]:
isolate_annotation_DF[isolates_no_lineage_calls_filter].head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID
39,,,,,,,,,,,,SAMN09102197
95,,,,,,,,,,,,SAMN07658483
107,,,,,,,,,,,,SAMN05276351
162,,,,,,,,,,,,SAMEA4752945
310,,,,,,,,,,,,SAMEA1117730


In [155]:
isolate_annotation_DF[isolates_no_lineage_calls_filter].tail()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID
32081,,,,,,,,,,,,SAMN09100465
32097,,,,,,,,,,,,SAMN02584672
32124,,,,,,,,,,,,IDR1400034450
32170,,,,,,,,,,,,SAMN08892469
32178,,,,,,,,,,,,SAMN09100166


Create a boolean filter of all isolates to retain (isolates that only had exactly 1 global lineage call)

In [156]:
isolates_to_keep_filter = [ not no_lineage_SNP for no_lineage_SNP in isolates_no_lineage_calls_filter ]

In [157]:
np.sum(isolates_to_keep_filter)

31920

In [158]:
#filter Isolate annotation DF
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]

#reindex Isolate annotation DF
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[: , np.array(isolates_to_keep_filter) ]

In [159]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641
2,3,1,1,i1,,,,,,,,SAMN03647419
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418
4,1,1,1,2,,,,,,,,SAMN07659096


In [160]:
np.shape(isolate_annotation_DF)

(31920, 12)

In [161]:
np.shape(genotypes_array)

(835979, 31920)

#### number of isolates present in each global lineage

In [163]:
from collections import Counter
lineage_count = pd.Series(Counter(isolate_annotation_DF.lineage_1))

In [164]:
lineage_count

1     2815
2     8090
3     3398
4    17388
5       98
6       96
7       35
dtype: int64

In [165]:
int( lineage_count.sum() )

31920

####################################################################################################################################################################################

## [5] *FILTER*: Drop Isolates that are typed as Lineage 7

####################################################################################################################################################################################

Create a boolean filter of all isolates to retain (isolates that were not typed as Lineage **7**)

In [169]:
isolates_to_keep_filter = [ not typed_L7 for typed_L7 in list(isolate_annotation_DF.lineage_1 == '7') ]

In [170]:
np.sum(isolates_to_keep_filter)

31885

In [171]:
#filter Isolate annotation DF
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]

#reindex Isolate annotation DF
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[: , np.array(isolates_to_keep_filter) ]

In [172]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641
2,3,1,1,i1,,,,,,,,SAMN03647419
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418
4,1,1,1,2,,,,,,,,SAMN07659096


In [173]:
np.shape(isolate_annotation_DF)

(31885, 12)

In [174]:
np.shape(genotypes_array)

(835979, 31885)

#### number of isolates present in each global lineage

In [175]:
lineage_count = pd.Series(Counter(isolate_annotation_DF.lineage_1))

In [176]:
lineage_count

1     2815
2     8090
3     3398
4    17388
5       98
6       96
dtype: int64

In [177]:
int( lineage_count.sum() )

31885

####################################################################################################################################################################################

## [6] *FILTER*: Drop SNPs where *minor* allele(s) occurs in $< 1$ isolates

####################################################################################################################################################################################

Some isolates (that had more than $10\%$ missing calls across SNP sites **or** without a *global* lineage call) were dropped. So if there were SNPs in which the alternate allele was present in 1 isolate and 1 was dropped, then there is only 0 isolates with the alternate allele left in the *Genotypes Matrix*. Let's drop all SNPs in which the minor (reference or alternate) allele is present in less than 1 isolate.

1. For each row of matrix (SNP) we will count the number of **0's**, **1's**, **2's** and **3's**.
1. For each SNP we will take the max( number of 0's , number of 1's , number of 2's , number of 3's ) to get a count of the **major allele**
1. For each SNP we will sum up the total number of isolates with a designated allele (not a 9), to get the count of **non-missing** calls
1. We will then subtract the **count of major alleles** from the **number of non-missing calls** to get the number of isolates with the minor allele(s) for each SNP.
1. We will discard all SNPs for which the minor allele(s) occurs in LESS THAN 1 isolate

In [178]:
genotypes_matrix_SNP_0_count = list( np.sum( genotypes_array == 0 , axis = 1 ) ) #A
genotypes_matrix_SNP_1_count = list( np.sum( genotypes_array == 1 , axis = 1 ) ) #C
genotypes_matrix_SNP_2_count = list( np.sum( genotypes_array == 2 , axis = 1 ) ) #G
genotypes_matrix_SNP_3_count = list( np.sum( genotypes_array == 3 , axis = 1 ) ) #T

genotypes_matrix_SNP_non_missing_count = np.sum( genotypes_array != 9 , axis = 1 ) #Non-Missing Data
genotypes_matrix_major_allele_count = np.array( [ max(zero_count , one_count , two_count , three_count) for zero_count , one_count , two_count , three_count in zip(genotypes_matrix_SNP_0_count , genotypes_matrix_SNP_1_count , genotypes_matrix_SNP_2_count , genotypes_matrix_SNP_3_count) ] )

#Number of isolates with the minor allele(s) for each SNP
genotypes_matrix_minor_alleles_count = genotypes_matrix_SNP_non_missing_count - genotypes_matrix_major_allele_count

In [179]:
genotypes_matrix_minor_alleles_count

array([  0,   5,   5, ...,   3, 505,   3])

In [180]:
len(genotypes_matrix_minor_alleles_count)

835979

#### Number of SNPs that we're going to drop because the minor (alternate or reference) allele(s) is present in $< 1$ isolate.

In [181]:
np.sum( np.array( genotypes_matrix_minor_alleles_count ) < 1 )

41760

In [182]:
#boolean list that is 'True' for all SNPs where the minor allele is present in at least 1 isolate
SNPs_to_keep_filter = list( np.array( genotypes_matrix_minor_alleles_count ) >= 1 )

SNPs that had at least 1 isolate with minor allele

In [183]:
np.sum(SNPs_to_keep_filter)

794219

In [184]:
#filter SNP annotation DF
SNP_annotation_DF = SNP_annotation_DF[SNPs_to_keep_filter]

#reindex SNP annotation DF
SNP_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[np.array(SNPs_to_keep_filter) , : ]

In [185]:
np.shape(SNP_annotation_DF)

(794219, 7)

In [186]:
np.shape(genotypes_array)

(794219, 31885)

####################################################################################################################################################################################

## [7] Save annotation files and genotypes matrix

####################################################################################################################################################################################

### Now that we've assigned lineages and have *dropped* isolates without Global Lineages and have *dropped* SNPs with minor allele count $< 1$, we save the updated Genotypes Matrix and corresponding annotation files.

#### Isolate Annotation

In [187]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641
2,3,1,1,i1,,,,,,,,SAMN03647419
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418
4,1,1,1,2,,,,,,,,SAMN07659096


In [188]:
np.shape(isolate_annotation_DF)

(31885, 12)

In [189]:
#save isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')

#### SNP Annotation

In [190]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA
2,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
3,69,C,[T],Rv0001,68.0,Essential,dnaA
4,71,C,[T],Rv0001,70.0,Essential,dnaA


In [191]:
np.shape(SNP_annotation_DF)

(794219, 7)

In [192]:
#save SNP annotation file (rows of Genotype Matrix)
SNP_annotation_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')

#### Genotype Matrix

In [193]:
genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 9, 9, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1]], dtype=int8)

In [194]:
np.shape(genotypes_array)

(794219, 31885)

In [195]:
#save Genotypes Matrix
np.save('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_matrix' , genotypes_array , allow_pickle = True)