In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

####################################################################################################################################################################################

## [1] Create a script that calculates the number of homoplasies that occur within a global lineage (phylogenetic tree constructed from all isolates typed into a lineage)

####################################################################################################################################################################################

In [2]:
import os
import pandas as pd
import numpy as np
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio import Phylo
from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq

from collections import Counter
from itertools import groupby
from operator import itemgetter

### Inputs

In [3]:
global_lineage = '1' #INPUT 1 - global lineage
spacer_length = 1 #INPUT 2 - length of spacer to define 'blocks' of allele types

### [1] Load SNP genotype matrix and Annotation Files

In [4]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')

#load SNP annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_annotation.pkl')

#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_matrix.npy')

Columns of Genotype Matrix

In [5]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733,4.2.1.2.1.1.i3.1,4B
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641,4.2.1.2.2.1.1,4B
2,3,1,1,i1,,,,,,,,SAMN03647419,3.1.1.i1,3
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418,4.2.1.2.1.1.i1,4B
4,1,1,1,2,,,,,,,,SAMN07659096,1.1.1.2,1


In [6]:
np.shape(isolate_annotation_DF)

(31428, 14)

Rows of Genotype Matrix

In [7]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA
2,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
3,69,C,[T],Rv0001,68.0,Essential,dnaA
4,71,C,[T],Rv0001,70.0,Essential,dnaA


In [8]:
np.shape(SNP_annotation_DF)

(782565, 7)

Genotype Matrix

In [9]:
genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 9, 9, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1],
       [1, 9, 1, ..., 1, 1, 1]], dtype=int8)

In [10]:
np.shape(genotypes_array)

(782565, 31428)

### [2] Load in phylogeny created by Luca tree pipeline

We're going to use Biopython's *Phylo* module to load phylogenetic trees created by Luca

In [11]:
#path to tree
phylogeny_path = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_' + global_lineage + '/tree_lineage_' + global_lineage + '_iqtree_FINAL.treefile'

#parses and load tree
tree = Phylo.parse(phylogeny_path , 'newick').next() 

#root the tree with the outgroup M. canettii ["Normally you will want the outgroup to be a monophyletic group, rather than a single taxon."]
tree.root_with_outgroup({"name":"canettii"})

#flip branches so deeper clades are displayed at top
#(sort clades in-place according to the # of terminal nodes)
tree.ladderize()

'''
Can choose to root the tree at the midpoint instead
#root the tree at the midpoint ["Root the tree at the midpoint of the two most distant taxa."]
tree.root_at_midpoint()
'''

#retrieves the terminal branches of the tree
terminal_branches = tree.get_terminals() 

Number of branches on this tree

In [12]:
len(terminal_branches)

2816

Retrieve the isolates that were used for this tree

In [13]:
isolate_tags_in_phylogeny = [terminal_branches[i].name for i in range(0 , len(terminal_branches))]

In [14]:
len(isolate_tags_in_phylogeny) #+1 for M. canetti

2816

### [3] Combine *genotypes matrix* and *phylogeny* (subset to intersecting isolate IDs)

#### Find the intersection of isolate tags from Phylogeny & Genotypes Matrix

In [15]:
isolate_tags_to_keep = list( set(isolate_annotation_DF.isolate_ID).intersection( set(isolate_tags_in_phylogeny) ) )

In [16]:
len(isolate_tags_to_keep)

2815

#### Subset *terminal branches* from phylogeny to those included in genotype matrix

In [17]:
isolate_tags_in_phylogeny_and_matrix_ordered = [isolate_tag for isolate_tag in isolate_tags_in_phylogeny if isolate_tag in isolate_tags_to_keep]

In [18]:
isolate_tags_in_phylogeny_and_matrix_ordered[0:5]

['0209688', 'SAMN07766112', 'SAMEA2783046', 'SAMEA2783143', 'SAMEA2783301']

#### Subset *genotypes matrix* to only those isolates that are included in phylogeny

In [19]:
isolates_to_keep_filter = [isolate_tag in isolate_tags_to_keep for isolate_tag in isolate_annotation_DF.isolate_ID]

#filter isolate annotation df
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter genotypes matrix
genotypes_array = genotypes_array[ : , np.array(isolates_to_keep_filter)]                

In [20]:
isolate_annotation_DF.head(n=2)

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
0,1,1,1,2,,,,,,,,SAMN07659096,1.1.1.2,1
1,1,2,1,1,1.0,,,,,,,SAMEA5542113,1.2.1.1.1,1


In [21]:
np.shape(isolate_annotation_DF)

(2815, 14)

In [22]:
SNP_annotation_DF.head(n=2)

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,48,C,[T],Rv0001,47.0,Essential,dnaA
1,64,G,[C],Rv0001,63.0,Essential,dnaA


In [23]:
np.shape(SNP_annotation_DF)

(782565, 7)

In [24]:
genotypes_array

array([[1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int8)

In [25]:
np.shape(genotypes_array)

(782565, 2815)

#### Drop SNPs where *minor* allele(s) occurs in $<$ 1 isolate

Many isolates were dropped in considering only isolates from this lineage. So if there were SNPs in which the alternate allele(s) was present in 2 isolates and 1 was dropped (or both were dropped), then there is only 1 (or 0) isolate(s) with the alternate allele left in the *Genotypes Matrix*. Let's drop all SNPs in which the minor (reference or alternate) allele is present in no isolate(s).

1. For each row of matrix (SNP) we will count the number of **0's**, **1's**, **2's** and **3's**.
1. For each SNP we will take the max( number of 0's , number of 1's , number of 2's , number of 3's ) to get a count of the **major allele**
1. For each SNP we will sum up the total number of isolates with a designated allele (not a 9), to get the count of **non-missing** calls
1. We will then subtract the **count of major alleles** from the **number of non-missing calls** to get the number of isolates with the minor allele(s) for each SNP.
1. We will discard all SNPs for which the minor allele(s) occurs in NO isolates

In [26]:
genotypes_matrix_SNP_0_count = list( np.sum( genotypes_array == 0 , axis = 1 ) ) #A
genotypes_matrix_SNP_1_count = list( np.sum( genotypes_array == 1 , axis = 1 ) ) #C
genotypes_matrix_SNP_2_count = list( np.sum( genotypes_array == 2 , axis = 1 ) ) #G
genotypes_matrix_SNP_3_count = list( np.sum( genotypes_array == 3 , axis = 1 ) ) #T

genotypes_matrix_SNP_non_missing_count = np.sum( genotypes_array != 9 , axis = 1 ) #Non-Missing Data
genotypes_matrix_major_allele_count = np.array( [ max(zero_count , one_count , two_count , three_count) for zero_count , one_count , two_count , three_count in zip(genotypes_matrix_SNP_0_count , genotypes_matrix_SNP_1_count , genotypes_matrix_SNP_2_count , genotypes_matrix_SNP_3_count) ] )

#Number of isolates with the minor allele(s) for each SNP
genotypes_matrix_minor_alleles_count = genotypes_matrix_SNP_non_missing_count - genotypes_matrix_major_allele_count

In [27]:
genotypes_matrix_minor_alleles_count

array([  0,   0,   1, ...,   0, 505,   0])

In [28]:
len(genotypes_matrix_minor_alleles_count)

782565

Number of SNPs that we're going to drop because the minor (alternate or reference) allele(s) is present in *no* isolates.

In [29]:
np.sum( np.array( genotypes_matrix_minor_alleles_count ) < 1 )

593525

In [30]:
#boolean list that is 'True' for all SNPs where the minor allele is present in at least 1 isolate
SNPs_to_keep_filter = list( np.array( genotypes_matrix_minor_alleles_count ) >= 1 )

SNPs that had at least 1 isolate with minor allele

In [31]:
np.sum(SNPs_to_keep_filter)

189040

In [32]:
#filter SNP annotation DF
SNP_annotation_DF = SNP_annotation_DF[SNPs_to_keep_filter]

#reindex SNP annotation DF
SNP_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[np.array(SNPs_to_keep_filter) , : ]

In [33]:
np.shape(genotypes_array)

(189040, 2815)

In [34]:
np.shape(SNP_annotation_DF)

(189040, 7)

In [35]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA
1,75,G,[A],Rv0001,74.0,Essential,dnaA
2,91,A,[C],Rv0001,90.0,Essential,dnaA
3,108,C,"[A, T]",Rv0001,107.0,Essential,dnaA
4,129,G,[A],Rv0001,128.0,Essential,dnaA


### [4] Search for evidence of homoplasy within the set of isolates using the terminal branches

#### Re-order the isolates according to the order of the terminal branches of the phylogeny

In [36]:
terminal_branch_isolate_order = []

for isolate_ID in isolate_tags_in_phylogeny_and_matrix_ordered:
    
    isolate_ID_index = isolate_annotation_DF[isolate_annotation_DF.isolate_ID == isolate_ID].index[0]
    terminal_branch_isolate_order.append(isolate_ID_index)

In [37]:
len(terminal_branch_isolate_order)

2815

Re-order the columns of the **Genotypes Array**

In [38]:
genotypes_array = genotypes_array[: , terminal_branch_isolate_order]

Re-order the rows of the **Isolate Annotation DF**

In [39]:
isolate_annotation_DF = isolate_annotation_DF.iloc[terminal_branch_isolate_order , :]

#reset the index
isolate_annotation_DF.reset_index(inplace = True, drop = True)

In [40]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
0,1,1,1,1,1,,,,,,,0209688,1.1.1.1.1,1
1,1,1,1,1,1,,,,,,,SAMN07766112,1.1.1.1.1,1
2,1,1,1,1,1,,,,,,,SAMEA2783046,1.1.1.1.1,1
3,1,1,1,1,1,,,,,,,SAMEA2783143,1.1.1.1.1,1
4,1,1,1,1,1,,,,,,,SAMEA2783301,1.1.1.1.1,1


#### Define function to calculate Homoplasy Score for each SNP

Allele <-> code mapping

In [41]:
allele_code_map_dict = {'A':0 , 'C':1 , 'G':2 , 'T':3}
allele_code_map_dict_r = {0:'A' , 1:'C' , 2:'G' , 3:'T'}

In [42]:
def calc_homoplasy_score_for_SNP(SNP_i):

    #get annotation for this SNP
    SNP_i_annotation = SNP_annotation_DF.loc[SNP_i , :]

    #> Find the minor allele(s) (and respective codes) for this SNP
    #create a list of the bases supported by at least 1 isolates at this reference position
    SNP_i_alleles = [SNP_i_annotation.ref] + SNP_i_annotation.alt

    #get the codes for each allele
    SNP_i_allele_codes = [allele_code_map_dict[allele_i] for allele_i in SNP_i_alleles]

    #get genotypes for SNP
    SNP_i_genotypes = genotypes_array[SNP_i , :]

    #get the number of isolates supporting each allele
    num_isolates_with_allele_series = pd.Series(index = SNP_i_allele_codes)

    for SNP_allele_code_i in SNP_i_allele_codes:

        num_isolate_with_allele_i = np.sum( SNP_i_genotypes == SNP_allele_code_i )
        num_isolates_with_allele_series[SNP_allele_code_i] = num_isolate_with_allele_i

    #sort series containing num isolates w/ each allele in increasing order
    num_isolates_with_allele_series.sort_values(ascending = True , inplace = True)

    #all alleles but the last one (allele called in a majority of isolates) are the minor allele(s) for this SNP
    minor_allele_codes = list( num_isolates_with_allele_series.index[:-1] )

    #create a list that stores the number of homoplasies for each minor allele
    SNP_i_homoplasy_score_list = []

    #> Iterate through all minor alleles and look for evidence of homoplasy
    for SNP_i_minor_allele in minor_allele_codes:

        #> Find all the subsets of continuous minor alleles within the terminal brances (including 9's)
        #> Create a boolean array that is *True* if SNP_i was called as the minor allele or missing data for the isolate, and *False* if SNP_i was called at the major allele or other minor allele(s)

        SNP_i_genotypes_bool_minor_allele = SNP_i_genotypes == SNP_i_minor_allele #TRUE where isolates support minor allele
        SNP_i_genotypes_bool_missing_data = SNP_i_genotypes == 9 #TRUE where isolates had missing data for this SNP

        SNP_i_genotypes_bool = [(SNP_i_minor_allele_for_isolate or SNP_i_missing_data_for_isolate) for SNP_i_minor_allele_for_isolate, SNP_i_missing_data_for_isolate in zip(SNP_i_genotypes_bool_minor_allele, SNP_i_genotypes_bool_missing_data)]

        #> Return the indices of the array where the elements are marked *True*
        SNP_i_genotypes_indices_with_minor_allele = np.arange(0 , len(SNP_i_genotypes_bool))[SNP_i_genotypes_bool]

        #1. run through the indices of the isolates with the minor allele
        #2. for each consective *block* of isolates with the minor allele store the isolate ID indices in a seperate list
        #3. pull the genotypes from the indices and make sure there is at least a single isolate with the minor allele (it's not all just 9's)
        #4. for each block of Isolate indices, retrieve isolate IDs from annotation DF

        '''
        isolate_blocks_with_minor_allele = []
        #if consecutive isolates with minor allele have at least 1 isolate with major allele between them
        for k, g in groupby(enumerate(SNP_i_genotypes_indices_with_minor_allele), lambda (i, x): i-x):

            #get a list of consecutive isolate indices that are located next to eachother on the phylogeny
            isolate_block_w_minor_allele_indices = map(itemgetter(1), g)

            #SNP genotypes for isolates
            isolate_block_genotypes = SNP_i_genotypes[isolate_block_w_minor_allele_indices]

            #check to see that at least 1 isolate had a good call for the minor allele within the block
            if SNP_i_minor_allele in isolate_block_genotypes:

                #pull the isolate IDs for each index
                isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                #append to list of blocks of isolates w/ minor allele
                isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
        '''

        ########################################################################################################################
        isolate_blocks_with_minor_allele = []

        #if consecutive isolates with minor allele have at least (spacer_length) isolates with major allele (or other minor allele) between them
        isolate_block_w_minor_allele_indices = []
        for i in np.arange(0 , len(SNP_i_genotypes_indices_with_minor_allele)-1):

            space_btwn_genotype_indices = SNP_i_genotypes_indices_with_minor_allele[i+1] - SNP_i_genotypes_indices_with_minor_allele[i]
            if space_btwn_genotype_indices <= spacer_length:

                isolate_block_w_minor_allele_indices.append(SNP_i_genotypes_indices_with_minor_allele[i])

                #check if second to last index
                if i+1 == (len(SNP_i_genotypes_indices_with_minor_allele)-1):

                    isolate_block_w_minor_allele_indices.append(SNP_i_genotypes_indices_with_minor_allele[i+1])

                    #######################################################
                    #SNP genotypes for isolates
                    isolate_block_genotypes = SNP_i_genotypes[isolate_block_w_minor_allele_indices]

                    #check to see that at least 1 isolate had a good call for the minor allele within the block
                    if SNP_i_minor_allele in isolate_block_genotypes:

                        #pull the isolate IDs for each index
                        isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                        #append to list of blocks of isolates w/ minor allele
                        isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
                    #######################################################

            elif space_btwn_genotype_indices > spacer_length:

                isolate_block_w_minor_allele_indices.append(SNP_i_genotypes_indices_with_minor_allele[i])

                #######################################################
                #SNP genotypes for isolates
                isolate_block_genotypes = SNP_i_genotypes[isolate_block_w_minor_allele_indices]

                #check to see that at least 1 isolate had a good call for the minor allele within the block
                if SNP_i_minor_allele in isolate_block_genotypes:

                    #pull the isolate IDs for each index
                    isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                    #append to list of blocks of isolates w/ minor allele
                    isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
                #######################################################

                isolate_block_w_minor_allele_indices = []

                #check if second to last index
                if i+1 == (len(SNP_i_genotypes_indices_with_minor_allele)-1):

                    isolate_block_w_minor_allele_indices.append(SNP_i_genotypes_indices_with_minor_allele[i+1])

                    #######################################################
                    #SNP genotypes for isolates
                    isolate_block_genotypes = SNP_i_genotypes[isolate_block_w_minor_allele_indices]

                    #check to see that at least 1 isolate had a good call for the minor allele within the block
                    if SNP_i_minor_allele in isolate_block_genotypes:

                        #pull the isolate IDs for each index
                        isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                        #append to list of blocks of isolates w/ minor allele
                        isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
                    #######################################################
        ########################################################################################################################

        #> Find the MRCA clade of each block of isolates
        isolate_block_MRCAs = []
        for isolate_block_i_with_minor_allele in isolate_blocks_with_minor_allele:

            #["Most recent common ancestor (clade) of all the given targets."]
            MRCA_of_children = tree.common_ancestor(isolate_block_i_with_minor_allele)
            isolate_block_MRCAs.append(MRCA_of_children)
            
        '''
        #>Find the distance between all pairs of the MRCAs of the *blocks* of isolates with minor allele
        #>> Finding the distance between all possible pairs can be visualized/computed as taking the distance between all pairs in the upper triangular part of a matrix (excluding the diagonal)
        distance_between_MRCAs = []
        for MRCA_i , MRCA_j in zip( np.triu_indices(len(isolate_block_MRCAs), 1)[0] , np.triu_indices(len(isolate_block_MRCAs), 1)[1] ):

            #get the distance between MRCA node & the root (i.e. tree.distance({"name":'SAMN03647821'} , {"name":'SAMEA1485499'}))
            #["Calculate the sum of the branch lengths between two targets. If only one target is specified, the other is the root of this tree."]
            dist_between_MRCAi_and_MRCAj = tree.distance(isolate_block_MRCAs[MRCA_i] , isolate_block_MRCAs[MRCA_j])
            distance_between_MRCAs.append(dist_between_MRCAi_and_MRCAj)
        '''

        #> The resulting *homoplasy score* is equal to the number of blocks of isolates (how many times the mutation arose in the phylogeny) ### weighted by the average pairwise distance between the MRCAs for all of the blocks of isolates
        #check to see that there are at least 1 block (minor allele arose at least once on phylogeny)
        if len(isolate_block_MRCAs) >= 1:
            SNP_i_homoplasy_score = float(len(isolate_block_MRCAs)) ###DELETE LATER### * np.mean(distance_between_MRCAs)
        elif len(isolate_block_MRCAs) == 0:
            SNP_i_homoplasy_score = 0.0

        #append minor allele & number of homoplasies to list
        SNP_i_homoplasy_score_list.append( [allele_code_map_dict_r[SNP_i_minor_allele] , SNP_i_homoplasy_score] )    
        
    return SNP_i_homoplasy_score_list

#### Calculate Homoplasy Score for all SNPs

In [43]:
SNP_i_homoplasy_scores = []

for SNP_i in SNP_annotation_DF.index:
    
    SNP_i_homoplasy_scores.append(calc_homoplasy_score_for_SNP(SNP_i))
    
    #keep track of progress
    if SNP_i in np.arange(0 , len(SNP_annotation_DF.index) , int(len(SNP_annotation_DF.index)*0.05)):
        print float(SNP_i) / float(len(SNP_annotation_DF.index))

0.0
0.05
0.1
0.15
0.2
0.25
0.3
0.35
0.4
0.45
0.5
0.55
0.6
0.65
0.7
0.75
0.8
0.85
0.9
0.95


In [44]:
SNP_annotation_DF['homoplasy_score'] = SNP_i_homoplasy_scores

In [45]:
SNP_annotation_DF.head(n=10)

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,homoplasy_score
0,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA,"[[A, 0.0], [T, 1.0]]"
1,75,G,[A],Rv0001,74.0,Essential,dnaA,"[[A, 1.0]]"
2,91,A,[C],Rv0001,90.0,Essential,dnaA,"[[C, 1.0]]"
3,108,C,"[A, T]",Rv0001,107.0,Essential,dnaA,"[[A, 0.0], [T, 1.0]]"
4,129,G,[A],Rv0001,128.0,Essential,dnaA,"[[A, 1.0]]"
5,147,T,"[C, G]",Rv0001,146.0,Essential,dnaA,"[[G, 0.0], [C, 1.0]]"
6,187,T,[C],Rv0001,186.0,Essential,dnaA,"[[C, 1.0]]"
7,194,T,[C],Rv0001,193.0,Essential,dnaA,"[[C, 1.0]]"
8,195,G,"[C, T]",Rv0001,194.0,Essential,dnaA,"[[T, 0.0], [C, 1.0]]"
9,200,G,[A],Rv0001,199.0,Essential,dnaA,"[[A, 1.0]]"


In [46]:
np.shape(SNP_annotation_DF)

(189040, 8)

#### Re-format dataframe so that each row contains only one minor allele

Each row will correspond to the homoplasy for a specific minor allele for a specific SNP position

In [47]:
SNP_i_minor_allele_i_list = []
pos_list = []
ref_list = []
alt_list = []
gene_id_list = []
gene_pos_list = []
gene_category_list = []
gene_name_list = []
homoplasy_minor_allele_list = []
homoplasy_score_list = []

SNP_i_minor_allele_i = 0
for SNP_i in SNP_annotation_DF.index:
    
    #get info for SNP
    SNP_i_annot = SNP_annotation_DF.loc[SNP_i , :]
    
    SNP_i_pos = SNP_i_annot.pos
    SNP_i_ref = SNP_i_annot.ref
    SNP_i_alt = SNP_i_annot.alt
    SNP_i_gene_id = SNP_i_annot.gene_id
    SNP_i_gene_pos = SNP_i_annot.gene_pos
    SNP_i_gene_category = SNP_i_annot.gene_category
    SNP_i_gene_name = SNP_i_annot.gene_name
    
    for SNP_i_minor_allele , homoplasy_score_i in SNP_i_annot.homoplasy_score:
        
        SNP_i_minor_allele_i_list.append(SNP_i_minor_allele_i)
        pos_list.append(SNP_i_pos)
        ref_list.append(SNP_i_ref)
        alt_list.append(SNP_i_alt)
        gene_id_list.append(SNP_i_gene_id)
        gene_pos_list.append(SNP_i_gene_pos)
        gene_category_list.append(SNP_i_gene_category)
        gene_name_list.append(SNP_i_gene_name)
        homoplasy_minor_allele_list.append(SNP_i_minor_allele)
        homoplasy_score_list.append(homoplasy_score_i)
        
        SNP_i_minor_allele_i += 1

SNP_homoplasy_DF = pd.DataFrame(index = SNP_i_minor_allele_i_list)
SNP_homoplasy_DF['pos'] = pos_list
SNP_homoplasy_DF['ref'] = ref_list
SNP_homoplasy_DF['alt'] = alt_list
SNP_homoplasy_DF['gene_id'] = gene_id_list
SNP_homoplasy_DF['gene_pos'] = gene_pos_list
SNP_homoplasy_DF['gene_category'] = gene_category_list
SNP_homoplasy_DF['gene_name'] = gene_name_list
SNP_homoplasy_DF['minor_allele'] = homoplasy_minor_allele_list
SNP_homoplasy_DF['homoplasy_score'] = homoplasy_score_list

In [48]:
SNP_homoplasy_DF.head(n = 10)

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,minor_allele,homoplasy_score
0,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA,A,0.0
1,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA,T,1.0
2,75,G,[A],Rv0001,74.0,Essential,dnaA,A,1.0
3,91,A,[C],Rv0001,90.0,Essential,dnaA,C,1.0
4,108,C,"[A, T]",Rv0001,107.0,Essential,dnaA,A,0.0
5,108,C,"[A, T]",Rv0001,107.0,Essential,dnaA,T,1.0
6,129,G,[A],Rv0001,128.0,Essential,dnaA,A,1.0
7,147,T,"[C, G]",Rv0001,146.0,Essential,dnaA,G,0.0
8,147,T,"[C, G]",Rv0001,146.0,Essential,dnaA,C,1.0
9,187,T,[C],Rv0001,186.0,Essential,dnaA,C,1.0


In [49]:
np.shape(SNP_homoplasy_DF)

(212971, 9)

#### Drop SNPs with a *Homoplasy Score* of 0

In [50]:
SNP_homoplasy_DF = SNP_homoplasy_DF[SNP_homoplasy_DF.homoplasy_score > 0.0]

In [51]:
np.shape(SNP_homoplasy_DF)

(190752, 9)

### [5] Functional annotation for each SNP

In [52]:
SNP_homoplasy_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,minor_allele,homoplasy_score
1,67,G,"[A, T]",Rv0001,66.0,Essential,dnaA,T,1.0
2,75,G,[A],Rv0001,74.0,Essential,dnaA,A,1.0
3,91,A,[C],Rv0001,90.0,Essential,dnaA,C,1.0
5,108,C,"[A, T]",Rv0001,107.0,Essential,dnaA,T,1.0
6,129,G,[A],Rv0001,128.0,Essential,dnaA,A,1.0


#### Drop columns

In [53]:
SNP_homoplasy_DF = SNP_homoplasy_DF.loc[: , ['pos','ref','alt','minor_allele','homoplasy_score']]

In [54]:
SNP_homoplasy_DF.head()

Unnamed: 0,pos,ref,alt,minor_allele,homoplasy_score
1,67,G,"[A, T]",T,1.0
2,75,G,[A],A,1.0
3,91,A,[C],C,1.0
5,108,C,"[A, T]",T,1.0
6,129,G,[A],A,1.0


#### Retrieve the SNP type from the functional SNP annotation

In [55]:
#load DataFrame with functional annotation for all possible alternate alleles
SNP_functional_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_SNP_functional_annotation.pkl')

In [56]:
SNP_functional_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change
48_T,48,C,T,Essential,dnaA,Rv0001,48,S,V16V
64_C,64,G,C,Essential,dnaA,Rv0001,64,N,G22R
67_A,67,G,A,Essential,dnaA,Rv0001,67,N,D23N
67_T,67,G,T,Essential,dnaA,Rv0001,67,N,D23Y
69_T,69,C,T,Essential,dnaA,Rv0001,69,S,D23D


In [57]:
#these lists will hold variant annotation per alternate allele for each alternate allele present at at given SNP position
gene_category_list = []
gene_name_list = []
gene_id_list = []
gene_pos_list = []
SNP_type_list = []
AA_change_list = []

for SNP_i_pos , SNP_i_alt_list in zip(list(SNP_homoplasy_DF.pos) , list(SNP_homoplasy_DF.alt)):
    
    #iterate through all alternate alleles for the SNP at this position (probably just 1 element in each list)
    SNP_i_gene_category_list = []
    SNP_i_gene_name_list = []
    SNP_i_gene_id_list = []
    SNP_i_gene_pos_list = []
    SNP_i_types_list = []
    SNP_i_AA_changes_list = []
    
    for SNP_i_alt in SNP_i_alt_list:
        
        SNP_i_key = str(SNP_i_pos) + '_' + SNP_i_alt
    
        #get variant annotation from funtional SNP annot DF
        SNP_i_gene_category_list.append(SNP_functional_annotation_DF.loc[SNP_i_key , 'gene_category'])
        SNP_i_gene_name_list.append(SNP_functional_annotation_DF.loc[SNP_i_key , 'gene_name'])
        SNP_i_gene_id_list.append(SNP_functional_annotation_DF.loc[SNP_i_key , 'gene_id'])
        SNP_i_gene_pos_list.append(SNP_functional_annotation_DF.loc[SNP_i_key , 'gene_pos'])
        SNP_i_types_list.append(SNP_functional_annotation_DF.loc[SNP_i_key , 'SNP_type'])
        SNP_i_AA_changes_list.append(SNP_functional_annotation_DF.loc[SNP_i_key , 'AA_change'])
        
    #append functional annotation for all alternate allele at this site
    gene_category_list.append(SNP_i_gene_category_list)
    gene_name_list.append(SNP_i_gene_name_list)
    gene_id_list.append(SNP_i_gene_id_list)
    gene_pos_list.append(SNP_i_gene_pos_list)
    SNP_type_list.append(SNP_i_types_list)
    AA_change_list.append(SNP_i_AA_changes_list)
    
#create new column for DF
SNP_homoplasy_DF['gene_id'] = gene_id_list
SNP_homoplasy_DF['gene_pos'] = gene_pos_list
SNP_homoplasy_DF['gene_category'] = gene_category_list
SNP_homoplasy_DF['gene_name'] = gene_name_list
SNP_homoplasy_DF['type'] = SNP_type_list
SNP_homoplasy_DF['AA_change'] = AA_change_list

In [58]:
SNP_homoplasy_DF.head(n=10)

Unnamed: 0,pos,ref,alt,minor_allele,homoplasy_score,gene_id,gene_pos,gene_category,gene_name,type,AA_change
1,67,G,"[A, T]",T,1.0,"[Rv0001, Rv0001]","[67, 67]","[Essential, Essential]","[dnaA, dnaA]","[N, N]","[D23N, D23Y]"
2,75,G,[A],A,1.0,[Rv0001],[75],[Essential],[dnaA],[S],[K25K]
3,91,A,[C],C,1.0,[Rv0001],[91],[Essential],[dnaA],[N],[S31R]
5,108,C,"[A, T]",T,1.0,"[Rv0001, Rv0001]","[108, 108]","[Essential, Essential]","[dnaA, dnaA]","[S, S]","[L36L, L36L]"
6,129,G,[A],A,1.0,[Rv0001],[129],[Essential],[dnaA],[S],[Q43Q]
8,147,T,"[C, G]",C,1.0,"[Rv0001, Rv0001]","[147, 147]","[Essential, Essential]","[dnaA, dnaA]","[S, N]","[N49N, N49K]"
9,187,T,[C],C,1.0,[Rv0001],[187],[Essential],[dnaA],[S],[L63L]
10,194,T,[C],C,1.0,[Rv0001],[194],[Essential],[dnaA],[N],[V65A]
12,195,G,"[C, T]",C,1.0,"[Rv0001, Rv0001]","[195, 195]","[Essential, Essential]","[dnaA, dnaA]","[S, S]","[V65V, V65V]"
13,200,G,[A],A,1.0,[Rv0001],[200],[Essential],[dnaA],[N],[S67N]


#### Classify each homoplastic SNP (Ref Allele <-> Alt Allele i ) as an *sSNP*, *nSNP*, *iSNP* or *ambiguous*

- **IF** minor allele = alternate allele i; **THEN** use SNP type for ref <> alternate allele i
- **ELIF** minor allele = reference allele **AND** there's only 1 alternalte allele (SNP is biallelic); **THEN** use SNP type for ref <> alternate allele
- **ELIF** minor allele = reference allele **AND** there's > 1 alternalte allele **AND** SNP types are the same for all alternate alleles; **THEN** use same SNP types for all ref <> alternate allele(s)
- **ELIF** minor allele = reference allele **AND** there's > 1 alternalte allele **AND** SNP types are different among alternate alleles; **THEN** classify functional effect as *ambiguous*

In [59]:
functional_effect_list = []

for homoplastic_SNP_i in SNP_homoplasy_DF.index:
    
    SNP_i_info = SNP_homoplasy_DF.loc[homoplastic_SNP_i , :]
    
    SNP_i_ref = SNP_i_info.ref
    SNP_i_alt_list = SNP_i_info.alt
    SNP_i_minor_allele = SNP_i_info.minor_allele
    SNP_i_types_list = SNP_i_info.type
    
    if SNP_i_minor_allele in SNP_i_alt_list:
        
        SNP_i_func_effect = SNP_i_types_list[SNP_i_alt_list.index(SNP_i_minor_allele)]
        
    elif (SNP_i_minor_allele == SNP_i_ref) and (len(SNP_i_alt_list) == 1):
        
        SNP_i_func_effect = SNP_i_types_list[0]
        
    elif (SNP_i_minor_allele == SNP_i_ref) and (len(SNP_i_alt_list) > 1) and (len(set(SNP_i_types_list)) == 1):
    
        SNP_i_func_effect = SNP_i_types_list[0]
        
    elif (SNP_i_minor_allele == SNP_i_ref) and (len(SNP_i_alt_list) > 1) and (len(set(SNP_i_types_list)) > 1):
        
        SNP_i_func_effect = 'A'
        
    functional_effect_list.append(SNP_i_func_effect)
        
SNP_homoplasy_DF.loc[: , 'mut_func_effect'] = functional_effect_list

In [60]:
SNP_homoplasy_DF.head()

Unnamed: 0,pos,ref,alt,minor_allele,homoplasy_score,gene_id,gene_pos,gene_category,gene_name,type,AA_change,mut_func_effect
1,67,G,"[A, T]",T,1.0,"[Rv0001, Rv0001]","[67, 67]","[Essential, Essential]","[dnaA, dnaA]","[N, N]","[D23N, D23Y]",N
2,75,G,[A],A,1.0,[Rv0001],[75],[Essential],[dnaA],[S],[K25K],S
3,91,A,[C],C,1.0,[Rv0001],[91],[Essential],[dnaA],[N],[S31R],N
5,108,C,"[A, T]",T,1.0,"[Rv0001, Rv0001]","[108, 108]","[Essential, Essential]","[dnaA, dnaA]","[S, S]","[L36L, L36L]",S
6,129,G,[A],A,1.0,[Rv0001],[129],[Essential],[dnaA],[S],[Q43Q],S


Pickle dataframe with homoplasy count for this lineage for downstream analysis

In [61]:
SNP_homoplasy_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/homoplasies_detected_in_global_lineages/SNP homoplasies/pickled dataframes/spacer_{0}/homoplasy_scores_lineage_{1}.pkl'.format(str(spacer_length) , global_lineage))

Check the **top hits**

In [62]:
SNP_homoplasy_DF.sort_values('homoplasy_score' , ascending = False).head(n = 40)

Unnamed: 0,pos,ref,alt,minor_allele,homoplasy_score,gene_id,gene_pos,gene_category,gene_name,type,AA_change,mut_func_effect
106563,2155168,C,"[A, T, G]",G,186.0,"[Rv1908c, Rv1908c, Rv1908c]","[944, 944, 944]","[Antibiotic Resistance, Antibiotic Resistance,...","[katG, katG, katG]","[N, N, N]","[S315I, S315N, S315T]",N
83903,1673425,C,[T],T,162.0,[Rv1482c_Rv1483],[None],[None],[None],[I],[None],I
41998,761155,C,"[A, T, G]",T,76.0,"[Rv0667, Rv0667, Rv0667]","[1349, 1349, 1349]","[Antibiotic Resistance, Antibiotic Resistance,...","[rpoB, rpoB, rpoB]","[N, N, N]","[S450*, S450L, S450W]",N
183510,3884906,A,[G],A,75.0,[Rv3467],[943],[Non-Essential],[Rv3467],[N],[K315E],N
181761,3841662,T,[C],C,63.0,[Rv3423c_Rv3424c],[None],[None],[None],[I],[None],I
151601,3136335,G,[A],A,55.0,[Rv2828c_Rv2829c],[None],[None],[None],[I],[None],I
129350,2626600,G,[A],A,52.0,[Rv2347c_Rv2348c],[None],[None],[None],[I],[None],I
183505,3883605,T,[C],C,51.0,[Rv3466],[81],[Non-Essential],[Rv3466],[S],[S27S],S
74702,1468150,C,[T],T,46.0,[Rv1312_Rv1313c],[None],[None],[None],[I],[None],I
60731,1164571,A,[G],G,45.0,[Rv1040c_Rv1041c],[None],[None],[None],[I],[None],I


####################################################################################################################################################################################

## [2] Run script above for global lineages $\in \{1, 2, 3, 4A, 4B, 4C, 5, 6 \}$ & for Spacer Values $\in \{1, 2, 3, 4, 5, 6 \}$

####################################################################################################################################################################################

In [1]:
from slurmpy import Slurm
import os

### Submit jobs to collect homoplasies on phylogeny for each *Global Lineage* and each *Spacer Length*

In [3]:
for global_lineage in ['1' , '2' , '3' , '4A' , '4B' , '4C' , '5' , '6']:

    for spacer_length in ['1' , '2' , '3' , '4' , '5' , '6']:

        homoplasy_job = 'python /home/rv76/Farhat_Lab/Python_Scripts/homoplasy_project/SNP_homoplasy_collection.py {0} {1}'.format(global_lineage , spacer_length)

        #directory where you want output + error files
        os.chdir('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/homoplasies_detected_in_global_lineages/SNP homoplasies/homoplasy collection jobs/')

        job_name = 'L{0}_S{1}_SNP'.format(global_lineage , spacer_length)

        s = Slurm(job_name , {'partition':'medium' , 'N':'1' , 't':'5-0:00:00' , 'mem':'64G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

        #submits the job
        job_id = s.run(homoplasy_job)

        print job_name  + ' : ' +  str(job_id)

submitted: Submitted batch job 17696936
submitted: Submitted batch job 17696937
submitted: Submitted batch job 17696938


L1_S1_SNP : 17696936
L1_S2_SNP : 17696937
L1_S3_SNP : 17696938


submitted: Submitted batch job 17696939
submitted: Submitted batch job 17696940
submitted: Submitted batch job 17696941


L1_S4_SNP : 17696939
L1_S5_SNP : 17696940
L1_S6_SNP : 17696941


submitted: Submitted batch job 17696942
submitted: Submitted batch job 17696943
submitted: Submitted batch job 17696944
submitted: Submitted batch job 17696945


L2_S1_SNP : 17696942
L2_S2_SNP : 17696943
L2_S3_SNP : 17696944
L2_S4_SNP : 17696945


submitted: Submitted batch job 17696946
submitted: Submitted batch job 17696947
submitted: Submitted batch job 17696948


L2_S5_SNP : 17696946
L2_S6_SNP : 17696947
L3_S1_SNP : 17696948


submitted: Submitted batch job 17696949
submitted: Submitted batch job 17696950
submitted: Submitted batch job 17696951
submitted: Submitted batch job 17696952


L3_S2_SNP : 17696949
L3_S3_SNP : 17696950
L3_S4_SNP : 17696951
L3_S5_SNP : 17696952


submitted: Submitted batch job 17696953
submitted: Submitted batch job 17696954
submitted: Submitted batch job 17696955
submitted: Submitted batch job 17696956


L3_S6_SNP : 17696953
L4A_S1_SNP : 17696954
L4A_S2_SNP : 17696955
L4A_S3_SNP : 17696956


submitted: Submitted batch job 17696957
submitted: Submitted batch job 17696958
submitted: Submitted batch job 17696959
submitted: Submitted batch job 17696960


L4A_S4_SNP : 17696957
L4A_S5_SNP : 17696958
L4A_S6_SNP : 17696959
L4B_S1_SNP : 17696960


submitted: Submitted batch job 17696961
submitted: Submitted batch job 17696962


L4B_S2_SNP : 17696961
L4B_S3_SNP : 17696962


submitted: Submitted batch job 17696963
submitted: Submitted batch job 17696964


L4B_S4_SNP : 17696963
L4B_S5_SNP : 17696964


submitted: Submitted batch job 17696965
submitted: Submitted batch job 17696966
submitted: Submitted batch job 17696967
submitted: Submitted batch job 17696968


L4B_S6_SNP : 17696965
L4C_S1_SNP : 17696966
L4C_S2_SNP : 17696967
L4C_S3_SNP : 17696968


submitted: Submitted batch job 17696969
submitted: Submitted batch job 17696970
submitted: Submitted batch job 17696971
submitted: Submitted batch job 17696972


L4C_S4_SNP : 17696969
L4C_S5_SNP : 17696970
L4C_S6_SNP : 17696971
L5_S1_SNP : 17696972


submitted: Submitted batch job 17696973
submitted: Submitted batch job 17696974


L5_S2_SNP : 17696973
L5_S3_SNP : 17696974


submitted: Submitted batch job 17696975
submitted: Submitted batch job 17696976
submitted: Submitted batch job 17696977
submitted: Submitted batch job 17696978


L5_S4_SNP : 17696975
L5_S5_SNP : 17696976
L5_S6_SNP : 17696977
L6_S1_SNP : 17696978
L6_S2_SNP : 17696979
L6_S3_SNP : 17696980
L6_S4_SNP : 17696981
L6_S5_SNP : 17696982
L6_S6_SNP : 17696983


submitted: Submitted batch job 17696979
submitted: Submitted batch job 17696980
submitted: Submitted batch job 17696981
submitted: Submitted batch job 17696982
submitted: Submitted batch job 17696983
