In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

####################################################################################################################################################################################

## [1] Create a script that calculates the number of homoplasies that occur within a global lineage (phylogenetic tree constructed from all isolates typed into a lineage)

####################################################################################################################################################################################

In [2]:
import os
import pandas as pd
import numpy as np
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio import Phylo
from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq

from collections import Counter
from itertools import groupby
from operator import itemgetter

### Inputs

In [3]:
global_lineage = '1' #INPUT 1 - global lineage
spacer_length = 1 #INPUT 2 - length of spacer to define 'blocks' of allele types

### [1] Load INDEL genotype matrix and Annotation Files

In [4]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')

#load INDEL annotation file (rows of Genotype Matrix) with gene annotation information
INDEL_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_annotation.pkl')

#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_matrix.npy')

Columns of Genotype Matrix

In [5]:
isolate_annotation_DF.head()

Unnamed: 0,isolate_ID,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,lineage_call,group
0,SAMN13051687,2,2,1,1.0,1.0,i3,,,,,,2.2.1.1.1.i3,2
1,SAMN09100245,4,2,1,2.0,1.0,1,i3,2.0,,,,4.2.1.2.1.1.i3.2,4B
2,SAMN08732238,2,2,1,1.0,1.0,,,,,,,2.2.1.1.1,2
3,SAMN07658260,3,1,1,,,,,,,,,3.1.1,3
4,SAMN03648003,2,2,1,1.0,1.0,,,,,,,2.2.1.1.1,2


In [6]:
np.shape(isolate_annotation_DF)

(31428, 14)

Rows of Genotype Matrix

In [7]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A,Rv0001,312.0,Essential,dnaA
1,TC_1549_T,1549,TC,T,intergenic,,,
2,T_1552_TAA,1552,T,TAA,intergenic,,,
3,TAA_1552_T,1552,TAA,T,intergenic,,,
4,T_1552_TA,1552,T,TA,intergenic,,,


In [8]:
np.shape(INDEL_annotation_DF)

(47425, 8)

Genotype Matrix

In [9]:
genotypes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [10]:
np.shape(genotypes_array)

(47425, 31428)

### [2] Load in phylogeny created by Luca tree pipeline

We're going to use Biopython's *Phylo* module to load phylogenetic trees created by Luca

In [11]:
#path to tree
phylogeny_path = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_' + global_lineage + '/tree_lineage_' + global_lineage + '_iqtree_FINAL.treefile'

#parses and load tree
tree = Phylo.parse(phylogeny_path , 'newick').next() 

#root the tree with the outgroup M. canettii ["Normally you will want the outgroup to be a monophyletic group, rather than a single taxon."]
tree.root_with_outgroup({"name":"canettii"})

#flip branches so deeper clades are displayed at top
#(sort clades in-place according to the # of terminal nodes)
tree.ladderize()

'''
Can choose to root the tree at the midpoint instead
#root the tree at the midpoint ["Root the tree at the midpoint of the two most distant taxa."]
tree.root_at_midpoint()
'''

#retrieves the terminal branches of the tree
terminal_branches = tree.get_terminals() 

Number of branches on this tree

In [12]:
len(terminal_branches)

2816

Retreive the isolates that were used for this tree

In [13]:
isolate_tags_in_phylogeny = [terminal_branches[i].name for i in range(0 , len(terminal_branches))]

In [14]:
len(isolate_tags_in_phylogeny) #+1 for M. canetti

2816

### [3] Combine *genotypes matrix* and *phylogeny* (subset to intersecting isolate IDs)

#### Find the intersection of isolate tags from Phylogeny & Genotypes Matrix

In [15]:
isolate_tags_to_keep = list( set(isolate_annotation_DF.isolate_ID).intersection( set(isolate_tags_in_phylogeny) ) )

In [16]:
len(isolate_tags_to_keep)

2815

#### Subset *terminal branches* from phylogeny to those included in genotype matrix

In [17]:
isolate_tags_in_phylogeny_and_matrix_ordered = [isolate_tag for isolate_tag in isolate_tags_in_phylogeny if isolate_tag in isolate_tags_to_keep]

In [18]:
isolate_tags_in_phylogeny_and_matrix_ordered[0:5]

['SAMN06209986',
 'SAMEA1119746',
 'SAMEA3445265',
 'SAMEA1118021',
 'SAMEA2297133']

#### Subset *genotypes matrix* to only those isolates that are included in phylogeny

In [19]:
isolates_to_keep_filter = [isolate_tag in isolate_tags_to_keep for isolate_tag in isolate_annotation_DF.isolate_ID]

#filter isolate annotation df
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter genotypes matrix
genotypes_array = genotypes_array[ : , np.array(isolates_to_keep_filter)]                

In [20]:
isolate_annotation_DF.head(n=2)

Unnamed: 0,isolate_ID,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,lineage_call,group
0,SAMN06055874,1,1,1,1,1,,,,,,,1.1.1.1.1,1
1,SAMEA2785403,1,1,1,2,i1,,,,,,,1.1.1.2.i1,1


In [21]:
np.shape(isolate_annotation_DF)

(2815, 14)

In [22]:
INDEL_annotation_DF.head(n=2)

Unnamed: 0,key,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,ACCGACGAAG_313_A,313,ACCGACGAAG,A,Rv0001,312.0,Essential,dnaA
1,TC_1549_T,1549,TC,T,intergenic,,,


In [23]:
np.shape(INDEL_annotation_DF)

(47425, 8)

In [24]:
genotypes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [25]:
np.shape(genotypes_array)

(47425, 2815)

#### Drop INDELs where indel allele occurs in $<$ 1 isolate

**Note**: While this was originally run on the 50,260 x 31,428 genotypes matrix (before INDELs that were present in 0 isolates were dropped), this step would have removed any of the $50260-47425=2835$ INDEL variants that were harbored in 0 / 31,428 isolates

Many isolates were dropped in considering only isolates from this lineage. So if there were INDELs in which the alternate allele was present in 2 isolates and 1 was dropped (or both were dropped), then there is only 1 (or 0) isolate with the indel allele left in the *Genotypes Matrix*. Let's drop all INDELs in which the mutant allele is present in no isolate(s).

1. For each row of matrix (INDEL) we will count the number of **1's** (representing the number of isolates that support the INDEL allele)
1. We will discard all INDELs for which the mutant allele occurs in NO isolates

In [26]:
genotypes_matrix_INDEL_1_count = list( np.sum( genotypes_array == 1 , axis = 1 ) )
genotypes_matrix_INDEL_1_count = np.array(genotypes_matrix_INDEL_1_count)

In [27]:
genotypes_matrix_INDEL_1_count

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
len(genotypes_matrix_INDEL_1_count)

47425

Number of INDELs that we're going to drop because the indel allele is present in *no* isolates.

In [29]:
np.sum( np.array( genotypes_matrix_INDEL_1_count ) < 1 )

38707

In [30]:
#boolean list that is 'True' for all INDEL where the allele is present in at least 1 isolate
INDELs_to_keep_filter = list( np.array( genotypes_matrix_INDEL_1_count ) >= 1 )

INDELs that had at least 1 isolate with INDEL allele

In [31]:
np.sum(INDELs_to_keep_filter)

8718

In [32]:
#filter INDEL annotation DF
INDEL_annotation_DF = INDEL_annotation_DF[INDELs_to_keep_filter]

#reindex INDEL annotation DF
INDEL_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[np.array(INDELs_to_keep_filter) , : ]

In [33]:
np.shape(genotypes_array)

(8718, 2815)

In [34]:
np.shape(INDEL_annotation_DF)

(8718, 8)

In [35]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,T_1552_TA,1552,T,TA,intergenic,,,
1,TA_1552_T,1552,TA,T,intergenic,,,
2,TTCTC_1561_T,1561,TTCTC,T,intergenic,,,
3,TCC_1568_T,1568,TCC,T,intergenic,,,
4,CCGACCGACGT_1725_C,1725,CCGACCGACGT,C,intergenic,,,


#### Append column to INDEL annotation to define the INDEL allele (1: always support for the INDEL in the genotypes matrix)

In [36]:
INDEL_annotation_DF['indel_allele'] = np.shape(INDEL_annotation_DF)[0] * [1]

In [37]:
INDEL_annotation_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,indel_allele
0,T_1552_TA,1552,T,TA,intergenic,,,,1
1,TA_1552_T,1552,TA,T,intergenic,,,,1
2,TTCTC_1561_T,1561,TTCTC,T,intergenic,,,,1
3,TCC_1568_T,1568,TCC,T,intergenic,,,,1
4,CCGACCGACGT_1725_C,1725,CCGACCGACGT,C,intergenic,,,,1


### [4] Search for evidence of homoplasy within the set of isolates using the terminal branches

#### Re-order the isolates according to the order of the terminal branches of the phylogeny

In [38]:
terminal_branch_isolate_order = []

for isolate_ID in isolate_tags_in_phylogeny_and_matrix_ordered:
    
    isolate_ID_index = isolate_annotation_DF[isolate_annotation_DF.isolate_ID == isolate_ID].index[0]
    terminal_branch_isolate_order.append(isolate_ID_index)

In [39]:
len(terminal_branch_isolate_order)

2815

Re-order the columns of the **Genotypes Array**

In [40]:
genotypes_array = genotypes_array[: , terminal_branch_isolate_order]

Re-order the rows of the **Isolate Annotation DF**

In [41]:
isolate_annotation_DF = isolate_annotation_DF.iloc[terminal_branch_isolate_order , :]

#reset the index
isolate_annotation_DF.reset_index(inplace = True, drop = True)

In [42]:
isolate_annotation_DF.head()

Unnamed: 0,isolate_ID,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,lineage_call,group
0,SAMN06209986,1,2,1,1,1.0,,,,,,,1.2.1.1.1,1
1,SAMEA1119746,1,2,1,1,2.0,,,,,,,1.2.1.1.2,1
2,SAMEA3445265,1,2,2,2,,,,,,,,1.2.2.2,1
3,SAMEA1118021,1,2,1,2,,,,,,,,1.2.1.2,1
4,SAMEA2297133,1,2,1,2,,,,,,,,1.2.1.2,1


#### Define function to calculate Homoplasy Score for each INDEL

For the purposes of this calculation, we will set (INDEL allele) = **minor allele** / (non-INDEL allele) = **major allele**

In [43]:
def calc_homoplasy_score_for_INDEL(INDEL_i):
    #get allele for this INDEL and create a variable for the "major" allele
    INDEL_i_annotation = INDEL_annotation_DF.loc[INDEL_i , :]
    INDEL_i_minor_allele = 1
    INDEL_i_major_allele = 0

    #get genotypes for INDEL
    INDEL_i_genotypes = genotypes_array[INDEL_i , :]

    #> Find all the subsets of continuous minor alleles within the terminal brances (including 9's)
    #> Create a boolean array that is *True* if INDEL_i was called as the minor allele or missing data for the isolate, and *False* if INDEL_i was called at the major allele
    INDEL_i_genotypes_bool = INDEL_i_genotypes != INDEL_i_major_allele

    #> Return the indices of the array where the elements are marked *True*
    INDEL_i_genotypes_indices_with_minor_allele = np.arange(0 , len(INDEL_i_genotypes_bool))[INDEL_i_genotypes_bool]

    #1. run through the indices of the isolates with the minor allele
    #2. for each consective *block* of isolates with the minor allele store the isolate ID indices in a seperate list
    #3. pull the genotypes from the indices and make sure there is at least a single isolate with the minor allele (it's not all just 9's)
    #4. for each block of Isolate indices, retrieve isolate IDs from annotation DF
    
    '''
    isolate_blocks_with_minor_allele = []
    #if consecutive isolates with minor allele have at least 1 isolate with major allele between them
    for k, g in groupby(enumerate(INDEL_i_genotypes_indices_with_minor_allele), lambda (i, x): i-x):

        #get a list of consecutive isolate indices that are located next to eachother on the phylogeny
        isolate_block_w_minor_allele_indices = map(itemgetter(1), g)

        #INDEL genotypes for isolates
        isolate_block_genotypes = INDEL_i_genotypes[isolate_block_w_minor_allele_indices]

        #check to see that at least 1 isolate had a good call for the minor allele within the block
        if INDEL_i_minor_allele in isolate_block_genotypes:

            #pull the isolate IDs for each index
            isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

            #append to list of blocks of isolates w/ minor allele
            isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
    '''
    
    ########################################################################################################################
    isolate_blocks_with_minor_allele = []

    #if consecutive isolates with minor allele have at least (spacer_length) isolates with major allele (or other minor allele) between them
    isolate_block_w_minor_allele_indices = []
    for i in np.arange(0 , len(INDEL_i_genotypes_indices_with_minor_allele)-1):

        space_btwn_genotype_indices = INDEL_i_genotypes_indices_with_minor_allele[i+1] - INDEL_i_genotypes_indices_with_minor_allele[i]
        if space_btwn_genotype_indices <= spacer_length:

            isolate_block_w_minor_allele_indices.append(INDEL_i_genotypes_indices_with_minor_allele[i])

            #check if second to last index
            if i+1 == (len(INDEL_i_genotypes_indices_with_minor_allele)-1):

                isolate_block_w_minor_allele_indices.append(INDEL_i_genotypes_indices_with_minor_allele[i+1])

                #######################################################
                #INDEL genotypes for isolates
                isolate_block_genotypes = INDEL_i_genotypes[isolate_block_w_minor_allele_indices]

                #check to see that at least 1 isolate had a good call for the minor allele within the block
                if INDEL_i_minor_allele in isolate_block_genotypes:

                    #pull the isolate IDs for each index
                    isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                    #append to list of blocks of isolates w/ minor allele
                    isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
                #######################################################

        elif space_btwn_genotype_indices > spacer_length:

            isolate_block_w_minor_allele_indices.append(INDEL_i_genotypes_indices_with_minor_allele[i])

            #######################################################
            #INDEL genotypes for isolates
            isolate_block_genotypes = INDEL_i_genotypes[isolate_block_w_minor_allele_indices]

            #check to see that at least 1 isolate had a good call for the minor allele within the block
            if INDEL_i_minor_allele in isolate_block_genotypes:

                #pull the isolate IDs for each index
                isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                #append to list of blocks of isolates w/ minor allele
                isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
            #######################################################

            isolate_block_w_minor_allele_indices = []

            #check if second to last index
            if i+1 == (len(INDEL_i_genotypes_indices_with_minor_allele)-1):

                isolate_block_w_minor_allele_indices.append(INDEL_i_genotypes_indices_with_minor_allele[i+1])

                #######################################################
                #INDEL genotypes for isolates
                isolate_block_genotypes = INDEL_i_genotypes[isolate_block_w_minor_allele_indices]

                #check to see that at least 1 isolate had a good call for the minor allele within the block
                if INDEL_i_minor_allele in isolate_block_genotypes:

                    #pull the isolate IDs for each index
                    isolate_block_w_minor_allele_IDs = [{'name':isolate_annotation_DF.loc[isolate_index , 'isolate_ID']} for isolate_index in isolate_block_w_minor_allele_indices]

                    #append to list of blocks of isolates w/ minor allele
                    isolate_blocks_with_minor_allele.append(isolate_block_w_minor_allele_IDs)
                #######################################################
    ########################################################################################################################
    
    #> Find the MRCA clade of each block of isolates
    isolate_block_MRCAs = []
    for isolate_block_i_with_minor_allele in isolate_blocks_with_minor_allele:

        #["Most recent common ancestor (clade) of all the given targets."]
        MRCA_of_children = tree.common_ancestor(isolate_block_i_with_minor_allele)
        isolate_block_MRCAs.append(MRCA_of_children)

    '''
    #>Find the distance between all pairs of the MRCAs of the *blocks* of isolates with minor allele
    #>> Finding the distance between all possible pairs can be visualized/computed as taking the distance between all pairs in the upper triangular part of a matrix (excluding the diagonal)
    distance_between_MRCAs = []
    for MRCA_i , MRCA_j in zip( np.triu_indices(len(isolate_block_MRCAs), 1)[0] , np.triu_indices(len(isolate_block_MRCAs), 1)[1] ):

        #get the distance between MRCA node & the root (i.e. tree.distance({"name":'SAMN03647821'} , {"name":'SAMEA1485499'}))
        #["Calculate the sum of the branch lengths between two targets. If only one target is specified, the other is the root of this tree."]
        dist_between_MRCAi_and_MRCAj = tree.distance(isolate_block_MRCAs[MRCA_i] , isolate_block_MRCAs[MRCA_j])
        distance_between_MRCAs.append(dist_between_MRCAi_and_MRCAj)
    '''

    #> The resulting *homoplasy score* is equal to the number of blocks of isolates (how many times the mutation arose in the phylogeny) ### weighted by the average pairwise distance between the MRCAs for all of the blocks of isolates
    #check to see that there is at least 1 block (minor allele arose at least once on phylogeny)
    if len(isolate_block_MRCAs) >= 1:
        INDEL_i_homoplasy_score = float(len(isolate_block_MRCAs)) ###DELETE LATER### * np.mean(distance_between_MRCAs)
    elif len(isolate_block_MRCAs) == 0:
        INDEL_i_homoplasy_score = 0.0

    return INDEL_i_homoplasy_score

#### Calculate Homoplasy Score for all INDELs

In [44]:
INDEL_i_homoplasy_scores = []

for INDEL_i in INDEL_annotation_DF.index:
    
    INDEL_i_homoplasy_scores.append(calc_homoplasy_score_for_INDEL(INDEL_i))
    
    #keep track of progress
    if INDEL_i in np.arange(0 , len(INDEL_annotation_DF.index) , int(len(INDEL_annotation_DF.index)*0.05)):
        print float(INDEL_i) / float(len(INDEL_annotation_DF.index))

0.0
0.0498967653131
0.0997935306263
0.149690295939
0.199587061253
0.249483826566
0.299380591879
0.349277357192
0.399174122505
0.449070887818
0.498967653131
0.548864418445
0.598761183758
0.648657949071
0.698554714384
0.748451479697
0.79834824501
0.848245010323
0.898141775637
0.94803854095
0.997935306263


In [45]:
INDEL_annotation_DF['homoplasy_score'] = INDEL_i_homoplasy_scores

In [46]:
INDEL_annotation_DF.head(n=10)

Unnamed: 0,key,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,indel_allele,homoplasy_score
0,T_1552_TA,1552,T,TA,intergenic,,,,1,3.0
1,TA_1552_T,1552,TA,T,intergenic,,,,1,5.0
2,TTCTC_1561_T,1561,TTCTC,T,intergenic,,,,1,1.0
3,TCC_1568_T,1568,TCC,T,intergenic,,,,1,1.0
4,CCGACCGACGT_1725_C,1725,CCGACCGACGT,C,intergenic,,,,1,1.0
5,T_1815_TG,1815,T,TG,intergenic,,,,1,2.0
6,GA_2041_G,2041,GA,G,intergenic,,,,1,3.0
7,T_5068_TG,5068,T,TG,intergenic,,,,1,1.0
8,C_5075_CG,5075,C,CG,intergenic,,,,1,2.0
9,G_5076_GT,5076,G,GT,intergenic,,,,1,6.0


In [47]:
np.shape(INDEL_annotation_DF)

(8718, 10)

#### Drop INDELs with a *Homoplasy Score* of 0

In [48]:
INDEL_homoplasy_DF = INDEL_annotation_DF[INDEL_annotation_DF.homoplasy_score > 0.0]

In [49]:
np.shape(INDEL_homoplasy_DF)

(8364, 10)

### [5] Functional annotation for each INDEL

In [50]:
INDEL_homoplasy_DF.head()

Unnamed: 0,key,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,indel_allele,homoplasy_score
0,T_1552_TA,1552,T,TA,intergenic,,,,1,3.0
1,TA_1552_T,1552,TA,T,intergenic,,,,1,5.0
2,TTCTC_1561_T,1561,TTCTC,T,intergenic,,,,1,1.0
3,TCC_1568_T,1568,TCC,T,intergenic,,,,1,1.0
4,CCGACCGACGT_1725_C,1725,CCGACCGACGT,C,intergenic,,,,1,1.0


#### Drop columns

In [51]:
INDEL_homoplasy_DF = INDEL_homoplasy_DF.loc[: , ['key','pos','ref','alt','indel_allele','homoplasy_score']]

In [52]:
INDEL_homoplasy_DF.head()

Unnamed: 0,key,pos,ref,alt,indel_allele,homoplasy_score
0,T_1552_TA,1552,T,TA,1,3.0
1,TA_1552_T,1552,TA,T,1,5.0
2,TTCTC_1561_T,1561,TTCTC,T,1,1.0
3,TCC_1568_T,1568,TCC,T,1,1.0
4,CCGACCGACGT_1725_C,1725,CCGACCGACGT,C,1,1.0


#### Retrieve the INDEL type from the functional INDEL annotation

In [53]:
#load DataFrame with functional annotation for alternate allele
INDEL_functional_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape_indels/Genotypes_Filtered_2/genotypes_INDEL_functional_annotation.pkl')

In [54]:
INDEL_functional_annotation_DF.head()

Unnamed: 0_level_0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACCGACGAAG_313_A,313,ACCGACGAAG,A,Essential,dnaA,Rv0001,313.0,del,inframe,105.0
TC_1549_T,1549,TC,T,,,Rv0001_Rv0002,,del,frameshift,
T_1552_TAA,1552,T,TAA,,,Rv0001_Rv0002,,ins,frameshift,
TAA_1552_T,1552,TAA,T,,,Rv0001_Rv0002,,del,frameshift,
T_1552_TA,1552,T,TA,,,Rv0001_Rv0002,,ins,frameshift,


In [55]:
#these lists will hold variant annotation for each INDEL
gene_category_list = []
gene_name_list = []
gene_id_list = []
gene_pos_list = []
ins_del_list = []
INDEL_type_list = []
codon_pos_list = []

for INDEL_i_key in list(INDEL_homoplasy_DF.key):
    
    #append functional annotation for INDEL
    gene_category_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'gene_category'])
    gene_name_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'gene_name'])
    gene_id_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'gene_id'])
    gene_pos_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'gene_pos'])
    ins_del_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'ins_del'])
    INDEL_type_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'INDEL_type'])
    codon_pos_list.append(INDEL_functional_annotation_DF.loc[INDEL_i_key , 'codon_pos'])
    
#create new column for DF
INDEL_homoplasy_DF['gene_category'] = gene_category_list
INDEL_homoplasy_DF['gene_name'] = gene_name_list
INDEL_homoplasy_DF['gene_id'] = gene_id_list
INDEL_homoplasy_DF['gene_pos'] = gene_pos_list
INDEL_homoplasy_DF['ins_del'] = ins_del_list
INDEL_homoplasy_DF['INDEL_type'] = INDEL_type_list
INDEL_homoplasy_DF['codon_pos'] = codon_pos_list

In [56]:
INDEL_homoplasy_DF.head(n=10)

Unnamed: 0,key,pos,ref,alt,indel_allele,homoplasy_score,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
0,T_1552_TA,1552,T,TA,1,3.0,,,Rv0001_Rv0002,,ins,frameshift,
1,TA_1552_T,1552,TA,T,1,5.0,,,Rv0001_Rv0002,,del,frameshift,
2,TTCTC_1561_T,1561,TTCTC,T,1,1.0,,,Rv0001_Rv0002,,del,frameshift,
3,TCC_1568_T,1568,TCC,T,1,1.0,,,Rv0001_Rv0002,,del,frameshift,
4,CCGACCGACGT_1725_C,1725,CCGACCGACGT,C,1,1.0,,,Rv0001_Rv0002,,del,frameshift,
5,T_1815_TG,1815,T,TG,1,2.0,,,Rv0001_Rv0002,,ins,frameshift,
6,GA_2041_G,2041,GA,G,1,3.0,,,Rv0001_Rv0002,,del,frameshift,
7,T_5068_TG,5068,T,TG,1,1.0,,,Rv0004_Rv0005,,ins,frameshift,
8,C_5075_CG,5075,C,CG,1,2.0,,,Rv0004_Rv0005,,ins,frameshift,
9,G_5076_GT,5076,G,GT,1,6.0,,,Rv0004_Rv0005,,ins,frameshift,


Pickle dataframe with homoplasy count for this lineage for downstream analysis

In [57]:
INDEL_homoplasy_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/homoplasies_detected_in_global_lineages/INDEL homoplasies/pickled dataframes/spacer_{0}/homoplasy_scores_lineage_{1}.pkl'.format(str(spacer_length) , global_lineage))

Check the **top hits**

In [58]:
INDEL_homoplasy_DF.sort_values('homoplasy_score' , ascending = False).head(n = 40)

Unnamed: 0,key,pos,ref,alt,indel_allele,homoplasy_score,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos
7242,C_3794867_CCA,3794867,C,CCA,1,549.0,Non-Essential,dxs2,Rv3379c,1.0,ins,frameshift,1.0
3712,G_1894300_GGTCTTGCCGC,1894300,G,GGTCTTGCCGC,1,537.0,Non-Essential,Rv1668c,Rv1668c,1043.0,ins,frameshift,348.0
8104,CCCAATTCGTA_4254330_C,4254330,CCCAATTCGTA,C,1,523.0,,,Rv3798_Rv3799c,,del,frameshift,
4213,A_2137521_ACTCCGATCAC,2137521,A,ACTCCGATCAC,1,510.0,Non-Essential,Rv1888c,Rv1888c,559.0,ins,frameshift,187.0
6374,G_3296371_GCCGCGGC,3296371,G,GCCGCGGC,1,509.0,Non-Essential,pks15,Rv2947c,1470.0,ins,frameshift,490.0
6921,A_3610391_AC,3610391,A,AC,1,456.0,Non-Essential,tgs3,Rv3234c,799.0,ins,frameshift,267.0
2663,A_1313337_AG,1313337,A,AG,1,449.0,,,Rv1179c_Rv1180,,ins,frameshift,
7417,G_3895244_GGCC,3895244,G,GGCC,1,419.0,PE/PPE,PPE60,Rv3478,819.0,ins,inframe,273.0
225,C_79281_CCCACAT,79281,C,CCCACAT,1,403.0,,,Rv0070c_Rv0071,,ins,inframe,
533,AT_208316_A,208316,AT,A,1,392.0,Essential,Rv0176,Rv0176,865.0,del,frameshift,289.0


####################################################################################################################################################################################

## [2] Run script above for global lineages $\in \{1, 2, 3, 4A, 4B, 4C, 5, 6 \}$ & for Spacer Values $\in \{1, 2, 3, 4, 5, 6 \}$

####################################################################################################################################################################################

In [1]:
from slurmpy import Slurm
import os

### Submit jobs to collect homoplasies on phylogeny for each *Global Lineage* and each *Spacer Length*

In [2]:
for global_lineage in ['1' , '2' , '3' , '4A' , '4B' , '4C' , '5' , '6']:

    for spacer_length in ['1' , '2' , '3' , '4' , '5' , '6']:

        homoplasy_job = 'python /home/rv76/Farhat_Lab/Python_Scripts/homoplasy_project/INDEL_homoplasy_collection.py {0} {1}'.format(global_lineage , spacer_length)

        #directory where you want output + error files
        os.chdir('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/homoplasies_detected_in_global_lineages/INDEL homoplasies/homoplasy collection jobs/')

        job_name = 'L{0}_S{1}_INDEL'.format(global_lineage , spacer_length)

        s = Slurm(job_name , {'partition':'medium' , 'N':'1' , 't':'3-0:00:00' , 'mem':'16G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

        #submits the job
        job_id = s.run(homoplasy_job)

        print job_name  + ' : ' +  str(job_id)

submitted: Submitted batch job 17696985


L1_S1_INDEL : 17696985


submitted: Submitted batch job 17696986
submitted: Submitted batch job 17696987


L1_S2_INDEL : 17696986
L1_S3_INDEL : 17696987


submitted: Submitted batch job 17696988
submitted: Submitted batch job 17696989
submitted: Submitted batch job 17696990
submitted: Submitted batch job 17696991


L1_S4_INDEL : 17696988
L1_S5_INDEL : 17696989
L1_S6_INDEL : 17696990
L2_S1_INDEL : 17696991


submitted: Submitted batch job 17696992
submitted: Submitted batch job 17696993
submitted: Submitted batch job 17696994
submitted: Submitted batch job 17696995
submitted: Submitted batch job 17696996


L2_S2_INDEL : 17696992
L2_S3_INDEL : 17696993
L2_S4_INDEL : 17696994
L2_S5_INDEL : 17696995
L2_S6_INDEL : 17696996


submitted: Submitted batch job 17696997
submitted: Submitted batch job 17696998
submitted: Submitted batch job 17696999
submitted: Submitted batch job 17697000
submitted: Submitted batch job 17697001


L3_S1_INDEL : 17696997
L3_S2_INDEL : 17696998
L3_S3_INDEL : 17696999
L3_S4_INDEL : 17697000
L3_S5_INDEL : 17697001


submitted: Submitted batch job 17697002
submitted: Submitted batch job 17697003
submitted: Submitted batch job 17697004


L3_S6_INDEL : 17697002
L4A_S1_INDEL : 17697003
L4A_S2_INDEL : 17697004


submitted: Submitted batch job 17697005
submitted: Submitted batch job 17697006
submitted: Submitted batch job 17697007
submitted: Submitted batch job 17697008
submitted: Submitted batch job 17697009


L4A_S3_INDEL : 17697005
L4A_S4_INDEL : 17697006
L4A_S5_INDEL : 17697007
L4A_S6_INDEL : 17697008
L4B_S1_INDEL : 17697009


submitted: Submitted batch job 17697010
submitted: Submitted batch job 17697011
submitted: Submitted batch job 17697012
submitted: Submitted batch job 17697013
submitted: Submitted batch job 17697014


L4B_S2_INDEL : 17697010
L4B_S3_INDEL : 17697011
L4B_S4_INDEL : 17697012
L4B_S5_INDEL : 17697013
L4B_S6_INDEL : 17697014


submitted: Submitted batch job 17697015
submitted: Submitted batch job 17697016
submitted: Submitted batch job 17697017
submitted: Submitted batch job 17697018


L4C_S1_INDEL : 17697015
L4C_S2_INDEL : 17697016
L4C_S3_INDEL : 17697017
L4C_S4_INDEL : 17697018


submitted: Submitted batch job 17697019
submitted: Submitted batch job 17697020
submitted: Submitted batch job 17697021
submitted: Submitted batch job 17697022


L4C_S5_INDEL : 17697019
L4C_S6_INDEL : 17697020
L5_S1_INDEL : 17697021
L5_S2_INDEL : 17697022


submitted: Submitted batch job 17697023
submitted: Submitted batch job 17697024
submitted: Submitted batch job 17697025
submitted: Submitted batch job 17697026


L5_S3_INDEL : 17697023
L5_S4_INDEL : 17697024
L5_S5_INDEL : 17697025
L5_S6_INDEL : 17697026


submitted: Submitted batch job 17697027
submitted: Submitted batch job 17697028
submitted: Submitted batch job 17697029
submitted: Submitted batch job 17697030
submitted: Submitted batch job 17697031


L6_S1_INDEL : 17697027
L6_S2_INDEL : 17697028
L6_S3_INDEL : 17697029
L6_S4_INDEL : 17697030
L6_S5_INDEL : 17697031
L6_S6_INDEL : 17697032


submitted: Submitted batch job 17697032
