In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### This notebook was created to (a) assign lineage to each isolate via SNP-barcoding and (b) *re-filter* out SNPs with minor allele < 1 isolates

In [2]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools
import gzip

import networkx as nx
import scipy
import random

####################################################################################################################################################################################

## [1] Load SNP genotype matrix and Annotation Files

####################################################################################################################################################################################

In [3]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/rolling_DB_scrape/Genotypes_Filtered/genotypes_isolate_annotation.pkl')

#load SNP annotation file (rows of Genotype Matrix) with gene annotation information
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/rolling_DB_scrape/Genotypes_Filtered/genotypes_SNP_annotation_with_gene_info.pkl')

#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/rolling_DB_scrape/Genotypes_Filtered/genotypes_matrix.npy')

Columns of Genotype Matrix

In [4]:
isolate_annotation_DF.head()

0    SAMEA3558733
1    SAMN03648641
2    SAMN03647419
3    SAMEA3671418
4    SAMN07659096
dtype: object

In [5]:
np.shape(isolate_annotation_DF)

(32355,)

Rows of Genotype Matrix

In [6]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,29,C,[G],[Rv0001],[28],[Essential],[dnaA]
1,31,A,"[C, G]",[Rv0001],[30],[Essential],[dnaA]
2,48,C,[T],[Rv0001],[47],[Essential],[dnaA]
3,64,G,[C],[Rv0001],[63],[Essential],[dnaA]
4,71,C,[T],[Rv0001],[70],[Essential],[dnaA]


In [7]:
np.shape(SNP_annotation_DF)

(139797, 7)

Genotype Matrix

In [8]:
genotypes_array

array([[1, 9, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [2, 9, 2, ..., 2, 2, 2],
       [2, 9, 2, ..., 2, 2, 2],
       [1, 9, 1, ..., 1, 1, 1]])

In [9]:
np.shape(genotypes_array)

(139797, 32355)

####################################################################################################################################################################################

## [2] Import  62-SNP barcode

####################################################################################################################################################################################

#### Import 62-SNP barcode from Coll et. al. 2014

In [10]:
all_barcode_diagnostic_SNPs = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/62_SNP_barcode_from_Coll_et_al.csv')

#fix encoding issue
all_barcode_diagnostic_SNPs.Position = all_barcode_diagnostic_SNPs.Position.str.strip().str.replace('\xc2\xa0','') #Position column
all_barcode_diagnostic_SNPs.loc[: , 'Allele Change'] = all_barcode_diagnostic_SNPs.loc[: , 'Allele Change'].str.strip().str.replace('\xc2\xa0','') #Allele Change column
all_barcode_diagnostic_SNPs.lineage = all_barcode_diagnostic_SNPs.lineage.str.strip().str.replace('\xc2\xa0','') #lineage column

#reset index
all_barcode_diagnostic_SNPs.reset_index(inplace = True , drop = True)

In [11]:
all_barcode_diagnostic_SNPs.head()

Unnamed: 0,Global lineage,lineage,Position,Gene coord,Allele Change,Codon number,Codon change,Amino acid change,Locus Id
0,1,lineage1,615938,1104,G/A,368,GAG/GAA,E/E,Rv0524
1,1,lineage1.1,4404247,1056,G/A,352,CTG/CTA,L/L,Rv3915
2,1,lineage1.1.1,3021283,711,G/A,237,CGG/CGA,R/R,Rv2707
3,1,lineage1.1.1.1,3216553,339,G/A,113,GTC/GTT,V/V,Rv2907c
4,1,lineage1.1.2,2622402,51,G/A,17,GCC/GCT,A/A,Rv2343c


In [12]:
np.shape(all_barcode_diagnostic_SNPs)

(62, 9)

#### Subset DataFrame to SNPs that we have present in our Genotypes Matrix

In [13]:
barcode_SNPs_in_Genotypes_Matrix = []

#iterate through each of 62 SNPs and find which ones are present in our dataset
for SNP_i in all_barcode_diagnostic_SNPs.index:
    
    if int( all_barcode_diagnostic_SNPs.loc[SNP_i , 'Position'] ) in list(SNP_annotation_DF.pos):
        
        barcode_SNPs_in_Genotypes_Matrix.append(SNP_i)

#subset to SNPs that we have in Genotypes Matrix
barcode_diagnostic_SNPs = all_barcode_diagnostic_SNPs.loc[barcode_SNPs_in_Genotypes_Matrix , :]     

#convert Position column to 'int'
barcode_diagnostic_SNPs.Position = barcode_diagnostic_SNPs.Position.apply(int)

In [14]:
barcode_diagnostic_SNPs.head()

Unnamed: 0,Global lineage,lineage,Position,Gene coord,Allele Change,Codon number,Codon change,Amino acid change,Locus Id
0,1,lineage1,615938,1104,G/A,368,GAG/GAA,E/E,Rv0524
1,1,lineage1.1,4404247,1056,G/A,352,CTG/CTA,L/L,Rv3915
2,1,lineage1.1.1,3021283,711,G/A,237,CGG/CGA,R/R,Rv2707
3,1,lineage1.1.1.1,3216553,339,G/A,113,GTC/GTT,V/V,Rv2907c
4,1,lineage1.1.2,2622402,51,G/A,17,GCC/GCT,A/A,Rv2343c


The original 62 SNP barcode (a subset of 413 diagnostic SNPs) outlined in Coll et. al. 2014 corresponds to 62 branches, we have 62 of 62 SNPs in our Genotypes Matrix.

In [15]:
np.shape(barcode_diagnostic_SNPs)

(62, 9)

#### Format DataFrame with 62 lineage-defining SNPs

In [16]:
barcode_SNP_annot = barcode_diagnostic_SNPs.loc[: , ['lineage' , 'Position' , 'Allele Change' , 'Gene coord' , 'Locus Id'] ]

#create Reference & Alternate allele columns from Allele Change column
barcode_SNP_annot['ref'] = [ref_alt_alleles.split('/')[0] for ref_alt_alleles in list( barcode_SNP_annot.loc[: , 'Allele Change'] ) ]
barcode_SNP_annot['alt'] = [ref_alt_alleles.split('/')[1] for ref_alt_alleles in list( barcode_SNP_annot.loc[: , 'Allele Change'] ) ]

#drop unnecessary Allele Change column
barcode_SNP_annot = barcode_SNP_annot.loc[: , ['lineage' , 'Position' , 'ref' , 'alt' , 'Gene coord' , 'Locus Id'] ]

#rename reference position column
barcode_SNP_annot = barcode_SNP_annot.rename(index=str, columns={"Position": "pos"})

In [17]:
np.shape(barcode_SNP_annot)

(62, 6)

In [18]:
barcode_SNP_annot.head()

Unnamed: 0,lineage,pos,ref,alt,Gene coord,Locus Id
0,lineage1,615938,G,A,1104,Rv0524
1,lineage1.1,4404247,G,A,1056,Rv3915
2,lineage1.1.1,3021283,G,A,711,Rv2707
3,lineage1.1.1.1,3216553,G,A,339,Rv2907c
4,lineage1.1.2,2622402,G,A,51,Rv2343c


Get rid of the word 'lineage'

In [19]:
barcode_SNP_annot.lineage = [lineage.split('lineage')[1] for lineage in list( barcode_SNP_annot.lineage ) ]

In [20]:
barcode_SNP_annot.head()

Unnamed: 0,lineage,pos,ref,alt,Gene coord,Locus Id
0,1,615938,G,A,1104,Rv0524
1,1.1,4404247,G,A,1056,Rv3915
2,1.1.1,3021283,G,A,711,Rv2707
3,1.1.1.1,3216553,G,A,339,Rv2907c
4,1.1.2,2622402,G,A,51,Rv2343c


#### Create a column that contains the diagnostic allele and another that contains the base code for the diagnostic allele

__Note__: The diagnostic allele is the Reference Allele (not the Alternate Allele) for these two SNPs

In [21]:
barcode_SNP_annot[barcode_SNP_annot.pos == 931123]

Unnamed: 0,lineage,pos,ref,alt,Gene coord,Locus Id
20,4**,931123,T,C,171,Rv0835


In [22]:
barcode_SNP_annot.loc['20' , 'lineage'] = '4'

In [23]:
barcode_SNP_annot[barcode_SNP_annot.pos == 1759252]

Unnamed: 0,lineage,pos,ref,alt,Gene coord,Locus Id
56,4.9**,1759252,G,T,1572,Rv1552


In [24]:
barcode_SNP_annot.loc['56' , 'lineage'] = '4.9'

In [25]:
#create dictionary for each base - code map
base_to_code_map = {'A':0 , 'C':1 , 'G':2 , 'T':3}
diag_allele = []
diag_allele_code = []

for diagnostic_SNP_i in barcode_SNP_annot.index:
    
    #default that the alternate allele is the diagnostic allele, except for two SNPs in which the diagnostic allele is the reference allele
    if barcode_SNP_annot.loc[diagnostic_SNP_i,:].pos in [931123 , 1759252]:
        
        diagnostic_SNP_i_allele = barcode_SNP_annot.loc[diagnostic_SNP_i,:].ref
        
    else:
        
        diagnostic_SNP_i_allele = barcode_SNP_annot.loc[diagnostic_SNP_i,:].alt
        
        
    diag_allele.append(diagnostic_SNP_i_allele)
    diag_allele_code.append(base_to_code_map[diagnostic_SNP_i_allele])
    
#append columns to barcode SNP DF
barcode_SNP_annot['diag_allele'] = diag_allele
barcode_SNP_annot['diag_allele_code'] = diag_allele_code

In [26]:
barcode_SNP_annot.tail()

Unnamed: 0,lineage,pos,ref,alt,Gene coord,Locus Id,diag_allele,diag_allele_code
57,5,1799921,C,A,339,Rv1599,A,0
58,6,1816587,C,G,399,Rv1617,G,2
59,7,1137518,G,A,543,Rv1018c,A,0
60,BOV,2831482,A,G,1110,Rv2515c,G,2
61,BOV_AFRI,1882180,C,T,477,Rv1662,T,3


####################################################################################################################################################################################

## [3] Assign Isolates to Lineage and Sub-lineages based off of SNP barcode

####################################################################################################################################################################################

#### Subset Genotypes matrix to SNP barcode SNPs

In [27]:
#subset SNP annotation to the annotation for just the SNP barcode
barcode_SNPs_filter = [SNP_i in list(barcode_SNP_annot.pos) for SNP_i in SNP_annotation_DF.pos]
barcode_SNPs_in_genotypes_matrix = SNP_annotation_DF[barcode_SNPs_filter]

In [28]:
barcode_SNPs_in_genotypes_matrix.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
3408,62657,G,[A],[Rv0058],[2261],[Essential],[dnaB]
5347,107794,C,[T],[Rv0098],[194],[Essential],[fcoT]
14350,346693,G,[T],[Rv0284],[1058],[Essential],[eccC3]
14656,355181,G,[A],[Rv0291],[683],[Antigen],[mycP3]
16286,403364,G,"[A, C]",[Rv0338c],[171],[Essential],[Rv0338c]


In [29]:
np.shape( barcode_SNPs_in_genotypes_matrix )

(62, 7)

#### Merge lineage/sub-lineage, diagnostic allele and diagnostic allele codes (from the barcode SNP annotation above) information with barcode SNPs annotation

In [30]:
barcode_SNPs_in_genotypes_matrix = barcode_SNPs_in_genotypes_matrix.merge(barcode_SNP_annot.loc[: , ['diag_allele' , 'diag_allele_code' , 'lineage' , 'pos']] , how = 'inner' , on = 'pos')

In [31]:
barcode_SNPs_in_genotypes_matrix.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name,diag_allele,diag_allele_code,lineage
0,62657,G,[A],[Rv0058],[2261],[Essential],[dnaB],A,0,4.1
1,107794,C,[T],[Rv0098],[194],[Essential],[fcoT],T,3,4.1.2.1
2,346693,G,[T],[Rv0284],[1058],[Essential],[eccC3],T,3,2.2.2
3,355181,G,[A],[Rv0291],[683],[Antigen],[mycP3],A,0,4.4.1.1
4,403364,G,"[A, C]",[Rv0338c],[171],[Essential],[Rv0338c],A,0,4.3.3


#### Subset Genotypes Matrix to barcode SNPs

In [32]:
genotypes_array_barcode_SNPs = genotypes_array[ np.array(barcode_SNPs_filter) , :]

In [33]:
np.shape(genotypes_array_barcode_SNPs)

(62, 32355)

In [34]:
genotypes_array_barcode_SNPs

array([[2, 2, 2, ..., 2, 9, 2],
       [1, 1, 1, ..., 1, 1, 1],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2]])

### Iterate through each isolate and assign a lineage/sub-lineage based off of the most specific lineage call made for that isolate

In [35]:
#create DataFrame to store the lineage call for each isolate
lineage_calls_for_isolates_DF = pd.DataFrame(index = isolate_annotation_DF.index , columns = ['lineage_1' , 'lineage_2' , 'lineage_3' , 'lineage_4' , 'lineage_5'])

for isolate_i in list(isolate_annotation_DF.index):

    genotypes_for_isolate = genotypes_array_barcode_SNPs[: , isolate_i]

    #find Lineage(s) types by the isolate (match the genotypes for the isolate to the diagnostic alleles from the SNP barcode and get lineages for matching alleles)
    lineage_calls = list( barcode_SNPs_in_genotypes_matrix[genotypes_for_isolate == barcode_SNPs_in_genotypes_matrix.diag_allele_code.values].lineage )
    
    #sort by length of element...global lineage first, then sublineage, then sub-sub-lineage
    lineage_calls = sorted(lineage_calls , key=len)
    
    #keep the 'most specific' lineage call if one exists
    if len(lineage_calls) > 0:
        
        most_specific_lineage_call = lineage_calls[-1]
    
        #iterate through the lineage & sublineages for most specific lineage call and fill in DF

        #check Bovis
        if most_specific_lineage_call == 'BOV': 
            lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = 'BOV'

        #check Bovis-Africa
        elif most_specific_lineage_call == 'BOV_AFRI': 
            lineage_calls_for_isolates_DF.iloc[isolate_i , 1] = 'BOV_AFRI'

            #check to see if global lineage was also called
            try: 
                lineage_calls_for_isolates_DF.iloc[isolate_i , 0] = lineage_calls[-2]
                
            except IndexError: #BOV_AFRI sublineage was called but Global Lineage was not
                continue
    
        #any other lineage call can be split into the global lineage & sub-lineages
        else:
            most_specific_lineage_call_split = most_specific_lineage_call.split('.')

            #iterate through lineage & sub-lineages and fill in DF
            column_i = 0
            for lineage_call in most_specific_lineage_call_split:
                lineage_calls_for_isolates_DF.iloc[isolate_i , column_i] = lineage_call
                column_i += 1

In [36]:
lineage_calls_for_isolates_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5
0,4,3,4,2.0,
1,4,4,1,1.0,
2,3,1,1,,
3,4,3,2,,
4,1,1,3,,


How many Isolates did not have a high-confidence lineage call (no lineage_1/global lineage designation)?

In [37]:
isolate_without_lineage_calls = 0

for isolate_i in lineage_calls_for_isolates_DF.index:
    
    #if the first column is NaN, then we don't have any lineage calls for this isolate*
    ## (*possible to have a sub-lineage 'BOV_AFRI' with no global lineage but we're dropping these)
    if lineage_calls_for_isolates_DF.loc[isolate_i , 'lineage_1'] is np.nan:
        
        isolate_without_lineage_calls += 1

In [38]:
print isolate_without_lineage_calls

322


These isolates probably failed to get categorized at the global lineage level because the (global) lineage-defining SNP/Base Call did not meet the Alternate Allele calling filters (BQ, MQ, indel, depth, etc.)

#### Append Isolate Annotation Series to Lineage/Sub-lineage assignments

In [39]:
lineage_calls_for_isolates_DF['isolate_ID'] = list( isolate_annotation_DF )
isolate_annotation_DF = lineage_calls_for_isolates_DF.copy()

In [40]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,isolate_ID
0,4,3,4,2.0,,SAMEA3558733
1,4,4,1,1.0,,SAMN03648641
2,3,1,1,,,SAMN03647419
3,4,3,2,,,SAMEA3671418
4,1,1,3,,,SAMN07659096


In [41]:
np.shape(isolate_annotation_DF)

(32355, 6)

####################################################################################################################################################################################

## [4] *FILTER*: Drop Isolates without a Global Lineage Call or that had more than 1 Global Lineage Call

####################################################################################################################################################################################

#### Only keep isolates with a single 'Global Lineage call' (many global lineages were actually inferred from more specific sub-lineage defining SNPs)

Filter out isolates with no lineage calls

In [42]:
#boolean list that is 'True' for all isolates no global lineage calls
isolates_no_lineage_calls_filter = []

for isolate_i in lineage_calls_for_isolates_DF.index:
    
    #if the first column is NaN, then we don't have any lineage calls for this isolate*
    ## (*possible to have a sub-lineage 'BOV_AFRI' with no global lineage but we're dropping these)
    if lineage_calls_for_isolates_DF.loc[isolate_i , 'lineage_1'] is np.nan:
        
        isolates_no_lineage_calls_filter.append(True)
        
    else:
        
        isolates_no_lineage_calls_filter.append(False)

In [43]:
np.sum(isolates_no_lineage_calls_filter) #drop these isolates

322

In [44]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(isolate_annotation_DF[isolates_no_lineage_calls_filter])

      lineage_1 lineage_2 lineage_3 lineage_4 lineage_5      isolate_ID
146         NaN       NaN       NaN       NaN       NaN    SAMN02419670
503         NaN       NaN       NaN       NaN       NaN    SAMN06578971
818         NaN       NaN       NaN       NaN       NaN    SAMN02403498
981         NaN       NaN       NaN       NaN       NaN    SAMN07344639
1082        NaN       NaN       NaN       NaN       NaN    SAMEA1016092
1127        NaN       NaN       NaN       NaN       NaN    SAMEA3558231
1153        NaN       NaN       NaN       NaN       NaN    SAMEA1119333
1244        NaN       NaN       NaN       NaN       NaN    SAMN08379718
1302        NaN       NaN       NaN       NaN       NaN    SAMEA3420861
1356        NaN       NaN       NaN       NaN       NaN    SAMEA1903091
1372        NaN       NaN       NaN       NaN       NaN    SAMN08629131
1394        NaN       NaN       NaN       NaN       NaN    SAMEA1569344
1418        NaN       NaN       NaN       NaN       NaN    SAMEA

Create a boolean filter of all isolates to retain (isolates that only had exactly 1 global lineage call)

In [45]:
isolates_to_keep_filter = [ not no_lineage_SNP for no_lineage_SNP in isolates_no_lineage_calls_filter ]

In [46]:
np.sum(isolates_to_keep_filter)

32033

In [47]:
#filter Isolate annotation DF
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]

#reindex Isolate annotation DF
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[: , np.array(isolates_to_keep_filter) ]

In [48]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,isolate_ID
0,4,3,4,2.0,,SAMEA3558733
1,4,4,1,1.0,,SAMN03648641
2,3,1,1,,,SAMN03647419
3,4,3,2,,,SAMEA3671418
4,1,1,3,,,SAMN07659096


In [49]:
np.shape(isolate_annotation_DF)

(32033, 6)

In [50]:
np.shape(genotypes_array)

(139797, 32033)

#### Number of isolates present in each global lineage

In [51]:
from collections import Counter
lineage_count = pd.Series(Counter(isolate_annotation_DF.lineage_1))

In [52]:
lineage_count

1       2802
2       8072
3       3354
4      17420
5        101
6         97
7         35
BOV      152
dtype: int64

In [53]:
int( lineage_count.sum() )

32033

####################################################################################################################################################################################

## [5] *FILTER*: Drop Isolates there were used in *Longitudinal* Pairs

####################################################################################################################################################################################

### Drop isolates belonging to any clinical isolates analyzed for in-host analysis (i.e. longitudinal isolates) in sample annotation

Annotation for all isolates used in in-host analysis (N = 200 isolates)

In [54]:
#sample annotation for all longitudinal isolates in study
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/Longitudinal_fastq_path_names_and_JankyPipe_tags_filtered_final.csv' , sep  = ',').set_index('patient_id')
isolate_IDs_within_host_analysis = list( sample_annotation.tag )

In [55]:
len(isolate_IDs_within_host_analysis)

400

Create a boolean filter of all isolates to retain (isolates that were not used in in-host analysis)

In [56]:
#bool FILTER for isolates to keep in scraping Luca's DB
isolates_to_keep_filter = []

for isolate_ID in list(isolate_annotation_DF.isolate_ID):
    
    if isolate_ID in isolate_IDs_within_host_analysis:
        isolates_to_keep_filter.append(False)
        
    else:
        isolates_to_keep_filter.append(True)

In [57]:
np.sum( isolates_to_keep_filter ) #number of isolates that were NOT included in in-host analysis

31933

In [58]:
#filter Isolate annotation DF
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]

#reindex Isolate annotation DF
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[: , np.array(isolates_to_keep_filter) ]

In [59]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,isolate_ID
0,4,3,4,2.0,,SAMEA3558733
1,4,4,1,1.0,,SAMN03648641
2,3,1,1,,,SAMN03647419
3,4,3,2,,,SAMEA3671418
4,1,1,3,,,SAMN07659096


In [60]:
np.shape(isolate_annotation_DF)

(31933, 6)

In [61]:
np.shape(genotypes_array)

(139797, 31933)

#### Number of isolates present in each global lineage

In [62]:
lineage_count = pd.Series(Counter(isolate_annotation_DF.lineage_1))

In [63]:
lineage_count

1       2802
2       8064
3       3352
4      17330
5        101
6         97
7         35
BOV      152
dtype: int64

In [64]:
int( lineage_count.sum() )

31933

####################################################################################################################################################################################

## [6] *FILTER*: Downsample Isolates in Lineage 2 and 4; Drop Isolates in Lineage BOV and 7

####################################################################################################################################################################################

### Drop isolates that were typed as Lineage *BOV* or Lineage *7*

In [65]:
isolates_IDs_BOV_to_drop = list( isolate_annotation_DF[isolate_annotation_DF.lineage_1 == 'BOV'].isolate_ID )
isolates_IDs_7_to_drop = list( isolate_annotation_DF[isolate_annotation_DF.lineage_1 == '7'].isolate_ID )

In [66]:
len(isolates_IDs_BOV_to_drop)

152

In [67]:
len(isolates_IDs_7_to_drop)

35

### Downsample L2 isolates: Randomly choose $1,064 = 8,064 - 7,000$ isolates typed as Lineage 2 to *drop* without replacement

In [72]:
isolate_IDs_2_all = list( isolate_annotation_DF[isolate_annotation_DF.lineage_1 == '2'].isolate_ID )

In [73]:
len(isolate_IDs_2_all)

8064

In [74]:
isolates_IDs_2_to_drop = random.sample(isolate_IDs_2_all , 1064)

In [75]:
len(isolates_IDs_2_to_drop)

1064

### Downsample L4 isolates: Randomly choose $10,330 = 17,330 - 7,000$ isolates typed as Lineage 4 to *drop* without replacement

In [76]:
isolate_IDs_4_all = list( isolate_annotation_DF[isolate_annotation_DF.lineage_1 == '4'].isolate_ID )

In [77]:
len(isolate_IDs_4_all)

17330

In [78]:
isolates_IDs_4_to_drop = random.sample(isolate_IDs_4_all , 10330)

In [79]:
len(isolates_IDs_4_to_drop)

10330

### Create boolean filter for isolates to drop 

In [80]:
isolates_IDs_to_drop = isolates_IDs_BOV_to_drop + isolates_IDs_7_to_drop + isolates_IDs_2_to_drop + isolates_IDs_4_to_drop

In [81]:
len(isolates_IDs_to_drop)

11581

Filter out isolates that were typed as *LBOV*, *L7*, or were randomly chosen to be dropped from *L2* and *L4*

In [82]:
#boolean list that is 'True' for all isolates to drop
isolates_to_drop_filter = []

for isolate_ID in isolate_annotation_DF.isolate_ID:
    
    if isolate_ID in isolates_IDs_to_drop:
        
        isolates_to_drop_filter.append(True)
        
    else:
        
        isolates_to_drop_filter.append(False)

In [83]:
sum(isolates_to_drop_filter)

11581

### Create boolean filter for isolates to keep

In [84]:
isolates_to_keep_filter = [not isolate_to_drop for isolate_to_drop in isolates_to_drop_filter]

In [85]:
np.sum(isolates_to_keep_filter)

20352

In [86]:
#filter Isolate annotation DF
isolate_annotation_DF = isolate_annotation_DF[isolates_to_keep_filter]

#reindex Isolate annotation DF
isolate_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[: , np.array(isolates_to_keep_filter) ]

In [87]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,isolate_ID
0,3,1,1,,,SAMN03647419
1,1,1,3,,,SAMN07659096
2,4,1,1,,,SAMEA1119683
3,2,2,1,,,SAMN05576173
4,2,2,1,,,SAMN08612836


In [88]:
np.shape(isolate_annotation_DF)

(20352, 6)

In [89]:
np.shape(genotypes_array)

(139797, 20352)

#### Number of isolates present in each global lineage

In [90]:
lineage_count = pd.Series(Counter(isolate_annotation_DF.lineage_1))
lineage_count

1    2802
2    7000
3    3352
4    7000
5     101
6      97
dtype: int64

In [91]:
int( lineage_count.sum() )

20352

####################################################################################################################################################################################

## [7] *FILTER*: Drop SNPs where *minor* allele(s) occurs in $< 1$ isolates

####################################################################################################################################################################################

Isolates were subset from the larger group of 32k isolates. So if there were SNPs in which the alternate allele was present in 6 isolates and all 6 were dropped, then there is no isolates with the alternate allele left in the *Genotypes Matrix*. Let's drop all SNPs in which the minor (reference or alternate) allele is present in less than 1 isolate.

1. For each row of matrix (SNP) we will count the number of **0's**, **1's**, **2's** and **3's**.
1. For each SNP we will take the max( number of 0's , number of 1's , number of 2's , number of 3's ) to get a count of the **major allele**
1. For each SNP we will sum up the total number of isolates with a designated allele (not a 9), to get the count of **non-missing** calls
1. We will then subtract the **count of major alleles** from the **number of non-missing calls** to get the number of isolates with the minor allele(s) for each SNP.
1. We will discard all SNPs for which the minor allele(s) occurs in LESS THAN 1 isolate

In [92]:
genotypes_matrix_SNP_0_count = list( np.sum( genotypes_array == 0 , axis = 1 ) ) #A
genotypes_matrix_SNP_1_count = list( np.sum( genotypes_array == 1 , axis = 1 ) ) #C
genotypes_matrix_SNP_2_count = list( np.sum( genotypes_array == 2 , axis = 1 ) ) #G
genotypes_matrix_SNP_3_count = list( np.sum( genotypes_array == 3 , axis = 1 ) ) #T

genotypes_matrix_SNP_non_missing_count = np.sum( genotypes_array != 9 , axis = 1 ) #Non-Missing Data
genotypes_matrix_major_allele_count = np.array( [ max(zero_count , one_count , two_count , three_count) for zero_count , one_count , two_count , three_count in zip(genotypes_matrix_SNP_0_count , genotypes_matrix_SNP_1_count , genotypes_matrix_SNP_2_count , genotypes_matrix_SNP_3_count) ] )

#Number of isolates with the minor allele(s) for each SNP
genotypes_matrix_minor_alleles_count = genotypes_matrix_SNP_non_missing_count - genotypes_matrix_major_allele_count

In [93]:
genotypes_matrix_minor_alleles_count

array([  8,   7,   5, ..., 197,  17, 484])

In [94]:
len(genotypes_matrix_minor_alleles_count)

139797

#### Number of SNPs that we're going to drop because the minor (alternate or reference) allele(s) is present in $< 1$ isolates.

In [95]:
np.sum( np.array( genotypes_matrix_minor_alleles_count ) < 1 )

10899

In [96]:
#boolean list that is 'True' for all SNPs where the minor allele is present in at least 1 isolate
SNPs_to_keep_filter = list( np.array( genotypes_matrix_minor_alleles_count ) >= 1 )

SNPs that had at least 1 isolate with minor allele

In [97]:
np.sum(SNPs_to_keep_filter)

128898

In [98]:
#filter SNP annotation DF
SNP_annotation_DF = SNP_annotation_DF[SNPs_to_keep_filter]

#reindex SNP annotation DF
SNP_annotation_DF.reset_index(drop = True , inplace = True)

#filter Genotype Matrix 
genotypes_array = genotypes_array[np.array(SNPs_to_keep_filter) , : ]

In [99]:
np.shape(SNP_annotation_DF)

(128898, 7)

In [100]:
np.shape(genotypes_array)

(128898, 20352)

####################################################################################################################################################################################

## [8] Save annotation files and genotypes matrix

####################################################################################################################################################################################

### Now that we've assigned lineages and have *dropped* isolates and have *dropped* SNPs, we save the updated Genotypes Matrix and corresponding annotation files.

#### Isolate Annotation

In [101]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,isolate_ID
0,3,1,1,,,SAMN03647419
1,1,1,3,,,SAMN07659096
2,4,1,1,,,SAMEA1119683
3,2,2,1,,,SAMN05576173
4,2,2,1,,,SAMN08612836


In [102]:
np.shape(isolate_annotation_DF)

(20352, 6)

In [103]:
#save isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_isolate_annotation.pkl')

#### SNP Annotation

In [104]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,29,C,[G],[Rv0001],[28],[Essential],[dnaA]
1,31,A,"[C, G]",[Rv0001],[30],[Essential],[dnaA]
2,48,C,[T],[Rv0001],[47],[Essential],[dnaA]
3,64,G,[C],[Rv0001],[63],[Essential],[dnaA]
4,71,C,[T],[Rv0001],[70],[Essential],[dnaA]


In [105]:
np.shape(SNP_annotation_DF)

(128898, 7)

In [106]:
#save SNP annotation file (rows of Genotype Matrix)
SNP_annotation_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_SNP_annotation.pkl')

#### Genotype Matrix

In [107]:
genotypes_array

array([[1, 1, 1, ..., 9, 1, 1],
       [0, 0, 0, ..., 9, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       [1, 1, 1, ..., 1, 1, 1]])

In [108]:
np.shape(genotypes_array)

(128898, 20352)

In [109]:
#save Genotypes Matrix
np.save('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_matrix' , genotypes_array , allow_pickle = True)

####################################################################################################################################################################################

## [9] Create DataFrame with the functional annotation for SNPs & corresponding Amino Acid Changes

####################################################################################################################################################################################

#### This block of code will determine the functional consequence of each SNP in our genotypes array.

In [110]:
#load SNP annotation file (rows of Genotype Matrix)
SNP_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_SNP_annotation.pkl')

In [111]:
SNP_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_id,gene_pos,gene_category,gene_name
0,29,C,[G],[Rv0001],[28],[Essential],[dnaA]
1,31,A,"[C, G]",[Rv0001],[30],[Essential],[dnaA]
2,48,C,[T],[Rv0001],[47],[Essential],[dnaA]
3,64,G,[C],[Rv0001],[63],[Essential],[dnaA]
4,71,C,[T],[Rv0001],[70],[Essential],[dnaA]


In [112]:
np.shape(SNP_annotation_DF)

(128898, 7)

#### Create a dataframe for each unique _reference_ allele - _alternate_ allele pairing (some SNPs are not biallelic and will require multiple rows)

In [113]:
#lists that will store columns for DF
SNP_index = []
pos_list = []
ref_list = []
alt_list = []
gene_category_list = []
gene_name_list = []

for SNP_i in SNP_annotation_DF.index:
    
    SNP_i_info = SNP_annotation_DF.loc[SNP_i , :]
    
    #iterate through all alternate alleles for this SNP; bi-allelic SNPs will have just 1
    for SNP_i_alt_allele in SNP_i_info.alt:
        
        #create a unique SNP index (a key for a particular mutation), Reference Position + Alternate Allele
        SNP_key = str(SNP_i_info.pos) + '_' + SNP_i_alt_allele
        
        #store information for this specific mutation
        SNP_index.append(SNP_key)
        pos_list.append(SNP_i_info.pos)
        ref_list.append(SNP_i_info.ref)
        alt_list.append(SNP_i_alt_allele)
        gene_category_list.append(SNP_i_info.gene_category)
        gene_name_list.append(SNP_i_info.gene_name)
        
#construct DF from lists
SNP_functional_annotation_DF = pd.DataFrame(index = SNP_index)
SNP_functional_annotation_DF['pos'] = pos_list
SNP_functional_annotation_DF['ref'] = ref_list
SNP_functional_annotation_DF['alt'] = alt_list
SNP_functional_annotation_DF['gene_category'] = gene_category_list
SNP_functional_annotation_DF['gene_name'] = gene_name_list        

In [114]:
SNP_functional_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name
29_G,29,C,G,[Essential],[dnaA]
31_C,31,A,C,[Essential],[dnaA]
31_G,31,A,G,[Essential],[dnaA]
48_T,48,C,T,[Essential],[dnaA]
64_C,64,G,C,[Essential],[dnaA]


In [115]:
np.shape(SNP_functional_annotation_DF)

(152836, 5)

Convert **gene_category** and **gene_name** columns so that the elements are lists only when there is more than 1 label in a cell (when a Reference Position corresponds to two overlapping genes)

In [116]:
gene_category_list = []
gene_name_list = []

for SNP_i in SNP_functional_annotation_DF.index:
    
    gene_category_SNP_i = SNP_functional_annotation_DF.loc[SNP_i , 'gene_category']
    gene_name_SNP_i = SNP_functional_annotation_DF.loc[SNP_i , 'gene_name']
    
    if len(gene_category_SNP_i) == 1:
        
        gene_category_list.append(gene_category_SNP_i[0])
        gene_name_list.append(gene_name_SNP_i[0])
    
    else:
        
        gene_category_list.append(gene_category_SNP_i)
        gene_name_list.append(gene_name_SNP_i)
        
SNP_functional_annotation_DF['gene_category'] = gene_category_list
SNP_functional_annotation_DF['gene_name'] = gene_name_list

In [117]:
SNP_functional_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name
29_G,29,C,G,Essential,dnaA
31_C,31,A,C,Essential,dnaA
31_G,31,A,G,Essential,dnaA
48_T,48,C,T,Essential,dnaA
64_C,64,G,C,Essential,dnaA


In [118]:
np.shape(SNP_functional_annotation_DF)

(152836, 5)

### Cell to Annotate SNPs

In [119]:
# Important Packages
################################################################################################################################################################################################
import os
import pandas as pd
import numpy as np
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
################################################################################################################################################################################################


# Relevant Information for H37Rv sequence SNP functional annotation
################################################################################################################################################################################################
####### Collect all DNA and Amino Acid sequences corresponding to genes on H37Rv #######
#load reference genome and reference annotation
reference_genome = '/n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta'
for reference_genome in SeqIO.parse(reference_genome, "fasta"):
    reference_genome.seq.alphabet = IUPAC.unambiguous_dna

reference_genome_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/H37Rv/h37rv_genome_summary.txt', '\t').set_index('name')

####### Function to translate coding DNA sequences ####### 
def translate(gene_id, sequence):

    #find which strand the gene is located on and translate
    strand = reference_genome_annotation.loc[gene_id, 'strand']
    if strand == '+':
        protein_sequence = sequence.translate(table="Bacterial", cds=False)
    elif strand == '-':
        protein_sequence = sequence.reverse_complement().translate(table="Bacterial", cds=False)

    return protein_sequence

####### Load in dictionaries for SNP annotation #######
with open('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/pickled_files/dicts_for_SNP_annotation/H37Rv_gene_seq_records.pickle', 'rb') as handle:
    ref_gene_sequences_records = pickle.load(handle)
    
with open('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/pickled_files/dicts_for_SNP_annotation/H37Rv_protein_seq_records.pickle', 'rb') as handle:
    ref_protein_sequences_records = pickle.load(handle)
    
with open('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/pickled_files/dicts_for_SNP_annotation/H37Rv_coord_gene_mapping.pickle', 'rb') as handle:
    ReferencePosition_Gene_mapping = pickle.load(handle)
    
####### get Gene Categories #######
gene_categories = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/gene_categories/gene_categories.csv').set_index('name')
gene_categories_dict = dict([gene_id , gene_category] for gene_id, gene_category in zip(list(gene_categories.gene_id) , list(gene_categories.Gene_Category)))

####### get Gene Symbols #######
gene_symbol_dict = dict([gene_id , gene_symbol] for gene_id, gene_symbol in zip(list(reference_genome_annotation.symbol.index) , list( reference_genome_annotation.symbol )))
################################################################################################################################################################################################


# Function to annotate Intergenic SNPs
################################################################################################################################################################################################
def find_flanking_genes_for_intergenic_region(intergenic_ref_pos): 

    #this function finds the genes flagging an intergenic region given a reference position

    #find gene immediately in the 5' direction
    for i in range(0 , 100000):

        #move toward 5' direction
        if ReferencePosition_Gene_mapping[intergenic_ref_pos - i] != []:

            gene_to_left = ReferencePosition_Gene_mapping[intergenic_ref_pos - i][0]
            break

    #find gene immediately in the 3' direction       
    for i in range(0 , 100000):

        #move toward 3' direction
        try:
            if ReferencePosition_Gene_mapping[intergenic_ref_pos + i] != []:

                gene_to_right = ReferencePosition_Gene_mapping[intergenic_ref_pos + i][0]
                break
        
        #KeyError means we have hit the 'end' of the chromosome, the intergenic region at then end of H37Rv in 5' > 3' orientation 
        #since TB chromosome is circular the gene to the 'right' is Rv0001    
        except KeyError:
            
            gene_to_right = 'Rv0001'
            break
    
    return gene_to_left + '_' + gene_to_right
################################################################################################################################################################################################


# Function to determine whether SNPs are Synonymous or Non-Synonymous; Returns gene coordinate, codon position, AA changes, Gene Category & Symbol
################################################################################################################################################################################################
def SNP_annotate(ref_seq_position , alt_allele_i):
    
    '''
    This function takes as input a reference position on H37Rv located within a 
    gene and an alternate allele and returns whether the base change 
    would correspond to a different Amino Acid sequence that results 
    from translating the DNA sequence into an AA sequence.
    
    '''
    gene_intergenic_id_list = []
    genomic_coord_list = []
    gene_category_list = []
    gene_symbol_list = []
    Syn_NSyn_list = []
    AA_change_list = []
    
    #get the Reference Allele from the complete H37Rv reference genome, indexing starts from 0
    ref_allele_i = reference_genome.seq[int(ref_seq_position) - 1] 
    
    #find the gene that SNP occurs on; check list corresponding to H37Rv coordinate to see if there are any genes associated with RefPosition
    if len(ReferencePosition_Gene_mapping[ref_seq_position]) > 0:

        #iterate through all genes that ReferencePosition is mapped to (i.e. SNP might correspond to 2 genes)
        for gene_intergenic_id in ReferencePosition_Gene_mapping[ref_seq_position]:

            #find genomic coordinate of SNP relative to gene (subtract 1 since reference seq starts counting at 1)
            gene_relative_coord = (ref_seq_position - 1) - min( reference_genome_annotation.loc[gene_intergenic_id , 'chromStart'] , reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd'] )
            
            #find the genomic coordinate (relative to the gene, in the 5' to 3' direction)
            strand = reference_genome_annotation.loc[gene_intergenic_id, 'strand']
            if strand == '+':
                 genomic_5_to_3_coord = (ref_seq_position) - reference_genome_annotation.loc[gene_intergenic_id , 'chromStart']

            elif strand == '-':
                 genomic_5_to_3_coord = (reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd']) - (ref_seq_position-1)
                    
            #find gene category (if one exists)
            try:
                gene_category_i = gene_categories_dict[gene_intergenic_id]
            except KeyError:
                gene_category_i = 'None'
            
            #find gene symbol (if one exists)
            try:
                gene_symbol_i = gene_symbol_dict[gene_intergenic_id]
            except KeyError:
                gene_symbol_i = 'None'
            
            #alternate allele is an actual base
            if alt_allele_i in ['A','C','G','T']:

                #translate into protein sequence with the SNP in place if not InDel or intergenic region
                SNP_change = alt_allele_i

                #ALTERNATE allele (is it Syn or NSyn?)
                #get sequence from dictionary of sequences (and convert to mutable object)
                test_gene_sequence = ref_gene_sequences_records[gene_intergenic_id].seq.tomutable()

                #change reference gene sequence by the SNP in the query sequence
                test_gene_sequence[int(gene_relative_coord)] = SNP_change

                #convert back immutable object
                test_gene_sequence = test_gene_sequence.toseq()

                #translate sequence into amino acid seq
                test_protein_sequence = translate(gene_intergenic_id , test_gene_sequence)

                #store the H37Rv AA seq to compare against
                H37Rv_AA_sequence = ref_protein_sequences_records[gene_intergenic_id].seq

                #get the codon number where the SNP occurs within
                ## take the genomic coordinate (relative to the gene, in the 5' to 3' direction), divide by 3, then take the ceiling of this number (will be fraction if SNP occurs in 1st or 2nd position on codon)
                strand = reference_genome_annotation.loc[gene_intergenic_id, 'strand']
                if strand == '+':
                     genomic_5_to_3_coord = (ref_seq_position) - reference_genome_annotation.loc[gene_intergenic_id , 'chromStart']

                elif strand == '-':
                     genomic_5_to_3_coord = (reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd']) - (ref_seq_position-1)

                codon_coord = int(np.ceil( float( genomic_5_to_3_coord) / 3.0 ))

                #compare to AA seq of original gene
                if test_protein_sequence == H37Rv_AA_sequence:

                    SNP_type = 'S'

                    #get the AA before & after
                    AA_change = H37Rv_AA_sequence[codon_coord-1] + str(codon_coord) + test_protein_sequence[codon_coord-1]

                else:
                    SNP_type = 'N'

                    #get the AA before & after
                    AA_change = H37Rv_AA_sequence[codon_coord-1] + str(codon_coord) + test_protein_sequence[codon_coord-1]
                    
            #alternate allele is a dummy (Base Call completely supports the Reference Allele)       
            else:
                
                SNP_type = 'None'
                AA_change = 'None'

            #store relevant info in lists    
            gene_intergenic_id_list.append(gene_intergenic_id)
            genomic_coord_list.append(genomic_5_to_3_coord)
            gene_category_list.append(gene_category_i)
            gene_symbol_list.append(gene_symbol_i)
            Syn_NSyn_list.append(SNP_type)
            AA_change_list.append(AA_change)
    
    #if no gene in H37Rv corresponds to the Reference Position for SNP, then SNP must be intergenic
    else:
        
        gene_intergenic_id = find_flanking_genes_for_intergenic_region(ref_seq_position)
        genomic_5_to_3_coord = 'None'
        gene_category_i = 'None'
        gene_symbol_i = 'None'
        SNP_type = 'I'
        AA_change = 'None'
        
        #store relevant info in lists    
        gene_intergenic_id_list.append(gene_intergenic_id)
        genomic_coord_list.append(genomic_5_to_3_coord)
        gene_category_list.append(gene_category_i)
        gene_symbol_list.append(gene_symbol_i)
        Syn_NSyn_list.append(SNP_type)
        AA_change_list.append(AA_change)
    
    #if there is only a single gene associated with this SNP, just return the individual elememts
    if len(gene_intergenic_id_list) == 1:
        return [ref_allele_i , gene_intergenic_id , genomic_5_to_3_coord , gene_category_i , gene_symbol_i , SNP_type , AA_change]
    
    #else if there are two genes associated with this SNP, return elements for each SNP annotation in a list
    elif len(gene_intergenic_id_list) > 1:
        return [ref_allele_i , gene_intergenic_id_list , genomic_coord_list , gene_category_list , gene_symbol_list , Syn_NSyn_list , AA_change_list]
################################################################################################################################################################################################

#### Iterate through each unique Reference Allele - Alternate Allele pair and get relevant annotation

In [120]:
#create lists for each new column of the SNP functional annotation DF
gene_intergenic_id_list = []
gene_pos_list = []
SNP_type_list = []
AA_change_list = []

for SNP_i in SNP_functional_annotation_DF.index:
    
    #get the reference position & alternate allele from index key
    ref_pos_i , alt_allele_i = SNP_i.split('_')
    
    #get functional annotation for SNP
    ref_allele_i , gene_intergenic_id_i , gene_pos_i , gene_category_i , gene_symbol_i , SNP_type_i , AA_change_i = SNP_annotate(int(ref_pos_i) , alt_allele_i)
    
    #store functional annotation info in lists
    gene_intergenic_id_list.append(gene_intergenic_id_i)
    gene_pos_list.append(gene_pos_i)
    SNP_type_list.append(SNP_type_i)
    AA_change_list.append(AA_change_i)
    
#append new columns to SNP functional annotation
SNP_functional_annotation_DF['gene_id'] = gene_intergenic_id_list
SNP_functional_annotation_DF['gene_pos'] = gene_pos_list
SNP_functional_annotation_DF['SNP_type'] = SNP_type_list
SNP_functional_annotation_DF['AA_change'] = AA_change_list



In [121]:
np.shape(SNP_functional_annotation_DF)

(152836, 9)

In [122]:
SNP_functional_annotation_DF.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,SNP_type,AA_change
29_G,29,C,G,Essential,dnaA,Rv0001,29,N,T10S
31_C,31,A,C,Essential,dnaA,Rv0001,31,N,T11P
31_G,31,A,G,Essential,dnaA,Rv0001,31,N,T11A
48_T,48,C,T,Essential,dnaA,Rv0001,48,S,V16V
64_C,64,G,C,Essential,dnaA,Rv0001,64,N,G22R


#### Pickle functional SNP annotation for downstream analysis

__Note__: 

The gene position/coordinate for the original SNP annotation file (whose rows map to the rows of the Genotypes Matrix) starts counting from 0 and starts counting from 5' > 3' regardless of which strand the gene is transcribed from, this is to accomodate the functional SNP annotation. 

The gene position/coordinate for the functional SNP annotation (above) starts counting from 1 and starts counting in the direction that the gene is transcribed which is more intuitive for reporting purposes.

In [123]:
SNP_functional_annotation_DF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_SNP_functional_annotation.pkl')