In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

##########################################################################################################################################################################################################################################

## *Functions* for retrieving SNPs between each pair of *longitudinal* samples (SNPs with $>= 25$% $\Delta$ AF)

##########################################################################################################################################################################################################################################

In [1]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from pylab import plot, show, savefig, xlim, figure, hold, ylim, legend, boxplot, setp, axes
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
from matplotlib.gridspec import GridSpec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools

import networkx as nx
import scipy
import pickle

#for exporting to Adobe Illustrator
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

### Decide on a threshold for difference in Alternate Allele Frequencies to call SNPs between two isolates

In [2]:
alt_AF_diff_threshold = 0.25 #x% 

### Load regions to exclude from analysis per EBR score across H37Rv (dropping sites with EBR score < 0.8)

In [3]:
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/variant_calling/H37Rv_sites_to_drop.pkl', 'rb') as f:
    H37Rv_positions_to_drop = pickle.load(f)
    
#convert to a set (faster to query)
H37Rv_positions_to_drop = set(H37Rv_positions_to_drop)

### *Cell* to annotate SNPs

In [4]:
# Important Packages
################################################################################################################################################################################################
import os
import pandas as pd
import numpy as np
import sys
import pickle

import Bio
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
################################################################################################################################################################################################


# Relevant Information for H37Rv sequence SNP functional annotation
################################################################################################################################################################################################
####### Collect all DNA and Amino Acid sequences corresponding to genes on H37Rv #######
#load reference genome and reference annotation
reference_genome = '/n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta'
for reference_genome in SeqIO.parse(reference_genome, "fasta"):
    reference_genome.seq.alphabet = IUPAC.unambiguous_dna

reference_genome_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/H37Rv/h37rv_genome_summary.txt', '\t').set_index('name')

####### Function to translate coding DNA sequences ####### 
def translate(gene_id, sequence):

    #find which strand the gene is located on and translate
    strand = reference_genome_annotation.loc[gene_id, 'strand']
    if strand == '+':
        protein_sequence = sequence.translate(table="Bacterial", cds=False)
    elif strand == '-':
        protein_sequence = sequence.reverse_complement().translate(table="Bacterial", cds=False)

    return protein_sequence

####### Load in dictionaries for SNP annotation #######
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/dicts_for_SNP_annotation/H37Rv_gene_seq_records.pickle', 'rb') as handle:
    ref_gene_sequences_records = pickle.load(handle)
    
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/dicts_for_SNP_annotation/H37Rv_protein_seq_records.pickle', 'rb') as handle:
    ref_protein_sequences_records = pickle.load(handle)
    
with open('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/dicts_for_SNP_annotation/H37Rv_coord_gene_mapping.pickle', 'rb') as handle:
    ReferencePosition_Gene_mapping = pickle.load(handle)
    
####### get Gene Categories #######
gene_categories = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/gene_categories/gene_categories.csv').set_index('name')
gene_categories_dict = dict([gene_id , gene_category] for gene_id, gene_category in zip(list(gene_categories.gene_id) , list(gene_categories.Gene_Category)))

####### get Gene Symbols #######
gene_symbol_dict = dict([gene_id , gene_symbol] for gene_id, gene_symbol in zip(list(reference_genome_annotation.symbol.index) , list( reference_genome_annotation.symbol )))
################################################################################################################################################################################################


# Function to annotate Intergenic SNPs
################################################################################################################################################################################################
def find_flanking_genes_for_intergenic_region(intergenic_ref_pos): 

    #this function finds the genes flagging an intergenic region given a reference position

    #find gene immediately in the 5' direction
    for i in range(0 , 100000):

        #move toward 5' direction
        if ReferencePosition_Gene_mapping[intergenic_ref_pos - i] != []:

            gene_to_left = ReferencePosition_Gene_mapping[intergenic_ref_pos - i][0]
            break

    #find gene immediately in the 3' direction       
    for i in range(0 , 100000):

        #move toward 3' direction
        try:
            if ReferencePosition_Gene_mapping[intergenic_ref_pos + i] != []:

                gene_to_right = ReferencePosition_Gene_mapping[intergenic_ref_pos + i][0]
                break
        
        #KeyError means we have hit the 'end' of the chromosome, the intergenic region at then end of H37Rv in 5' > 3' orientation 
        #since TB chromosome is circular the gene to the 'right' is Rv0001    
        except KeyError:
            
            gene_to_right = 'Rv0001'
            break
    
    return gene_to_left + '_' + gene_to_right
################################################################################################################################################################################################


# Function to determine whether SNPs are Synonymous or Non-Synonymous; Returns gene coordinate, codon position, AA changes, Gene Category & Symbol
################################################################################################################################################################################################
def SNP_annotate(ref_seq_position , alt_allele_i):
    
    '''
    This function takes as input a reference position on H37Rv located within a 
    gene and an alternate allele and returns whether the base change 
    would correspond to a different Amino Acid sequence that results 
    from translating the DNA sequence into an AA sequence.
    
    '''
    gene_intergenic_id_list = []
    genomic_coord_list = []
    gene_category_list = []
    gene_symbol_list = []
    Syn_NSyn_list = []
    AA_change_list = []
    
    #get the Reference Allele from the complete H37Rv reference genome, indexing starts from 0
    ref_allele_i = reference_genome.seq[int(ref_seq_position) - 1] 
    
    #find the gene that SNP occurs on; check list corresponding to H37Rv coordinate to see if there are any genes associated with RefPosition
    if len(ReferencePosition_Gene_mapping[ref_seq_position]) > 0:

        #iterate through all genes that ReferencePosition is mapped to (i.e. SNP might correspond to 2 genes)
        for gene_intergenic_id in ReferencePosition_Gene_mapping[ref_seq_position]:

            #find genomic coordinate of SNP relative to gene (subtract 1 since reference seq starts counting at 1)
            gene_relative_coord = (ref_seq_position - 1) - min( reference_genome_annotation.loc[gene_intergenic_id , 'chromStart'] , reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd'] )
            
            #find the genomic coordinate (relative to the gene, in the 5' to 3' direction)
            strand = reference_genome_annotation.loc[gene_intergenic_id, 'strand']
            if strand == '+':
                 genomic_5_to_3_coord = (ref_seq_position) - reference_genome_annotation.loc[gene_intergenic_id , 'chromStart']

            elif strand == '-':
                 genomic_5_to_3_coord = (reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd']) - (ref_seq_position-1)
                    
            #find gene category (if one exists)
            try:
                gene_category_i = gene_categories_dict[gene_intergenic_id]
            except KeyError:
                gene_category_i = 'None'
            
            #find gene symbol (if one exists)
            try:
                gene_symbol_i = gene_symbol_dict[gene_intergenic_id]
            except KeyError:
                gene_symbol_i = 'None'
            
            #alternate allele is an actual base
            if alt_allele_i in ['A','C','G','T']:

                #translate into protein sequence with the SNP in place if not InDel or intergenic region
                SNP_change = alt_allele_i

                #ALTERNATE allele (is it Syn or NSyn?)
                #get sequence from dictionary of sequences (and convert to mutable object)
                test_gene_sequence = ref_gene_sequences_records[gene_intergenic_id].seq.tomutable()

                #change reference gene sequence by the SNP in the query sequence
                test_gene_sequence[int(gene_relative_coord)] = SNP_change

                #convert back immutable object
                test_gene_sequence = test_gene_sequence.toseq()

                #translate sequence into amino acid seq
                test_protein_sequence = translate(gene_intergenic_id , test_gene_sequence)

                #store the H37Rv AA seq to compare against
                H37Rv_AA_sequence = ref_protein_sequences_records[gene_intergenic_id].seq

                #get the codon number where the SNP occurs within
                ## take the genomic coordinate (relative to the gene, in the 5' to 3' direction), divide by 3, then take the ceiling of this number (will be fraction if SNP occurs in 1st or 2nd position on codon)
                strand = reference_genome_annotation.loc[gene_intergenic_id, 'strand']
                if strand == '+':
                     genomic_5_to_3_coord = (ref_seq_position) - reference_genome_annotation.loc[gene_intergenic_id , 'chromStart']

                elif strand == '-':
                     genomic_5_to_3_coord = (reference_genome_annotation.loc[gene_intergenic_id , 'chromEnd']) - (ref_seq_position-1)

                codon_coord = int(np.ceil( float( genomic_5_to_3_coord) / 3.0 ))

                #compare to AA seq of original gene
                if test_protein_sequence == H37Rv_AA_sequence:

                    SNP_type = 'S'

                    #get the AA before & after
                    AA_change = H37Rv_AA_sequence[codon_coord-1] + str(codon_coord) + test_protein_sequence[codon_coord-1]

                else:
                    SNP_type = 'N'

                    #get the AA before & after
                    AA_change = H37Rv_AA_sequence[codon_coord-1] + str(codon_coord) + test_protein_sequence[codon_coord-1]
                    
            #alternate allele is a dummy (Base Call completely supports the Reference Allele)       
            else:
                
                SNP_type = 'None'
                AA_change = 'None'

            #store relevant info in lists    
            gene_intergenic_id_list.append(gene_intergenic_id)
            genomic_coord_list.append(genomic_5_to_3_coord)
            gene_category_list.append(gene_category_i)
            gene_symbol_list.append(gene_symbol_i)
            Syn_NSyn_list.append(SNP_type)
            AA_change_list.append(AA_change)
    
    #if no gene in H37Rv corresponds to the Reference Position for SNP, then SNP must be intergenic
    else:
        
        gene_intergenic_id = find_flanking_genes_for_intergenic_region(ref_seq_position)
        genomic_5_to_3_coord = 'None'
        gene_category_i = 'None'
        gene_symbol_i = 'None'
        SNP_type = 'I'
        AA_change = 'None'
        
        #store relevant info in lists    
        gene_intergenic_id_list.append(gene_intergenic_id)
        genomic_coord_list.append(genomic_5_to_3_coord)
        gene_category_list.append(gene_category_i)
        gene_symbol_list.append(gene_symbol_i)
        Syn_NSyn_list.append(SNP_type)
        AA_change_list.append(AA_change)
    
    #if there is only a single gene associated with this SNP, just return the individual elememts
    if len(gene_intergenic_id_list) == 1:
        return [ref_allele_i , gene_intergenic_id , genomic_5_to_3_coord , gene_category_i , gene_symbol_i , SNP_type , AA_change]
    
    #else if there are two genes associated with this SNP, return elements for each SNP annotation in a list
    elif len(gene_intergenic_id_list) > 1:
        return [ref_allele_i , gene_intergenic_id_list , genomic_coord_list , gene_category_list , gene_symbol_list , Syn_NSyn_list , AA_change_list]
################################################################################################################################################################################################

### *Function* to get SNPs between paired isolates (filtered for $\Delta AF$, MGE and low EBR score regions)

In [5]:
def get_filtered_SNPs_between_isolates(isolate_pair_ID , alt_AF_diff_threshold):

    '''
    This function only the fixed SNP variants that occur between a given isolate pair 
    by loading in the pickled DataFrame for isolate pair and comparing alternate allele frequencies called in each isolate.
    (Differing Base Calls that have an Alternate Allele Frequencies >= x% different). 
    This function also drops regions flagged as Mobile Genetic Elements & Regions of poor Illumina mapping / variant calling
    per Empirical Base Recall (EBR) scores across H37Rv.
    '''

    ################################################################################
    ### get SNPs between pair of isolates
    ################################################################################

    population = sample_annotation.loc[isolate_pair_ID , 'population'][0]

    #load in the differing Base Calls for the isolate pair from pickle
    different_base_calls_between_isolates = pd.read_pickle(SNP_variant_dir + population + '_' + isolate_pair_ID + '/base_calls_different_between_isolates.pkl')

    ################################################################################
    ### Drop SNPs with change in AF < x%
    ################################################################################

    #FILTER out paired Base Calls that have a difference in Alternate Allele Frequency of less than x%
    alt_AF_isolate_A = different_base_calls_between_isolates.loc[range(0 , np.shape(different_base_calls_between_isolates)[0] , 2) , 'alt_AF']
    alt_AF_isolate_B = different_base_calls_between_isolates.loc[range(1 , np.shape(different_base_calls_between_isolates)[0] , 2) , 'alt_AF']
    alt_AF_diff_btwn_paired_isolates = abs(alt_AF_isolate_A.values - alt_AF_isolate_B.values)

    isolate_A_Base_Call_indices_small_change_alt_AF = list(alt_AF_isolate_A[alt_AF_diff_btwn_paired_isolates < alt_AF_diff_threshold].index)
    isolate_B_Base_Call_indices_small_change_alt_AF = list(alt_AF_isolate_B[alt_AF_diff_btwn_paired_isolates < alt_AF_diff_threshold].index)
    Base_Call_Indices_SMALL_Alt_AF_Diff = isolate_A_Base_Call_indices_small_change_alt_AF + isolate_B_Base_Call_indices_small_change_alt_AF

    #drop paired Base Calls w/ corresponding change in Alterante Allele Frequency < x%
    different_base_calls_between_isolates.drop(Base_Call_Indices_SMALL_Alt_AF_Diff , axis = 0 , inplace = True)

    #reset index of filtered SNP DataFrame
    different_base_calls_between_isolates.reset_index(inplace = True, drop = True)

    ################################################################################
    ### Drop SNPs with change in regions with low EBR scores
    ################################################################################

    #Drop Base Calls in H37Rv sites with low EBR score (make sure there is at least 1 SNP)
    if np.shape(different_base_calls_between_isolates)[0] > 0:

        #create a boolean filter for SNPs to keep
        SNPs_to_keep_filter = [SNP_i_ref_pos not in H37Rv_positions_to_drop for SNP_i_ref_pos in different_base_calls_between_isolates.ref_position]

        #filter out SNPs in H37Rv sites with low EBR scores and reset index
        different_base_calls_between_isolates = different_base_calls_between_isolates[SNPs_to_keep_filter]
        different_base_calls_between_isolates.reset_index(inplace = True, drop = True)

    ################################################################################
    ### Annotate SNPs & Drop SNPs in MGE regions
    ################################################################################

    gene_id_list = []
    gene_coord_list = []
    gene_category_list = []
    gene_symbol_list = []
    SNP_ftype_list = []
    AA_change_list = []

    #Annotate Filtered Base Calls (make sure there is at least 1 SNP)
    if np.shape(different_base_calls_between_isolates)[0] > 0:

        for ref_position_i , alt_base_i in zip(list(different_base_calls_between_isolates.ref_position) , list(different_base_calls_between_isolates.alt_base)):

            #annotate SNP
            gene_id_i , gene_coord_i , gene_category_i , gene_symbol_i , SNP_ftype_i , AA_change_i = SNP_annotate(ref_position_i , alt_base_i)[1:]

            gene_id_list.append(gene_id_i)
            gene_coord_list.append(gene_coord_i)
            gene_category_list.append(gene_category_i)
            gene_symbol_list.append(gene_symbol_i)
            SNP_ftype_list.append(SNP_ftype_i)
            AA_change_list.append(AA_change_i)

        #create columns to store SNP annotation info
        different_base_calls_between_isolates['gene_id'] = gene_id_list
        different_base_calls_between_isolates['gene_coord'] = gene_coord_list
        different_base_calls_between_isolates['gene_category'] = gene_category_list
        different_base_calls_between_isolates['gene_symbol'] = gene_symbol_list
        different_base_calls_between_isolates['SNP_ftype'] = SNP_ftype_list
        different_base_calls_between_isolates['AA_change'] = AA_change_list

        #FILTER out Base Calls in MGE regions (Mobile Genentic Elements)
        SNPs_to_drop_filter = [] #True if SNP is located within an MGE region

        for gene_id_i in list(different_base_calls_between_isolates.gene_category):

            #only 1 or 0 genes associated with this SNP
            if (type(gene_id_i) == str) and (gene_id_i == 'Mobile Genetic Element'):

                SNPs_to_drop_filter.append(True)

            #two genes associated with this SNP
            elif (type(gene_id_i) == list) and ('Mobile Genetic Element' in gene_id_i):

                SNPs_to_drop_filter.append(True)

            #SNP not in an MGE region so don't drop
            else:

                SNPs_to_drop_filter.append(False)

        #create a boolean filter for SNPs to keep
        SNPs_to_keep_filter = [not MGE_SNP for MGE_SNP in SNPs_to_drop_filter]

        #filter out SNPs in MGE regions and reset index
        different_base_calls_between_isolates = different_base_calls_between_isolates[SNPs_to_keep_filter]
        different_base_calls_between_isolates.reset_index(inplace = True, drop = True)

    #No SNPs detected between this pair of isolates (empty DataFrame)
    else:

        different_base_calls_between_isolates['gene_id'] = ""
        different_base_calls_between_isolates['gene_coord'] = ""
        different_base_calls_between_isolates['gene_category'] = ""
        different_base_calls_between_isolates['gene_symbol'] = ""
        different_base_calls_between_isolates['SNP_ftype'] = ""
        different_base_calls_between_isolates['AA_change'] = ""
    
    return different_base_calls_between_isolates

##########################################################################################################################################################################################################################################

## Longitudinal Sample Pairs

##########################################################################################################################################################################################################################################

#### Import Sample Annotation file

In [6]:
sample_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/sample_annotation_files/Longitudinal_fastq_path_names_and_JankyPipe_tags_filtered_final.csv' , sep  = ',').set_index('patient_id')

In [7]:
SNP_variant_dir = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/variant_calling/longitudinal_SNPs/all_SNPs_between_longitudinal_pairs/'

In [8]:
sample_annotation.head()

Unnamed: 0_level_0,fastq_files,population,run_ID,sample_ID,sample_order,tag,isolate_type
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P000183,/n/data1/hms/dbmi/farhat/fastq_db/walker/ERR03...,WALKER,ERR039337,,1,ERR039337,longitudinal
P000183,/n/data1/hms/dbmi/farhat/fastq_db/walker/ERR03...,WALKER,ERR039338,,2,ERR039338,longitudinal
1960,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,MMJA00000000,Peru5115,1,Peru5115,longitudinal
1960,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,MMPC00000000,Peru4668,2,Peru4668,longitudinal
2491,/n/data1/hms/dbmi/farhat/cetr_strains/good_wgs...,CETR,MMZT00000000,Peru4903,1,Peru4903,longitudinal


In [9]:
num_isolate_pair_IDs = np.shape(sample_annotation)[0] / 2
print num_isolate_pair_IDs 

200


In [10]:
isolate_pair_ID_list = list(set(sample_annotation.index))

### Call function to collect SNPs passing Difference in Alternate Allele Frequency Threshold

In [11]:
Base_Call_variants_btwn_isolates_big_change_in_alt_AF = []

isolate_pair_index = 0

#iterate through isolate pairs, collect all SNP variants arising between each pair of isolates
for isolate_pair_ID in isolate_pair_ID_list:
    
    #retrieve filtered paired Base Calls with a change in Alternate Allele Frequency > threshold
    Base_Call_variants_btwn_isolates_big_change_in_alt_AF_pair_i = get_filtered_SNPs_between_isolates(isolate_pair_ID , alt_AF_diff_threshold)
    
    #store relevant Base Call info in list of DataFrames (1 for each isolate pair)
    Base_Call_variants_btwn_isolates_big_change_in_alt_AF.append(Base_Call_variants_btwn_isolates_big_change_in_alt_AF_pair_i)
    isolate_pair_index += 1
    
    if isolate_pair_index % 5 == 0:
        print isolate_pair_index
        
#concatenate DataFrames for each subject into 1 DataFrame
Base_Call_variants_btwn_isolates_big_change_in_alt_AF = pd.concat(Base_Call_variants_btwn_isolates_big_change_in_alt_AF , axis = 0)

Base_Call_variants_btwn_isolates_big_change_in_alt_AF.reset_index(inplace = True , drop = True)

5




10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200


### *Filter*: Drop paired Base Calls if both Base Calls in a pair support *different* Alternate Alleles

In [12]:
#list that stores the indices of paired Base Calls with DIFFERENT Alternate Alleles
Base_Calls_to_Drop = []

#iterate through each PAIR of corresponding Base Calls from paired isolates
for isolate_A_Base_Call_i , isolate_B_Base_Call_i in zip(range(0 , np.shape(Base_Call_variants_btwn_isolates_big_change_in_alt_AF)[0] , 2) , range(1 , np.shape(Base_Call_variants_btwn_isolates_big_change_in_alt_AF)[0] , 2) ):
    
    #pull info that both Base Calls should have in COMMON
    isolate_A_Base_Call_info = list( Base_Call_variants_btwn_isolates_big_change_in_alt_AF.loc[isolate_A_Base_Call_i , ['ref_base','ref_position','gene_id','genomic_coord','population','patient_id']] ) 
    isolate_B_Base_Call_info = list( Base_Call_variants_btwn_isolates_big_change_in_alt_AF.loc[isolate_B_Base_Call_i , ['ref_base','ref_position','gene_id','genomic_coord','population','patient_id']] ) 
    
    #make sure Base Calls Match with respect to Reference Base, Reference Position, gene ID, Genomic Coordinate, Gene Category, Symbol, Population & Patient ID
    if isolate_A_Base_Call_info == isolate_B_Base_Call_info:
        
        #pull alternate Allele for each of the paired isolates
        isolate_A_Alt_Base = Base_Call_variants_btwn_isolates_big_change_in_alt_AF.loc[isolate_A_Base_Call_i , 'alt_base']
        isolate_B_Alt_Base = Base_Call_variants_btwn_isolates_big_change_in_alt_AF.loc[isolate_B_Base_Call_i , 'alt_base']
        
        #Check to see if there is a 'Z' in the pair of Alternate Alleles, if so one of the Base Calls supported the Reference Base (so there was no Alternate Allele)
        if (isolate_A_Alt_Base == 'Z') or (isolate_B_Alt_Base == 'Z'):
            pass
        
        #if neither Alternate Allele is a 'Z', then check to see that the Alternate Allele Bases Match
        elif isolate_A_Alt_Base == isolate_B_Alt_Base:
            pass
        
        #if the Alternate Alleles DON'T match and both Base Calls supported Alternate Alleles (not the Reference), then we can't compare the Allele Frequencies of these Alternate Alleles (since they're different)
        else:
            Base_Calls_to_Drop = Base_Calls_to_Drop + [isolate_A_Base_Call_i , isolate_B_Base_Call_i]
        
    #print indices of Base Calls and see what went wrong if the paired Base Calls have different information that the Calls should have in Common (Ref Position, Ref Base, Gene ID, Patient ID, etc.)
    else:
        print (isolate_A_Base_Call_i , isolate_B_Base_Call_i)
        
#Drop Paired Base Calls that supported different Alternate Alleles
Base_Call_variants_btwn_isolates_big_change_in_alt_AF.drop(Base_Calls_to_Drop , axis = 0 , inplace = True)

#reset index
Base_Call_variants_btwn_isolates_big_change_in_alt_AF.reset_index(inplace = True, drop = True)

In [13]:
Base_Call_variants_btwn_isolates_big_change_in_alt_AF.head(n = 10)

Unnamed: 0,ref_base,alt_base,ref_position,quality,SNP_type,PASS_filter,INFO,alt_AF,depth,tag,population,patient_id,gene_id,gene_coord,gene_category,gene_symbol,SNP_ftype,AA_change
0,C,Z,761139,3140,Ref_PASS,[],"{u'QP': [0, 100, 0, 0], u'AC': [0], u'AF': [0....",0.0,89,ERR369703,CASALI,P251,Rv0667,1333,Antibiotic Resistance,rpoB,,
1,C,G,761139,1266,Alt_PASS,[],"{u'QP': [0, 0, 100, 0], u'AC': [2], u'AF': [1....",1.0,36,ERR600658,CASALI,P251,Rv0667,1333,Antibiotic Resistance,rpoB,N,H445D
2,T,Z,861378,3116,Ref_PASS,[],"{u'QP': [0, 0, 0, 100], u'AC': [0], u'AF': [0....",0.0,90,ERR369703,CASALI,P251,Rv0768,467,Non-Essential,aldA,,
3,T,G,861378,1167,Alt_PASS,[],"{u'QP': [0, 0, 100, 0], u'AC': [2], u'AF': [1....",1.0,34,ERR600658,CASALI,P251,Rv0768,467,Non-Essential,aldA,N,I156S
4,A,G,4402310,2360,Amb,[Amb],"{u'QP': [48, 0, 52, 0], u'AC': [1], u'AF': [0....",0.52,71,ERR369703,CASALI,P251,Rv3913,583,Essential,trxB2,N,I195V
5,A,G,4402310,1340,Alt_PASS,[],"{u'QP': [0, 0, 100, 0], u'AC': [2], u'AF': [1....",1.0,38,ERR600658,CASALI,P251,Rv3913,583,Essential,trxB2,N,I195V
6,A,C,656329,7836,Ref_PASS,[],"{u'QP': [91, 9, 0, 0], u'AC': [0], u'AF': [0.0...",0.09,306,Peru5140,CETR,2968,Rv0565c,1142,Non-Essential,,N,V381G
7,A,C,656329,5372,Amb,[Amb],"{u'QP': [41, 59, 0, 0], u'AC': [1], u'AF': [0....",0.59,219,Peru5139,CETR,2968,Rv0565c,1142,Non-Essential,,N,V381G
8,C,Z,656956,15111,Ref_PASS,[],"{u'QP': [0, 100, 0, 0], u'AC': [0], u'AF': [0....",0.0,389,Peru5140,CETR,2968,Rv0565c,515,Non-Essential,,,
9,C,T,656956,3512,Amb,[Amb],"{u'QP': [0, 65, 0, 34], u'AC': [1], u'AF': [0....",0.34,235,Peru5139,CETR,2968,Rv0565c,515,Non-Essential,,N,W172*


In [14]:
np.shape(Base_Call_variants_btwn_isolates_big_change_in_alt_AF)

(3198, 18)

#### Pickle DataFrame for Downstream analyses (Alternate Allele Frequency 1 vs. Alternate Allele Frequency 2)

In [15]:
Base_Call_variants_btwn_isolates_big_change_in_alt_AF.to_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/variant_calling/longitudinal_SNPs/longitudinal_SNP_variants_25_delta_in_alt_AF.pkl')

### Re-Shape Filtered DataFrame (Paired Base Calls across all isolate pairs) to store one entry per SNP

In [16]:
SNP_variants_between_paired_isolates = pd.DataFrame()

#common information to both Base Calls (can just look at isolate A)
population_dict = {}
patient_id_dict = {}
ref_position_dict = {}
ref_allele_dict = {}
gene_id_dict  = {}
genomic_coord_dict = {}
gene_category_dict = {}
gene_symbol_dict = {}

#look at info for both Base Calls
alt_allele_dict = {}
alt_AF_diff_dict = {}
SNP_type_dict = {}
AA_change_dict = {}

SNP_index = 0
#iterate through indices for isolate A (store common information for isolate pair A & B came from and Base Call), calculate different in Alternate Allele Frequencies, store Syn or NSyn info
for even_index in range(0 , np.shape(Base_Call_variants_btwn_isolates_big_change_in_alt_AF)[0] , 2):
    
    #Base Call info for isolate A
    Base_Call_info_isolate_A = Base_Call_variants_btwn_isolates_big_change_in_alt_AF.loc[even_index , :]
    #Base Call info for isolate B
    Base_Call_info_isolate_B = Base_Call_variants_btwn_isolates_big_change_in_alt_AF.loc[even_index+1 , :]
    
    population_dict[SNP_index] = Base_Call_info_isolate_A.population
    patient_id_dict[SNP_index] = Base_Call_info_isolate_A.patient_id
    ref_position_dict[SNP_index] = Base_Call_info_isolate_A.ref_position
    ref_allele_dict[SNP_index] = Base_Call_info_isolate_A.ref_base
    gene_id_dict[SNP_index] = Base_Call_info_isolate_A.gene_id
    genomic_coord_dict[SNP_index] = Base_Call_info_isolate_A.gene_coord
    gene_category_dict[SNP_index] = Base_Call_info_isolate_A.gene_category
    gene_symbol_dict[SNP_index] = Base_Call_info_isolate_A.gene_symbol
    
    #get alternate allele
    alt_allele_calls = [Base_Call_info_isolate_A.alt_base , Base_Call_info_isolate_B.alt_base]
    try:
        alt_allele_calls.remove('Z')
    except ValueError:
        pass
    alt_allele_dict[SNP_index] = alt_allele_calls[0]

    #get difference in Alternate Allele Frequencies
    alt_AF_diff_dict[SNP_index] = abs(Base_Call_info_isolate_A.alt_AF - Base_Call_info_isolate_B.alt_AF)
    
    #get type of SNP 
    if 'S' in [Base_Call_info_isolate_A.SNP_ftype , Base_Call_info_isolate_B.SNP_ftype]:
        SNP_type_dict[SNP_index] = 'S'
        
    elif 'N' in [Base_Call_info_isolate_A.SNP_ftype , Base_Call_info_isolate_B.SNP_ftype]:
        SNP_type_dict[SNP_index] = 'N'
        
    elif 'I' in [Base_Call_info_isolate_A.SNP_ftype , Base_Call_info_isolate_B.SNP_ftype]:
        SNP_type_dict[SNP_index] = 'I'
        
    #get AA change
    AA_change_calls = [Base_Call_info_isolate_A.AA_change , Base_Call_info_isolate_B.AA_change]
    try:
        AA_change_calls.remove('None')
    except ValueError:
        pass
    AA_change_dict[SNP_index] = AA_change_calls[0]
        
    SNP_index += 1
    
#convert dictionaries into series
population = pd.Series(population_dict)
patient_id = pd.Series(patient_id_dict)
ref_position = pd.Series(ref_position_dict)
ref_allele = pd.Series(ref_allele_dict)
alt_allele = pd.Series(alt_allele_dict)
gene_id = pd.Series(gene_id_dict)
genomic_coord = pd.Series(genomic_coord_dict)
gene_category = pd.Series(gene_category_dict)
gene_symbol = pd.Series(gene_symbol_dict)
alt_AF_diff = pd.Series(alt_AF_diff_dict)
SNP_type = pd.Series(SNP_type_dict)
AA_change = pd.Series(AA_change_dict)
        
#create DataFrame
SNP_variants_between_paired_isolates['population'] = population
SNP_variants_between_paired_isolates['patient_id'] = patient_id
SNP_variants_between_paired_isolates['ref_position'] = ref_position 
SNP_variants_between_paired_isolates['ref_allele'] = ref_allele
SNP_variants_between_paired_isolates['alt_allele'] = alt_allele
SNP_variants_between_paired_isolates['gene_id'] = gene_id
SNP_variants_between_paired_isolates['genomic_coord'] = genomic_coord
SNP_variants_between_paired_isolates['gene_category'] = gene_category
SNP_variants_between_paired_isolates['gene_symbol'] = gene_symbol
SNP_variants_between_paired_isolates['alt_AF_diff'] = alt_AF_diff
SNP_variants_between_paired_isolates['SNP_type'] = SNP_type
SNP_variants_between_paired_isolates['AA_change'] = AA_change

In [17]:
SNP_variants_between_paired_isolates.head()

Unnamed: 0,population,patient_id,ref_position,ref_allele,alt_allele,gene_id,genomic_coord,gene_category,gene_symbol,alt_AF_diff,SNP_type,AA_change
0,CASALI,P251,761139,C,G,Rv0667,1333,Antibiotic Resistance,rpoB,1.0,N,H445D
1,CASALI,P251,861378,T,G,Rv0768,467,Non-Essential,aldA,1.0,N,I156S
2,CASALI,P251,4402310,A,G,Rv3913,583,Essential,trxB2,0.48,N,I195V
3,CETR,2968,656329,A,C,Rv0565c,1142,Non-Essential,,0.5,N,V381G
4,CETR,2968,656956,C,T,Rv0565c,515,Non-Essential,,0.34,N,W172*


In [18]:
np.shape(SNP_variants_between_paired_isolates)

(1599, 12)

#### Save DataFrame as CSV

In [19]:
SNP_variants_between_paired_isolates.to_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/variant_calling/longitudinal_SNPs/SNPs_between_isolates_delta_25.csv' , sep = ',')

#### Pickle DataFrame for Downstream analyses

In [20]:
SNP_variants_between_paired_isolates.to_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/variant_calling/longitudinal_SNPs/SNPs_between_isolates_delta_25.pkl')