In [1]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/microbiome_evolution/microbiome_evolution_MOUSE/")

import matplotlib  
matplotlib.use('Agg') 
import sample_utils
import config
import parse_midas_data
import os.path
import pylab
import sys
import numpy
import sfs_utils
        

import diversity_utils
import gene_diversity_utils
import core_gene_utils
import gzip
import os

import stats_utils
from math import log10,ceil,factorial
from numpy.random import randint

### parameters

In [2]:
intersample_change_directory = '%sintersample_change/' % (parse_midas_data.data_directory)
intermediate_filename_template = '%s%s.txt.gz'  

min_coverage = config.min_median_coverage
min_sample_size = 2

### main script

`if __name__=='__main__':`

In [3]:
# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument("--debug", help="Loads only a subset of SNPs for speed", action="store_true")
# parser.add_argument("--chunk-size", type=int, help="max number of records to load", default=1000000000)
# parser.add_argument("species", help="Name of specific species to run code on")

# args = parser.parse_args()

debug = False
chunk_size = 1000000000
species_name= "Parabacteroides_distasonis_56985"
good_species_list = [species_name]

In [4]:
os.system('mkdir -p %s' % intersample_change_directory)

0

In [5]:
# Load subject and sample metadata
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = sample_utils.parse_subject_sample_map()
sample_order_map = sample_utils.parse_sample_order_map()
sys.stderr.write("Done!\n")

Loading sample metadata...
Done!


In [6]:
intermediate_filename = intermediate_filename_template % (intersample_change_directory, species_name)

In [7]:
output_file = gzip.open(intermediate_filename,"w")

In [8]:
#header
output_file.write(", ".join(['Species', 'Sample1', 'Sample2', 'Type', 'L','Perr', 'Change1', '...']))
output_file.write("\n")

1

In [9]:
#for species_name in good_species_list:
sample_coverage_map = parse_midas_data.parse_sample_coverage_map(species_name)

In [10]:
sys.stderr.write("Loading SFSs for %s...\t" % species_name)
samples, sfs_map = parse_midas_data.parse_within_sample_sfs(species_name, allowed_variant_types=set(['1D','2D','3D','4D'])) 
sys.stderr.write("Done!\n")

Loading SFSs for Parabacteroides_distasonis_56985...	Done!


In [11]:
sys.stderr.write("Loading desired samples...\n")
snp_samples = diversity_utils.calculate_highcoverage_samples(species_name, min_coverage)
#NOTE: I lowkey think I should change this to calculate haploid samples, which does use calculate_highcoverage_samples

Loading desired samples...


In [12]:
# if len(snp_samples) < min_sample_size:
#     sys.stderr.write("Not enough temporal samples!\n")
#     continue

In [13]:
n_comb = factorial(len(snp_samples)) / (factorial(2) * factorial(len(snp_samples) - 2))
sys.stderr.write("Proceeding with %d comparisons of %d temporal samples!\n" % (n_comb, len(snp_samples)))
                 

Proceeding with 210 comparisons of 21 temporal samples!


In [14]:
# Analyze SNPs, looping over chunk sizes. 
# Clunky, but necessary to limit memory usage on cluster

sys.stderr.write("Loading whitelisted genes...\n")
non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(species_name)
shared_pangenome_genes = core_gene_utils.parse_shared_genes(species_name)
sys.stderr.write("Done! %d shared genes and %d non-shared genes\n" % (len(shared_pangenome_genes), len(non_shared_genes)))


Loading whitelisted genes...
Done! 4076 shared genes and 3894 non-shared genes


In [15]:
# Now calculate gene differences
# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name,allowed_samples=snp_samples, disallowed_genes=shared_pangenome_genes)
sys.stderr.write("Done!\n")

Loading pangenome data for Parabacteroides_distasonis_56985...
Done!


In [16]:
snp_samples = gene_samples #adds any samples in the genes dataframe. Thus, all samples comprise those with > 20 median read coverage and those present in the gene dataframe



In [17]:
# if len(snp_samples) < min_sample_size:
#     sys.stderr.write("Not enough temporal samples!\n")
#     continue


In [18]:
import calculate_private_snvs
private_snv_map = calculate_private_snvs.load_private_snv_map(species_name)
#NOTE: might want to eliminate

In [19]:
# Load SNP information for species_name
sys.stderr.write("Loading SNPs for %s...\n" % species_name)    
snp_changes = {}
gene_changes = {}
tracked_private_snps = {}
snp_opportunities = {}
gene_opportunities = {}
tracked_private_snp_opportunities = {}

snp_perrs = {}
gene_perrs = {}
tracked_private_snp_perrs = {}

snp_difference_matrix = numpy.array([]) # all sites in all genes
snp_opportunity_matrix = numpy.array([])


Loading SNPs for Parabacteroides_distasonis_56985...


In [20]:
final_line_number = 0

#while final_line_number >= 0: #MW: this is -1 after running the next line
sys.stderr.write("Loading chunk starting @ %d...\n" % final_line_number)
dummy_samples, allele_counts_map, passed_sites_map, final_line_number = parse_midas_data.parse_snps(species_name, debug=debug, allowed_samples=snp_samples, chunk_size=chunk_size,initial_line_number=final_line_number,allowed_genes=non_shared_genes)
sys.stderr.write("Done! Loaded %d genes\n" % len(allele_counts_map.keys()))

Loading chunk starting @ 0...
1k sites processed...
2k sites processed...
3k sites processed...
4k sites processed...
Done! Loaded 3527 genes


`passed_sites_map`: Within each gene and variant type bin, there is a symmetric matrix with dimensions (len(snp_samples), len(snp_samples). Each element counts the number of snp sites that have a depth > 0 in BOTH samples.  

`allele_counts_map`: Within each gene and variant type bin, you have the sites that "passed", which means the alt allele is above 0.05%. If the SNV has an alt allele that passes in at least 1 sample. If it passes, the locations and [alt, ref] reads for ALL samples (in desired_samples) are appended to the dictionary. 

In [21]:
# All
chunk_snp_difference_matrix, chunk_snp_opportunity_matrix = diversity_utils.calculate_fixation_matrix(allele_counts_map, passed_sites_map)  




`chunk_snp_difference_matrix`: a symmetric matrix of with row and column dimensions equal to the number of samples. It tabulates the number of differences between samples  

`chunk_snp_opportunity_matrix`: same thing, but with opportunities!

In [22]:
if snp_difference_matrix.shape[0]==0:
    snp_difference_matrix = numpy.zeros_like(chunk_snp_difference_matrix)*1.0
    snp_opportunity_matrix = numpy.zeros_like(snp_difference_matrix)*1.0
    
# Add all
snp_difference_matrix += chunk_snp_difference_matrix
snp_opportunity_matrix += chunk_snp_opportunity_matrix

In [23]:
sample_pairs, sample_pair_map = sample_utils.load_all_sample_pairs(snp_samples)

In [24]:
for sample_pair_idx in xrange(0,len(sample_pairs[0])):
    i = sample_pair_map[sample_pair_idx][0] #extract indices in snp_samples
    j = sample_pair_map[sample_pair_idx][1]

    sample_i = sample_pairs[0, sample_pair_idx] #extract sample_ids
    sample_j = sample_pairs[1, sample_pair_idx]

    avg_depth_i = sample_coverage_map[sample_i] #median number of reads per sample
    avg_depth_j = sample_coverage_map[sample_j]

    chunk_tracked_private_snps = diversity_utils.calculate_tracked_private_snvs(i, j, allele_counts_map, passed_sites_map, avg_depth_i, avg_depth_j, private_snv_map)
    chunk_snp_changes = diversity_utils.calculate_snp_differences_between(i, j, allele_counts_map, passed_sites_map, avg_depth_i, avg_depth_j)
    
    sample_pair = (sample_i, sample_j)
        
    if sample_pair not in snp_changes:
        snp_changes[sample_pair] = []
        gene_changes[sample_pair] = []
        snp_opportunities[sample_pair] = 0
        gene_opportunities[sample_pair] = 0
        snp_perrs[sample_pair] = -1
        gene_perrs[sample_pair] = -1

        tracked_private_snps[sample_pair] = []
        tracked_private_snp_opportunities[sample_pair] = 0
        tracked_private_snp_perrs[sample_pair] = -1

    snp_changes[sample_pair].extend(chunk_snp_changes)
    snp_opportunities[sample_pair] += chunk_snp_opportunity_matrix[i,j]

    tracked_private_snps[sample_pair].extend( chunk_tracked_private_snps)

    tracked_private_snp_opportunities[sample_pair] += len(chunk_tracked_private_snps)
    

In [38]:
# Calculate SNP error rate
for sample_pair_idx in xrange(0,len(sample_pairs[0])):

    i = sample_pair_map[sample_pair_idx][0] #extract indices in snp_samples
    j = sample_pair_map[sample_pair_idx][1]

    sample_i = sample_pairs[0, sample_pair_idx] #extract sample_ids
    sample_j = sample_pairs[1, sample_pair_idx]
    
    sample_pair = (sample_i, sample_j)

    perr = diversity_utils.calculate_fixation_error_rate(sfs_map, sample_i, sample_j)[0]

    snp_perrs[sample_pair] = perr
    tracked_private_snp_perrs[sample_pair] = perr

    gene_changes[sample_pair].extend( gene_diversity_utils.calculate_gene_differences_between(i, j, gene_reads_matrix, gene_depth_matrix, marker_coverages) )
    #structure of gene_changes[sample_pair] output: 
    #(gene_idx, (gene depth of sample i, marker gene coverage in sample i), (gene depth of sample j, marker gene coverage in sample j))
    gene_perr = gene_diversity_utils.calculate_gene_error_rate(i, j, gene_reads_matrix, gene_depth_matrix, marker_coverages)[0]

    gene_opportunities[sample_pair] = gene_depth_matrix.shape[0]

    gene_perrs[sample_pair] = gene_perr

sys.stderr.write("Done!\n") 

Done!


In [41]:
for sample_i, sample_j in snp_changes.keys():

    # First output SNPs
    snp_strs = []
    for snp_change in snp_changes[(sample_i, sample_j)]:


        gene_name, location, variant_type, allele_counts_1, allele_counts_2 = snp_change
        contig = location[0]
        position = location[1]

        A1,D1 = allele_counts_1
        A2,D2 = allele_counts_2

        snp_str = ('%s;%s;%d;%s;%d;%d;%d;%d' % (gene_name, contig, position, variant_type, A1, D1, A2, D2))

        snp_strs.append(snp_str)

    record_str_items = [species_name, sample_i, sample_j, 'snps', "%g" % snp_opportunities[(sample_i, sample_j)], "%g" % snp_perrs[(sample_i, sample_j)]] + snp_strs
    record_str = ", ".join(record_str_items)
    output_file.write(record_str)
    output_file.write("\n")

    # Now output genes
    gene_strs = []
    for gene_change in gene_changes[(sample_i, sample_j)]:
        gene_idx, coverages_1, coverages_2 = gene_change
        gene_name = gene_names[gene_idx] 
        D1,Dm1 = coverages_1 #read depth of the gene in sample i, average read depth of marker genes in sample i
        D2,Dm2 = coverages_2 #read depth of the gene in sample j, average read depth of marker genes in sample j

        gene_str = ('%s;%0.2f;%0.2f;%0.2f;%0.2f' % (gene_name, D1, Dm1, D2, Dm2)) #gene name, depth i, marker depth i, depth j, marker depth j
        gene_strs.append(gene_str)

    record_str_items = [species_name, sample_i, sample_j, 'genes', "%g" % gene_opportunities[(sample_i, sample_j)], "%g" % gene_perrs[(sample_i, sample_j)]] + gene_strs
    #Structure of output: species_name, sample i, sample j, "genes", gene change opportunities, gene change error rate, gene name, depth i, marker depth i, depth j, marker depth j
    record_str = ", ".join(record_str_items)
    output_file.write(record_str)
    output_file.write("\n")

    # Now output private SNPS
    private_snp_strs = []
    for snp_change in tracked_private_snps[(sample_i, sample_j)]:


        gene_name, location, variant_type, allele_counts_1, allele_counts_2 = snp_change
        contig = location[0]
        position = location[1]

        A1,D1 = allele_counts_1
        A2,D2 = allele_counts_2

        snp_str = ('%s;%s;%d;%s;%d;%d;%d;%d' % (gene_name, contig, position, variant_type, A1, D1, A2, D2))

        private_snp_strs.append(snp_str)

    record_str_items = [species_name, sample_i, sample_j, 'private_snps', "%g" % tracked_private_snp_opportunities[(sample_i, sample_j)], "%g" % tracked_private_snp_perrs[(sample_i, sample_j)]] + private_snp_strs
    #Structure of output: species_name, sample i, sample j, "private snps", private snps opportunities, error rate, gene name, contig, position, variant type, alt reads i, depth i, alt reads j, depth j
    record_str = ", ".join(record_str_items)
    output_file.write(record_str)
    output_file.write("\n")


sys.stderr.write("Done with %s!\n" % species_name)

Done with Parabacteroides_distasonis_56985!


In [47]:
sys.stderr.write("Done looping over species!\n")
output_file.close()
sys.stderr.write("Done!\n")

Done looping over species!
Done!


### FUNCTIONS

In [49]:
intermediate_filename_template % (intersample_change_directory, species_name)

'/u/project/ngarud/Garud_lab/HumanizedMouse/HumanizedMouse_Batch2/merged_data/intersample_change/Parabacteroides_distasonis_56985.txt.gz'

In [None]:
def load_intersample_change_map(species_name):
    
    intermediate_filename = intermediate_filename_template % (intersample_change_directory, species_name)

    intersample_change_map = {}


    if not os.path.isfile(intermediate_filename):
        return intersample_change_map
    
    file = gzip.open(intermediate_filename,"r")
    file.readline() # header
    for line in file:
        items = line.split(",")
        if items[0].strip()!=species_name:
            continue
            
        sample_1 = items[1].strip()
        sample_2 = items[2].strip()
        type = items[3].strip()
        num_opportunities = float(items[4])
        perr = float(items[5])
        sample_pair = (sample_1, sample_2)
        if sample_pair not in temporal_change_map:
            temporal_change_map[sample_pair] = {}
        
        changes = []
        if len(items)<7:
            pass
        else:
            change_strs = items[6:]
            for change_str in change_strs:
            
                subitems = change_str.split(";")
                
                # switch on type of change
                if type=='snps':    
                    gene_name = subitems[0].strip()
                    contig = subitems[1].strip()
                    position = long(subitems[2])
                    variant_type = subitems[3].strip()
                    A1 = float(subitems[4])
                    D1 = float(subitems[5])
                    A2 = float(subitems[6])
                    D2 = float(subitems[7])
                    changes.append( (gene_name, contig, position, variant_type, A1, D1, A2, D2) )
                            
                elif type=='genes':
                    gene_name = subitems[0].strip()
                    D1 = float(subitems[1])
                    Dm1 = float(subitems[2])
                    D2 = float(subitems[3])
                    Dm2 = float(subitems[4])
                    changes.append( (gene_name, D1, Dm1, D2, Dm2) )
                    
                elif type=='private_snps':
                    
                    gene_name = subitems[0].strip()
                    contig = subitems[1].strip()
                    position = long(subitems[2])
                    variant_type = subitems[3].strip()
                    A1 = float(subitems[4])
                    D1 = float(subitems[5])
                    A2 = float(subitems[6])
                    D2 = float(subitems[7])
                    changes.append( (gene_name, contig, position, variant_type, A1, D1, A2, D2) )
                    
        intersample_change_map[sample_pair][type] = num_opportunities, perr, changes
    
    return intersample_change_map

In [None]:
def calculate_private_reversions_from_intersample_change_map(intersample_change_map, sample_1, sample_2, lower_threshold=config.consensus_lower_threshold, 
upper_threshold=config.consensus_upper_threshold):
    
    sample_pair = sample_1, sample_2
    if sample_pair not in intersample_change_map:
        return -1, None, None
        
    if 'private_snps' not in intersample_change_map[sample_pair]:
        return -1, None, None
        
    # otherwise, some hope! 
    
    private_snp_opportunities, private_snp_perr, private_snps = intersample_change_map[sample_pair]['private_snps']
    
    mutations = []
    private_snp_reversions = []
    for snp_change in private_snps:
    
        a,b,c,d,A1,D1,A2,D2 = snp_change
        
        if D1==0 or D2==0:
            private_snp_opportunities-=1
            continue
        
        f1 = A1*1.0/D1
        f2 = A2*1.0/D2
        
        if f1>=upper_threshold and f2<=lower_threshold:
            private_snp_reversions.append(snp_change)
        if f1<=upper_threshold and f2>=upper_threshold:
            mutations.append(snp_change)        
    
    return private_snp_opportunities, private_snp_perr, private_snp_reversions


In [None]:
def calculate_mutations_reversions_from_intersample_change_map(intersample_change_map, sample_1, sample_2, lower_threshold=config.consensus_lower_threshold, 
upper_threshold=config.consensus_upper_threshold):

    sample_pair = sample_1, sample_2
    if sample_pair not in intersample_change_map:
        return -1, -1, [], []
        
    if 'snps' not in intersample_change_map[sample_pair]:
        return -1, -1, [], []
        
    # otherwise, some hope! 
    snp_opportunities, snp_perr, snp_changes = intersample_change_map[sample_pair]['snps']
    
    mutations = []
    reversions = []
    for snp_change in snp_changes:
    
        a,b,c,d,A1,D1,A2,D2 = snp_change
        
        f1 = A1*1.0/D1
        f2 = A2*1.0/D2
        
        if (f1<=lower_threshold) and (f2>=upper_threshold):
            mutations.append(snp_change)
        elif (f1>=upper_threshold) and (f2<=lower_threshold):
            reversions.append(snp_change)
            
    
    return snp_opportunities, snp_perr, mutations, reversions


In [None]:
def calculate_gains_losses_from_intersample_change_map(intersample_change_map, sample_1, sample_2, max_absent_copynum=config.gainloss_max_absent_copynum, min_normal_copynum=config.gainloss_min_normal_copynum, max_normal_copynum=config.gainloss_max_normal_copynum):


    sample_pair = sample_1, sample_2
    if sample_pair not in intersample_change_map:
        return -1, -1, [], []
        
    if 'genes' not in intersample_change_map[sample_pair]:
        return -1, -1, [], []
        
    # otherwise, some hope! 
    gene_opportunities, gene_perr, gene_changes = intersample_change_map[sample_pair]['genes']
    
    gains = []
    losses = []
    for gene_change in gene_changes:
    
        gene_name, D1, Dm1, D2, Dm2 = gene_change
        
        copynum_1 = D1/Dm1
        copynum_2 = D2/Dm2
        
        if (copynum_1<=max_absent_copynum) and (copynum_2>=min_normal_copynum) and (copynum_2<=max_normal_copynum):
            gains.append(gene_change)
        elif (copynum_2<=max_absent_copynum) and (copynum_1>=min_normal_copynum) and (copynum_1<=max_normal_copynum):
            losses.append(gene_change)
            
    
    return gene_opportunities, gene_perr, gains, losses