# Question 1: is `calculate_snp_prevalences.parse_population_freqs(species_name, use_HMP_freqs=True)` loading HMP snp prevalences?

In [None]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/microbiome_evolution/microbiome_evolution_MOUSE/")
import numpy
import bz2
import gzip
import config
import os.path

intermediate_filename_template = config.data_directory+"snp_prevalences/%s.txt.gz"
intermediate_filename_template_HMP = config.HMP_data_directory+"snp_prevalences/%s.txt.gz" #MW added this 

def parse_population_freqs(desired_species_name, polarize_by_consensus=False, use_HMP_freqs=False):
    
    if use_HMP_freqs:                                                                           #MW added this
        intermediate_filename = intermediate_filename_template_HMP % desired_species_name       #if statement.
    else:                                                                                       #If statement will be
        intermediate_filename = intermediate_filename_template % desired_species_name           #true in calculate_within_person_sfs.py
      
    population_freqs = {}
    
    if not os.path.isfile(intermediate_filename):
        return population_freqs

    file = gzip.GzipFile(intermediate_filename,"r")
    file.readline()
    for line in file:
        items = line.split(",")
        contig = items[0]
        location = long(items[1])
        population_freq = float(items[2])
        snp_freq = float(items[3])
        
        if polarize_by_consensus:
            if population_freq > 0.5:
                population_freq = 1-population_freq
        
        if population_freq==0:
            pass
        else:
            population_freqs[(contig,location)] = population_freq
                            
    file.close()
    
    return population_freqs

In [None]:
species_name = "Akkermansia_muciniphila_55290"

In [None]:
parse_population_freqs(species_name)

In [None]:
parse_population_freqs(species_name, use_HMP_freqs=True)

**Conclusion:** `parse_population_freqs(species_name, use_HMP_freqs=True)` in `calculate_snp_prevalences` does work!

# Question 2: does `calculate_within_persons_sfs.py` successfully create a within person sfs using HMP snp prevalences?

In [None]:
import parse_midas_data
import pylab
import sys
import numpy
import bz2
import calculate_snp_prevalences

In [None]:
species_name = "Akkermansia_muciniphila_55290"
debug = False
chunk_size = 1000000000

### Setup

In [None]:
# Should we do this? 
sys.stderr.write("Loading core genes...\n")
core_genes = parse_midas_data.load_core_genes(species_name)
sys.stderr.write("Done! %d core genes\n" % len(core_genes))
allowed_genes = core_genes

sys.stderr.write("Loading population freqs...\n")
population_freqs = calculate_snp_prevalences.parse_population_freqs(species_name, use_HMP_freqs=True)
sys.stderr.write("Done! %d SNVs\n" % len(population_freqs))

allowed_variant_type_list = ['1D','2D','3D','4D']
allowed_variant_types = set(allowed_variant_type_list)  

# Open post-processed MIDAS output
snp_file =  bz2.BZ2File("%ssnps/%s/annotated_snps.txt.bz2" % (parse_midas_data.data_directory, species_name),"r")
    
line = snp_file.readline() # header
items = line.split()[1:]
samples = numpy.array([item.strip() for item in items])

# We shouldn't be doing this for raw data 
#samples = parse_midas_data.parse_merged_sample_names(items)
    
site_map = [{} for sample in samples]
for sample_idx in xrange(0,len(samples)):
    site_map[sample_idx] = {variant_type:{} for variant_type in allowed_variant_types}


In [None]:

sys.stderr.write("Calculating within-person SFSs...\n")        
num_sites_processed = 0
for line in snp_file:
    #
    items = line.split()
    # Load information about site
    info_items = items[0].split("|")
    chromosome = info_items[0]
    location = long(info_items[1])
    gene_name = info_items[2]
    variant_type = info_items[3]
    
    if len(info_items) > 5: # for backwards compatability
            polarization = info_items[4]
            pvalue = float(info_items[5])
    else: 
        polarization="?"
        pvalue = float(info_items[4])
        
    #    
    if variant_type not in allowed_variant_types:
        continue
    #    
    if len(allowed_genes)>0 and (gene_name not in allowed_genes):
        continue
    #    
    # Load alt and depth counts
    alts = []
    depths = []
    for item in items[1:]:
        subitems = item.split(",")
        alts.append(long(subitems[0]))
        depths.append(long(subitems[1]))
    alts = numpy.array(alts)
    depths = numpy.array(depths)
    refs = depths-alts
    #print alts
    #print depths
    #
    # population_freq returns the fraction of people for which the alt is the major allele.
    # This is a very important quantity being computed! It is later used for identifying CPS samples.
    if (chromosome, location) in population_freqs:
        population_freq = population_freqs[(chromosome, location)]
    else:
        population_freq = 0
    
    # polarize SFS according to population freq
    if population_freq>0.5:
        alts,refs = refs,alts
        population_freq = 1-population_freq
        
    #    
    for i in xrange(0,len(alts)):
        site = (depths[i],alts[i])
        #
        if site not in site_map[i][variant_type]:
            site_map[i][variant_type][site] = [0,0.0]
        #        
        site_map[i][variant_type][site][0] += 1
        site_map[i][variant_type][site][1] += population_freq # weight of polarization reversals
        #
        #
    num_sites_processed+=1
    #print num_sites_processed
    if num_sites_processed%50000==0:
        sys.stderr.write("%dk sites processed...\n" % (num_sites_processed/1000))   
        if debug:
            break


In [None]:
site_map[0]['1D'][(169, 0)]

In [None]:
sorted(site_map[0]['1D'].keys())

**Conclusion:** It successfully calculates different within person SFSs based on setting `use_HMP_freqs=True`
- Furthermore, I confirmed that the within_host_sfs.txt.bz2 file is indeed the product of the pipeline when using `use_HMP_freqs=True`

# Question 3: What are the within and between difference values for each sample

In [20]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/microbiome_evolution/microbiome_evolution_MOUSE/")

import numpy
from scipy.linalg import eigh
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster
from numpy.random import shuffle, normal
import scipy.stats
from scipy.stats import binom
import config
from scipy.special import betainc
import parse_midas_data
import sample_utils
import stats_utils
import os.path
import sfs_utils
import diversity_utils
import pandas as pd


In [12]:
species_name = "Akkermansia_muciniphila_55290"
desired_samples = diversity_utils.calculate_highcoverage_samples(species_name, 20)

In [13]:
samples, sfs_map = parse_midas_data.parse_within_sample_sfs(species_name,     allowed_variant_types=set(['4D']))


In [16]:
total_vec = []
within_vec = []
between_vec= []
betweenX10percent = []
for sample in samples:
    within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(sfs_map[sample])
    
    total_vec.append(int(total_sites))
    within_vec.append(int(within_sites))
    between_vec.append(int(between_sites))
    betweenX10percent.append(0.1*between_sites)
    

In [22]:
df = pd.DataFrame()
df['samples'] = samples
df['total_sites'] = total_vec
df['within_sites'] = within_vec
df['between_sites'] = between_vec
df['betweenX10percent'] = betweenX10percent

In [23]:
df

Unnamed: 0,samples,total_sites,within_sites,between_sites,betweenX10percent
0,M8CeCGG,274399,19,8169,816.966868
1,M8JGG_2,274582,17,8219,821.990907
2,M8IGG_2,274676,18,8218,821.85205
3,M7CecGG,272344,20,8132,813.2623
4,M8ColonGG_2,274399,21,8206,820.636318
5,M7IGG_2,274628,19,8216,821.679176
6,TL1gDNAshort,274714,17,8215,821.576558
7,M7JGG,274468,18,8213,821.364061
8,M7DGG,273872,23,8137,813.734558
9,M8D,274627,18,8217,821.791789
