In [1]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/microbiome_evolution/microbiome_evolution_MOUSE/")

import matplotlib  
matplotlib.use('Agg') 
import sample_utils
import config
import parse_midas_data
import os.path
import pylab
import numpy
import diversity_utils
import gene_diversity_utils
import calculate_temporal_changes
import calculate_substitution_rates
import stats_utils
import sfs_utils
    
    
import matplotlib.colors as colors
import matplotlib.cm as cmx
from math import log10,ceil,log,exp
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from numpy.random import randint, random, choice, multinomial, shuffle
import matplotlib.colors as mcolors
import matplotlib.patheffects as pe

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster


mpl.rcParams['font.size'] = 7
mpl.rcParams['lines.linewidth'] = 0.5
mpl.rcParams['legend.frameon']  = False
mpl.rcParams['legend.fontsize']  = 'small'

species_name = "Bacteroides_vulgatus_57955"

### parameters

In [2]:
debug = False
chunk_size = 1000000000
memoize = False
modification_difference_threshold = config.modification_difference_threshold #set to 20
replacement_difference_threshold = config.replacement_difference_threshold #set to 500
twin_modification_difference_threshold = config.twin_modification_difference_threshold #set to 1000
twin_replacement_difference_threshold = config.twin_replacement_difference_threshold #set to 1000
default_num_bootstraps = 10000

### initialize outputs

In [3]:
output_filename = ('%s/within_host_output.txt' % (parse_midas_data.analysis_directory))
output_strs = []

In [4]:
min_coverage = config.min_median_coverage
min_sample_size = 3
min_haploid_sample_size = 10

### more parameters

In [5]:
variant_types = ['1D','4D']

In [6]:
within_host_type = 'consecutive' # MW: will need to change this

In [7]:
# For partitioning SNVs according to prevalence
derived_freq_bins = numpy.array([-1,0,0.01,0.1,0.5,0.9,0.99,1,2])
derived_virtual_freqs = numpy.arange(0,len(derived_freq_bins)-1)
derived_virtual_xticks = list(derived_virtual_freqs[:-1]+0.5)
derived_virtual_xticklabels = ['0','.01','.1','.5','.9','.99','1']

In [8]:
# For partitioning genes into different prevalence classes
gene_freq_bins = numpy.array([-1,0.1,0.5,0.9,2])
gene_freq_xticks      = [-4, -3,  -2,   -1,   0,   1,    2,   3, 4]
gene_freq_xticklabels = ['0','0.1','0.5', '0.9','1','0.9','0.5', '0.1','0']
gene_gain_virtual_freqs = numpy.array([3.5,2.5,1.5,0.5])
gene_loss_virtual_freqs = numpy.array([-3.5,-2.5,-1.5,-0.5])

### setting cohorts (irrelevant for our analysis)

In [9]:
# #####
# #
# # Settings for different cohorts we are looking at 
# #
# #####
# cohorts = ["hmp", "twins", "young_twins"]
# countries = ["United States", "United Kingdom", "Western Europe"]
# country_cohort_map = {country: cohort for country,cohort in zip(countries,cohorts)}

# modification_difference_thresholds = {"hmp": modification_difference_threshold, "twins": 1e06, "young_twins": twin_modification_difference_threshold}

# replacement_difference_thresholds = {"hmp": replacement_difference_threshold, "twins": twin_replacement_difference_threshold, "young_twins": twin_replacement_difference_threshold}

# Set up figures

## Distribution of changes across individual species

In [10]:
pylab.figure(1,figsize=(7,5))
fig = pylab.gcf()
# make three panels panels
outer_grid = gridspec.GridSpec(2,1,height_ratios=[1,1],hspace=0.25)

upper_grid  = gridspec.GridSpecFromSubplotSpec(1,2,width_ratios=[1,1],wspace=0.1, subplot_spec=outer_grid[0])

species_snp_axis = plt.Subplot(fig, upper_grid[0])
fig.add_subplot(species_snp_axis)

species_snp_axis.spines['top'].set_visible(False)
species_snp_axis.spines['right'].set_visible(False)
species_snp_axis.get_xaxis().tick_bottom()
species_snp_axis.get_yaxis().tick_left()

species_snp_axis.set_ylabel('Fraction comparisons $\geq n$')
species_snp_axis.set_xlabel('# SNV changes')


species_gene_axis = plt.Subplot(fig, upper_grid[1])
fig.add_subplot(species_gene_axis)

species_gene_axis.spines['top'].set_visible(False)
species_gene_axis.spines['right'].set_visible(False)
species_gene_axis.get_xaxis().tick_bottom()
species_gene_axis.get_yaxis().tick_left()

species_snp_axis.loglog([0.01],[1],'k.')
species_gene_axis.loglog([0.01],[1],'k.')

species_snp_axis.set_xlim([3e-01,1e05])
species_gene_axis.set_xlim([3e-01,1e04])

species_gene_axis.set_xlabel('# gene changes')

species_legend_axis = plt.Subplot(fig, outer_grid[1])
fig.add_subplot(species_legend_axis)

species_legend_axis.set_ylim([0,1])
species_legend_axis.set_xlim([0,1])

species_legend_axis.spines['top'].set_visible(False)
species_legend_axis.spines['right'].set_visible(False)
species_legend_axis.spines['left'].set_visible(False)
species_legend_axis.spines['bottom'].set_visible(False)

species_legend_axis.set_xticks([])
species_legend_axis.set_yticks([])

in singular transformations; automatically expanding.
bottom=1.0, top=1.0
  'bottom=%s, top=%s') % (bottom, top))


[]

## Main text figure

In [11]:
pylab.figure(2,figsize=(7,4))
fig2 = pylab.gcf()
# make three panels panels
outer_outer_grid_2 = gridspec.GridSpec(1,1) #2, width_ratios=[1,0.2],wspace=0.1) 
outer_grid_2 = gridspec.GridSpecFromSubplotSpec(2,1, height_ratios=[1,0.7],hspace=0.6, subplot_spec=outer_outer_grid_2[0])

prevalence_outer_grid = gridspec.GridSpecFromSubplotSpec(1,2, width_ratios=[1,0.1],wspace=0.2,subplot_spec=outer_grid_2[1])

prevalence_grid = gridspec.GridSpecFromSubplotSpec(1,2, width_ratios=[1,1],wspace=0.5,subplot_spec=prevalence_outer_grid[0])

pooled_grid = gridspec.GridSpecFromSubplotSpec(1,3,width_ratios=[1,1,0.2],wspace=0.15,subplot_spec=outer_grid_2[0])

pooled_snp_axis = plt.Subplot(fig2, pooled_grid[0])
fig2.add_subplot(pooled_snp_axis)
pooled_snp_axis.set_ylabel('Fraction comparisons $\geq n$')
pooled_snp_axis.set_xlabel('# SNV changes')
#pooled_axis.set_ylim([-35,35])
#pooled_snp_axis.set_xlim([2e-01,1e05])
pooled_snp_axis.set_xlim([0.6,1e05])

pooled_snp_axis.set_xticklabels([])

pooled_snp_axis.spines['top'].set_visible(False)
pooled_snp_axis.spines['right'].set_visible(False)
pooled_snp_axis.get_xaxis().tick_bottom()
pooled_snp_axis.get_yaxis().tick_left()
 
pooled_gene_axis = plt.Subplot(fig2, pooled_grid[1])
fig2.add_subplot(pooled_gene_axis)
#pooled_gene_axis.set_ylabel('Number of samples')
pooled_gene_axis.set_xlabel('# gene changes')
#pooled_axis.set_ylim([-35,35])
#pooled_gene_axis.set_xlim([2e-01,1e04])
pooled_gene_axis.set_xlim([0.6,1e04])

pooled_gene_axis.spines['top'].set_visible(False)
pooled_gene_axis.spines['right'].set_visible(False)
pooled_gene_axis.get_xaxis().tick_bottom()
pooled_gene_axis.get_yaxis().tick_left()

pooled_snp_axis.loglog([0.1],[1],'k.')
pooled_gene_axis.loglog([0.1],[1],'k.')
 
legend2_axis = plt.Subplot(fig2, pooled_grid[2])
fig2.add_subplot(legend2_axis)

legend2_axis.set_ylim([0,1])
legend2_axis.set_xlim([0,1])

legend2_axis.spines['top'].set_visible(False)
legend2_axis.spines['right'].set_visible(False)
legend2_axis.spines['left'].set_visible(False)
legend2_axis.spines['bottom'].set_visible(False)

legend2_axis.set_xticks([])
legend2_axis.set_yticks([])

legend2_axis.plot([-2,-1],[-1,-1],'-',linewidth=1, color='#08519c',label='Within-host')
legend2_axis.plot([-2,-1],[-1,-1],'-',color='#08519c',linewidth=1, label='modification',zorder=2,path_effects=[pe.Stroke(linewidth=5, foreground='#9ecae1'), pe.Normal()])
legend2_axis.plot([-2,-1],[-1,-1],'-',color='#08519c', label='replacement',linewidth=1,path_effects=[pe.Stroke(linewidth=5, foreground='#fee0d2'), pe.Normal()])
legend2_axis.plot([-2,-1],[-1,-1],'-',color='#08519c', label='no SNVs',linewidth=1,path_effects=[pe.Stroke(linewidth=5, foreground='0.8'), pe.Normal()])
legend2_axis.plot([-2,-1],[-1,-1], '-',linewidth=1,color='w', alpha=0.5, label=' ')
legend2_axis.plot([-2,-1],[-1,-1], '-',linewidth=1,color='r', alpha=0.5, label='Between-host\n(unrelated)')
legend2_axis.plot([-2,-1],[-1,-1],'-',linewidth=1,color='#8856a7', label='Between-host\n(adult twins)')

legend2_axis.legend(loc='upper center',frameon=False,fontsize=5,numpoints=1,ncol=1,handlelength=1)   


hmp_frequency_axis = plt.Subplot(fig2, prevalence_grid[0])
fig2.add_subplot(hmp_frequency_axis)

hmp_frequency_axis.spines['top'].set_visible(False)
hmp_frequency_axis.spines['right'].set_visible(False)
hmp_frequency_axis.get_xaxis().tick_bottom()
hmp_frequency_axis.get_yaxis().tick_left()
 
hmp_frequency_axis.set_xlabel('Derived allele prevalence\nacross hosts')
hmp_frequency_axis.set_ylabel('# SNV changes')

hmp_frequency_axis.set_xticks(derived_virtual_xticks)
hmp_frequency_axis.set_xticklabels(derived_virtual_xticklabels) #,rotation='vertical')

hmp_frequency_axis.set_ylim([0,200])

hmp_gene_frequency_axis = plt.Subplot(fig2, prevalence_grid[1])
fig2.add_subplot(hmp_gene_frequency_axis)

hmp_gene_frequency_axis.spines['top'].set_visible(False)
hmp_gene_frequency_axis.spines['right'].set_visible(False)
hmp_gene_frequency_axis.get_xaxis().tick_bottom()
hmp_gene_frequency_axis.get_yaxis().tick_left()
 
hmp_gene_frequency_axis.set_xlabel('Gene prevalence across hosts')
hmp_gene_frequency_axis.set_ylabel('# gene changes')

hmp_gene_frequency_axis.set_xlim([gene_freq_xticks[0],gene_freq_xticks[-1]])
hmp_gene_frequency_axis.set_xticks(gene_freq_xticks)
hmp_gene_frequency_axis.set_xticklabels(gene_freq_xticklabels) #,rotation='vertical')

hmp_gene_frequency_axis.plot([0,0],[100,100],'k-')
hmp_gene_frequency_axis.set_ylim([0,60])

hmp_gene_legend_axis = plt.Subplot(fig2, prevalence_outer_grid[1])
fig2.add_subplot(hmp_gene_legend_axis)

hmp_gene_legend_axis.set_ylim([0,1])
hmp_gene_legend_axis.set_xlim([0,1])

hmp_gene_legend_axis.spines['top'].set_visible(False)
hmp_gene_legend_axis.spines['right'].set_visible(False)
hmp_gene_legend_axis.spines['left'].set_visible(False)
hmp_gene_legend_axis.spines['bottom'].set_visible(False)

hmp_gene_legend_axis.set_xticks([])
hmp_gene_legend_axis.set_yticks([])

hmp_gene_legend_axis.bar([-2],[-1],width=0.2, linewidth=0,facecolor='#b3de69',label='gain')
hmp_gene_legend_axis.bar([-2],[-1],width=0.2, linewidth=0,facecolor='#ff7f00',label='loss')
hmp_gene_legend_axis.bar([-2],[-1],width=0.2, linewidth=0,facecolor='0.7',label='de novo\nexpectation')

hmp_gene_legend_axis.legend(loc='center left',frameon=False,fontsize=5,numpoints=1,ncol=1,handlelength=1)   


<matplotlib.legend.Legend at 0x2ba5b957ccd0>

# Calculations

### `pooled_snp_axis`

In [12]:
# Load subject and sample metadata
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sample_subject_map = sample_utils.calculate_sample_subject_map(subject_sample_map)
sys.stderr.write("Done!\n")

Loading sample metadata...
Done!


In [13]:
good_species_list = parse_midas_data.parse_good_species_list() #Loads good species based on them having marker gene coverage over 10 (i.e., present) in at least 2 samples, basically, 


In [14]:
num_passed_species = 0

In [15]:
#This is the for loop starting at line 746
species_name = "Akkermansia_muciniphila_55290"


In [16]:
# all samples
all_samples = sample_subject_map.keys()

In [17]:
highcoverage_samples = set(diversity_utils.calculate_highcoverage_samples(species_name))

In [18]:
haploid_samples = set(diversity_utils.calculate_haploid_samples(species_name))

In [19]:
haploid_samples

{'M1CeC_CKDN220050952-1A_H7MMHDSX5_L3',
 'M1CoC_CKDN220050958-1A_H7MMHDSX5_L2',
 'M1DC_CKDN220050964-1A_H7MMHDSX5_L2',
 'M1IC_CKDN220050976-1A_H7MMHDSX5_L2',
 'M1JC_CKDN220050970-1A_H7MMHDSX5_L2',
 'M2CeC_CKDN220050953-1A_merge',
 'M2CoC_CKDN220050959-1A_H7MMHDSX5_L2',
 'M2IC_CKDN220050977-1A_H7MMHDSX5_L2',
 'M2JC_CKDN220050971-1A_H7MMHDSX5_L2',
 'M3CeC_CKDN220050954-1A_H7MMHDSX5_L2',
 'M3CoC_CKDN220050960-1A_H7MMHDSX5_L2',
 'M3DC_CKDN220050966-1A_H7MMHDSX5_L2',
 'M3JC_CKDN220050972-1A_H7MMHDSX5_L2',
 'M4CeG_CKDN220050955-1A_H7MMHDSX5_L2',
 'M4CoG_CKDN220050961-1A_merge',
 'M4DG_CKDN220050967-1A_H7MMHDSX5_L3',
 'M4IG_CKDN220050979-1A_H7MMHDSX5_L2',
 'M4JG_CKDN220050973-1A_merge',
 'M5CeG_CKDN220050956-1A_H7MMHDSX5_L2',
 'M5CoG_CKDN220050962-1A_merge',
 'M5DG_CKDN220050968-1A_merge',
 'M5IG_CKDN220050980-1A_HF3V5DSX5_L1',
 'M5JG_CKDN220050974-1A_merge',
 'M6CeG_CKDN220050957-1A_H7MMGDSX5_L2',
 'M6CoG_CKDN220050963-1A_H7MMHDSX5_L2',
 'M6DG_CKDN220050969-1A_H7MMGDSX5_L1',
 'M6IG_CKDN22005

In [20]:
all_samples = list(haploid_samples)

In [21]:
#skipped from line 760 to 855
import calculate_private_snvs


In [22]:
private_snv_map = calculate_private_snvs.load_private_snv_map(species_name)

In [23]:
private_snv_map

{}

In [24]:
import calculate_snp_prevalences
snv_freq_map = calculate_snp_prevalences.parse_population_freqs(species_name,polarize_by_consensus=True)
snv_freq_keys = snv_freq_map.keys()
snv_freq_values = snv_freq_map.values()

In [41]:
snv_freq_map

{('NC_010655', 138292L): 0.5,
 ('NC_010655', 299821L): 0.0789474,
 ('NC_010655', 299872L): 0.108108,
 ('NC_010655', 816900L): 0.5,
 ('NC_010655', 816901L): 0.5,
 ('NC_010655', 816902L): 0.333333,
 ('NC_010655', 835682L): 0.333333,
 ('NC_010655', 866125L): 0.025641000000000025,
 ('NC_010655', 866126L): 0.0512821,
 ('NC_010655', 866141L): 0.10256399999999999,
 ('NC_010655', 866157L): 0.17948699999999995,
 ('NC_010655', 866162L): 0.230769,
 ('NC_010655', 866211L): 0.384615,
 ('NC_010655', 866218L): 0.307692,
 ('NC_010655', 866244L): 0.461538,
 ('NC_010655', 866263L): 0.421053,
 ('NC_010655', 866305L): 0.315789,
 ('NC_010655', 866335L): 0.162162,
 ('NC_010655', 1171515L): 0.115385,
 ('NC_010655', 1171558L): 0.484848,
 ('NC_010655', 1171890L): 0.30303,
 ('NC_010655', 1171912L): 0.333333,
 ('NC_010655', 1171914L): 0.19999999999999996,
 ('NC_010655', 1185477L): 0.1875,
 ('NC_010655', 1185478L): 0.153846,
 ('NC_010655', 1216444L): 0.0714286,
 ('NC_010655', 1360892L): 0.025641000000000025,
 ('N

In [42]:
import core_gene_utils
gene_freq_map = core_gene_utils.parse_gene_freqs(species_name)
gene_freq_values = numpy.array(gene_freq_map.values())
gene_freq_weights = gene_freq_values*1.0/gene_freq_values.sum()

In [45]:
sys.stderr.write("Loading pre-computed substitution rates for %s...\n" % species_name)
substitution_rate_map = calculate_substitution_rates.load_substitution_rate_map(species_name)
sys.stderr.write("Calculating SNV matrix...\n")
dummy_samples, snp_mut_difference_matrix, snp_rev_difference_matrix, snp_mut_opportunity_matrix, snp_rev_opportunity_matrix = calculate_substitution_rates.calculate_mutrev_matrices_from_substitution_rate_map(substitution_rate_map, 'all', allowed_samples=combined_samples)





Loading pre-computed substitution rates for Akkermansia_muciniphila_55290...
Calculating SNV matrix...


NameError: name 'combined_samples' is not defined