In [1]:
# Basically replicate pickle_everything but track info about how many QP pairs are considered

In [9]:
from utils import sample_utils as su, parse_midas_data, substitution_rates_utils, config, temporal_changes_utils, snps_utils, core_gene_utils, gene_diversity_utils
import numpy as np
from numpy.random import choice, random as np_random, randint
import random
from collections import defaultdict
import pickle
import os, sys

# ======================================================
# Examines all consecutive timepoint pairs within hosts
# across all cohorts, and pickles SNP/gene change info
# ======================================================

# Parameters
sweep_type = 'full' # assume full for now
pp_prev_cohort = 'all'
min_coverage = 0
thresholds = {'full': (0.2, 0.8), 'partial': (0.35, 0.65)}
lower_threshold, upper_threshold = thresholds[sweep_type]

clade_divergence_threshold = 3e-02 # TODO: change to top level clade definition later

min_sample_size = 3
variant_types = ['1D','4D']
within_host_type = 'nonconsecutive' # consecutive timepoints
min_snp_change_sample_size = 5

# For partitioning SNVs according to prevalence
derived_freq_bins = np.array([-1,0,0.01,0.1,0.5,0.9,0.99,1,2])
derived_virtual_freqs = np.arange(0,len(derived_freq_bins)-1)
derived_virtual_xticks = list(derived_virtual_freqs[:-1]+0.5)
derived_virtual_xticklabels = ['0','.01','.1','.5','.9','.99','1']

# For partitioning genes into different prevalence classes
gene_freq_bins = np.array([-1,0.1,0.5,0.9,2])
gene_freq_xticks			= [-4, -3,	-2,		-1,		0,	 1,		 2,		3, 4]
gene_freq_xticklabels = ['0','0.1','0.5', '0.9','1','0.9','0.5', '0.1','0']
gene_gain_virtual_freqs = np.array([3.5,2.5,1.5,0.5])
gene_loss_virtual_freqs = np.array([-3.5,-2.5,-1.5,-0.5])

# Sample-subject-order maps
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = su.parse_subject_sample_map()
sample_order_map = su.parse_sample_order_map()
sample_subject_map = su.parse_sample_subject_map()
sys.stderr.write("Done!\n")

# Timepoint pair types
tp_pair_names = ['MM', 'MI', 'II', 'AA']

# Cohorts
cohorts = ['backhed', 'ferretti', 'yassour', 'shao', 'hmp']
mi_cohorts = ['backhed', 'ferretti', 'yassour', 'shao']

# Prevalence cohorts
prev_cohorts = ['all', 'hmp', 'infant', 'nonpremie', 'mother']

# Samples for each cohort
samples = {cohort: su.get_sample_names(cohort) for cohort in cohorts}
hmp_samples = su.get_sample_names('hmp')
olm_samples = su.get_sample_names('olm')
mother_samples = [sample for sample in su.get_sample_names('mother') if sample not in olm_samples]
infant_samples = [sample for sample in su.get_sample_names('infant') if sample not in olm_samples]

# Species list
good_species_list = parse_midas_data.load_pickled_good_species_list()

Loading sample metadata...
Done!


In [14]:
mi_sample_day_dict = su.get_mi_sample_day_dict()

In [7]:
species_cohort_qp_pair_count = defaultdict(dict)
species_qp_pair_dict = defaultdict(list)
num_species_enough_haploid = 0

for species_name in good_species_list[::-1]:
	
	sys.stderr.write("\nProcessing %s...\n" % species_name)
	
	# Grab QP samples for this species
	qp_sample_lists = {}
	for cohort in cohorts:
		qp_sample_lists[cohort] = sorted(su.load_qp_samples(samples[cohort], species_name, prev_cohort=pp_prev_cohort)['qp'])
	
	combined_qp_samples = sorted(su.flatten([qp_sample_lists[cohort] for cohort in cohorts]))
	combined_sample_name_idx_map = {combined_qp_samples[i] : i for i in range(len(combined_qp_samples))}
	
	# Using all QP samples to threshold on sample size
	if len(combined_qp_samples) < min_sample_size:
		sys.stderr.write("Not enough haploid samples!\n")
		continue
	
	num_species_enough_haploid += 1
	
	# Loop over different cohorts
	for cohort in cohorts:		
		desired_samples = qp_sample_lists[cohort]
		
		# These indices are w.r.t. desired_samples
		same_subject_idxs = su.calculate_mi_ordered_same_subject_pairs(sample_order_map, desired_samples, within_host_type=within_host_type, one_per_mi_pair=False)
		for i, j in zip(same_subject_idxs[0], same_subject_idxs[1]):
			sample_i = desired_samples[i]
			sample_j = desired_samples[j]
			species_qp_pair_dict[species_name].append((sample_i, sample_j))
		species_cohort_qp_pair_count[species_name][cohort] = len(same_subject_idxs[0])


Processing Peptoniphilus_rhinitidis_62391...
Not enough haploid samples!

Processing Actinomyces_sp_62581...
Not enough haploid samples!

Processing Clostridium_sordellii_57678...
Not enough haploid samples!

Processing Eubacterium_biforme_61684...

Processing Clostridium_baratii_60475...

Processing Finegoldia_magna_57293...
Not enough haploid samples!

Processing Lactobacillus_crispatus_56887...
Not enough haploid samples!

Processing Escherichia_hermannii_58626...
Not enough haploid samples!

Processing Phascolarctobacterium_succinatutens_61948...
Not enough haploid samples!

Processing Enterococcus_faecalis_55915...

Processing Paraprevotella_xylaniphila_62280...

Processing Parabacteroides_goldsteinii_56831...

Processing Bacteroides_coprophilus_61767...

Processing Clostridium_hathewayi_61827...

Processing Odoribacter_laneus_62216...

Processing Enterococcus_gallinarum_52312...

Processing Prevotella_buccae_56058...

Processing Clostridium_leptum_61499...

Processing Corynebact

In [3]:
cohort_qp_pair_counts = defaultdict(int)
for species in species_cohort_qp_pair_count:
    for cohort in species_cohort_qp_pair_count[species]:
        qp_count = species_cohort_qp_pair_count[species][cohort]
        cohort_qp_pair_counts[cohort] += qp_count

In [4]:
print("Number of QP sample pairs per cohort:")
print(cohort_qp_pair_counts)

Number of QP sample pairs per cohort:
defaultdict(<type 'int'>, {'ferretti': 84, 'backhed': 426, 'hmp': 1296, 'shao': 1572, 'yassour': 481})


In [5]:
mi_qp_pair_count = 0
for cohort in mi_cohorts:
    mi_qp_pair_count += cohort_qp_pair_counts[cohort]

print("Number of QP sample pairs for all mother-infant: %i" % mi_qp_pair_count)

Number of QP sample pairs for all mother-infant: 2563


In [6]:
cohort_qp_pair_counts

defaultdict(int,
            {'backhed': 426,
             'ferretti': 84,
             'hmp': 1296,
             'shao': 1572,
             'yassour': 481})

In [11]:
print("There are %i out of %i 'good' species which additionally have at least %i haploid samples" % (num_species_enough_haploid,
                                                                                                     len(good_species_list),
                                                                                                     min_sample_size))
print("Recall definition of good species: must have at least 10 samples with marker gene coverage >= 10")

There are 176 out of 217 'good' species which additionally have at least 3 haploid samples
Recall definition of good species: must have at least 10 samples with marker gene coverage >= 10


In [21]:
mi_tt_qp_pair_count = 0
mi_deliv_tt_qp_pair_count = 0
ii_tt_qp_pair_count = 0

for species in species_qp_pair_dict:
    for sample_i, sample_j in species_qp_pair_dict[species]:
        if sample_i in mother_samples and sample_j in infant_samples:
            mi_tt_qp_pair_count += 1
            if mi_sample_day_dict[sample_i] >= -1 and mi_sample_day_dict[sample_i] < 6:
                mi_deliv_tt_qp_pair_count += 1
        if sample_i in infant_samples and sample_j in infant_samples:
            ii_tt_qp_pair_count += 1
        if sample_i in infant_samples and sample_j in mother_samples:
            print("Weird")

Weird
Weird
Weird
Weird
Weird


In [22]:
print("Number of QP sample pairs for all mother-infant: %i" % mi_tt_qp_pair_count)
print("Number of QP sample pairs for all mother-infant (mother at delivery only): %i" % mi_deliv_tt_qp_pair_count)
print("Number of QP sample pairs for all infant-infant: %i" % ii_tt_qp_pair_count)

Number of QP sample pairs for all mother-infant: 281
Number of QP sample pairs for all mother-infant (mother at delivery only): 241
Number of QP sample pairs for all infant-infant: 2184


In [None]:
# Plot QP pairs per species
# Plot infants first

from matplotlib.patches import Patch

cohort = 'hmp'

for species in species_cohort_qp_pair_count:

ordered_species_list = []
for species, count in sorted(num_qp_agg_tps[cat].items(), key=lambda x: x[1], reverse=True):
    ordered_species_list.append(species)

ordered_species_list_subset = ordered_species_list[:40][::-1]

all_num_qp = [species_cohort_qp_pair_count[cat][species] for species in ordered_species_list_subset]
all_num_non = [num_non_agg_tps[cat][species] for species in ordered_species_list_subset]
all_num_lowcov = [num_lowcov_agg_tps[cat][species] for species in ordered_species_list_subset]

fig, ax = plt.subplots(figsize=(8,12))

yticks = np.arange(len(all_num_qp))

ax.barh(np.array(yticks) + 0.5, all_num_qp, color='orange')
ax.barh(np.array(yticks) + 0.5, all_num_non, left=all_num_qp, color='#77acff')
# ax.barh(np.array(yticks) + 0.5, all_num_lowcov, left=np.array(all_num_qp)+np.array(all_num_non), color='#396651')

ax.set_yticks(np.array(yticks) + 0.5)
ax.set_yticklabels(ordered_species_list_subset)
ax.set_title("Number of QP samples per species")

legend_elements = [Patch(facecolor='orange', label='QP'), Patch(facecolor='#77acff', label='non-QP')]
ax.legend(handles=legend_elements, loc='center right', frameon=False)

plt.show()
fig.savefig("%s/count_qp_by_species_barh.pdf" % config.analysis_directory, bbox_inches='tight')
fig.savefig("%s/count_qp_by_species_barh.png" % config.analysis_directory, bbox_inches='tight', dpi=500)