In [1]:
from utils import config, parse_midas_data, sample_utils as su, temporal_changes_utils, stats_utils, midas_db_utils, parse_patric
from collections import defaultdict
import numpy as np
import pickle
import sys
import random
from math import log10,ceil,log,exp

import matplotlib.cm as cmx
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors
import matplotlib.patheffects as pe

# Cohort list
cohorts = ['backhed', 'ferretti', 'yassour', 'shao', 'olm', 'hmp']

# Plot directory
plot_dir = "%s/" % (config.analysis_directory)

# Species list
good_species_list = parse_midas_data.load_pickled_good_species_list()

# Sample-subject-order maps
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = su.parse_subject_sample_map()
sample_order_map = su.parse_sample_order_map()
sample_subject_map = su.parse_sample_subject_map()
sys.stderr.write("Done!\n")

# Timepoint pair types
tp_pair_names = ['MM', 'MI', 'II', 'AA']

# Cohorts
cohorts = ['backhed', 'ferretti', 'yassour', 'shao', 'hmp']
mi_cohorts = ['backhed', 'ferretti', 'yassour', 'shao']

# Samples for each cohort
samples = {cohort: su.get_sample_names(cohort) for cohort in cohorts}
hmp_samples = su.get_sample_names('hmp')
mother_samples = su.get_sample_names('mother')
infant_samples = su.get_sample_names('infant')
olm_samples = su.get_sample_names('olm')
infant_samples_no_olm = [sample for sample in infant_samples if sample not in olm_samples]
mi_samples_no_olm = [sample for sample in (mother_samples + infant_samples) if sample not in olm_samples]

# Sample-cohort map
sample_cohort_map = su.parse_sample_cohort_map()

# Sample-timepoint map
mi_sample_day_dict = su.get_mi_sample_day_dict(exclude_cohorts=['olm'])
mi_tp_sample_dict = su.get_mi_tp_sample_dict(exclude_cohorts=['olm']) # no binning
mi_tp_sample_dict_binned, mi_tp_binned_labels = su.get_mi_tp_sample_dict(exclude_cohorts=['olm'], binned=True)

Loading sample metadata...
Done!


In [9]:
# ======================================================================
# Load pickled data
# ======================================================================

# Parameters
sweep_type = 'full' # assume full for now
pp_prev_cohort = 'all'
min_coverage = 0

ddir = config.data_directory
pdir = "%s/pickles/cov%i_prev_%s/nonconsecutive" % (ddir, min_coverage, pp_prev_cohort)

snp_changes = pickle.load(open('%s/big_snp_changes_%s.pkl' % (pdir, sweep_type), 'rb'))
gene_changes = pickle.load(open('%s/big_gene_changes_%s.pkl' % (pdir, sweep_type), 'rb'))
snp_change_freqs = pickle.load(open('%s/snp_change_freqs_%s.pkl' % (pdir, sweep_type), 'rb'))
snp_change_freqs_with_opps = pickle.load(open('%s/snp_change_freqs_with_opps_full.pkl' % (pdir), 'rb'))
snp_change_null_freqs = pickle.load(open('%s/snp_change_null_freqs_%s.pkl' % (pdir, sweep_type), 'rb'))
gene_gain_freqs = pickle.load(open('%s/gene_gain_freqs_%s.pkl' % (pdir, sweep_type), 'rb'))
gene_loss_freqs = pickle.load(open('%s/gene_loss_freqs_%s.pkl' % (pdir, sweep_type), 'rb'))
gene_loss_null_freqs = pickle.load(open('%s/gene_loss_null_freqs_%s.pkl' % (pdir, sweep_type), 'rb'))
between_snp_change_counts = pickle.load(open('%s/between_snp_change_counts_%s.pkl' % (pdir, sweep_type), 'rb'))
between_gene_change_counts = pickle.load(open('%s/between_gene_change_counts_%s.pkl' % (pdir, sweep_type), 'rb'))
snp_change_present_gene_null = pickle.load(open('%s/snp_change_present_gene_null' % pdir, 'rb'))
snp_change_between_host_null = pickle.load(open('%s/snp_change_between_host_null' % pdir, 'rb'))
snp_change_pangenome_null = pickle.load(open('%s/snp_change_pangenome_null' % pdir, 'rb'))
dnds_info = pickle.load(open('%s/dnds_info.pkl' % (pdir), 'rb'))

In [3]:
# Consecutive data

# Parameters
sweep_type = 'full' # assume full for now
pp_prev_cohort = 'all'
min_coverage = 0

ddir = config.data_directory
pdir = "%s/pickles/cov%i_prev_%s" % (ddir, min_coverage, pp_prev_cohort)

snp_changes_consecutive = pickle.load(open('%s/big_snp_changes_%s.pkl' % (pdir, sweep_type), 'rb'))

In [6]:
# Calculate number of days for a timpeoint pair

mi_sample_day_dict = su.get_mi_sample_day_dict()

def sample_pair_to_days(sample1, sample2):
    days = mi_sample_day_dict[sample2] - mi_sample_day_dict[sample1]
    if days < 0:
        return np.abs(days)
    return days

def tp_pair_to_days(tp_pair):
    tpa, tpb = tp_pair
    o1 = float(tpa[1:])
    o2 = float(tpb[1:])
    return np.abs(o1-o2)

# Rough approximation of HMP time intervals
def adult_tp_pair_to_days(tp_pair):
    tpa, tpb = tp_pair
    return np.abs(int(tpa[-1:]) - int(tpb[-1:]))*183

def tp_pair_to_tp_type(tp_pair):
    tpa, tpb = tp_pair
    tp_type = tpa[0] + tpb[0]
    if tp_type == 'IM':
        tp_type = 'MI'
    return tp_type

In [7]:
# Settings for prevalence plots

modification_difference_threshold = config.modification_difference_threshold
replacement_difference_threshold = config.replacement_difference_threshold
default_num_bootstraps = 10000
min_sample_size = 3
min_haploid_sample_size = 10

variant_types = ['1D','4D']
within_host_type = 'consecutive' # consecutive timepoints (vs. longest)

num_bootstraps = 10 # for gene change prevalence null

# For partitioning SNVs according to prevalence
derived_freq_bins = np.array([-1,0,0.01,0.1,0.5,0.9,0.99,1,2])
derived_virtual_freqs = np.arange(0,len(derived_freq_bins)-1)
derived_virtual_xticks = list(derived_virtual_freqs[:-1]+0.5)
derived_virtual_xticklabels = ['0','.01','.1','.5','.9','.99','1']

# For partitioning genes into different prevalence classes
gene_freq_bins = np.array([-1,0.1,0.5,0.9,2])
gene_freq_xticks = [-4, -3, -2, -1, 0, 1, 2, 3, 4]
gene_freq_xticklabels = ['0','0.1','0.5', '0.9','1','0.9','0.5', '0.1','0']
gene_gain_virtual_freqs = np.array([3.5,2.5,1.5,0.5])
gene_loss_virtual_freqs = np.array([-3.5,-2.5,-1.5,-0.5])

# Function
def get_f_idx(f):
    return ((f>derived_freq_bins[:-1])*
            (f<=derived_freq_bins[1:])).argmax()

In [28]:
# Do suspected de novo mutations in infants revert within hosts?

# subject -> species -> (day1, day2) -> list of SNP change tuples OR 'replacement'
host_snp_changes = defaultdict(dict)
# subject -> species -> (day1, day2) -> list of (vartype, fdict) tuples OR 'replacement
host_snp_changes_freqs = defaultdict(dict)

for species in snp_changes:
    for sample_i, sample_j in snp_changes[species]:
        
        tp_pair = su.sample_pair_to_tp_pair(sample_i, sample_j, sample_order_map, hmp_samples, mother_samples)
        tp_type = tp_pair_to_tp_type(tp_pair)
        if tp_type != 'MI' and tp_type != 'II': # Only look at mother-infant or infant-infant
            continue
        
        subject = sample_subject_map[sample_j][:-2] # Combine mother and infant subjects!
        day1, day2 = mi_sample_day_dict[sample_i], mi_sample_day_dict[sample_j]
        
        pdicts = snp_change_freqs_with_opps[species][(sample_i, sample_j)] # list of (vartype, freq_dict, opp_dict tuples)
        npdict = snp_change_null_freqs[species][(sample_i, sample_j)] # dict: prev_cohort > list of (freq, weight) tuples
        
        snp_change_val = snp_changes[species][(sample_i, sample_j)]
        
        nonsyn_diffs, nonsyn_opps, syn_diffs, syn_opps = dnds_info[species][(sample_i, sample_j)]
        
        if type(snp_change_val) == type(1): # Replacement
            try:
                host_snp_changes[subject][species][(day1, day2, tp_type)] = 'replacement'
                host_snp_changes_freqs[subject][species][(day1, day2, tp_type)] = 'replacement'
            except:
                host_snp_changes[subject][species] = {(day1, day2, tp_type): 'replacement'}
                host_snp_changes_freqs[subject][species] = {(day1, day2, tp_type): 'replacement'}
        else:
            try:
                host_snp_changes[subject][species][(day1, day2, tp_type)] = snp_changes[species][(sample_i, sample_j)]
                host_snp_changes_freqs[subject][species][(day1, day2, tp_type)] = pdicts
            except:
                host_snp_changes[subject][species] = {(day1, day2, tp_type): snp_changes[species][(sample_i, sample_j)]}
                host_snp_changes_freqs[subject][species] = {(day1, day2, tp_type): pdicts}

In [None]:
num_host_species_pairs = 0
desired_host_species_sites = []

for subject in host_snp_changes:
    for species in host_snp_changes[subject]:        
        if len(host_snp_changes[subject][species].keys()) > 1:
            num_host_species_pairs += 1
            print("Subject: %s, Species: %s" % (subject, species))
            sites = []
            for day1, day2, tp_type in sorted(host_snp_changes[subject][species].keys(), key=lambda x: x[0]):
                print('\n%s Day %i to day %i' % (tp_type, day1, day2))
                snp_change_list = host_snp_changes[subject][species][(day1, day2, tp_type)]
                snp_change_freqs_list = host_snp_changes_freqs[subject][species][(day1, day2, tp_type)]
                for snp_change, x_snp_change_freqs in zip(snp_change_list, snp_change_freqs_list):
                    gene_id, contig, location, vartype, a1, d1, a2, d2 = snp_change
                    vartype2, freq_dict, opp_dict = x_snp_change_freqs
                    if vartype != vartype2:
                        print("Weird")
                    sites.append((gene_id, contig, location))
                    print("Site: %s,%s,%s\t| Vartype: %s" % (gene_id, contig, location, vartype))
                    print("\t%i %i %i %i | %.02f -> %.02f" % (a1, d1, a2, d2, float(a1)/d1, float(a2)/d2))
                    print("\tInfant prev: %.02f | HMP prev: %.02f | Mother prev: %.02f" % (freq_dict['nonpremie'], freq_dict['hmp'], freq_dict['mother']))
            desired_host_species_sites.append(([subject], species, sites))
            print("\n=============================================================================")

In [31]:
# Get a sense of available timepoints per host
# so I filter out MI appropriately
# Wait... should I not be restricting to modifications?
# Including any nonconsecutive timepoint pair, calculation seems correct

num_only_infant = 0
num_have_mother = 0

for subject in host_snp_changes:
    for species in host_snp_changes[subject]:
        m_days = set()
        i_days = set()
        for day1, day2, tp_type in sorted(host_snp_changes[subject][species].keys(), key=lambda x: x[0]):
            if tp_type == 'MI':
                m_days.add(day1)
                i_days.add(day2)
            elif tp_type == 'II':
                i_days.add(day1)
                i_days.add(day2)
        
        if len(m_days) > 0:
            num_have_mother += 1
            print("Subject: %s, Species: %s" % (subject, species))
            print("Mother: %s" % str(sorted(list(m_days))))
            print("Infant: %s" % str(sorted(list(i_days))))
            print("Pairs: %s" % str(host_snp_changes[subject][species].keys()))
            print('')
        else:
            num_only_infant += 1     

Subject: M0305, Species: Bacteroides_uniformis_57318
Mother: [0, 92]
Infant: [14, 30]
Pairs: [(0, 30, 'MI'), (14, 30, 'II'), (92, 30, 'MI'), (92, 14, 'MI'), (0, 14, 'MI')]

Subject: A01805, Species: Bifidobacterium_adolescentis_56815
Mother: [1]
Infant: [4]
Pairs: [(1, 4, 'MI')]

Subject: C01758, Species: Escherichia_coli_58110
Mother: [1]
Infant: [21, 311]
Pairs: [(1, 21, 'MI'), (21, 311, 'II'), (1, 311, 'MI')]

Subject: C01758, Species: Bacteroides_vulgatus_57955
Mother: [1]
Infant: [311]
Pairs: [(1, 311, 'MI')]

Subject: C01930, Species: Enterococcus_faecium_56947
Mother: [1]
Infant: [7]
Pairs: [(1, 7, 'MI')]

Subject: C01930, Species: Bacteroides_uniformis_57318
Mother: [1]
Infant: [271]
Pairs: [(1, 271, 'MI')]

Subject: 100, Species: Bifidobacterium_bifidum_55065
Mother: [2]
Infant: [122]
Pairs: [(2, 122, 'MI')]

Subject: 100, Species: Blautia_wexlerae_56130
Mother: [2]
Infant: [366]
Pairs: [(2, 366, 'MI')]

Subject: 39, Species: Bacteroides_caccae_53434
Mother: [2]
Infant: [366]


In [21]:
num_only_infant

145

In [61]:
num_host_species_pairs = 0
num_host_species_pairs_by_cohort = defaultdict(int)
num_host_species_pairs_by_species = defaultdict(int)

for subject in host_snp_changes:
    
    # Really hacky way to get cohort of this subject
    for possible_subject in [subject+'IS',subject+'-I',subject+'-C']:
        if possible_subject in subject_sample_map:
            for sample in subject_sample_map[possible_subject]:
                if sample in sample_cohort_map:
                    cohort = sample_cohort_map[sample]
    
    for species in host_snp_changes[subject]:        
        if len(host_snp_changes[subject][species].keys()) > 1:
            
            # Check if there is at least one infant-infant modification for this host
            # AND mother information
            # If not, skip/not interested
            has_modification = False
            has_mother = False
            for day1, day2, tp_type in host_snp_changes[subject][species]:
                val = host_snp_changes[subject][species][(day1, day2, tp_type)]
                if val != 'replacement' and len(val) > 0 and tp_type == 'II':
                    has_modification = True
                if tp_type == 'MI':
                    has_mother = True
            
            if has_modification == False or has_mother == False:
                continue
            
            num_host_species_pairs += 1
            num_host_species_pairs_by_cohort[cohort] += 1
            num_host_species_pairs_by_species[species] += 1
            
            print("Subject: %s, Species: %s" % (subject, species))
            
            for day1, day2, tp_type in sorted(host_snp_changes[subject][species].keys(), key=lambda x: x[0]):
                print('\n%s Day %i to day %i' % (tp_type, day1, day2))
                val = host_snp_changes[subject][species][(day1, day2, tp_type)]
                if val == 'replacement':
                    print("REPLACEMENT")
                elif len(val) == 0:
                    print("NO CHANGE")
                else:
                    snp_change_list = val                    
                    snp_change_freqs_list = host_snp_changes_freqs[subject][species][(day1, day2, tp_type)]
                    for snp_change, x_snp_change_freqs in zip(snp_change_list, snp_change_freqs_list):
                        gene_id, contig, location, vartype, a1, d1, a2, d2 = snp_change
                        vartype2, freq_dict, opp_dict = x_snp_change_freqs
                        if vartype != vartype2:
                            print("Weird")
                        print("Site: %s,%s,%s\t| Vartype: %s" % (gene_id, contig, location, vartype))
                        print("\t%i %i %i %i | %.02f -> %.02f" % (a1, d1, a2, d2, float(a1)/d1, float(a2)/d2))
                        print("\tInfant prev: %.02f | HMP prev: %.02f | Mother prev: %.02f" % (freq_dict['nonpremie'], freq_dict['hmp'], freq_dict['mother']))
            print("\n=============================================================================")

Subject: 98, Species: Parabacteroides_distasonis_56985

MI Day 2 to day 122
Site: 658663.3.peg.45,ACUD01000001,46292	| Vartype: 1D
	0 51 264 264 | 0.00 -> 1.00
	Infant prev: -0.50 | HMP prev: -0.50 | Mother prev: -0.50
Site: 658663.3.peg.4287,ACUD01000012,72506	| Vartype: 2D
	3 27 134 134 | 0.11 -> 1.00
	Infant prev: -0.50 | HMP prev: -0.50 | Mother prev: -0.50
Site: 658663.3.peg.2184,ACUD01000004,144184	| Vartype: 1D
	0 25 106 106 | 0.00 -> 1.00
	Infant prev: -0.50 | HMP prev: -0.50 | Mother prev: -0.50

MI Day 2 to day 366
Site: 658663.3.peg.45,ACUD01000001,46292	| Vartype: 1D
	0 51 47 47 | 0.00 -> 1.00
	Infant prev: -0.50 | HMP prev: -0.50 | Mother prev: -0.50
Site: 658663.3.peg.2024,ACUD01000003,296362	| Vartype: 1D
	0 49 54 61 | 0.00 -> 0.89
	Infant prev: -0.50 | HMP prev: -0.50 | Mother prev: -0.50
Site: 658663.3.peg.2311,ACUD01000004,294606	| Vartype: 1D
	0 37 33 40 | 0.00 -> 0.82
	Infant prev: -0.50 | HMP prev: -0.50 | Mother prev: -0.50
Site: 658663.3.peg.4435,ACUD01000013,527

In [62]:
print(num_host_species_pairs)
print(num_host_species_pairs_by_cohort)
print(num_host_species_pairs_by_species)

10
defaultdict(<type 'int'>, {'yassour': 1, 'shao': 2, 'backhed': 7})
defaultdict(<type 'int'>, {'Bacteroides_fragilis_54507': 2, 'Bifidobacterium_adolescentis_56815': 1, 'Bacteroides_xylanisolvens_57185': 1, 'Parabacteroides_distasonis_56985': 1, 'Bifidobacterium_bifidum_55065': 1, 'Bacteroides_vulgatus_57955': 4})


In [4]:
# Parallelism

custom_cohorts_ordered = ['MI', 'Day 0-Week 1', 'Week 1-Month 1', 'Month 1-Year 1', 'II(other)', 'MM', 'AA']
variant_types = ['1D', '2D', '3D', '4D']

def custom_cohort_of_sample_pair(custom_cohort_tests, sample_i, sample_j):
    for cohort in custom_cohort_tests:
        if custom_cohort_tests[cohort](sample_i, sample_j):
            return cohort

custom_cohort_tests = {}
custom_cohort_tests['MI'] = lambda sample_i, sample_j: (sample_i in mother_samples and sample_j in infant_samples_no_olm)
# custom_cohort_tests['MI'] = lambda sample_i, sample_j: ((sample_i in mother_samples and sample_j in infant_samples_no_olm) and mi_sample_day_dict[sample_i] >= 0 and mi_sample_day_dict[sample_j] <= 7)
custom_cohort_tests['Day 0-Week 1'] = lambda sample_i, sample_j: ((sample_i in infant_samples_no_olm and sample_j in infant_samples_no_olm) and (mi_sample_day_dict[sample_i] >= 0 and mi_sample_day_dict[sample_j] <= 7))
custom_cohort_tests['Week 1-Month 1'] = lambda sample_i, sample_j: ((sample_i in infant_samples_no_olm and sample_j in infant_samples_no_olm) and (mi_sample_day_dict[sample_i] >= 7 and mi_sample_day_dict[sample_j] <= 31))
custom_cohort_tests['Month 1-Year 1'] = lambda sample_i, sample_j: ((sample_i in infant_samples_no_olm and sample_j in infant_samples_no_olm) and (mi_sample_day_dict[sample_i] >= 31 and mi_sample_day_dict[sample_j] <= 370))
custom_cohort_tests['II(other)'] = lambda sample_i, sample_j: ((sample_i in infant_samples_no_olm and sample_j in infant_samples_no_olm))
custom_cohort_tests['MM'] = lambda sample_i, sample_j: (sample_i in mother_samples and sample_j in mother_samples)
custom_cohort_tests['AA'] = lambda sample_i, sample_j: (sample_i in hmp_samples and sample_j in hmp_samples)

# cohort -> gene -> variant type -> count of SNP changes
genes_by_cohort = {cohort: {} for cohort in custom_cohorts_ordered}
genes_by_cohort_present_null = {cohort: defaultdict(int) for cohort in custom_cohorts_ordered}
genes_by_cohort_between_null = {cohort: defaultdict(int) for cohort in custom_cohorts_ordered}
genes_by_cohort_pangenome_null = {cohort: defaultdict(int) for cohort in custom_cohorts_ordered}

# cohort -> gene -> count of unique hosts
num_host_genes_by_cohort = {cohort: defaultdict(set) for cohort in custom_cohorts_ordered}

for species in snp_changes_consecutive:
    print("Working on %s...." % species)
    genome_ids = midas_db_utils.get_ref_genome_ids(species)
    # load the gene descriptions for all genomes coresponding to this speceis:
    gene_descriptions=parse_patric.load_patric_gene_descriptions(genome_ids)
    
    for sample_i, sample_j in snp_changes_consecutive[species]:
        val = snp_changes_consecutive[species][(sample_i, sample_j)]
        subject_tuple = (sample_subject_map[sample_i], sample_subject_map[sample_j])
        custom_cohort = custom_cohort_of_sample_pair(custom_cohort_tests, sample_i, sample_j)
        if type(val) == type([]): # not a replacement
            
            # Store actual changed genes
            for gene_id, contig, position, variant_type, A1, D1, A2, D2 in val:
                try:
                    desc = gene_descriptions[gene_id]
                    if desc not in genes_by_cohort[custom_cohort]:
                        genes_by_cohort[custom_cohort][desc] = {vartype: 0 for vartype in variant_types}
                    genes_by_cohort[custom_cohort][desc][variant_type] += 1
                    num_host_genes_by_cohort[custom_cohort][desc].add(subject_tuple)
                except:
                    print("Weird")
                    continue
            
            # Store present gene null
            for gene_id in snp_change_present_gene_null[species][(sample_i, sample_j)]:
                try:
                    desc = gene_descriptions[gene_id]
                    count = snp_change_present_gene_null[species][(sample_i, sample_j)][gene_id]
                    genes_by_cohort_present_null[custom_cohort][desc] += count
                except:
                    print("Weird - present")
                    continue
            
            # Store between host null
            for gene_id in snp_change_between_host_null[species][(sample_i, sample_j)]:
                try:
                    desc = gene_descriptions[gene_id]
                    count = snp_change_between_host_null[species][(sample_i, sample_j)][gene_id]
                    genes_by_cohort_between_null[custom_cohort][desc] += count
                except:
                    print("Weird - between")
                    continue
            
            # Store pangenome null
            for gene_id in snp_change_pangenome_null[species][(sample_i, sample_j)]:
                try:
                    desc = gene_descriptions[gene_id]
                    count = snp_change_pangenome_null[species][(sample_i, sample_j)][gene_id]
                    genes_by_cohort_pangenome_null[custom_cohort][desc] += count
                except:
                    print("Weird - pangenome")
                    continue

# pickle.dump(genes_by_cohort, open('%s/genes_by_cohort.pkl' % config.analysis_directory, 'wb'))
# pickle.dump(num_host_genes_by_cohort, open('%s/num_host_genes_by_cohort.pkl' % config.analysis_directory, 'wb'))
# pickle.dump(genes_by_cohort_present_null, open('%s/genes_by_cohort_present_null.pkl' % config.analysis_directory, 'wb'))
# pickle.dump(genes_by_cohort_between_null, open('%s/genes_by_cohort_between_null.pkl' % config.analysis_directory, 'wb'))
# pickle.dump(genes_by_cohort_pangenome_null, open('%s/genes_by_cohort_pangenome_null.pkl' % config.analysis_directory, 'wb'))

Working on Lactobacillus_paracasei_55666....
Working on Leclercia_adecarboxylata_62497....
Working on Bacteroides_sartorii_54642....
Working on Megamonas_hypermegale_57114....
Working on Bacteroides_intestinalis_61596....
Weird - present
Working on Bacteroides_uniformis_57318....
Weird - pangenome
Weird - present
Weird - present
Working on Bifidobacterium_bifidum_55065....
Working on Clostridium_bolteae_57158....
Working on Streptococcus_gallolyticus_57748....
Working on Clostridium_paraputrificum_59909....
Working on Subdoligranulum_sp_62068....
Working on Enterococcus_faecium_56710....
Working on Streptococcus_lutetiensis_58501....
Working on Phascolarctobacterium_succinatutens_61948....
Working on Faecalibacterium_prausnitzii_62201....
Working on Faecalibacterium_prausnitzii_61481....
Working on Streptococcus_anginosus_58223....
Working on Blautia_wexlerae_56130....
Working on Dorea_longicatena_61473....
Working on Staphylococcus_warneri_58053....
Working on Enterococcus_faecalis_55

In [10]:
pickle.dump(genes_by_cohort, open('%s/genes_by_cohort_with_vartype.pkl' % config.analysis_directory, 'wb'))

In [11]:
genes_by_cohort = pickle.load(open('%s/genes_by_cohort.pkl' % config.analysis_directory, 'rb'))
num_host_genes_by_cohort = pickle.load(open('%s/num_host_genes_by_cohort.pkl' % config.analysis_directory, 'rb'))
genes_by_cohort_present_null = pickle.load(open('%s/genes_by_cohort_present_null.pkl' % config.analysis_directory, 'rb'))
genes_by_cohort_between_null = pickle.load(open('%s/genes_by_cohort_between_null.pkl' % config.analysis_directory, 'rb'))
genes_by_cohort_pangenome_null = pickle.load(open('%s/genes_by_cohort_pangenome_null.pkl' % config.analysis_directory, 'rb'))

In [22]:
for cat in custom_cohorts_ordered:
    f = open('%s/%s_snp_change_gene_annotation_v2.tsv' % (config.analysis_directory, cat.replace(' ', '_')), 'w')
    f.write('\t'.join(['count', 'count_by_vartype', 'num_unique_hosts', 'present_null', 'between_null', 'pangenome_null', 'gene']) + '\n')
    for gene, count_dict in sorted(genes_by_cohort[cat].items(), key=lambda x: sum(x[1].values()), reverse=True):
        total_count = sum(count_dict.values())
        vartype_str = '|'.join([str(genes_by_cohort[cat][gene][vt]) for vt in variant_types])
        present_null_count = genes_by_cohort_present_null[cat][gene]
        between_null_count = genes_by_cohort_between_null[cat][gene]
        pangenome_null_count = genes_by_cohort_pangenome_null[cat][gene]
        num_unique_hosts = len(num_host_genes_by_cohort[cat][gene])
        if num_unique_hosts > total_count:
            print("Huh???")
            print(gene)
        f.write('\t'.join([str(x) for x in [total_count, vartype_str, num_unique_hosts, present_null_count, between_null_count, pangenome_null_count, gene]]) + '\n')
    f.close()

In [19]:
genes_by_cohort['AA']['1,4-alpha-glucan (glycogen) branching enzyme, GH-13-type (EC 2.4.1.18)']

{'1D': 0, '2D': 1, '3D': 0, '4D': 0}

In [5]:
# Parallelism - ok just make one big II category

custom_cohorts_ordered = ['II']
variant_types = ['1D', '2D', '3D', '4D']

def custom_cohort_of_sample_pair(custom_cohort_tests, sample_i, sample_j):
    for cohort in custom_cohort_tests:
        if custom_cohort_tests[cohort](sample_i, sample_j):
            return cohort

custom_cohort_tests = {}
custom_cohort_tests['II'] = lambda sample_i, sample_j: ((sample_i in infant_samples_no_olm and sample_j in infant_samples_no_olm))

# cohort -> gene -> variant type -> count of SNP changes
genes_by_cohort = {cohort: {} for cohort in custom_cohorts_ordered}
genes_by_cohort_present_null = {cohort: defaultdict(int) for cohort in custom_cohorts_ordered}
genes_by_cohort_between_null = {cohort: defaultdict(int) for cohort in custom_cohorts_ordered}
genes_by_cohort_pangenome_null = {cohort: defaultdict(int) for cohort in custom_cohorts_ordered}

# cohort -> gene -> count of unique hosts
num_host_genes_by_cohort = {cohort: defaultdict(set) for cohort in custom_cohorts_ordered}

for species in snp_changes_consecutive:
    print("Working on %s...." % species)
    genome_ids = midas_db_utils.get_ref_genome_ids(species)
    # load the gene descriptions for all genomes coresponding to this speceis:
    gene_descriptions=parse_patric.load_patric_gene_descriptions(genome_ids)
    
    for sample_i, sample_j in snp_changes_consecutive[species]:
        val = snp_changes_consecutive[species][(sample_i, sample_j)]
        subject_tuple = (sample_subject_map[sample_i], sample_subject_map[sample_j])
        if not ((sample_i in infant_samples_no_olm and sample_j in infant_samples_no_olm)):
            continue
        else:
            custom_cohort = 'II'
        if type(val) == type([]): # not a replacement
            
            # Store actual changed genes
            for gene_id, contig, position, variant_type, A1, D1, A2, D2 in val:
                try:
                    desc = gene_descriptions[gene_id]
                    if desc not in genes_by_cohort[custom_cohort]:
                        genes_by_cohort[custom_cohort][desc] = {vartype: 0 for vartype in variant_types}
                    genes_by_cohort[custom_cohort][desc][variant_type] += 1
                    num_host_genes_by_cohort[custom_cohort][desc].add(subject_tuple)
                except:
                    print("Weird")
                    continue
            
            # Store present gene null
            for gene_id in snp_change_present_gene_null[species][(sample_i, sample_j)]:
                try:
                    desc = gene_descriptions[gene_id]
                    count = snp_change_present_gene_null[species][(sample_i, sample_j)][gene_id]
                    genes_by_cohort_present_null[custom_cohort][desc] += count
                except:
                    print("Weird - present")
                    continue
            
            # Store between host null
            for gene_id in snp_change_between_host_null[species][(sample_i, sample_j)]:
                try:
                    desc = gene_descriptions[gene_id]
                    count = snp_change_between_host_null[species][(sample_i, sample_j)][gene_id]
                    genes_by_cohort_between_null[custom_cohort][desc] += count
                except:
                    print("Weird - between")
                    continue
            
            # Store pangenome null
            for gene_id in snp_change_pangenome_null[species][(sample_i, sample_j)]:
                try:
                    desc = gene_descriptions[gene_id]
                    count = snp_change_pangenome_null[species][(sample_i, sample_j)][gene_id]
                    genes_by_cohort_pangenome_null[custom_cohort][desc] += count
                except:
                    print("Weird - pangenome")
                    continue

Working on Lactobacillus_paracasei_55666....
Working on Leclercia_adecarboxylata_62497....
Working on Bacteroides_sartorii_54642....
Working on Megamonas_hypermegale_57114....
Working on Bacteroides_intestinalis_61596....
Working on Bacteroides_uniformis_57318....
Weird - pangenome
Working on Bifidobacterium_bifidum_55065....
Working on Clostridium_bolteae_57158....
Working on Streptococcus_gallolyticus_57748....
Working on Clostridium_paraputrificum_59909....
Working on Subdoligranulum_sp_62068....
Working on Enterococcus_faecium_56710....
Working on Streptococcus_lutetiensis_58501....
Working on Phascolarctobacterium_succinatutens_61948....
Working on Faecalibacterium_prausnitzii_62201....
Working on Faecalibacterium_prausnitzii_61481....
Working on Streptococcus_anginosus_58223....
Working on Blautia_wexlerae_56130....
Working on Dorea_longicatena_61473....
Working on Staphylococcus_warneri_58053....
Working on Enterococcus_faecalis_55915....
Working on Streptococcus_salivarius_5803

In [None]:
genes_by_cohort = pickle.load(open('%s/genes_by_cohort_with_vartype.pkl' % config.analysis_directory, 'rb'))

In [None]:
# Also include number of unique hosts
    
for cat in custom_cohorts_ordered:
    f = open('%s/%s_snp_change_gene_annotation_v2.tsv' % (config.analysis_directory, cat.replace(' ', '_')), 'w')
    f.write('\t'.join(['count', 'count_by_vartype', 'num_unique_hosts', 'present_null', 'between_null', 'pangenome_null', 'gene']) + '\n')
    for gene, count_dict in sorted(genes_by_cohort[cat].items(), key=lambda x: sum(x[1].values()), reverse=True):
        total_count = sum(count_dict.values())
        vartype_str = '|'.join([str(genes_by_cohort[cat][gene][vt]) for vt in variant_types])
        present_null_count = genes_by_cohort_present_null[cat][gene]
        between_null_count = genes_by_cohort_between_null[cat][gene]
        pangenome_null_count = genes_by_cohort_pangenome_null[cat][gene]
        num_unique_hosts = len(num_host_genes_by_cohort[cat][gene])
        if num_unique_hosts > total_count:
            print("Huh???")
            print(gene)
        f.write('\t'.join([str(x) for x in [total_count, vartype_str, num_unique_hosts, present_null_count, between_null_count, pangenome_null_count, gene]]) + '\n')
    f.close()

In [None]:
# Here's what imma plot:
# For every gene that changes in at least two unique hosts in infants,
# and which changes a greater number of times than all three null expectations,
# get proportion increase in occurrence from present null as
# (actual - present_null) / (present_null)
# and order by proportion increase..?

gene_prop_increase = {}

for gene, count in genes_by_cohort[cat].items():
    present_null_count = genes_by_cohort_present_null[cat][gene]
    between_null_count = genes_by_cohort_between_null[cat][gene]
    pangenome_null_count = genes_by_cohort_pangenome_null[cat][gene]
    num_unique_hosts = len(num_host_genes_by_cohort[cat][gene])
    if num_unique_hosts >= 2 and count > present_null_count and count > between_null_count and count > pangenome_null_count:
        gene_prop_increase[gene] = float(count - present_null_count)/float(present_null_count)

In [None]:
for gene, prop_increase in sorted(gene_prop_increase.items(), key=lambda x: x[1], reverse=True):
    print(gene)
    present_null_count = genes_by_cohort_present_null[cat][gene]
    count = genes_by_cohort[cat][gene]
    print('Present null: %.02f\t| Actual: %i\t| Num hosts: %i\t| Increase: %.02fx' % (present_null_count, count, prop_increase) + '\n')