In [1]:
from utils import config, sample_utils as su
from utils import parse_midas_data, parse_patric, core_gene_utils, midas_db_utils
import numpy as np, pickle, sys
from numpy.random import choice
from collections import defaultdict

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

In [2]:
plot_dir = "%s/revs_genes/" % (config.analysis_directory)
ddir = config.data_directory
pdir = "%s/pickles" % ddir
sweep_type = 'full'
snp_modifications_by_site = pickle.load(open('%s/snp_modifications_by_site_%s.pkl' % (pdir, sweep_type), 'rb'))
snp_modification_genes = pickle.load(open('%s/snp_modification_genes_%s.pkl' % (pdir, sweep_type), 'rb'))
modification_gene_gains = pickle.load(open('%s/modification_gene_gains_%s.pkl' % (pdir, sweep_type), 'rb'))
modification_gene_losses = pickle.load(open('%s/modification_gene_losses_%s.pkl' % (pdir, sweep_type), 'rb'))
modification_gene_gains_present_null = pickle.load(open('%s/modification_gene_gains_present_null_%s.pkl' % (pdir, sweep_type), 'rb'))
modification_gene_losses_present_null = pickle.load(open('%s/modification_gene_losses_present_null_%s.pkl' % (pdir, sweep_type), 'rb'))

In [3]:
keywords={}
keywords['ABC transporter']=['ABC']
keywords['phage']=['hage']
keywords['transposon']=['onjugati','anspos']
keywords['mobilization']=['mob','obilization','obile']
keywords['integrase']=['ntegrase']
keywords['plasmid']=['plasmid']
keywords['recombinase']=['ecombinase']
keywords['tRNA']=['tRNA']
keywords['ATP']=['ATP']
keywords['excisionase']=['xcisionase']
keywords['transmembrane']=['embrane']
keywords['replication']=['eplication']
keywords['regulator']=['egulator']
keywords['transcription']=['anscription']
keywords['toxin']=['toxin']
keywords['restriction']=['estriction']
keywords['replication']=['eplication']
keywords['transferase']=['ansferase']
keywords['reductase']=['eductase']
keywords['phosphatase']=['phosphatase']
keywords['helicase']=['elicase']
keywords['kinase']=['kinase']
keywords['dehydrogenase']=['dehydrogenase']
keywords['drug']=['drug']
keywords['cell wall']=['ell wall']
keywords['primase']=['imase']
keywords['resistance']=['resistance']
keywords['hydrolase']=['ydrolase']
keywords['topoisomerase']=['opoisomerase']
keywords['hypothetical'] = ['ypothetical']

# since this is a greedy algorithm, order the more important keywords first
keyword_order=['ABC transporter','phage','transposon','mobilization','integrase', 'plasmid','recombinase','tRNA','ATP','excisionase','transmembrane','replication','regulator','transcription','toxin','restriction','replication','transferase','reductase','phosphatase','helicase','kinase','dehydrogenase','drug','cell wall','primase','topoisomerase','hypothetical']

common_genes={}

In [17]:
new_dict = defaultdict(int)

for keyword in keyword_order:
    for gene_name in changed_gene_name_counts_by_tp_type['AA']:
        if keyword in gene_name:
            new_dict[keyword] += changed_gene_name_counts_by_tp_type['AA'][gene_name]

In [18]:
new_dict

defaultdict(int,
            {'ABC transporter': 27,
             'ATP': 11,
             'dehydrogenase': 2,
             'drug': 2,
             'hypothetical': 77,
             'kinase': 6,
             'phosphatase': 1,
             'reductase': 2,
             'regulator': 11,
             'tRNA': 4,
             'transcription': 1,
             'transferase': 9})

In [22]:
tp_types = ['AA', 'II-premie', 'II-nonpremie','MM', 'MI-premie', 'MI-nonpremie']
num_gene_gains_by_tp_type = {tp_type: 0 for tp_type in tp_types}
gained_genes_by_tp_type = {tp_type: set() for tp_type in tp_types}
gained_genes_counts_by_tp_type = {tp_type: defaultdict(int) for tp_type in tp_types}
gained_genes_counts_nulls_by_tp_type = {tp_type: defaultdict(int) for tp_type in tp_types}

# Enumerate genes
for species in modification_gene_gains:
    for cohort in modification_gene_gains[species]:
        for subject in modification_gene_gains[species][cohort]:
            subdict = modification_gene_gains[species][cohort][subject]
            subdict_null = modification_gene_gains_present_null[species][cohort][subject]
            # For each sample pair in this cohort/species,
            # have a count of how many gains of a particular gene
            for tp_pair in subdict:
                tp_type = tp_pair_to_tp_type(tp_pair, cohort)
                for gene_id, gene_name in subdict[tp_pair]:
                    gene_gain_count = subdict[tp_pair][(gene_id, gene_name)]
                    gene_gain_count_null = subdict_null[tp_pair][(gene_id, gene_name)]
                    num_gene_gains_by_tp_type[tp_type] += gene_gain_count
                    gained_genes_by_tp_type[tp_type].add(gene_name)
                    gained_genes_counts_by_tp_type[tp_type][gene_name] += gene_gain_count
                    gained_genes_counts_nulls_by_tp_type[tp_type][gene_name] += gene_gain_count_null

In [37]:
num_gene_gains_by_tp_type

{'AA': 4,
 'II-nonpremie': 3555,
 'II-premie': 93,
 'MI-nonpremie': 1,
 'MI-premie': 0,
 'MM': 0}

In [36]:
for tp_type in gained_genes_by_tp_type:
    count = len(gained_genes_by_tp_type[tp_type])
    print('%i %s' % (count, tp_type))

1 AA
30 II-premie
0 MI-premie
0 MM
1 MI-nonpremie
637 II-nonpremie


In [38]:
for tp_type in gained_genes_counts_by_tp_type:
    print(tp_type)
    strs = []
    for gene_name in gained_genes_counts_by_tp_type[tp_type]:
        count = gained_genes_counts_by_tp_type[tp_type][gene_name]
        if count > 1:
            strs.append("%i %s" % (count, gene_name))
    for string in sorted(strs, reverse=True):
        print(string)
    print('')

AA
4 hypothetical protein

II-premie
61 hypothetical protein
3 Phage protein
2 Doubtful CDS. No database matches

MI-premie

MM

MI-nonpremie

II-nonpremie
9 RNA-binding protein
9 Outer membrane TonB-dependent transporter, utilization system for glycans and polysaccharides (PUL), SusC family
8 Putative helicase
8 Integrase
7 Uncharacterized MFS-type transporter
6 cAMP-binding proteins - catabolite gene activator and regulatory subunit of cAMP-dependent protein kinases
6 UpdZ protein
6 Two-component transcriptional response regulator, LuxR family
6 Glycosyltransferase
5 SclB protein
5 Putative glycosyltransferase
5 O-acetylhomoserine sulfhydrylase (EC 2.5.1.49) @ O-succinylhomoserine sulfhydrylase (EC 2.5.1.48)
5 N-acetylmuramoyl-L-alanine amidase (EC 3.5.1.28)
5 Capsular polysaccharide transcription antitermination protein, UpxY family
5 ABC-type antimicrobial peptide transport system, ATPase component
46 RNA polymerase ECF-type sigma factor
4 beta-glycosyl hydrolase
4 [FeFe]-hydrogena

In [31]:
gained_genes_counts_nulls_by_tp_type

{'AA': defaultdict(int, {}),
 'II-nonpremie': defaultdict(int,
             {'1,4-alpha-glucan (glycogen) branching enzyme, GH-13-type (EC 2.4.1.18)': 0.14,
              "16S rRNA (cytidine(1402)-2'-O)-methyltransferase (EC 2.1.1.198)": 0.21999999999999997,
              '2-amino-3-ketobutyrate coenzyme A ligase (EC 2.3.1.29)': 0.27999999999999997,
              '2-amino-4-hydroxy-6-hydroxymethyldihydropteridine pyrophosphokinase (EC 2.7.6.3)': 0.41999999999999993,
              '2-methylfumaryl-CoA hydratase (EC 4.2.1.148)': 0.25999999999999995,
              '2-oxoglutarate/2-oxoacid ferredoxin oxidoreductase, delta subunit, ferredoxin-like 4Fe-4S binding protein (EC 1.2.7.-)': 0.3,
              '2-oxoglutarate/2-oxoacid ferredoxin oxidoreductase, gamma subunit (EC 1.2.7.-)': 0.27999999999999997,
              '3-deoxy-manno-octulosonate cytidylyltransferase (EC 2.7.7.38)': 0.23999999999999996,
              '3-isopropylmalate dehydratase small subunit (EC 4.2.1.33)': 0.34,
       

In [39]:
tp_types = ['AA', 'II-premie', 'II-nonpremie','MM', 'MI-premie', 'MI-nonpremie']
num_gene_losses_by_tp_type = {tp_type: 0 for tp_type in tp_types}
lost_genes_by_tp_type = {tp_type: set() for tp_type in tp_types}
lost_genes_counts_by_tp_type = {tp_type: defaultdict(int) for tp_type in tp_types}
lost_genes_counts_nulls_by_tp_type = {tp_type: defaultdict(int) for tp_type in tp_types}

# Enumerate genes
for species in modification_gene_losses:
    for cohort in modification_gene_losses[species]:
        for subject in modification_gene_losses[species][cohort]:
            subdict = modification_gene_losses[species][cohort][subject]
            subdict_null = modification_gene_losses_present_null[species][cohort][subject]
            # For each sample pair in this cohort/species,
            # have a count of how many gains of a particular gene
            for tp_pair in subdict:
                tp_type = tp_pair_to_tp_type(tp_pair, cohort)
                for gene_id, gene_name in subdict[tp_pair]:
                    gene_loss_count = subdict[tp_pair][(gene_id, gene_name)]
                    gene_loss_count_null = subdict_null[tp_pair][(gene_id, gene_name)]
                    num_gene_losses_by_tp_type[tp_type] += gene_gain_count
                    lost_genes_by_tp_type[tp_type].add(gene_name)
                    lost_genes_counts_by_tp_type[tp_type][gene_name] += gene_gain_count
                    lost_genes_counts_nulls_by_tp_type[tp_type][gene_name] += gene_gain_count_null

In [40]:
num_gene_losses_by_tp_type

{'AA': 17,
 'II-nonpremie': 60,
 'II-premie': 6,
 'MI-nonpremie': 1792,
 'MI-premie': 0,
 'MM': 0}

In [42]:
for tp_type in lost_genes_by_tp_type:
    count = len(lost_genes_by_tp_type[tp_type])
    print('%i %s' % (count, tp_type))

7 AA
5 II-premie
0 MI-premie
0 MM
407 MI-nonpremie
24 II-nonpremie


In [43]:
for tp_type in lost_genes_counts_by_tp_type:
    print(tp_type)
    strs = []
    for gene_name in lost_genes_counts_by_tp_type[tp_type]:
        count = lost_genes_counts_by_tp_type[tp_type][gene_name]
        if count > 1:
            strs.append("%i %s" % (count, gene_name))
    for string in sorted(strs, reverse=True):
        print(string)
    print('')

AA
11 hypothetical protein

II-premie
2 hypothetical protein

MI-premie

MM

MI-nonpremie
8 Transcriptional regulator, AraC family
7 putative membrane protein
7 Two-component system sensor histidine kinase
7 Outer membrane TonB-dependent transporter, utilization system for glycans and polysaccharides (PUL), SusC family
7 Cell surface glycan-binding lipoprotein, utilization system for glycans and polysaccharides (PUL), SusD family
6 Glycosyltransferase
5 cAMP-binding proteins - catabolite gene activator and regulatory subunit of cAMP-dependent protein kinases
5 Thioredoxin
5 Putative anti-sigma factor
4 Uncharacterized MFS-type transporter
4 RNA-binding protein
4 Putative helicase
3 putative TonB-dependent receptor
3 UpdZ protein
3 Two-component transcriptional response regulator, LuxR family
3 Transcriptional regulator, AcrR family
3 Permease of the drug/metabolite transporter (DMT) superfamily
3 O-acetylhomoserine sulfhydrylase (EC 2.5.1.49) @ O-succinylhomoserine sulfhydrylase (EC 2.

In [52]:
gene_change_counts_by_tp_type = {}
gene_change_null_counts_by_tp_type = {}

for tp_type in lost_genes_counts_by_tp_type:
    gene_change_counts_by_tp_type[tp_type] = {}
    gene_change_null_counts_by_tp_type[tp_type] = {}
    
    for gene in set(lost_genes_counts_by_tp_type[tp_type].keys() + gained_genes_counts_by_tp_type[tp_type].keys()):
        num_losses = 0 if gene not in lost_genes_counts_by_tp_type[tp_type] else lost_genes_counts_by_tp_type[tp_type][gene]        
        num_gains = 0 if gene not in gained_genes_counts_by_tp_type[tp_type] else gained_genes_counts_by_tp_type[tp_type][gene]
        gene_change_counts_by_tp_type[tp_type][gene] = num_losses + num_gains
        
        num_losses_null = 0 if gene not in lost_genes_counts_nulls_by_tp_type[tp_type] else lost_genes_counts_nulls_by_tp_type[tp_type][gene]        
        num_gains_null = 0 if gene not in gained_genes_counts_nulls_by_tp_type[tp_type] else gained_genes_counts_nulls_by_tp_type[tp_type][gene]        
        gene_change_null_counts_by_tp_type[tp_type][gene] = num_losses_null + num_gains_null

In [51]:
for tp_type in gene_change_counts_by_tp_type:
    print(sorted(gene_change_counts_by_tp_type[tp_type].items(), key=lambda x: x[1], reverse=True))

[('hypothetical protein', 15), ('helicase, UvrD/Rep family', 1), ('Glycosyl transferase, family 2', 1), ('Glycosyltransferase', 1), ('LtrC-like protein', 1), ('FIG00897069: hypothetical protein', 1), ('Holin', 1)]
[('hypothetical protein', 63), ('Phage protein', 3), ('Doubtful CDS. No database matches', 2), ('N-acetylglutamate kinase (EC 2.7.2.8)', 1), ('UPF0358 protein YlaN', 1), ('Biotin carboxyl carrier protein associated with SAR1687-8', 1), ('Lambdoid phage Rac integrase', 1), ('Uncharacterized protein YebG', 1), ('hypothetical protein within a prophage', 1), ('[Genomic island nu Sa beta2]', 1), ('Mobile element protein', 1), ('Bacterial non-heme ferritin (EC 1.16.3.2)', 1), ('EsaC protein within ESAT-6 gene cluster (S.aureus type)', 1), ('Membrane protein insertion efficiency factor YidD', 1), ('Choline dehydrogenase (EC 1.1.99.1)', 1), ('ORF058', 1), ('Hypothetical protein SAV2259', 1), ('Copper/silver efflux RND transporter, transmembrane protein CusA', 1), ('Copper/silver effl

In [55]:
for tp_type in gene_change_null_counts_by_tp_type:
    print(sorted(gene_change_null_counts_by_tp_type[tp_type].items(), key=lambda x: x[1], reverse=True))

[('helicase, UvrD/Rep family', 0), ('Glycosyl transferase, family 2', 0), ('Glycosyltransferase', 0), ('LtrC-like protein', 0), ('FIG00897069: hypothetical protein', 0), ('Holin', 0), ('hypothetical protein', 0)]
[('hypothetical protein', 0.9400000000000004), ('Phage protein', 0.08), ('hypothetical protein transposon-related', 0.08), ('LSU ribosomal protein L31p @ LSU ribosomal protein L31p, zinc-independent', 0.06), ('UPF0358 protein YlaN', 0.04), ('EsaC protein within ESAT-6 gene cluster (S.aureus type)', 0.04), ('Hypothetical protein SAV2259', 0.04), ('Hypothetical protein SAV2286', 0.04), ('PTS system, galactitol-specific IIB component (EC 2.7.1.200)', 0.04), ('N-acetylglutamate kinase (EC 2.7.2.8)', 0.02), ('Biotin carboxyl carrier protein associated with SAR1687-8', 0.02), ('Uncharacterized protein YebG', 0.02), ('hypothetical protein within a prophage', 0.02), ('Choline dehydrogenase (EC 1.1.99.1)', 0.02), ('ORF058', 0.02), ('Repetitive hypothetical protein in ESAT cluster, COG4

In [4]:
for tp_type in gene_change_counts_by_tp_type:
    output = open('%s/gene_changes_%s.tsv' % (config.analysis_directory, tp_type), 'w')
    output.write('\t'.join(['gene_name', 'num_changes', 'num_changes_null', 'num_gains', 'num_gains_null', 'num_losses', 'num_losses_null']) + '\n')
    for gene in gene_change_counts_by_tp_type[tp_type]:
        num_changes = gene_change_counts_by_tp_type[tp_type][gene]
        num_losses = 0 if gene not in lost_genes_counts_by_tp_type[tp_type] else lost_genes_counts_by_tp_type[tp_type][gene]
        num_gains = 0 if gene not in gained_genes_counts_by_tp_type[tp_type] else gained_genes_counts_by_tp_type[tp_type][gene]
        num_changes_null = gene_change_null_counts_by_tp_type[tp_type][gene]
        num_losses_null = 0 if gene not in lost_genes_counts_nulls_by_tp_type[tp_type] else lost_genes_counts_nulls_by_tp_type[tp_type][gene]        
        num_gains_null = 0 if gene not in gained_genes_counts_nulls_by_tp_type[tp_type] else gained_genes_counts_nulls_by_tp_type[tp_type][gene]
        output.write('\t'.join([str(item) for item in [gene, num_changes, num_changes_null, num_gains, num_gains_null, num_losses, num_losses_null]]) + '\n')
    output.close()

NameError: name 'gene_change_counts_by_tp_type' is not defined

In [5]:
# Returns two arrays, one of ordered timepoints
# and other of corresponding values
def order_by_tp(tp_dict):
    items = tp_dict.items()
    mother_items = filter(lambda x: x[0][0] == 'M', items)
    mother_sorted = sorted(mother_items, key=lambda x: int(x[0][1:]))
    infant_items = filter(lambda x: x[0][0] == 'I', items)
    infant_sorted = sorted(infant_items, key=lambda x: int(x[0][1:]))
    comb_sorted = mother_sorted + infant_sorted
    return comb_sorted if comb_sorted != [] else sorted(items, key=lambda x: int(x[0][1:]))

In [6]:
# Returns True if one value is below lower_threshold
# and the other value is above upper_threshold
def is_diff(val1, val2, lower_threshold, upper_threshold):
    return (val1 <= lower_threshold and val2 >= upper_threshold) or (val1 >= upper_threshold and val2 <= lower_threshold)

In [7]:
# Determines if list of values has reversion
def has_reversion(vals, lower_threshold, upper_threshold):
    ref_val = vals[0]
    one_change = False
    for val in vals[1:]:
        # While first change has yet to occur
        if one_change == False:
            if is_diff(val, ref_val, lower_threshold, upper_threshold):
                # If first change detected, set ref_val to new value
                # and set one_change to True
                ref_val = val
                one_change = True
        # First change has already occurred with ref_val reset
        elif one_change == True:
            if is_diff(val, ref_val, lower_threshold, upper_threshold):
                # Second change (must be in opp. direction) detected
                return True
    return False

In [8]:
reversions = []
lower_threshold = 0.2
upper_threshold = 0.8

# Enumerate reversions
for species in snp_modifications_by_site:
    
    # Also prepare gene information
    if sum([len(snp_modifications_by_site[species][cohort].keys()) for cohort in snp_modifications_by_site[species]]) != 0:
        print("Getting gene info for %s..." % species)
        genome_ids = midas_db_utils.get_ref_genome_ids(species)
        non_shared_genes = core_gene_utils.parse_non_shared_pangenome_genes(species)
        gene_desc = parse_patric.load_patric_gene_descriptions(genome_ids, non_shared_genes)
    
    for cohort in snp_modifications_by_site[species]:
        subdict = snp_modifications_by_site[species][cohort]
        for subject in subdict:
            for site in subdict[subject]:
                if len(subdict[subject][site]) > 1:
                    tp_freq_dict = {} # freq info only
                    for snp_change in subdict[subject][site]:
                        tp_pair, gene_id, var_type, A1, D1, A2, D2 = snp_change
                        tp1, tp2 = tp_pair
                        tp_freq_dict[tp1] = (A1/float(D1))
                        tp_freq_dict[tp2] = (A2/float(D2))
                    ordered_tp_freqs = order_by_tp(tp_freq_dict)
                    gene_name = gene_desc[gene_id] if gene_id in gene_desc else 'No name'
                    if has_reversion([freq for tp, freq in ordered_tp_freqs], lower_threshold, upper_threshold):
                        reversions.append((species, cohort, subject, site, gene_id, gene_name, var_type, ordered_tp_freqs))

Getting gene info for Bacteroides_intestinalis_61596...
Getting gene info for Bacteroides_uniformis_57318...


KeyboardInterrupt: 

In [9]:
# revised to differentiate premie and nonpremie
def tp_pair_to_tp_type(tp_pair, cohort):
    tpa, tpb = tp_pair
    tp_type = tpa[0] + tpb[0]
    if tp_type == 'IM':
        tp_type = 'MI'
    if tp_type in ['II', 'MI']:
        if cohort == 'olm':
            tp_type += '-premie'
        else:
            tp_type += '-nonpremie'
    return tp_type

In [10]:
cohorts = ['backhed', 'yassour', 'ferretti', 'hmp', 'shao', 'olm']

In [11]:
num_snp_change_by_cohort = {cohort: 0 for cohort in cohorts}
changed_gene_ids_by_cohort = {cohort: set() for cohort in cohorts}
changed_gene_names_by_cohort = {cohort: set() for cohort in cohorts}
changed_gene_name_counts_by_cohort = {cohort: defaultdict(int) for cohort in cohorts}

tp_types = ['AA', 'II-premie', 'II-nonpremie','MM', 'MI-premie', 'MI-nonpremie']
num_snp_change_by_tp_type = {tp_type: 0 for tp_type in tp_types}
changed_gene_names_by_tp_type = {tp_type: set() for tp_type in tp_types}
changed_gene_name_counts_by_tp_type = {tp_type: defaultdict(int) for tp_type in tp_types}

# Enumerate genes
for species in snp_modification_genes:
    for cohort in snp_modification_genes[species]:
        for subject in snp_modification_genes[species][cohort]:
            subdict = snp_modification_genes[species][cohort][subject]
            # For each sample pair in this cohort/species,
            # have a count of how many SNP changes are
            # in a particular gene
            for tp_pair in subdict:
                tp_type = tp_pair_to_tp_type(tp_pair, cohort)
                for gene_id, gene_name in subdict[tp_pair]:
                    snp_change_count = subdict[tp_pair][(gene_id, gene_name)]

                    num_snp_change_by_cohort[cohort] += snp_change_count
                    changed_gene_ids_by_cohort[cohort].add(gene_id)
                    changed_gene_names_by_cohort[cohort].add(gene_name)
                    changed_gene_name_counts_by_cohort[cohort][gene_name] += snp_change_count

                    num_snp_change_by_tp_type[tp_type] += snp_change_count
                    changed_gene_names_by_tp_type[tp_type].add(gene_name)
                    changed_gene_name_counts_by_tp_type[tp_type][gene_name] += snp_change_count

In [12]:
num_snp_change_by_tp_type

{'AA': 259,
 'II-nonpremie': 276,
 'II-premie': 38,
 'MI-nonpremie': 101,
 'MI-premie': 0,
 'MM': 33}

In [13]:
for tp_type in changed_gene_names_by_tp_type:
    print("%s: %i" % (tp_type,len(changed_gene_names_by_tp_type[tp_type])))

AA: 114
II-premie: 28
MI-premie: 0
MM: 27
MI-nonpremie: 54
II-nonpremie: 107


In [14]:
for tp_type in changed_gene_name_counts_by_tp_type:
    subdict = changed_gene_name_counts_by_tp_type[tp_type]
    print(tp_type)
    for gene_name, count in sorted(subdict.items(), key=lambda x: x[1], reverse=True):
        if count > 1:
            print("%i %s" % (count, gene_name))
    print('')

AA
76 hypothetical protein
14 Oligopeptide ABC transporter, permease protein OppC (TC 3.A.1.5.1)
12 Biotin synthase (EC 2.8.1.6)
10 HAD-superfamily hydrolase, subfamily IA, variant 3
6 RND efflux system, inner membrane transporter
6 Outer membrane TonB-dependent transporter, utilization system for glycans and polysaccharides (PUL), SusC family
5 Outer membrane protein oprM
5 Oligopeptide ABC transporter, permease protein OppB (TC 3.A.1.5.1)
5 Vitamin B12 ABC transporter, ATP-binding protein BtuD
4 Ribonuclease J2 (endoribonuclease in RNA processing)
3 Sensory box/GGDEF family protein
3 N/A
3 DNA gyrase subunit A (EC 5.99.1.3)
2 Two-component system sensor histidine kinase
2 Vitamin B12 ABC transporter, substrate-binding protein BtuF
2 ATP:Cob(I)alamin adenosyltransferase (EC 2.5.1.17)
2 Argininosuccinate lyase (EC 4.3.2.1)
2 Cell surface glycan-binding lipoprotein, utilization system for glycans and polysaccharides (PUL), SusD family
2 DNA-directed RNA polymerase beta subunit (EC 2.7.7

In [13]:
num_snp_change_by_cohort

{'backhed': 166,
 'ferretti': 34,
 'hmp': 259,
 'olm': 38,
 'shao': 153,
 'yassour': 57}

In [14]:
for cohort in changed_gene_ids_by_cohort:
    print("%s: %i" % (cohort,len(changed_gene_ids_by_cohort[cohort])))

hmp: 176
olm: 31
yassour: 48
backhed: 114
ferretti: 13
shao: 68


In [15]:
for cohort in changed_gene_names_by_cohort:
    print("%s: %i" % (cohort,len(changed_gene_names_by_cohort[cohort])))

hmp: 114
olm: 28
yassour: 38
backhed: 81
ferretti: 12
shao: 51


In [16]:
for cohort in changed_gene_name_counts_by_cohort:
    subdict = changed_gene_name_counts_by_cohort[cohort]
    print(cohort)
    for gene_name, count in sorted(subdict.items(), key=lambda x: x[1], reverse=True):
        if count > 1:
            print("%i %s" % (count, gene_name))
    print('')

hmp
76 hypothetical protein
14 Oligopeptide ABC transporter, permease protein OppC (TC 3.A.1.5.1)
12 Biotin synthase (EC 2.8.1.6)
10 HAD-superfamily hydrolase, subfamily IA, variant 3
6 RND efflux system, inner membrane transporter
6 Outer membrane TonB-dependent transporter, utilization system for glycans and polysaccharides (PUL), SusC family
5 Outer membrane protein oprM
5 Oligopeptide ABC transporter, permease protein OppB (TC 3.A.1.5.1)
5 Vitamin B12 ABC transporter, ATP-binding protein BtuD
4 Ribonuclease J2 (endoribonuclease in RNA processing)
3 Sensory box/GGDEF family protein
3 N/A
3 DNA gyrase subunit A (EC 5.99.1.3)
2 Two-component system sensor histidine kinase
2 Vitamin B12 ABC transporter, substrate-binding protein BtuF
2 ATP:Cob(I)alamin adenosyltransferase (EC 2.5.1.17)
2 Argininosuccinate lyase (EC 4.3.2.1)
2 Cell surface glycan-binding lipoprotein, utilization system for glycans and polysaccharides (PUL), SusD family
2 DNA-directed RNA polymerase beta subunit (EC 2.7.

In [17]:
grouped_cohorts = {'hmp': ['hmp'], 'non-premie': ['backhed', 'yassour', 'ferretti', 'shao'], 'premie': ['olm'], 'infant': ['backhed', 'yassour', 'ferretti', 'shao', 'olm']}
changed_gene_name_counts_by_grouped_cohort = {grouped_cohort: defaultdict(int) for grouped_cohort in grouped_cohorts.keys()}

# Group by premie and non-premie
for cohort in changed_gene_name_counts_by_cohort:
    subdict = changed_gene_name_counts_by_cohort[cohort]
    for grouped_cohort in grouped_cohorts:
        if cohort in grouped_cohorts[grouped_cohort]:
            for gene_name in subdict:
                changed_gene_name_counts_by_grouped_cohort[grouped_cohort][gene_name] += subdict[gene_name]

In [18]:
for grouped_cohort in changed_gene_name_counts_by_grouped_cohort:
    subdict = changed_gene_name_counts_by_grouped_cohort[grouped_cohort]
    print(grouped_cohort)
    for gene_name, count in sorted(subdict.items(), key=lambda x: x[1], reverse=True):
        if count > 1:
            print("%i %s" % (count, gene_name))
    print('')

hmp
76 hypothetical protein
14 Oligopeptide ABC transporter, permease protein OppC (TC 3.A.1.5.1)
12 Biotin synthase (EC 2.8.1.6)
10 HAD-superfamily hydrolase, subfamily IA, variant 3
6 RND efflux system, inner membrane transporter
6 Outer membrane TonB-dependent transporter, utilization system for glycans and polysaccharides (PUL), SusC family
5 Outer membrane protein oprM
5 Oligopeptide ABC transporter, permease protein OppB (TC 3.A.1.5.1)
5 Vitamin B12 ABC transporter, ATP-binding protein BtuD
4 Ribonuclease J2 (endoribonuclease in RNA processing)
3 Sensory box/GGDEF family protein
3 N/A
3 DNA gyrase subunit A (EC 5.99.1.3)
2 Two-component system sensor histidine kinase
2 Vitamin B12 ABC transporter, substrate-binding protein BtuF
2 ATP:Cob(I)alamin adenosyltransferase (EC 2.5.1.17)
2 Argininosuccinate lyase (EC 4.3.2.1)
2 DNA-directed RNA polymerase beta subunit (EC 2.7.7.6)
2 Cell surface glycan-binding lipoprotein, utilization system for glycans and polysaccharides (PUL), SusD fa

In [19]:
changed_gene_name_counts_by_cohort['backhed']

defaultdict(int,
            {'1-acyl-sn-glycerol-3-phosphate acyltransferase': 1,
             '16S rRNA (cytosine(1402)-N(4))-methyltransferase (EC 2.1.1.199)': 1,
             'ABC transporter, ATP-binding protein (cluster 1, maltose/g3p/polyamine/iron); ABC transporter, ATP-binding protein (cluster 10, nitrate/sulfonate/bicarbonate)': 1,
             'ABC transporter, permease protein 1 (cluster 1, maltose/g3p/polyamine/iron)': 1,
             'ABC transporter, substrate-binding protein (cluster 1, maltose/g3p/polyamine/iron)': 2,
             'ATP-dependent DNA helicase RecQ': 1,
             'Acetylornithine deacetylase/Succinyl-diaminopimelate desuccinylase and related deacylases': 1,
             'Acidobacterial duplicated orphan permease (function unknown)': 1,
             'Acyl-[acyl-carrier-protein]--UDP-N-acetylglucosamine O-acyltransferase (EC 2.3.1.129)': 1,
             'Alpha-1,2-mannosidase': 1,
             'Aminopeptidase C (EC 3.4.22.40)': 1,
             'Aminotra

In [20]:
genes_cohort = {cohort: set() for cohort in cohorts}

for cohort in genes_tp_pairs:
    for gene_tuple in genes_tp_pairs[cohort]:
        for tp_pair in genes_tp_pairs[cohort][gene_tuple]:
            tpa, tpb = tp_pair
            tp_type = tpa[0] + tpb[0]
            if tp_type == 'II':
                genes_cohort[cohort].add(gene_tuple)

NameError: name 'genes_tp_pairs' is not defined

In [21]:
genes_cohort

{'backhed': set(),
 'ferretti': set(),
 'hmp': set(),
 'olm': set(),
 'shao': set(),
 'yassour': set()}

In [13]:
# Formatted printing of reversions
for species, cohort, subject, site, gene_id, gene_name, var_type, ordered_tp_freqs in reversions:
    
    print(species + ' | ' + cohort + ' | Subject: ' + subject)
    print("Gene id: " + gene_id + ' | Variant type: ' + var_type)
    print("Gene name: " + gene_name)
    for tp, freq in ordered_tp_freqs:
        print('\t' + tp + ': %.03f' % freq)
    print('')

Enterococcus_faecalis_56297 | olm | Subject: N1_018
Gene id: 1158976.3.peg.771 | Variant type: 4D
Gene name: hypothetical protein
	I18: 0.826
	I19: 0.074
	I20: 0.074
	I24: 0.801
	I37: 0.118

Klebsiella_pneumoniae_54788 | olm | Subject: N1_023
Gene id: 1328400.3.peg.232 | Variant type: 1D
Gene name: No name
	I15: 0.026
	I17: 0.897
	I18: 0.818
	I19: 0.088
	I21: 0.911
	I29: 0.172

Citrobacter_freundii_56148 | olm | Subject: N1_009
Gene id: 1114920.3.peg.2929 | Variant type: 4D
Gene name: Phosphoenolpyruvate-dihydroxyacetone phosphotransferase operon regulatory protein DhaR
	I10: 0.870
	I13: 0.164
	I15: 0.091
	I16: 0.191
	I17: 0.944

Bacteroides_vulgatus_57955 | olm | Subject: N4_097
Gene id: 435590.9.peg.1561 | Variant type: 4D
Gene name: Uncharacterized protein BT3327
	I12: 0.112
	I27: 0.826
	I34: 0.115

Bacteroides_vulgatus_57955 | yassour | Subject: M1098-M
Gene id: 435590.9.peg.52 | Variant type: 1D
Gene name: DNA internalization-related competence protein ComEC/Rec2
	M1: 1.000
	I2: 0