In [1]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/microbiome_evolution/microbiome_evolution_MOUSE/")

import numpy
import sys
import config
import gzip
import os.path
import os
import midas_db_utils

#My additions
import core_gene_utils
from core_gene_utils import *
import pandas as pd

# Directories

In [2]:
core_genes_directory = ("%score_genes/" % config.data_directory)
external_core_genes_directory = ("%score_genes/external/" % config.data_directory)

default_external_shared_gene_filename = (external_core_genes_directory+"shared_genes.txt.gz")
default_external_core_gene_filename = (external_core_genes_directory+"core_genes.txt.gz")
default_external_stringent_core_gene_filename = (external_core_genes_directory+"core_genes_stringent.txt.gz")
default_external_gene_freq_template = (external_core_genes_directory+"%s_gene_freqs.txt.gz")

default_shared_gene_filename = (core_genes_directory+"shared_genes_HMP.txt.gz")
default_core_gene_filename = (core_genes_directory+"core_genes.txt.gz")
default_stringent_core_gene_filename = (core_genes_directory+"core_genes_stringent.txt.gz")
default_gene_freq_template = (core_genes_directory+"%s_gene_freqs.txt.gz")

# MAIN

### Importing another script

In [3]:
import parse_midas_data

### Making core gene directory if needed

In [4]:
os.system('mkdir -p %s' % core_genes_directory)
os.system('mkdir -p %s' % external_core_genes_directory)

0

### Parsing good species

The requirements:
- good_species_min_coverage = 10
- good_species_min_prevalence = 10

Using the function:

> species_coverage_matrix, samples, species = parse_midas_data.parse_global_marker_gene_coverages()

it generates an list made up of arrays equal to the number of SNPs, with each array being the length of the number of samples. "samples" and "species" provide a key for this marker gene coverage list.

parse_good_species_list() then iterates through each species generated by parse_global_marker_gene_coverages(), and checks:
- the number of samples with > the min marker gene coverage is > the prevalence threshold

It appends all species that pass to "good_species_list," which here is assigned to **pangenome_species**.

In [5]:
pangenome_species = parse_midas_data.parse_good_species_list()

In [6]:
pangenome_species

['Blautia_producta_56315',
 'Akkermansia_muciniphila_55290',
 'Roseburia_intestinalis_56239',
 'Bacteroides_cellulosilyticus_58046',
 'Escherichia_coli_58110',
 'Adlercreutzia_equolifaciens_60310',
 'Enterococcus_faecium_56947',
 'Parabacteroides_distasonis_56985',
 'Bacteroides_ovatus_58035',
 'Blautia_wexlerae_56130',
 'Ruminococcus_sp_58571',
 'Burkholderiales_bacterium_56577',
 'Lactobacillus_acidophilus_51143',
 'Coprococcus_sp_62244',
 'Bacteroides_vulgatus_57955',
 'Bifidobacterium_animalis_58116',
 'Anaerostipes_hadrus_55206',
 'Bacteroides_uniformis_57318',
 'Clostridiales_bacterium_52743',
 'Ruminococcus_bicirculans_59300',
 'Clostridium_clostridioforme_51842',
 'Clostridiales_bacterium_61057',
 'Faecalibacterium_prausnitzii_61481',
 'Sutterella_wadsworthensis_56828',
 'Eubacterium_hallii_61477',
 'Ruminococcus_bromii_62047',
 'Lactococcus_lactis_57073',
 'Lachnospiraceae_bacterium_56833',
 'Bacteroides_thetaiotaomicron_56941',
 'Bacteroides_massiliensis_44749',
 'Ruminococcu

### Setting coverage minimums

cmin (config.core_genome_min_copynum) and cmax (config.core_genome_max_copynum) set the thresholds beyond which gene are not considered as part of the core genome.

I'm not sure what shared shared_cmin—hopefully that will become clear!

min_good_fraction (config.core_genome_min_prevalence) is the fraction of samples a gene needs to be in to be considered "core." This is **0.9**.

The minimum coverage for a gene to be included in the core genome is 5. I'm not sure if this is an average, or per sample.

In [7]:
cmin = config.core_genome_min_copynum
cmax = config.core_genome_max_copynum  
shared_cmin = config.shared_genome_min_copynum

In [8]:
print "cmin is %s" % (cmin)
print "cmax is %s" % (cmax)
print "shared_cmin is %s" % (shared_cmin)

cmin is 0.3
cmax is 3
shared_cmin is 3


In [9]:
min_good_fraction = config.core_genome_min_prevalence
min_coverage = 5 # (for assessing core genome, we'll use a lower coverage value than when we look at real changes)
    
    

In [10]:
print "min_good_fraction is %s" % (min_good_fraction)


min_good_fraction is 0.9


### Opening the files I'm writing to

In [None]:
output_filename = default_core_gene_filename
#output_file = gzip.GzipFile(output_filename,"w")

In [None]:
stringent_output_filename = default_stringent_core_gene_filename
#stringent_output_file = gzip.GzipFile(stringent_output_filename,"w")

In [None]:
#shared_output_file = gzip.GzipFile(default_shared_gene_filename,"w")

### the MAIN for loop!

This loops through each species in **pangenome_species**. I will use  

In [None]:
!ls /u/project/ngarud/Garud_lab/HumanizedMouse/HumanizedMouse_Batch2/merged_data/snps/ | grep "coli"

In [None]:
species_name = "Anaerostipes_hadrus_55206"

In [None]:
# Load reference genes
sys.stderr.write("Loading genes on reference genome..\n")
reference_genes = midas_db_utils.load_reference_genes(species_name) #Self explanatory
sys.stderr.write("Done!\n")


In [None]:
# Load reference genes
sys.stderr.write("Loading shared genes from midas db..\n")
midas_shared_genes = midas_db_utils.parse_midas_shared_genes(species_name)             
sys.stderr.write("Done!\n")

Here's what `midas_db_utils.parse_midas_shared_genes(species_name)` doing:
1. It initializes `midas_shared_genes` as an empty set
2. It loads `centroid_gene_map`
    - This is taking the `gene_info.txt.gz` file for a given species (e.g., `/u/project/ngarud/Garud_lab/midas_db_v1.2/pan_genomes/Bacteroides_uniformis_57318/gene_info.txt.gz`), and creating a dictionary (**centroid_gene_map**).
    - This dictionary uses `centroid_95` as the dictionary key, and either uses:
        - `gene_id` as the item if the `gene_id` is in the reference genome and the `centroid_95` is not in the reference genome. Otherwise, it uses `centroid_95` as the key.
3. It loads `cross_species_centroids.txt.gz` (`'/u/project/ngarud/Garud_lab/midas_db_v1.2/cross_species_centroids.txt.gz'`), which represents gene centroids observed across species boundaries.
4. For each line of the file, it
    - initializes `midas_shared_genes` as an empty set.
    - It assigns the first column to `big_centroid`, and appends `big_centroid` to the `midas_shared_genes` set.
    - it assigns the second column (which could hold multiple genes) to a vector called `other_centroids`
    - it loops through each `other_centroid`, and appends these centroids to `midas_shared_genes` if they are a key in the `centroid_gene_map`
5. it outputs `midas_shared_genes`

In [None]:
bad_pangenome_data = False #it starts as false, but may be set to true later

In [None]:
# Load gene coverage information for species_name
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name)
sys.stderr.write("Done!\n")  

In [None]:
"245018.3.peg.2048" in gene_names

In [None]:
len(marker_coverages)

Where:
- `gene_samples` is a list of samples for which genes were calculated in the MIDAS pipeline (e.g., for *B. uniformis*, their are 33 samples for which gene information was calculated)
- `gene_names` is a list of gene names (e.g., there are 8914 for *B. uniformis*)
- `gene_presence_matrix` is a matrix of dimensions [# of genes, # of samples] that contains a boolean outcome for presence/absence (e.g., for *B. uniformis*, it has dimensions [8914, 33])
- `gene_depth_matrix` has the same dimensions as `gene_presence_matrix`, but is filled with average gene depth values.
- `marker_coverages` is a vector of marker coverages for each sample (e.g., for *B. uniformis*, it has 33 values for all 33 samples).
- `gene_reads_matrix` has the same dimensions as `gene_presence_matrix`, but is filled with the number of reads mapping to each gene (in other words, it's not normalized by gene length). 


In [None]:
if len(marker_coverages)==0:
    bad_pangenome_data = True
else:        

    high_coverage_idxs = (marker_coverages>=min_coverage) #i.e., selecting all samples with average coverage over 5

    if high_coverage_idxs.sum() < 0.5: #This is essentally asking if there are 0 high coverage idxs
        bad_pangenome_data = True

In [None]:
if bad_pangenome_data:
    # Just use reference genes
    sys.stderr.write("Bad pangenome data for %s!\n" % species_name)
    shared_gene_names = sorted(midas_shared_genes)
    core_gene_names = sorted(reference_genes - midas_shared_genes)
    stringent_gene_names = sorted(reference_genes - midas_shared_genes)

if bad_pangenome_data == True, then the core genes that are written out are based on the reference genome. Specifally:
- shared genes is those that were calculated in this script
- core genes are those in the reference that *aren't* present in the shared genes vector
    - stringent_gene_names is the same thing
    
    
The rest of this exploration script will be for if the bad_pangenome_data == False.

In [None]:
gene_names = numpy.array(gene_names) #created by parse_midas_data.parse_pangenome_data()
gene_samples = gene_samples[high_coverage_idxs] #gene_samples created from parse_midas_data.parse_pangenome_data(). We're subsetting to only include 
marker_coverages = marker_coverages[high_coverage_idxs] #marker_coverages created from parse_midas_data.parse_pangenome_data(). We're subsetting to only include high coverage samples
gene_depth_matrix = gene_depth_matrix[:,high_coverage_idxs] #gene_depth_matrix created from parse_midas_data.parse_pangenome_data(). We're subsetting to only include high coverage samples
gene_copynum_matrix = gene_depth_matrix*1.0/(marker_coverages+(marker_coverages==0)) #gene_depth_matrix and marker coverage created by parse_midas_data.parse_pangenome_data(). Here, we convert them into copy number using universal single copy marker genes.


In [None]:
good_sample_idxs = get_good_pangenome_samples(marker_coverages, gene_copynum_matrix, species_name)
bad_sample_idxs = numpy.logical_not(good_sample_idxs)

What is `get_good_pangenome_samples` doing?
1. calculates the number of reference genes (using `parse_midas_data.load_reference_genes(species_name)`)
2. counts the number of present genes in each sample by calculating how many elements in `gene_copynum_matrix` are over cmin (0.3) and summing the resulting columns.
3. counts the number of genes in each sample over our high gene threshold by calculating how many elements in `gene_copynum_matrix` are over cmax (3) and summing the resulting columns. 
4. What is `(num_present_genes>0.3*num_reference_genes)*(num_high_genes<0.3*num_present_genes)` doing?
    - `(num_present_genes>0.3*num_reference_genes)` identifies the samples that have > $0.3 \times \text{the number of reference genes}$
    - `(num_high_genes<0.3*num_present_genes)` identifies the samples with < $0.3 \times \text{the number of genes present (in each sample)}$
    - multiplying the two together *only* produces a true in the output vector if there is a true at the same position in the two input vectors
    - ***essentially, this selects for samples that have both at least 30% of the reference genome present, and with no more than 30% of genes have copy number over 3.***

What is `bad_sample_idxs`?
- this just reverses the boolean operator in the `good_sample_idxs` vector

In [None]:
gene_samples = gene_samples[good_sample_idxs] #This subsets gene samples by those that pass the thresholds specified above
marker_coverages = marker_coverages[good_sample_idxs] #This does the same for our marker_coverages
gene_copynum_matrix = gene_copynum_matrix[:,good_sample_idxs] #This does the same for the columns of our gene_copynum_matrix

reference_gene_idxs = numpy.array([gene_name in reference_genes for gene_name in gene_names]) #This selects the gene names that are in the reference genome (boolean)

midas_shared_idxs = numpy.array([gene_name in midas_shared_genes for gene_name in gene_names]) #This selects the gene names of genes shared across species (boolean)


In [None]:
metagenome_shared_idxs = ((gene_copynum_matrix>shared_cmin).sum(axis=1)>0.5) #This is summing the number of samples in which a gene has over


What is `metagenome_shared_idxs`?
- This is a boolean vector the length of the number of genes
- each element indicates whether that gene has a copy number >3 in *any* sample

If a gene has CN>3 in any sample, this provides evidence that its shared between species (hence the name).

In [None]:
# Now union with those we identified from midas db
shared_idxs = numpy.logical_or(metagenome_shared_idxs, midas_shared_idxs) #if at least one is true, then output is true
non_shared_idxs = numpy.logical_not(shared_idxs) #flips the booleans in shared_idxs
shared_gene_names = gene_names[shared_idxs] #slices gene_names to only include those that are shared across species


In [None]:
# calculating good genes
good_idxs = (((gene_copynum_matrix>=cmin)*(gene_copynum_matrix<=cmax)).sum(axis=1)*1.0/len(marker_coverages) >= min_good_fraction)
core_gene_idxs = good_idxs*reference_gene_idxs*non_shared_idxs
core_gene_names = gene_names[core_gene_idxs]

num_metagenome_and_midas = numpy.logical_and(midas_shared_idxs, metagenome_shared_idxs).sum()
num_metagenome_only = numpy.logical_and(metagenome_shared_idxs, numpy.logical_not(midas_shared_idxs)).sum()
num_midas_only = numpy.logical_and(midas_shared_idxs, numpy.logical_not(metagenome_shared_idxs)).sum()
num_metagenome_or_midas = shared_idxs.sum()
num_remaining = non_shared_idxs.sum()
num_reference_remaining = (non_shared_idxs*reference_gene_idxs).sum()
num_core = core_gene_idxs.sum()


**good_idxs:** 
1. For each gene, the number of samples with 0.3<GCN<3 is divided by the number of good samples
2. If it's greater than `min_good_fraction` (i.e., 0.9), it's 1, if not, it's 0. 
3. output: a vector of 1s and 0s corresponding to all genes
**core_gene_idxs:**
1. multiplies the boolean vectors describing good indices, reference gene indices, and non-shared indices
2. Only genes that are true in all three are true in the output
**core_gene_names:** using `core_gene_idxs` to retrieve core gene names

**STATS:**
- `num_metagenome_and_midas`: number of genes that are shared as determined by **both** midas_db_utils and from the metagenomic data itself
- `num_metagenome_only`: number of genes that are shared as calculated by metagenomic data only (that is, excluding those that are calculated from midas_db_utils)
- `num_midas_only`: umber of genes that are shared as calculated by midas_db_utils only (that is, excluding those calculated directly from the data)
- `num_metagenome_or_midas`: total number of shared genes, as calculated by **either** midas_db_utils and from the metagenomic data itself
- `num_remaining`: number of non-shared genes
- `num_reference_remaining`: number of non-shared reference genes
- `num_core`: number of core genes

In [None]:
print "%s %d %d %d %d %d %d %d" % (species_name, num_metagenome_and_midas, num_metagenome_only, num_midas_only, num_metagenome_or_midas, num_remaining, num_reference_remaining, num_core)

In [None]:
# Measure frequencies and output them
gene_prevalence_numerators = ((gene_copynum_matrix>=cmin)*(gene_copynum_matrix<=cmax)).sum(axis=1)
gene_prevalence_denominators = ((gene_copynum_matrix<=cmax).sum(axis=1))

good_prevalence_idxs = (gene_prevalence_numerators>0.5)*(gene_prevalence_denominators>0.5)*non_shared_idxs

gene_prevalence_names = gene_names[good_prevalence_idxs]
gene_prevalences = gene_prevalence_numerators[good_prevalence_idxs]*1.0/gene_prevalence_denominators[good_prevalence_idxs]


`gene_prevalence_numerators`: For each gene, the number of samples in which the gene with 0.3<CN<3 (i.e., present)
`gene_prevalence_denominators`: For each gene, the number of samples in which CN<3 (i.e., not shared, possibly present, possibly absent)
`good_prevalence_idxs`: this is a boolen vector for all genes. The value for a gene is true if it's present in at least one sample, has a CN < 3 in at least one sample (kinda redundant), and not shared (also kinda redundant). 
`gene_prevalence_names`: names of non-shared genes that are prevalent in at least on sample
`gene_prevalences`: The actual prevalence of each gene (only considering non-shared genes that are prevalent in at least on sample)

# MW: Addition to add shared genes we detect in our data...

## Step 1: load in shared genes file

In [None]:
species_name

In [None]:
HMP_shared_genes = parse_shared_genes(species_name, default_shared_gene_filename, default_external_shared_gene_filename)


In [None]:
parse_shared_genes(species_name, default_shared_gene_filename, default_external_shared_gene_filename)


In [None]:
HMP_shared_genes.update(set(shared_gene_names))

In [None]:
numpy.array(list(HMP_shared_genes))

In [None]:
set_1 = set([0,1,2,3,4])
set_2  = set([1,2,3,4,5])

set_2.update(set_1)

In [None]:
set_1 = set([0,1,2])
set_2 = set([3])
set_1.update(set_2)
set_1

In [None]:
len(set_1)

In [None]:
[gid for gids_in_ena if gid not in sample_text_gid]

In [None]:
HMP_shared_genes

In [None]:
'1095771.3.peg.969' in shared_gene_names

In [None]:
HMP_core_genes = parse_core_genes(species_name)

In [None]:
HMP_core_genes

In [None]:
len(set(shared_gene_names).intersection(HMP_shared_genes))

In [None]:
len(HMP_shared_genes)

In [None]:
len(shared_gene_names)