In [None]:
# import numpy as np
# import pandas as pd
# import os
# import sys



# correcting gene name issue

## `core_gene_utils.py`

In [1]:
import numpy
import sys
import config
import gzip
import os.path
import os
import midas_db_utils
import parse_midas_data #MW: Added 09/21/23
import species_utils
import core_gene_utils

In [2]:
core_genes_directory = ("%score_genes/" % config.data_directory)
external_core_genes_directory = ("%score_genes/external/" % config.data_directory)

default_external_shared_gene_filename = (external_core_genes_directory+"shared_genes.txt.gz")
default_external_core_gene_filename = (external_core_genes_directory+"core_genes.txt.gz")
default_external_stringent_core_gene_filename = (external_core_genes_directory+"core_genes_stringent.txt.gz")
default_external_gene_freq_template = (external_core_genes_directory+"%s_gene_freqs.txt.gz")

default_shared_gene_filename = (core_genes_directory+"shared_genes.txt.gz")
default_core_gene_filename = (core_genes_directory+"core_genes.txt.gz")
default_stringent_core_gene_filename = (core_genes_directory+"core_genes_stringent.txt.gz")
default_gene_freq_template = (core_genes_directory+"%s_gene_freqs.txt.gz")

#HMP data #MW added on 11/2/23
shared_gene_HMP_filename = (core_genes_directory+"shared_genes_HMP.txt.gz")
core_gene_HMP_filename = (core_genes_directory+"core_genes_HMP.txt.gz")
stringent_core_gene_HMP_filename = (core_genes_directory+"core_genes_stringent_HMP.txt.gz")

pangenome_species = parse_midas_data.parse_species_list() # MW 08/29/2025: loaded all species in the species_snps.txt file

cmin = config.core_genome_min_copynum
cmax = config.core_genome_max_copynum  
shared_cmin = config.shared_genome_min_copynum

min_good_fraction = config.core_genome_min_prevalence
min_coverage = 5 # (for assessing core genome, we'll use a lower coverage value than when we look at real changes)


species_name = "100112"

In [3]:
sys.stderr.write("Loading genes on reference genome..\n")
reference_genes = midas_db_utils.load_reference_genes(species_name)
sys.stderr.write("Done!\n")

Loading genes on reference genome..
Done!


6

In [4]:
# Load gene coverage information for species_name
bad_pangenome_data = False
sys.stderr.write("Loading pangenome data for %s...\n" % species_name)
gene_samples, gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix = parse_midas_data.parse_pangenome_data(species_name)
sys.stderr.write("Done!\n") 

Loading pangenome data for 100112...
MGBC000112_03705 gene not found in .genes file in clusters_99_info.tsv file. Appending normal gene name.
MGBC118706_02522 gene not found in .genes file in clusters_99_info.tsv file. Appending normal gene name.
MGBC000057_02978 gene not found in .genes file in clusters_99_info.tsv file. Appending normal gene name.
Done!


6

In [5]:
"MGBC000057_03240" in reference_genes

False

In [6]:
high_coverage_idxs = (marker_coverages>=min_coverage)
high_coverage_idxs.sum() < 0.5

False

In [7]:
gene_names = numpy.array(gene_names)
gene_samples = gene_samples[high_coverage_idxs]
marker_coverages = marker_coverages[high_coverage_idxs]
gene_depth_matrix = gene_depth_matrix[:,high_coverage_idxs] 
gene_copynum_matrix = gene_depth_matrix*1.0/(marker_coverages+(marker_coverages==0))


In [8]:
good_sample_idxs = core_gene_utils.get_good_pangenome_samples(species_name, marker_coverages, gene_copynum_matrix)
bad_sample_idxs = numpy.logical_not(good_sample_idxs)

In [9]:
gene_samples = gene_samples[good_sample_idxs]
marker_coverages = marker_coverages[good_sample_idxs]
gene_copynum_matrix = gene_copynum_matrix[:,good_sample_idxs]

reference_gene_idxs = numpy.array([gene_name in reference_genes for gene_name in gene_names]) 
# THIS MIGHT BE A PROBLEM--ONLY 708 genes "pass" (are in the reference genome)


In [10]:
sum(reference_gene_idxs)

1052

In [11]:
### MY CODE ###
non_reference_genes = numpy.array([gene_name for gene_name in gene_names if gene_name not in reference_genes]) 
"MGBC000112_02952" in reference_genes
###############

True

In [1]:
len(['205567', '263040', '211853', '213184', '100334', '234341', '214912', '251524', '266763', '265132', '264624', '261649', '240066', '237892', '214845', '213999', '209914', '207693', '207550', '207450', '204195', '264289', '262170', '259253', '213942', '100555', '100375', '264714', '261672', '231109', '214603', '208232', '204819', '204699', '100042', '236627', '230525', '216453', '216392', '264455', '263672', '231258', '217378', '100600', '213583', '100158', '265345', '253991', '232356', '217253', '217094', '205224', '205142', '100320', '265282', '265025', '264958', '264316', '261755', '229722', '229547', '217744', '209106', '100111', '263323', '261676', '231887', '206400', '202897', '201916', '265152', '262018', '229998', '218434', '216954', '211235', '263558', '231977', '216689', '206896', '205270', '100500', '264599', '263490', '231995', '231118', '229914', '217610', '100113', '264501', '264045', '263510', '255975', '237711', '231349', '217240', '208519', '205132', '100110', '265440', '264450', '204380', '203686', '266183', '264298', '263303', '242035', '236833', '231504', '100338', '264454', '231066', '229633', '229275', '209838', '208766', '100395', '100171', '100118'])

119

In [12]:
metagenome_shared_idxs = ((gene_copynum_matrix>shared_cmin).sum(axis=1)>0.5) # 341 shared

In [13]:
shared_idxs = metagenome_shared_idxs # MW 07/09/2025: no cross_species_centroids.txt.gz
non_shared_idxs = numpy.logical_not(shared_idxs) # 4479 non-shared

In [14]:
good_idxs = (((gene_copynum_matrix>=cmin)*(gene_copynum_matrix<=cmax)).sum(axis=1)*1.0/len(marker_coverages) >= min_good_fraction) #indexes in which gene copy numbers are between 0.3 and 3 in 90% of samples (2147 good idxs)
core_gene_idxs = good_idxs*reference_gene_idxs*non_shared_idxs
core_gene_names = gene_names[core_gene_idxs] # only 39 genes :(

In [15]:
len(core_gene_names) # 289

289

In [None]:
"MGBC000112_02952" in reference_genes

In [None]:
"MGBC118706_01662" in gene_names

## `parse_midas_data.parse_pangenome_data()`

In [None]:
import numpy
import sys
import bz2
import gzip
import lz4.frame
import os.path 
import stats_utils 
from math import floor, ceil
import gene_diversity_utils

import config
import sample_utils
from parse_midas_data import *

In [None]:
species_name = "100112"
allowed_samples = []
allowed_genes=[]
convert_centroid_names=True
disallowed_genes=[]
remove_c = False

In [None]:
data_directory = config.data_directory
analysis_directory = config.analysis_directory
scripts_directory = config.scripts_directory
midas_directory = config.midas_directory
project_directory = config.project_folder #MW: added 12/15/23
metadata_directory = config.metadata_directory #MW: added 07/30/24

# We use this one to debug because it was the first one we looked at
debug_species_name = config.debug_species_name

from sample_utils import *

In [None]:
# Open post-processed MIDAS output
# Raw read counts
gene_reads_file =  lz4.frame.open("%sgenes/%s/%s.genes_reads.tsv.lz4" % (data_directory, species_name, species_name),"rt", encoding = "utf-8") # MW 07/14/2025: decoding bytes like object
# Depth (read counts / length?)
gene_depth_file =  lz4.frame.open("%sgenes/%s/%s.genes_depth.tsv.lz4" % (data_directory, species_name, species_name),"rt", encoding = "utf-8") # MW 07/14/2025: decoding bytes like object
# Presence absence calls
gene_presabs_file =  lz4.frame.open("%sgenes/%s/%s.genes_presabs.tsv.lz4" % (data_directory, species_name, species_name),"rt", encoding = "utf-8") # MW 07/14/2025: decoding bytes like object


In [None]:
gene_summary_file = open("%sgenes/genes_summary.tsv" % (data_directory),"r") # MW 07/10/2025: gene_summary.tsv file now found in genes/ folder and has all species
marker_coverage_map = {}
gene_summary_file.readline() # header
marker_coverage_samples = []
marker_coverages = []
for summary_line in gene_summary_file:
    items = summary_line.split() 
    sample = items[0].strip()
    species_id = items[1].strip()
    if species_id != species_name: # MW 07/10/2025: gene_summary.tsv has all species, so skip row that doesn't have species
        continue
    marker_coverage = float(items[8]) # MW 07/10/2025: gene_summary.tsv marker_depth at index 8, not 5
    marker_coverage_samples.append(sample)
    marker_coverages.append(marker_coverage)

gene_summary_file.close()

In [None]:
marker_coverage_map = {sample: marker_coverage for sample,marker_coverage in zip(marker_coverage_samples, marker_coverages)}

In [None]:
# Now read through remaining files
reads_line = gene_reads_file.readline() # header
depth_line = gene_depth_file.readline() # header
presabs_line = gene_presabs_file.readline() # header
items = presabs_line.split() # header

if remove_c: #MW 09/23/24: added remove_c condition
    samples = sample_utils.parse_merged_sample_names(items[1:])
else: 
    samples = items[1:]

# ordered vector of marker coverages (guaranteed to be in same order as samples)
marker_coverages = numpy.array([marker_coverage_map[sample] for sample in samples])

if len(allowed_samples)==0:
    allowed_samples = set(samples)
else:
    allowed_samples = (set(allowed_samples) & set(samples))
    
desired_sample_idxs = numpy.array([sample in allowed_samples for sample in samples])
desired_samples = numpy.array(samples)[desired_sample_idxs] #MW 10/01/24: converted samples to an array (used to just be samples[desired_sample_idxs]

marker_coverages = marker_coverages[desired_sample_idxs]
gene_presence_matrix = []
gene_reads_matrix = []
gene_depth_matrix = []
gene_names = []

num_genes_processed = 0

In [None]:
reads_line = gene_reads_file.readline() # header
depth_line = gene_depth_file.readline() # header
presabs_line = gene_presabs_file.readline() # header

while reads_line!="":
    
    # Loop through!
    
    items = presabs_line.split() # MW 07/09/2025: decode bytes like object
    gene_name = items[0] 
    gene_presences = numpy.array([float(item) for item in items[1:]])[desired_sample_idxs]
    
    if True: #gene_presences.sum() > 0.5:
    
        gene_reads = numpy.array([float(item) for item in reads_line.split()[1:]])[desired_sample_idxs]
        gene_depths = numpy.array([float(item) for item in depth_line.split()[1:]])[desired_sample_idxs]
        
        # Note to self: not uniform across samples!
        #gene_lengths = gene_reads/(gene_depths+(gene_reads<0.5))
        #print gene_lengths
        
        # gene is present in at least one individual! 
        gene_presence_matrix.append(gene_presences)
        gene_depth_matrix.append(gene_depths)
        gene_reads_matrix.append(gene_reads)
        gene_names.append(gene_name)    
    
    num_genes_processed+=1
    
    reads_line = gene_reads_file.readline() # header
    depth_line = gene_depth_file.readline() # header
    presabs_line = gene_presabs_file.readline() # header

gene_reads_file.close()
gene_depth_file.close()
gene_presabs_file.close()

In [None]:
if convert_centroid_names:
    new_gene_names = []
    centroid_gene_map = load_centroid_gene_map(species_name)
    for gene_name in gene_names:                                # MW 07/14/2025: For wahatever reason not all centroid_ids or gene_ids are in the pangemome database (accessed by load_reference_genes()), so we just attached the normal gene name if that's the case
        if gene_name in centroid_gene_map.keys():
            new_gene_names.append(centroid_gene_map[gene_name])
        else: 
            sys.stderr.write("%s gene not found in .genes file in gene_annotations/ directory. Appending normal gene name.\n" % (gene_name))
            new_gene_names.append(gene_name)
else:
    new_gene_names=gene_names

In [None]:
centroid_gene_map = load_centroid_gene_map(species_name)

In [None]:
centroid_gene_map['MGBC118706_01662']

In [None]:
import midas_db_utils
reference_genes = midas_db_utils.load_reference_genes(species_name)


In [None]:
"MGBC000057_03240" in reference_genes

In [None]:
"MGBC000112_02952" in centroid_gene_map.values()

In [None]:
"MGBC118706_01662" in centroid_gene_map.keys()

In [None]:
{key:value for key,value in centroid_gene_map.items() if key == "MGBC118706_01662"}

In [None]:
{key: value for key,value in centroid_gene_map.items() if value == "MGBC118706_01662"}

In [None]:
gene_presence_matrix = numpy.array(gene_presence_matrix)
gene_depth_matrix = numpy.array(gene_depth_matrix)
gene_reads_matrix = numpy.array(gene_reads_matrix)

In [None]:
def parse_pangenome_data(species_name, allowed_samples = [], allowed_genes=[], convert_centroid_names=True, disallowed_genes=[], remove_c = False): #MW 09/23/24: added remove_c condition
    
    if not pangenome_data_exists(species_name):
        return [], [], [], [], [], []
        
    # Open post-processed MIDAS output
    # Raw read counts
    gene_reads_file =  lz4.frame.open("%sgenes/%s/%s.genes_reads.tsv.lz4" % (data_directory, species_name, species_name),"rt", encoding = "utf-8") # MW 07/14/2025: decoding bytes like object
    # Depth (read counts / length?)
    gene_depth_file =  lz4.frame.open("%sgenes/%s/%s.genes_depth.tsv.lz4" % (data_directory, species_name, species_name),"rt", encoding = "utf-8") # MW 07/14/2025: decoding bytes like object
    # Presence absence calls
    gene_presabs_file =  lz4.frame.open("%sgenes/%s/%s.genes_presabs.tsv.lz4" % (data_directory, species_name, species_name),"rt", encoding = "utf-8") # MW 07/14/2025: decoding bytes like object
    
    # First read through gene_summary_file to get marker gene coverages
    # Gene summary file
    gene_summary_file = open("%sgenes/genes_summary.tsv" % (data_directory),"r") # MW 07/10/2025: gene_summary.tsv file now found in genes/ folder and has all species
    marker_coverage_map = {}
    gene_summary_file.readline() # header
    marker_coverage_samples = []
    marker_coverages = []
    for summary_line in gene_summary_file:
        items = summary_line.split() 
        sample = items[0].strip()
        species_id = items[1].strip()
        if species_id != species_name: # MW 07/10/2025: gene_summary.tsv has all species, so skip row that doesn't have species
            continue
        marker_coverage = float(items[8]) # MW 07/10/2025: gene_summary.tsv marker_depth at index 8, not 5
        marker_coverage_samples.append(sample)
        marker_coverages.append(marker_coverage)

    gene_summary_file.close()
    
    if remove_c: #MW 09/23/24: added remove_c condition
        marker_coverage_samples = sample_utils.parse_merged_sample_names(marker_coverage_samples)

    marker_coverage_map = {sample: marker_coverage for sample,marker_coverage in zip(marker_coverage_samples, marker_coverages)}
    
    # Now read through remaining files
    reads_line = gene_reads_file.readline() # header
    depth_line = gene_depth_file.readline() # header
    presabs_line = gene_presabs_file.readline() # header
    items = presabs_line.split() # header
    
    if remove_c: #MW 09/23/24: added remove_c condition
        samples = sample_utils.parse_merged_sample_names(items[1:])
    else: 
        samples = items[1:]
    
    # ordered vector of marker coverages (guaranteed to be in same order as samples)
    marker_coverages = numpy.array([marker_coverage_map[sample] for sample in samples])
    
    if len(allowed_samples)==0:
        allowed_samples = set(samples)
    else:
        allowed_samples = (set(allowed_samples) & set(samples))
        
    desired_sample_idxs = numpy.array([sample in allowed_samples for sample in samples])
    desired_samples = numpy.array(samples)[desired_sample_idxs] #MW 10/01/24: converted samples to an array (used to just be samples[desired_sample_idxs]
    
    marker_coverages = marker_coverages[desired_sample_idxs]
    gene_presence_matrix = []
    gene_reads_matrix = []
    gene_depth_matrix = []
    gene_names = []
    
    num_genes_processed = 0
    
    reads_line = gene_reads_file.readline() # header
    depth_line = gene_depth_file.readline() # header
    presabs_line = gene_presabs_file.readline() # header
    
    while reads_line!="":
        
        # Loop through!
        
        items = presabs_line.split() # MW 07/09/2025: decode bytes like object
        gene_name = items[0] 
        gene_presences = numpy.array([float(item) for item in items[1:]])[desired_sample_idxs]
        
        if True: #gene_presences.sum() > 0.5:
        
            gene_reads = numpy.array([float(item) for item in reads_line.split()[1:]])[desired_sample_idxs]
            gene_depths = numpy.array([float(item) for item in depth_line.split()[1:]])[desired_sample_idxs]
            
            # Note to self: not uniform across samples!
            #gene_lengths = gene_reads/(gene_depths+(gene_reads<0.5))
            #print gene_lengths
            
            # gene is present in at least one individual! 
            gene_presence_matrix.append(gene_presences)
            gene_depth_matrix.append(gene_depths)
            gene_reads_matrix.append(gene_reads)
            gene_names.append(gene_name)    
        
        num_genes_processed+=1
        
        reads_line = gene_reads_file.readline() # header
        depth_line = gene_depth_file.readline() # header
        presabs_line = gene_presabs_file.readline() # header
    
        
    gene_reads_file.close()
    gene_depth_file.close()
    gene_presabs_file.close()
    gene_presence_matrix = numpy.array(gene_presence_matrix)
    gene_depth_matrix = numpy.array(gene_depth_matrix)
    gene_reads_matrix = numpy.array(gene_reads_matrix)

    if convert_centroid_names:
        new_gene_names = []
        centroid_gene_map = load_centroid_gene_map(species_name)
        for gene_name in gene_names:                                # MW 07/14/2025: For wahatever reason not all centroid_ids or gene_ids are in the pangemome database (accessed by load_reference_genes()), so we just attached the normal gene name if that's the case
            if gene_name in centroid_gene_map.keys():
                new_gene_names.append(centroid_gene_map[gene_name])
            else: 
                sys.stderr.write("%s gene not found in .genes file in gene_annotations/ directory. Appending normal gene name.\n" % (gene_name))
                new_gene_names.append(gene_name)
    else:
        new_gene_names=gene_names
    
    
    new_gene_names = numpy.array(new_gene_names)
        
    # Now weed out disallowed genes if provided
    disallowed_genes=set(disallowed_genes)
    allowed_gene_idxs = []
    for gene_idx in range(0,len(new_gene_names)):
        
        if new_gene_names[gene_idx] in disallowed_genes:
            # don't include
            pass
        else:
            allowed_gene_idxs.append(gene_idx)
    allowed_gene_idxs = numpy.array(allowed_gene_idxs)
    
    new_gene_names = new_gene_names[allowed_gene_idxs]
    gene_presence_matrix = gene_presence_matrix[allowed_gene_idxs,:]
    gene_depth_matrix = gene_depth_matrix[allowed_gene_idxs,:]
    gene_reads_matrix = gene_reads_matrix[allowed_gene_idxs,:]
    
    return desired_samples, new_gene_names, gene_presence_matrix, gene_depth_matrix, marker_coverages, gene_reads_matrix


## `midas_db_utils.py`

In [None]:
import config
import gzip
import os
import os.path
import config
import species_utils

In [None]:
desired_species_name = "100112"
feature_file_intermediate_dir = "%sgene_annotations/%s/" % (config.midas_directory, desired_species_name)
# subdir_name = os.listdir(feature_file_intermediate_dir)[0] # PROBLEM
subdir_name = species_utils.load_reference_genome_id(desired_species_name)
features_file = open("%sgene_annotations/%s/%s/%s.genes" % (config.midas_directory, desired_species_name, subdir_name, subdir_name), 'r') 

features_file.readline() # header
reference_genes = []
for line in features_file:
    items = line.split() 
    gene_name = items[0].strip()
    reference_genes.append(gene_name)
features_file.close()    

In [None]:
reference_genes

Ok, so the problem is that it loads the FIRST subdirectory name. This is NOT ok. We need to be pulling the reference genome (I think).

so for `subdir_name` we need to get the reference genome. Let's put this into species_utils.

In [None]:
import pandas as pd
species_name = "100112"
genomes_df_path = "%s%s" % (config.midas_directory, "genomes.tsv")
genomes_df = pd.read_csv(genomes_df_path, sep = "\t")
genomes_df["species"] = genomes_df["species"].astype(str)
reference_genome_id = genomes_df.loc[(genomes_df.species == species_name) & (genomes_df.genome_is_representative == 1), "genome"].values[0]


In [None]:
reference_genome_id

In [None]:
genomes_df.species == "10"

# What's wrong with `within_sample_sfs`?

In [None]:
import parse_midas_data
import pylab
import sys
import numpy
import bz2
import calculate_snp_prevalences
import diversity_utils


In [None]:
species_name = "100112"
debug = True
chunk_size = 1000000000
use_HMP = False

In [None]:
# Should we do this? 
sys.stderr.write("Loading core genes...\n")
core_genes = parse_midas_data.load_core_genes(species_name)
sys.stderr.write("Done! %d core genes\n" % len(core_genes))
allowed_genes = core_genes

sys.stderr.write("Loading population freqs...\n")
population_freqs = calculate_snp_prevalences.parse_population_freqs(species_name, use_HMP_freqs=use_HMP) 
sys.stderr.write("Done! %d SNVs\n" % len(population_freqs))

allowed_variant_type_list = ['1D','2D','3D','4D']
allowed_variant_types = set(allowed_variant_type_list) 


# Open post-processed MIDAS output
snp_file =  bz2.open("%ssnps/%s/annotated_snps.txt.bz2" % (parse_midas_data.data_directory, species_name),"rt", encoding = "utf-8") # MW 07/10/2025: decoding bytes like object
    
line = snp_file.readline() # header
items = line.split()[1:]
samples = numpy.array([item.strip() for item in items])

# We shouldn't be doing this for raw data 
#samples = parse_midas_data.parse_merged_sample_names(items)
    
site_map = [{} for sample in samples]
for sample_idx in range(0,len(samples)):
    site_map[sample_idx] = {variant_type:{} for variant_type in allowed_variant_types}

sys.stderr.write("Calculating within-person SFSs...\n")        
num_sites_processed = 0
for line in snp_file:
    #
    items = line.split()
    # Load information about site
    info_items = items[0].split("|")
    chromosome = info_items[0]
    location = int(info_items[1])
    gene_name = info_items[2]
    variant_type = info_items[3]
    
    if len(info_items) > 5: # for backwards compatability
            polarization = info_items[4]
            pvalue = float(info_items[5])
    else: 
        polarization="?"
        pvalue = float(info_items[4])
        
    #    
    if variant_type not in allowed_variant_types:
        continue
    #    
    if len(allowed_genes)>0 and (gene_name not in allowed_genes):
        continue
    #    
    # Load alt and depth counts
    alts = []
    depths = []
    for item in items[1:]:
        subitems = item.split(",")
        alts.append(int(subitems[0]))
        depths.append(int(subitems[1]))
    alts = numpy.array(alts)
    depths = numpy.array(depths)
    refs = depths-alts
    #print alts
    #print depths
    #
    # population_freq returns the fraction of people for which the alt is the major allele.
    # This is a very important quantity being computed! It is later used for identifying CPS samples.
    if (chromosome, location) in population_freqs:
        population_freq = population_freqs[(chromosome, location)]
    else:
        population_freq = 0
    
    # polarize SFS according to population freq
    if population_freq>0.5:
        alts,refs = refs,alts
        population_freq = 1-population_freq
        
    #    
    for i in range(0,len(alts)):
        site = (depths[i],alts[i])
        #
        if site not in site_map[i][variant_type]:
            site_map[i][variant_type][site] = [0,0.0]
        #        
        site_map[i][variant_type][site][0] += 1
        site_map[i][variant_type][site][1] += population_freq # weight of polarization reversals
        #
        #
    num_sites_processed+=1
    #print num_sites_processed
    if num_sites_processed%50000==0:
        sys.stderr.write("%dk sites processed...\n" % (num_sites_processed/1000))   
        if debug:
            break
    
snp_file.close()


In [None]:
gene_name

# What's wrong with `calculate_haploid_samples`?

In [None]:
import numpy
from scipy.linalg import eigh
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster
from numpy.random import shuffle, normal
import scipy.stats
from scipy.stats import binom
import config
from scipy.special import betainc
import sys
import parse_midas_data
import sample_utils
import stats_utils
import os.path
import sfs_utils
from diversity_utils import *

In [None]:
species_name = "100112"
quick_and_dirty = True
desired_samples = calculate_highcoverage_samples(species_name)

samples, sfs_map = parse_midas_data.parse_within_sample_sfs(species_name, allowed_variant_types=set(['4D'])) 

haploid_samples = []
for sample in desired_samples:
    within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(sfs_map[sample])

    print("Within sites: %s" % (within_sites))
    print("Between sites: %s" % (between_sites))
    print("Total sites: %s" % (total_sites))
    
    if quick_and_dirty:
        #QUICK AND DIRTY APPROACH: if within-sample diversity is < 1e-3, it's a haploid sample! (Mostly works)
        if float(within_sites)/total_sites < 5e-4: # Original 1e-3
            print(float(within_sites)/total_sites)
            haploid_samples.append(sample)
    else:
        #actual approach: compare to expected number of between site differences. I use HMP panel for calculating this.
        if within_sites <= threshold_within_between_fraction*between_sites:
            haploid_samples.append(sample)    
        
# return numpy.array(haploid_samples)


In [None]:
sfs_map

In [None]:
desired_samples