### Thin Zoonomia dataset

Import modules

In [1]:
from ete3 import Tree
import toytree
import toyplot
import toyplot.pdf
import itertools
import math
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings

Functions

In [2]:
def distance_within_order(df, current_sp, nwkete):
    '''Returns distance to other individuals in super order'''
    distance_pi_het = pd.DataFrame([[r.Species, nwkete.get_distance(current_sp, r.Species), r.Heterozygosity, r.Pi] for i,r in df.iterrows() if r.Species!=current_sp])
    distance_pi_het.columns = ["Species", "distance", "Heterozygosity", "Pi"]
    distance_pi_het = distance_pi_het.sort_values(by="distance")
    distance_pi_het.index = range(len(distance_pi_het))
    return distance_pi_het

def choose_pi_over_het(df):
    ''' Chooses the "best" pi/het value for species lacking them. 
    Pi (Buffalo) is taken over Heterzygosity (Zoonomia)'''
    pi = np.nan
    for i,r in df.iterrows():
        if not math.isnan(r.Pi):
            pi = r.Pi
            return pi
        if not math.isnan(r.Heterozygosity):
            pi = r.Heterozygosity
            return pi
    return pi

def prune_species(d, tree):
    '''Returns species ranking higher in decision tree'''
    for param, criteria in tree.items():
        if criteria!="Max" and criteria!="Min":
            subd = d[d[param]!=criteria]
            if len(subd)==1:
                return subd.Species.values[0]
        else:
            if len(d[param].unique())==1:
                continue
            sorted_d = d.sort_values(by=param, ascending=False if criteria!="Max" else True)
            sorted_d.index = range(len(sorted_d))
            return sorted_d.Species.values[0]

Read data

In [14]:
buffalo[buffalo.FullSpecies.str.contains("Taeniopyg")].diversity_data_source

283    Leffler et al.
Name: diversity_data_source, dtype: object

In [3]:
# Read metadata 
meta = pd.read_csv("./../data/zoonomia_assembly_metadata.csv")

# Read phylogenies
nwkete = Tree("./../trees/241-mammalian-2020v2.phast-242.nh", format=1)
order_trees = pd.read_csv("../trees/orders_phylofit_nwk.txt", sep="\t", header=None)
order_trees.columns = ["order", "tree"]

# Read DNM counts
dnms = pd.read_csv("./../data/dnm_est.tsv",sep="\t")[["Species", "Mat DNMs", "Pat DNMs"]]
dnms["Species"] = [sp.split("(")[0].rstrip().replace(" ","_") for sp in dnms.Species]
dnms["Total_dnms"] = dnms["Mat DNMs"] + dnms["Pat DNMs"]
dnm_counts = {k:v for k,v in zip(dnms["Species"], dnms["Total_dnms"])}

# Tree of sex
treeofsex = pd.read_csv("./../data/tree_of_sex.csv")
column = 'Karyotype (ZO,ZW,XY,XO,WO,homomorphic,complex XY,complex ZW)'
complex_XY = treeofsex[(treeofsex["Higher taxonomic group"]=="Mammalia") & (treeofsex[column]!="XY")]
genus_complex_XY = [sp.split()[0] for sp in complex_XY["Species"] if "Mus" not in sp]
complex_XY_sp = list(meta[meta.Genus.isin(genus_complex_XY)].Species.values)
species_complex_XY = list(meta[meta.Genus.isin(genus_complex_XY)]["Species"].values)

# Pi from Vince Buffalo's paper
buffalo = pd.read_csv("./../data/buffalo_variation.tsv",sep="\t")
buffalo["FullSpecies"] = ["_".join(sp.split()) for sp in buffalo["species"]]
buffalo["Pi"] = [10**d for d in buffalo["log10_diversity"]]

# Add DNM counts to metadata
meta["dnm"] = [dnm_counts[sp] if sp in dnm_counts else 0 for sp in meta.Species]
meta["Anage_ntraits"] = [0 if math.isnan(n_traits) else n_traits for n_traits in meta.AnAge_ntraits]
males = list(meta[meta.Sex=="male"].Species.values)

# Add Buffalo's Pi to metadata
warnings.filterwarnings(action='ignore', message='Mean of empty slice')
meta = meta.set_index("Species").join(buffalo[["FullSpecies", "Pi"]].set_index("FullSpecies")).reset_index()
pi_per_genus = buffalo.groupby("genus").apply(lambda x: np.nanmean(x["Pi"])).to_dict()
fill_genus_pi = []
for i,r in meta.iterrows():
    if math.isnan(r.Pi):
        if r.Genus in pi_per_genus:
            fill_genus_pi.append(pi_per_genus[r.Genus])
        else:
            fill_genus_pi.append(np.nan)
    else:
        fill_genus_pi.append(r.Pi)

# Propagate pi if not available from any source
close_pi = []
for i,r in meta.iterrows():
    if math.isnan(r["Pi"]) and math.isnan(r["Heterozygosity"]):
        subd = meta[meta.SuperOrder==r.SuperOrder]
        current_sp = r.Species
        distance_df = distance_within_order(subd, current_sp, nwkete)
        best_pi = choose_pi_over_het(distance_df)
        close_pi.append(best_pi)
    else:
        close_pi.append(np.nanmean([r.Pi, r.Heterozygosity]))
meta["Pi_het"] = close_pi

Output table

In [13]:
meta.to_csv("../data/Mammals_assembly_metadata.csv")

Decision tree to keep species if sequence divergence is below limit (15pi or 2% sequence divergence)

In [11]:
decision_tree = {"dnm":"Max",
                 "AssemblyStatus":"Chromosome",
                 "Sex":"female",
                 "ScaffoldN50":"Max",
                 "AnAge_ntraits":"Max",
                }

Prune species

In [12]:
pi_modifier = 15
pi_data = {r["Species"]:r["Pi_het"] for i,r in meta.iterrows()}
species_out = []

# Iterate over all superorders
for superorder,df in meta.groupby("SuperOrder"):
    
    # Read best tree of order
    #nwkete = Tree(order_trees[order_trees.order==superorder]["tree"].values[0])
    nwkete = Tree("./../trees/241-mammalian-2020v2.phast-242.nh", format=1)
    
    # Automatically exclude male-based assemblies
    species_out = species_out + list(df[df.Sex=="male"].Species.values)
    subd = df[df.Sex!="male"]
    
    # Iterate over pairwise species comparisons
    for c in itertools.combinations(subd.Species, r=2):
        sp1, sp2 = c
        seqdiv = nwkete.get_distance(sp1, sp2)
        limit = np.max([pi_data[sp1], pi_data[sp2]])*pi_modifier
        limit = 0.02 if limit<=0.02 else limit # LIMIT OF 2%!
        
        # If sequence divergence is not enough
        if seqdiv <= limit:
            pair_df = subd[subd.Species.isin(c)]
            worst_species = prune_species(pair_df, decision_tree)
            #print(sp1,sp2,np.max([pi_data[sp1], pi_data[sp2]]),limit,seqdiv, worst_species)
            if worst_species not in species_out:
                species_out.append(worst_species)

meta = meta[~meta.Species.isin(species_out)].reset_index(drop=True)

Plot with final selection of species

In [15]:
mammals = toytree.tree("./../trees/Mammals.nwk", tree_format=1)

style = {"edge_style":{"stroke-width": 1}, "tip_labels_style":{"font-size": "10px", "-toyplot-anchor-shift": "3px"}}
canvas, axes, mark = mammals.draw(
    tip_labels_align=True,
    tip_labels = [sp for sp in mammals.get_tip_labels()],
    height=500,
    **style
);
#toyplot.pdf.render(canvas, "pdfs/mammals.pdf")

Selection criteria plots

In [135]:
nwk = toytree.tree("./../trees/241-mammalian-2020v2.phast-242.nh", tree_format=1)

outs = sum([males,species_complex_XY,sp_far_chrom,sp_failed_thersh,species_out], [])
colors_tips = ["#F4F4F4" if sp in outs else "black" for sp in nwk.get_tip_labels()]
style = {"edge_style":{"stroke-width": 1}, "tip_labels_style":{"font-size": "4px", "-toyplot-anchor-shift": "3px"}}
canvas, axes, mark = nwk.draw(
    tip_labels_align=False,
    tip_labels_colors=colors_tips,
    height=800,
    **style
);
toyplot.pdf.render(canvas, "pdfs/males_complexXY_farfromchrom_qual_ancpoly.pdf")

New column showing if included in thinned set or not

For the controlled-thinned set, remove species that are far away from a chromosome-level assembly

In [22]:
nwkete = Tree("./../trees/241-mammalian-2020v2.phast-242.nh", format=1)
keep_species = []
subd = meta[(meta.Included==1) & (meta.ScaffoldN50>350e3) & (meta.ContigN50>25e3)]# | (meta.AssemblyStatus=="Chromosome"))]
div_limit = 0.15

for i,r in subd.iterrows():
    if r.AssemblyStatus=="Chromosome":
        keep_species.append(r.Species)
    else:
        distances = subd[(subd.Species!=r.Species) & (subd.AssemblyStatus=="Chromosome")].apply(lambda r2: nwkete.get_distance(r.Species, r2.Species), axis=1)
        if min(distances)<=div_limit:
            keep_species.append(r.Species)

## Remove bubalis
keep_species = [sp for sp in keep_species if "Bubalus" not in sp]

#tree = Tree("./../trees/241-mammalian-2020v2.phast-242.nh", format=1)
#tree.prune(keep_species, preserve_branch_length=True)
#with open("./../data/Mammals2.txt","w") as of:
#    of.write(",".join(keep_species) + "\n")
#tree.write(outfile="./../trees/Mammals2.nwk",format=5)