### Thin Zoonomia dataset

Import modules

In [1]:
from ete3 import Tree
import math
import pandas as pd
import numpy as np
import warnings

Functions

In [2]:
def distance_within_order(df, current_sp, nwkete):
    '''Returns distance to other individuals in super order'''
    distance_pi_het = pd.DataFrame([[r.Species, nwkete.get_distance(current_sp, r.Species), r.Heterozygosity, r.Pi] for i,r in df.iterrows() if r.Species!=current_sp])
    distance_pi_het.columns = ["Species", "distance", "Heterozygosity", "Pi"]
    distance_pi_het = distance_pi_het.sort_values(by="distance")
    distance_pi_het.index = range(len(distance_pi_het))
    return distance_pi_het

def choose_pi_over_het(df):
    ''' Chooses the "best" pi/het value for species lacking them. 
    Pi (Buffalo) is taken over Heterzygosity (Zoonomia)'''
    pi = np.nan
    for i,r in df.iterrows():
        if not math.isnan(r.Pi):
            pi = r.Pi
            return pi
        if not math.isnan(r.Heterozygosity):
            pi = r.Heterozygosity
            return pi
    return pi

Read data

In [6]:
# Read metadata 
meta = pd.read_csv("./../data/zoonomia_assembly_metadata.csv")

# Read DNM counts
dnms = pd.read_csv("./../data/dnm_est.tsv",sep="\t")[["Species", "Mat DNMs", "Pat DNMs"]]
dnms["Species"] = [sp.split("(")[0].rstrip().replace(" ","_") for sp in dnms.Species]
dnms["Total_dnms"] = dnms["Mat DNMs"] + dnms["Pat DNMs"]
dnm_counts = {k:v for k,v in zip(dnms["Species"], dnms["Total_dnms"])}

# Add DNM counts 
meta["dnm"] = [dnm_counts[sp] if sp in dnm_counts else 0 for sp in meta.Species]
meta["Anage_ntraits"] = [0 if math.isnan(n_traits) else n_traits for n_traits in meta.AnAge_ntraits]
males = list(meta[meta.Sex=="male"].Species.values)

# Pi from Vince Buffalo's paper
buffalo = pd.read_csv("./../data/buffalo_variation.tsv",sep="\t")
buffalo["FullSpecies"] = ["_".join(sp.split()) for sp in buffalo["species"]]
buffalo["Pi"] = [10**d for d in buffalo["log10_diversity"]]

# Add Buffalo's Pi
meta = meta.set_index("Species").join(buffalo[["FullSpecies", "Pi"]].set_index("FullSpecies")).reset_index()

# Propagate pi if not available from any source
nwkete = Tree("./../trees/241-mammalian-2020v2.phast-242.nh", format=1)
close_pi = []
source_pi = []
propagated, one_pihet, both_pihet = 0, 0, 0
for i,r in meta.iterrows():
    if math.isnan(r["Pi"]) and math.isnan(r["Heterozygosity"]):
        subd = meta[meta.SuperOrder==r.SuperOrder]
        current_sp = r.Species
        distance_df = distance_within_order(subd, current_sp, nwkete)
        best_pi = choose_pi_over_het(distance_df)
        close_pi.append(best_pi)
        propagated += 1
        source_pi.append("Propagated")
    elif math.isnan(r["Pi"]) or math.isnan(r["Heterozygosity"]):
        close_pi.append(np.nanmean([r.Pi, r.Heterozygosity]))
        one_pihet += 1
        source_pi.append("Zoonomia_heterozygosity" if math.isnan(r["Pi"]) else "Buffalo_pi")
    else:
        source_pi.append("Zoonomia_heterozygosity|Buffalo_pi")
        close_pi.append(np.nanmean([r.Pi, r.Heterozygosity]))
        both_pihet += 1
        
print("Propagated: {}".format(propagated))
print("One source: {}".format(one_pihet))
print("Mean of both: {}".format(both_pihet))

meta["Pi_het"] = close_pi
meta["Pi_het_source"] = source_pi

Propagated: 109
One source: 127
Mean of both: 5


Output table

In [9]:
meta.to_csv("../data/Mammals_assembly_metadata.csv")