### Thin B10k dataset

Import modules

In [1]:
from ete3 import Tree
import toytree
import toyplot
import toyplot.pdf
import itertools
import math
import xml.etree.ElementTree as ET
import random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
pd.options.mode.chained_assignment = None

Functions

In [10]:
def distance_within_order(df, current_sp, nwkete):
    '''Returns distance to other individuals in phylo'''
    
    distance_pi_het = pd.DataFrame([[r.Species, nwkete.get_distance(current_sp, r.Species), r.Pi] for i,r in df.iterrows() if r.Species!=current_sp])
    distance_pi_het.columns = ["Species", "distance", "Pi"]
    distance_pi_het = distance_pi_het.sort_values(by="distance")
    distance_pi_het.index = range(len(distance_pi_het))
    return distance_pi_het

def keep_best_species(d, tree):
    '''Returns species ranking higher in decision tree'''
    for param, criteria in tree.items():

        if criteria!="Max" and criteria!="Min":
            subd = d[d[param]!=criteria]
            if len(subd)==1:
                return subd.Species.values[0]
        else:
            if len(d[param].unique())==1:
                continue
            sorted_d = d.sort_values(by=param, ascending=False if criteria=="Max" else True)
            sorted_d.index = range(len(sorted_d))
            return sorted_d.Species.values[0]

Read input metadata and phylogeny of B10k 

In [11]:
# B10k metadata
meta = pd.read_excel("../data/metadata_b10k.xlsx")

# Read NCBI entrez 
with open("../data/b10k_NCBI_Entrez.xml") as f:
    xml = f.read()
root = ET.fromstring("<root>" + xml + "</root>")

# Convert to data frame
features_of_interest = ["AssemblyAccession", "AssemblyName", "Organism", "AssemblyStatus", "Coverage", "ContigN50", "ScaffoldN50", "Sex","SpeciesName"]
xml_l = []
for b in list(root):
    current_species = {f:"nan" for f in features_of_interest}
    for b2 in list(b):
        tag = b2.tag
        value = b2.text
        if tag in features_of_interest:
            current_species[tag] = value
        if tag=="Biosource":
            for b3 in list(b2):
                tag = b3.tag
                value = b3.text
                if tag=="Sex":
                    current_species[tag] = value
    xml_l.append(current_species)
    
ncbi_meta = pd.DataFrame(xml_l)

# Read phylo
b10k = Tree("./../trees/363-avian-2020-phast.nh", format=1)

# Merged both datasets
merged_meta = meta.set_index("Latin name").join(ncbi_meta.set_index("SpeciesName"), lsuffix="b10k")
merged_meta["ScaffoldN50"] = pd.to_numeric(merged_meta["ScaffoldN50"])
merged_meta["ContigN50"] = pd.to_numeric(merged_meta["ContigN50"])
merged_meta["Species"] = ["_".join(sp.split()) for sp in merged_meta.index]

# Tree of sex
treeofsex = pd.read_csv("./../data/tree_of_sex.csv")
column = 'Karyotype (ZO,ZW,XY,XO,WO,homomorphic,complex XY,complex ZW)'
complex_ZW = treeofsex[(treeofsex["Higher taxonomic group"]=="Aves") & (treeofsex[column]!="ZW")]
genus_complex_ZW = [sp.split()[0] for sp in complex_ZW["Species"]]
merged_meta["Complex_ZW"] = [sp.split("_")[0] in genus_complex_ZW for sp in merged_meta.Species]

# Add max pi
#merged_meta["Pi_het"] = 9e-3

# Add dnms for flycatchers
merged_meta["dnm"] = [0 if sp!="Ficedula_albicollis" else 33 for sp in merged_meta.Species]

# Creted ibis sex is not well annotated (https://doi.org/10.1186/s13059-014-0557-1)
merged_meta["Sex"] = [r.Sex if r.Species!="Nipponia_nippon" else "female" for i,r in merged_meta.iterrows()]

# Get rid of duplicated Numida line
merged_meta = merged_meta.drop_duplicates(subset=["Species"]).reset_index(drop=True)

In [12]:
variation = pd.read_csv("../data/bruniche-olsen_variation.csv")
merged_meta = merged_meta.set_index("Species").join(variation[["species", "Pi"]].set_index("species")).reset_index()
#pi_per_genus = variation.groupby("genus").apply(lambda x: np.nanmean(x["Pi"])).to_dict()

In [13]:
species_in_tree = b10k.get_leaf_names()
merged_meta = merged_meta[merged_meta.Species.isin(species_in_tree)].reset_index(drop=True)

In [14]:
#merged_meta["genus"] = [sp.split("_")[0] for sp in merged_meta.Species]

In [15]:
# Mean of species genus
#pi_per_genus = variation.groupby("genus").apply(lambda x: np.nanmean(x["Pi"])).to_dict()
#fill_genus_pi = []
#for i,r in merged_meta.iterrows():
#    if math.isnan(r.Pi):
#        if r.genus in pi_per_genus:
#            fill_genus_pi.append(pi_per_genus[r.genus])
#        else:
#            fill_genus_pi.append(np.nan)
#    else:
#        fill_genus_pi.append(r.Pi)
#merged_meta["Pi"] = fill_genus_pi

# Propagate pi if not available from any source
close_pi = []
source_pi = []
for i,r in merged_meta.iterrows():
    if math.isnan(r["Pi"]):
        subd = merged_meta
        current_sp = r.Species
        distance_df = distance_within_order(subd, current_sp, b10k)
        nonna_df = distance_df[distance_df.Pi>0]
        closest_pi = nonna_df.Pi.values[0]
        close_pi.append(closest_pi)
        source_pi.append("Propagated")
    else:
        close_pi.append(r.Pi)
        source_pi.append("Bruniche-olsen_pi")

merged_meta["Pi"] = close_pi
merged_meta["Pi_het"] = merged_meta["Pi"]
merged_meta["Pi_het_source"] = source_pi

Output table

In [18]:
merged_meta.to_csv("../data/Birds_assembly_metadata.csv")