In [1]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
ncbi = NCBITaxa()

In [2]:
df = pd.read_csv('../../figures/fig3/all_contigs_df.tsv', sep='\t', 
                dtype={'taxid': np.int})
df = df[df['group'] == 'Metazoa']
df = df[['sample', 'sci_name', 'read_prop', 'taxid']]

In [3]:
def taxid2name(taxid):
    return ncbi.get_taxid_translator([taxid])[taxid]

There is a partial order on taxa: a < b if a is an ancestor of b. For a sample with one true bloodmeal, we would like to report a taxon t such that it is related to every other taxon, ie, t < b or b < t for all t reported.

put another way, a taxon is admissable if t in lineage(b) or b in lineage(t) for all b.

In [4]:
def get_least_admissable_taxon(taxa,
                               exclude = [], # drop these taxa
                               exclude_children = [], # drop children of these taxa
                               parent=None, # only keep children of the parent
                               antiparent=None # only keep taxa not in lineage of parent
                              ):
    if antiparent:
        exclude.extend(ncbi.get_lineage(antiparent))
    exclude = set(exclude)
    exclude_children = set(exclude_children)
    
    taxa = [taxid for taxid in taxa if taxid not in exclude]
    
    lineages = [ncbi.get_lineage(taxid) for taxid in taxa]
    
    lineages = [lineage for lineage in lineages if len(set(lineage) & exclude_children) == 0]

    if parent:
        lineages = [lineage for lineage in lineages if parent in lineage]
        if len(lineages) == 0:
            return 0
        
    if antiparent:
        lineages = [lineage for lineage in lineages if antiparent not in lineage]
        if len(lineages) == 0:
            return 0
        
    all_taxa = np.unique([taxid for lineage in lineages for taxid in lineage])
    non_leaf_taxa = np.unique([taxid for lineage in lineages for taxid in lineage[:-1]])
    leaf_taxa = [taxid for taxid in all_taxa if taxid not in non_leaf_taxa]
    
    leaf_lineages = [ncbi.get_lineage(taxid) for taxid in leaf_taxa]
    leaf_common_ancestors = set.intersection(*[set(l) for l in leaf_lineages])
    lca = [taxid for taxid in leaf_lineages[0] if taxid in leaf_common_ancestors][-1]
        
    return lca

In [5]:
vertebrate_taxid = 7742
primate_taxid = 9443

In [6]:
euarchontoglires_taxid = 314146

In [7]:
least_admissable_taxa = []
for sample in df['sample'].unique():
    taxid = get_least_admissable_taxon(df[df['sample'] == sample]['taxid'],
                                      exclude = [euarchontoglires_taxid],
                                      exclude_children = [primate_taxid],
                                      parent = vertebrate_taxid)
    name = taxid2name(taxid) if taxid else "NA"
    least_admissable_taxa.append({'sample': sample, 'name': name, 'taxid': taxid})
least_admissable_taxa = pd.DataFrame(least_admissable_taxa).sort_values('sample')
least_admissable_taxa = least_admissable_taxa[['sample', 'taxid', 'name']]

In [8]:
least_admissable_taxa.head()

Unnamed: 0,sample,taxid,name
39,CMS001_001_Ra_S1,35500,Pecora
0,CMS001_003_Ra_S2,35500,Pecora
21,CMS001_004_Ra_S2,379584,Caniformia
6,CMS001_005_Ra_S3,1437010,Boreoeutheria
3,CMS001_008_Ra_S3,35500,Pecora


In [9]:
partition = "Pecora Carnivora Homininae Rodentia Leporidae Aves".split()
partition = ncbi.get_name_translator(partition)
partition = {v[0]: k for k, v in partition.items()}

def get_category(taxid):
    if not taxid:
        return None
    lineage = ncbi.get_lineage(taxid)
    for k in partition:
        if k in lineage:
            return partition[k]
    else:
        return 'NA'

In [10]:
ncbi.get_rank(partition.keys())

{8782: 'class',
 9979: 'family',
 9989: 'order',
 33554: 'order',
 35500: 'infraorder',
 207598: 'subfamily'}

In [11]:
vertebrate_calls = least_admissable_taxa

In [12]:
vertebrate_calls['category'] = vertebrate_calls['taxid'].apply(get_category)

vertebrate_calls = vertebrate_calls[vertebrate_calls['category'] != 'NA']
vertebrate_calls = vertebrate_calls[['sample', 'category', 'name']]
vertebrate_calls = vertebrate_calls.sort_values('sample')

vertebrate_calls.to_csv(
    '../../figures/fig4/vertebrate_lat.csv', index=False)

In [13]:
vertebrate_calls[['sample', 'category', 'name']].head()

Unnamed: 0,sample,category,name
39,CMS001_001_Ra_S1,Pecora,Pecora
0,CMS001_003_Ra_S2,Pecora,Pecora
21,CMS001_004_Ra_S2,Carnivora,Caniformia
3,CMS001_008_Ra_S3,Pecora,Pecora
1,CMS001_009_Ra_S13,Pecora,Pecora


In [14]:
least_admissable_taxa = []
for sample in df['sample'].unique():
    taxid = get_least_admissable_taxon(df[df['sample'] == sample]['taxid'],
                                      antiparent = vertebrate_taxid)
    name = taxid2name(taxid) if taxid else "NA"
    least_admissable_taxa.append({'sample': sample, 'name': name, 'taxid': taxid})
least_admissable_taxa = pd.DataFrame(least_admissable_taxa).sort_values('sample')
least_admissable_taxa[['sample', 'taxid', 'name']].to_csv(
    '../../figures/fig4/non_vertebrate_lat.csv', index=False)