Compile the file `sample_taxa.tsv`, which contains for each sample, the scientific name for each taxon occurring in it (after decontamination), with no filtering byond LCA. Grouped according to categories needed for co-occurrence analysis: Chordates, (Other) Eukaryotes, Viruses, Bacteria, Other.

In [1]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import json
ncbi = NCBITaxa()

In [2]:
def group_at_higher_tax(df, taxonomic_group, family_name, taxid_colname="taxid", family_colname="family"):
    groups = {}
    for x in taxonomic_group:
        if isinstance(x, str):
            groups[x] = ncbi.get_name_translator([x])[x][0]
        else:
            groups[ncbi.get_taxid_translator(x)[x]] = x
    family_assignments = {}
    for x in df["taxid"].unique():
        lin = ncbi.get_lineage(x)
        family_assignments[x] = family_name
        for key, taxid_x in groups.items():
            if taxid_x in lin:
                family_assignments[x] = key
                break
    df[family_colname] = df["taxid"].apply(lambda x: family_assignments[x])
    return (df)


def taxid2name(taxid):
    return ncbi.get_taxid_translator([taxid])[taxid]

def filter_taxon(taxid, parent):
    lineage = ncbi.get_lineage(taxid)
    if parent in lineage:
        return True
    else:
        return False
    
def name2taxid(name):
    return ncbi.get_name_translator([name])[name][0]

# Read in data

In [3]:
s3_dir = "s3://czbiohub-mosquito"
s3_dir = "../../data/s3"

lca_reads = pd.read_csv(s3_dir + "/contig_quality_concat/lca_decontam.tsv", sep="\t", header=0)
viral_reads = pd.read_csv(s3_dir + "/contig_quality_concat/viral_decontam.tsv", sep="\t", header=0)

In [4]:
lca_reads.head()

Unnamed: 0,sample,taxid,reads
0,CMS001_001_Ra_S1,1,4198
1,CMS001_002_Ra_S1,1,2460
2,CMS001_005_Ra_S3,1,4292
3,CMS001_006_Ra_S5,1,907
4,CMS001_007_Ra_S12,1,387


In [5]:
viral_reads.head()

Unnamed: 0,sample,poly_group,reads
0,CMS001_002_Ra_S1,1,1907
1,CMS001_039_Ra_S9,1,12696
2,CMS002_056a_Rb_S9_L004,1,15621
3,CMS001_039_Ra_S9,2,4383
4,CMS002_032a_Rb_S166_L004,2,89007


In [6]:
with open ("../../data/darkmatter/virus.json", "r") as f:
    viral_json = pd.DataFrame(json.load(f)).T
    viral_json['poly_group'] = viral_json.index.astype(np.int)
    viral_json['sci_name'] = viral_json.apply(lambda x: x['name'] if x['name'] != 'TBD' else x['provisional_name'], 
                                              axis = 1)

In [7]:
viral_taxa = viral_reads.merge(viral_json[['poly_group', 'sci_name']], on='poly_group', how='left')[['sample', 'sci_name']]
viral_taxa['group'] = 'Viruses'

In [8]:
lca_reads['sci_name'] = lca_reads['taxid'].apply(taxid2name)
lca_taxa = group_at_higher_tax(lca_reads,
                                 taxonomic_group=["Chordata", "Eukaryota", "Bacteria", "Viruses"],
                                 family_name="Other",
                                 taxid_colname="taxid",
                                 family_colname="group")
lca_taxa = lca_taxa[['sample', 'sci_name', 'group']]



In [9]:
sample_taxa = pd.concat([lca_taxa, viral_taxa])

In [10]:
sample_taxa.to_csv("../../figures/fig4/sample_taxa.tsv", sep="\t", index=False)