In [208]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
ncbi = NCBITaxa()

In [209]:
df = pd.read_csv('../../figures/fig3/all_contigs_df.tsv', sep='\t', 
                dtype={''})
df = df[df['group'] == 'Metazoa']
df = df[['sample', 'sci_name', 'read_prop', 'taxid']]

In [210]:
def taxid2name(taxid):
    return ncbi.get_taxid_translator([taxid])[taxid]

In [211]:
def get_rows_taxid (df, taxid, taxid_colname="taxid"):
    if (not isinstance(df, pd.DataFrame)):
        if isinstance(taxid, str):
            taxid = ncbi.get_name_translator([taxid])[taxid][0]
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]

    return outdf

There is a partial order on taxa: a < b if a is an ancestor of b. For a sample with one true bloodmeal, we would like to report a taxon t such that it is related to every other taxon, ie, t < b or b < t for all t reported.

put another way, a taxon is admissable if t in lineage(b) or b in lineage(t) for all b.

In [212]:
def get_least_admissable_taxon(taxa,
                               exclude = [], # drop these taxa
                               parent=None, # only keep children of the parent
                               antiparent=None # only keep taxa not in lineage of parent
                              ):
    if antiparent:
        exclude.extend(ncbi.get_lineage(antiparent))
    
    taxa = [taxid for taxid in taxa if taxid not in exclude]
    lineages = [ncbi.get_lineage(taxid) for taxid in taxa]
    
    if parent:
        lineages = [lineage for lineage in lineages if parent in lineage]
        if len(lineages) == 0:
            return 0
        
    if antiparent:
        lineages = [lineage for lineage in lineages if antiparent not in lineage]
        if len(lineages) == 0:
            return 0
        
    all_taxa = np.unique([taxid for lineage in lineages for taxid in lineage])
    non_leaf_taxa = np.unique([taxid for lineage in lineages for taxid in lineage[:-1]])
    leaf_taxa = [taxid for taxid in all_taxa if taxid not in non_leaf_taxa]
    
    leaf_lineages = [ncbi.get_lineage(taxid) for taxid in leaf_taxa]
    leaf_common_ancestors = set.intersection(*[set(l) for l in leaf_lineages])
    lca = [taxid for taxid in leaf_lineages[0] if taxid in leaf_common_ancestors][-1]
    
    return lca

In [213]:
vertebrate_taxid = 7742

In [214]:
one_off_exclusions = [314146.0, 9544.0]

In [205]:
least_admissable_taxa = []
for sample in df['sample'].unique():
    taxid = get_least_admissable_taxon(df[df['sample'] == sample]['taxid'],
                                      exclude = one_off_exclusions,
                                      parent = vertebrate_taxid)
    name = taxid2name(taxid) if taxid else "NA"
    least_admissable_taxa.append({'sample': sample, 'name': name, 'taxid': taxid})
least_admissable_taxa = pd.DataFrame(least_admissable_taxa).sort_values('sample')
least_admissable_taxa[['sample', 'taxid', 'name']].to_csv(
    '../../figures/fig3/vertebrate_lat.csv')

In [206]:
least_admissable_taxa = []
for sample in df['sample'].unique():
    taxid = get_least_admissable_taxon(df[df['sample'] == sample]['taxid'],
                                      exclude = one_off_exclusions,
                                      antiparent = vertebrate_taxid)
    name = taxid2name(taxid) if taxid else "NA"
    least_admissable_taxa.append({'sample': sample, 'name': name, 'taxid': taxid})
least_admissable_taxa = pd.DataFrame(least_admissable_taxa).sort_values('sample')
least_admissable_taxa[['sample', 'taxid', 'name']].to_csv(
    '../../figures/fig3/non_vertebrate_lat.csv')

## Manual exclusions

In [149]:
sample_032_exclude = [6231.0, 85066.0,7209.0, 1237907.0, 32561.0]

In [150]:
df[df['sample'] == 'CMS001_032_Ra_S7']['taxid'][:2]

2497    1237907.0
2505     175121.0
Name: taxid, dtype: float64

In [152]:
taxid2name(
    get_least_admissable_taxon(
        df[df['sample'] == 'CMS001_032_Ra_S7']['taxid'], sample_032_exclude
    ))

[175121.0, 44394.0, 39616.0, 9126.0, 8930.0, 8825.0, 8782.0]


'Neognathae'

In [160]:
sample_003_exclude = [6913.0, 74971.0, 336591.0]

taxid2name(
    get_least_admissable_taxon(
        df[df['sample'] == 'CMS001_003_Ra_S2']['taxid'], sample_003_exclude))

[1437010.0, 314145.0, 72004.0, 35500.0, 9938.0, 9913.0, 9895.0, 9881.0, 9880.0, 9871.0]


'Pecora'

In [161]:
taxid2name(
    get_least_admissable_taxon(
        sample_003_exclude))

[6913.0, 74971.0, 336591.0]


'Larinioides cornutus'

In [165]:
sample_058_exclude = [637355.0]

taxid2name(
    get_least_admissable_taxon(
        df[df['sample'] == 'CMS001_058_Ra_S9']['taxid'], sample_058_exclude))

[1437010.0, 337687.0, 79684.0, 39087.0]


'Microtus ochrogaster'

In [112]:
taxid2name(get_least_admissable_taxon(df[df['sample'] == 'CMS001_009_Ra_S13']['taxid']))

'Pecora'

In [86]:
df.sort_values(by= ['sample','read_prop']).to_csv('../../figures/fig3/metazoa_sorted.csv')

In [82]:
df[df['sample'] == 'CMS001_009_Ra_S13'].sort_values('read_prop')

Unnamed: 0,sample,sci_name,read_prop,taxid
2454,CMS001_009_Ra_S13,Laurasiatheria,4.1e-05,314145.0
2448,CMS001_009_Ra_S13,Boreoeutheria,0.000172,1437010.0
2471,CMS001_009_Ra_S13,Cervidae,0.000252,9850.0
2464,CMS001_009_Ra_S13,Bovidae,0.001577,9895.0


In [61]:
x = df[['sci_name', 'read_prop']].groupby('sci_name').agg({'read_prop': ['sum','count']})
x.columns = x.columns.droplevel()
x.sort_values('count').tail(20)

Unnamed: 0_level_0,sum,count
sci_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Zonotrichia albicollis,0.014802,3
Passeroidea,0.000353,3
Cetartiodactyla,0.000485,3
Cervidae,0.000307,4
Odocoileus,0.001616,4
Amniota,0.07588,4
Procyon lotor,0.011936,4
Carnivora,0.010041,4
Caniformia,0.017999,5
Odocoileus hemionus,0.00025,5


In [40]:
x = df.pivot(index='sample', columns='sci_name', values='read_prop').fillna(0)


sci_name,Acarus,Amniota,Anas platyrhynchos,Anatidae,Aphelocoma,Araneidae,Araneomorphae,Archosauria,Arvicolinae,Aves,...,Sturnus vulgaris,Sus scrofa,Taeniopygia guttata,Trachelas tranquillus,Troglodytinae,Ursus arctos horribilis,Zonotrichia,Zonotrichia albicollis,Zonotrichia leucophrys,unclassified Mimetus
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CMS001_001_Ra_S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_003_Ra_S2,0.0,0.0,0.0,0.0,0.0,0.000143,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_004_Ra_S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.007357,0.0,0.0,0.0,0.0
CMS001_005_Ra_S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_008_Ra_S3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_009_Ra_S13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_010_Ra_S1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_011_Ra_S4,0.0,0.023222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_012_Ra_S4,0.0,2.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMS001_013_Ra_S5,0.0,0.002039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
get_rows_taxid(df, 314145.0)

Unnamed: 0,sample,sci_name,read_prop,taxid
2452,CMS001_003_Ra_S2,Laurasiatheria,0.003818,314145.0
2453,CMS001_008_Ra_S3,Laurasiatheria,0.000337,314145.0
2454,CMS001_009_Ra_S13,Laurasiatheria,0.000041,314145.0
2455,CMS001_008_Ra_S3,Bubalus bubalis,0.000271,89462.0
2457,CMS001_003_Ra_S2,Bos mutus,0.000079,72004.0
...,...,...,...,...
2701,CMS001_012_Ra_S4,Odocoileinae,0.009814,9881.0
2702,CMS001_012_Ra_S4,Odocoileus virginianus texanus,0.000129,9880.0
2703,CMS001_012_Ra_S4,Odocoileus hemionus,0.000089,9872.0
2704,CMS001_012_Ra_S4,Odocoileus,0.000016,9871.0


In [None]:
pd.

In [31]:
get_rows_taxid(df, 314145.0).gather()

AttributeError: 'DataFrame' object has no attribute 'gather'

In [10]:
df = df[['sample', 'sci_name', 'read_prop', 'taxid']]

In [24]:
def get_viral_family_df (row_x):
    segments = row_x["segments"]
    df_by_sample = pd.DataFrame([x.split("|") for x in segments[list(segments.keys())[0]]["contigs"]])
    df_by_sample = df_by_sample.assign(sci_name=row_x["family"], taxid=row_x["taxid"])
    df_by_sample = df_by_sample.rename(columns={0:"sample", 1:"contig_name"})
    return (df_by_sample)

def get_rows_taxid (df, taxid, taxid_colname="taxid", identity_qcov_cutoff=None):
    if (not isinstance(df, pd.DataFrame)):
        if isinstance(taxid, str):
            taxid = ncbi.get_name_translator([taxid])[taxid][0]
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]
    if identity_qcov_cutoff is not None:
        outdf = outdf[outdf["identity_qcov"]>=identity_qcov_cutoff]
    return (outdf)


# def filter_by_criterion (df, colname, minthreshold, bysample=True):
#     if bysample:
#         sums = df.groupby(["sample", "taxid"])[colname].sum().reset_index()
#         sums["tokeep"] = sums[colname] >= minthreshold
#         df = pd.merge(df, sums.drop(columns=colname), how="left")
#         df = df[df["tokeep"]!=False].drop(columns="tokeep")
#     else:
#         df = df[df[colname] >= minthreshold]
#     return (df)


def check_if_in_any_taxid(taxid, taxid_list):
    if taxid in taxid_list:
        return (taxid)
    taxids = ncbi.get_lineage(taxid)
    check_in = [i for i, x in enumerate(taxids) if x in taxid_list]
    if (len(check_in)==0):
        return (np.nan)
    return (taxids[check_in[0]])



def clean_taxids(df, taxids, root_taxid, taxid_colname="taxid"):
    if isinstance(taxids[0], str):
        taxids = dict(zip([ncbi.get_name_translator([x])[x][0] for x in taxids], taxids))
    else:
        taxids = ncbi.get_taxid_translator(taxids)
    if isinstance(root_taxid, str):
        root_taxid_number = ncbi.get_name_translator([root_taxid])[root_taxid][0]
        root_taxid = {root_taxid_number:root_taxid}
    else:
        root_taxid = ncbi.get_taxid_translator([root_taxid])
    df[taxid_colname] = df[taxid_colname].apply(check_if_in_any_taxid, taxid_list=taxids)
    df[taxid_colname][df[taxid_colname].isnull()] = list(root_taxid.keys())[0]
    taxids.update(root_taxid)
    df["sci_name"] = df[taxid_colname].apply(lambda x: taxids[x])
    return (df)
    
    
def get_summary_table (df, colnames, metric):
    df = df.groupby(colnames)[metric].sum().reset_index()
    if not isinstance(metric, list):
        metric = [metric]
    sort_order = colnames+metric
    sort_order.remove("sample")
    return (df.sort_values(by=sort_order, ascending=False))

def correct_read_count(df, decontam_df, taxid, taxid_colname="taxid"):
    if isinstance(taxid, list):
        decontam_table = pd.concat([get_rows_taxid(decontam_df, taxid=x, taxid_colname=taxid_colname) for x in taxid])
    else:
        decontam_table = get_rows_taxid(decontam_df, taxid=taxid, taxid_colname=taxid_colname)
    decontam_table = decontam_table.groupby(["sample", "taxid"])["reads"].sum().reset_index()
    outdf = pd.merge(df, decontam_table)
    outdf["read_prop"] = outdf["reads"]/outdf["read_count"]*outdf["read_prop"]
    outdf = outdf.drop(columns=["read_count", "reads"]).rename(columns={"reads":"read_count"})
    return (outdf)

    

# Read in data

In [25]:
identity_qcov_threshold = 0.9
metadata_cols = ["ska_genus", "ska_species", "collected_by"]
numbers = {}

In [26]:
## Read counts data
idseq_data = pd.read_csv("../../data/project-mosquito_sample-table.csv", header=0)

In [27]:
## Load metadata
metadata = pd.read_csv("../../data/metadata/CMS001_CMS002_MergedAnnotations.csv", header=0)
metadata = pd.merge(metadata, idseq_data[["sample_name", "nonhost_reads", "total_reads"]], left_on="NewIDseqName", right_on="sample_name")

In [28]:
## Load LCA data
contig_stats_lca = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_lca.tsv", sep="\t", header=0)
contig_stats_lca = contig_stats_lca.assign(identity_qcov=(contig_stats_lca["identity"]/100*contig_stats_lca["align_length"]/contig_stats_lca["contig_length"]).apply(lambda x: min(x, 1)))
numbers["total_num_contigs_with_blast_hits"] = len(contig_stats_lca)
contig_stats_lca = contig_stats_lca[~(contig_stats_lca["hexapoda"])]
numbers["total_nonhexapoda_contigs"] = len(contig_stats_lca)
contig_stats_lca = pd.merge(contig_stats_lca, metadata, how="left", left_on="sample", right_on="NewIDseqName")
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]

# Curated list of known viruses

In [29]:
# Information about Baltimore classification of virus family groups
viral_family_groups = pd.read_csv("../../data/virus_family_groups.csv", header=0)
viral_family_groups = viral_family_groups.loc[:, ~viral_family_groups.columns.str.startswith('Unnamed')]
# Convert virus json into data frame
with open ("../../data/darkmatter/virus.json", "r") as f:
    viral_json = pd.DataFrame(json.load(f)).T
viral_contigs = pd.concat(viral_json.apply(get_viral_family_df, axis=1).tolist())
numbers["num_viral_contigs"] = len(viral_contigs)
# Add read proportions columns
viral_contig_stats = get_rows_taxid(contig_stats_lca[["sample", "read_prop", "taxid", "contig_name"]], taxid=10239, taxid_colname="taxid")
viral_contigs = pd.merge(viral_contigs, viral_contig_stats).groupby(["sample", "sci_name", "taxid"])["read_prop"].sum().reset_index()
viral_contigs = pd.merge(viral_contigs, contig_stats_lca[["sample"]+metadata_cols])
viral_contigs = pd.merge(viral_contigs, viral_family_groups, left_on="sci_name", right_on="family").drop(columns="family")




# Curate lists of non-viral contigs with high-confidence hits to NCBI records

In [30]:
## Load decontam data
true_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/decontam_sample_read_counts.tsv", sep="\t", header=0)
contam_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/sample_contamination.tsv", sep="\t", header=0)

In [31]:
# Deduct from the nonhost reads per sample the number of reads that were removed due to suspected contamination
contig_stats_lca["nonhost_reads"] = pd.merge(contig_stats_lca, contam_reads.groupby("sample")["reads"].sum().reset_index()).apply(lambda x: x["nonhost_reads"]-x["reads"], axis=1)
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]

In [32]:
# Only keep hits that are almost identical to a known wolbachia sequence
wolbachia_taxid = 952
wolbachia_contigs = get_rows_taxid(contig_stats_lca, taxid=wolbachia_taxid, taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Only keep wolbachia groups that were not removed by the decontamination step
wolbachia_contigs = wolbachia_contigs[wolbachia_contigs["taxid"].isin(true_reads["taxid"])]
numbers["total_wolbachia_contigs"] = len(wolbachia_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
wolbachia_contigs = get_summary_table(wolbachia_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
wolbachia_contigs = correct_read_count(wolbachia_contigs, true_reads, wolbachia_taxid, "taxid")
#wolbachia_contigs = wolbachia_contigs.assign(taxid=wolbachia_taxid)
# Create sci_name column to denote that that this table contains Wolbachia samples
wolbachia_contigs = wolbachia_contigs.assign(sci_name=ncbi.get_taxid_translator([wolbachia_taxid])[wolbachia_taxid])



In [33]:
# Only keep hits that are almost identical to a known metazoan sequence
metazoan_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep metazoan groups that were not removed by the decontamination step
metazoan_contigs = metazoan_contigs[metazoan_contigs["taxid"].isin(true_reads["taxid"])]
# Convert taxids to those of interest
#metazoan_contigs = clean_taxids(metazoan_contigs, taxids=["Leporidae", "Muroidea", "Homo sapiens", "Carnivora", "Odocoileinae", "Bovidae", "Neognathae"], root_taxid="Metazoa")
metazoan_contigs = metazoan_contigs.assign(sci_name=metazoan_contigs["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_metazoan_contigs"] = len(metazoan_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
metazoan_contigs = get_summary_table(metazoan_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
metazoan_contigs = correct_read_count(metazoan_contigs, true_reads, taxid="Metazoa", taxid_colname="taxid")
    



In [34]:
# Only keep hits that are almost identical to a known eukaryote sequence
eukaryotic_taxids = ["Fungi", "Trypanosomatidae", "Plasmodium", "Stramenopiles", "Viridiplantae", "Euglenozoa", "Alveolata"]
eukaryotic_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Eukaryota") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep eukaryotic groups that were not removed by the decontamination step
eukaryotic_contigs = eukaryotic_contigs[eukaryotic_contigs["taxid"].isin(true_reads["taxid"])]
# Convert taxids to those of interest
#eukaryotic_contigs = clean_taxids(eukaryotic_contigs, taxids=eukaryotic_taxids, root_taxid="Eukaryota")
eukaryotic_contigs = eukaryotic_contigs.assign(sci_name=eukaryotic_contigs["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_eukaryotic_contigs"] = len(eukaryotic_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
eukaryotic_contigs = get_summary_table(eukaryotic_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
eukaryotic_contigs = correct_read_count(eukaryotic_contigs, true_reads, taxid=eukaryotic_taxids, taxid_colname="taxid") 




# Output file

In [35]:
all_contigs_df = pd.concat([wolbachia_contigs.assign(group="Wolbachia"), 
                            viral_contigs.assign(group="Virus"),
                            metazoan_contigs.assign(group="Metazoa"), 
                            eukaryotic_contigs.assign(group="Other Eukaryotes")])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [36]:
all_contigs_df.to_csv("../../figures/fig3/all_contigs_df.tsv", sep="\t", index=False)