In [602]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
import math
ncbi = NCBITaxa()

In [639]:
def get_viral_family_df (row_x):
    segments = row_x["segments"]
    df_by_sample = pd.DataFrame([x.split("|") for x in segments[list(segments.keys())[0]]["contigs"]])
    df_by_sample = df_by_sample.assign(family=row_x["family"], taxid=row_x["taxid"])
    if not pd.isnull(row_x["provisional_name"]):
        df_by_sample = df_by_sample.assign(sci_name=row_x["provisional_name"])
    else:
        df_by_sample = df_by_sample.assign(sci_name=row_x["name"])
    df_by_sample = df_by_sample.rename(columns={0:"sample", 1:"contig_name"})
    return (df_by_sample)

def get_rows_taxid (df, taxid, taxid_colname="taxid", identity_qcov_cutoff=None):
    if (not isinstance(df, pd.DataFrame)):
        if isinstance(taxid, str):
            taxid = ncbi.get_name_translator([taxid])[taxid][0]
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]
    if identity_qcov_cutoff is not None:
        outdf = outdf[outdf["identity_qcov"]>=identity_qcov_cutoff]
    return (outdf)


# def filter_by_criterion (df, colname, minthreshold, bysample=True):
#     if bysample:
#         sums = df.groupby(["sample", "taxid"])[colname].sum().reset_index()
#         sums["tokeep"] = sums[colname] >= minthreshold
#         df = pd.merge(df, sums.drop(columns=colname), how="left")
#         df = df[df["tokeep"]!=False].drop(columns="tokeep")
#     else:
#         df = df[df[colname] >= minthreshold]
#     return (df)


def check_if_in_any_taxid(taxid, taxid_list):
    if taxid in taxid_list:
        return (taxid)
    taxids = ncbi.get_lineage(taxid)
    check_in = [i for i, x in enumerate(taxids) if x in taxid_list]
    if (len(check_in)==0):
        return (np.nan)
    return (taxids[check_in[0]])



def clean_taxids(df, taxids, root_taxid, taxid_colname="taxid"):
    if isinstance(taxids[0], str):
        taxids = dict(zip([ncbi.get_name_translator([x])[x][0] for x in taxids], taxids))
    else:
        taxids = ncbi.get_taxid_translator(taxids)
    if isinstance(root_taxid, str):
        root_taxid_number = ncbi.get_name_translator([root_taxid])[root_taxid][0]
        root_taxid = {root_taxid_number:root_taxid}
    else:
        root_taxid = ncbi.get_taxid_translator([root_taxid])
    df[taxid_colname] = df[taxid_colname].apply(check_if_in_any_taxid, taxid_list=taxids)
    df[taxid_colname][df[taxid_colname].isnull()] = list(root_taxid.keys())[0]
    taxids.update(root_taxid)
    df["sci_name"] = df[taxid_colname].apply(lambda x: taxids[x])
    return (df)
    
    
def get_summary_table (df, colnames, metric):
    if (any(df["read_prop"]==0)):
        print (df)
    df = df.groupby(colnames)[metric].sum().reset_index()
    if not isinstance(metric, list):
        metric = [metric]
    sort_order = colnames+metric
    sort_order.remove("sample")
    return (df.sort_values(by=sort_order, ascending=False))

def correct_read_count(df, decontam_df, taxid, taxid_colname="taxid", search_lower_lineages=False):
    if not isinstance(taxid, list):
        taxid = [taxid]
    taxid = [ncbi.get_name_translator([x])[x][0] if isinstance(x, str) else x for x in taxid]
    if search_lower_lineages:
        decontam_table = pd.concat([get_rows_taxid(decontam_df, taxid=x, taxid_colname=taxid_colname) for x in taxid])
    else:
        decontam_table = pd.concat([decontam_df[decontam_df[taxid_colname]==x] for x in taxid if x in decontam_df[taxid_colname].tolist()])
    decontam_table = decontam_table.groupby(["sample", "taxid"])["reads"].sum().reset_index()
    outdf = pd.merge(df, decontam_table, how="left")
    outdf = outdf[~outdf["reads"].isnull()]
    outdf["read_prop"] = outdf["reads"]/outdf["read_count"]*outdf["read_prop"]
    outdf = outdf.drop(columns=["read_count", "reads"]).rename(columns={"reads":"read_count"})
    return (outdf)

def group_at_higher_tax(df, taxonomic_group, family_name, taxid_colname="taxid", family_colname="family"):
    groups = {}
    for x in taxonomic_group:
        if isinstance(x, str):
            groups[x] = ncbi.get_name_translator([x])[x][0]
        else:
            groups[ncbi.get_taxid_translator(x)[x]] = x
    family_assignments = {}
    for x in df["taxid"].unique():
        lin = ncbi.get_lineage(x)
        family_assignments[x] = family_name
        for key, taxid_x in groups.items():
            if taxid_x in lin:
                family_assignments[x] = key
                break
    df[family_colname] = df["taxid"].apply(lambda x: family_assignments[x])
    return (df)


# Read in data

In [604]:
identity_qcov_threshold = 0.9
metadata_cols = ["ska_genus", "ska_species", "collected_by"]
numbers = {}

In [605]:
## Read counts data
idseq_data = pd.read_csv("../../data/metadata/idseq_metadata.csv", header=0)

In [606]:
## Load metadata
metadata = pd.read_csv("../../data/metadata/CMS001_CMS002_MergedAnnotations.csv", header=0)
metadata = pd.merge(metadata, idseq_data[["sample", "nonhost_reads", "total_reads"]], left_on="NewIDseqName", right_on="sample")



In [607]:
## Load read count data for all contigs
contig_stats_all = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_all.tsv", sep="\t", header=0)
contig_stats_all = pd.merge(contig_stats_all, metadata, how="left", on="sample")
contig_stats_all["read_prop"] = contig_stats_all["read_count"]/contig_stats_all["nonhost_reads"]


In [608]:
## Load decontam data
true_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/decontam_sample_read_counts.tsv", sep="\t", header=0)
contam_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/sample_contamination.tsv", sep="\t", header=0)

In [609]:
## Load LCA data
contig_stats_lca_raw = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_lca.tsv", sep="\t", header=0)

In [628]:
## Process raw LCA data
contig_stats_lca = contig_stats_lca_raw.assign(identity_qcov=(contig_stats_lca_raw["identity"]/100*contig_stats_lca_raw["align_length"]/contig_stats_lca_raw["contig_length"]).apply(lambda x: min(x, 1)))
numbers["total_num_contigs_with_blast_hits"] = len(contig_stats_lca)
hexapoda_read_counts = contig_stats_lca[contig_stats_lca["hexapoda"]].groupby("sample")["read_count"].sum().reset_index()
contig_stats_lca = contig_stats_lca[~(contig_stats_lca["hexapoda"])]
numbers["total_nonhexapoda_contigs"] = len(contig_stats_lca)
contig_stats_lca = pd.merge(contig_stats_lca, metadata, how="left", on="sample")
contig_stats_lca["nonhost_reads"] = pd.merge(contig_stats_lca, hexapoda_read_counts,  how="left", on="sample", suffixes=["", "_hexapoda"]).apply(lambda x: x["nonhost_reads"]-x["read_count_hexapoda"], axis=1)
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]

# Curated list of known viruses

In [640]:
# Information about Baltimore classification of virus family groups
viral_family_groups = pd.read_csv("../../data/virus_family_groups.csv", header=0)
viral_family_groups = viral_family_groups.loc[:, ~viral_family_groups.columns.str.startswith('Unnamed')]
# Convert virus json into data frame
with open ("../../data/darkmatter/virus.json", "r") as f:
    viral_json = pd.DataFrame(json.load(f)).T
viral_contigs_df = pd.concat(viral_json.apply(get_viral_family_df, axis=1).tolist())
numbers["num_viral_contigs"] = len(viral_contigs)
# Add read proportions columns
viral_contigs = pd.merge(viral_contigs_df, contig_stats_all[["sample", "read_prop", "contig_name"]], how="left", on=["sample", "contig_name"]).groupby(["sample", "sci_name", "taxid"])["read_prop"].sum().reset_index()
# Add metadata information
viral_contigs = pd.merge(viral_contigs, contig_stats_lca[["sample"]+metadata_cols].groupby(["sample"]).first().reset_index(), how="left")
viral_contigs = pd.merge(viral_contigs, viral_contigs_df[["sample", "sci_name", "family"]], how="left")
# Add baltimore group information about the viruses
viral_contigs = pd.merge(viral_contigs, viral_family_groups, on="family", how="left")



TypeError: ("ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''", 'occurred at index 1')

In [645]:
pd.isnull(viral_json["partial"].iloc[0])

True

# Curate lists of non-viral contigs with high-confidence hits to NCBI records

In [612]:
# Deduct from the nonhost reads per sample the number of reads that were removed due to suspected contamination
contig_stats_lca["nonhost_reads"] = pd.merge(contig_stats_lca, contam_reads.groupby("sample")["reads"].sum().reset_index(), how="left").apply(lambda x: x["nonhost_reads"]-x["reads"] if not math.isnan(x["reads"]) else x["nonhost_reads"], axis=1)
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]



In [613]:
# Only keep hits that are almost identical to a known wolbachia sequence
wolbachia_taxid = 952
wolbachia_contigs = get_rows_taxid(contig_stats_lca, taxid=wolbachia_taxid, taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Only keep wolbachia groups that were not removed by the decontamination step
wolbachia_contigs = pd.merge(wolbachia_contigs, true_reads, how="left")
wolbachia_contigs = wolbachia_contigs[~wolbachia_contigs["reads"].isnull()]
numbers["total_wolbachia_contigs"] = len(wolbachia_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
wolbachia_contigs = get_summary_table(wolbachia_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
wolbachia_contigs = correct_read_count(wolbachia_contigs, true_reads, wolbachia_taxid, "taxid", search_lower_lineages=True)
# Create sci_name column to denote that that this table contains Wolbachia samples
wolbachia_contigs = wolbachia_contigs.assign(sci_name=ncbi.get_taxid_translator([wolbachia_taxid])[wolbachia_taxid])



In [614]:
# Only keep hits that are almost identical to a known metazoan sequence
metazoan_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep metazoan groups that were not removed by the decontamination step
metazoan_contigs = pd.merge(metazoan_contigs, true_reads, how="left")
metazoan_contigs = metazoan_contigs[~metazoan_contigs["reads"].isnull()]
# Convert taxids to those of interest
#metazoan_contigs = clean_taxids(metazoan_contigs, taxids=["Leporidae", "Muroidea", "Homo sapiens", "Carnivora", "Odocoileinae", "Bovidae", "Neognathae"], root_taxid="Metazoa")
metazoan_contigs = metazoan_contigs.assign(sci_name=metazoan_contigs["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_metazoan_contigs"] = len(metazoan_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
metazoan_contigs = get_summary_table(metazoan_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
metazoan_taxids = metazoan_contigs["taxid"].unique().tolist()
metazoan_contigs = correct_read_count(metazoan_contigs, true_reads, taxid=metazoan_taxids, taxid_colname="taxid", search_lower_lineages=False)
    

In [615]:
# Only keep hits that are almost identical to a known chordate sequence
# chordate_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
chordate_contigs = get_rows_taxid(contig_stats_lca, taxid="Chordata", taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Only keep metazoan groups that were not removed by the decontamination step
chordate_contigs = pd.merge(chordate_contigs, true_reads, how="left")
chordate_contigs = chordate_contigs[~chordate_contigs["reads"].isnull()]
# Convert taxids to those of interest
chordate_contigs = chordate_contigs.assign(sci_name=chordate_contigs["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_chordate_contigs"] = len(chordate_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
chordate_contigs = get_summary_table(chordate_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
chordate_taxids = chordate_contigs["taxid"].unique().tolist()
chordate_contigs = correct_read_count(chordate_contigs, true_reads, taxid=chordate_taxids, taxid_colname="taxid", search_lower_lineages=False)
# In the 'family' column, add information about the higher taxonomic grouping for taxids
chordate_contigs = group_at_higher_tax(chordate_contigs, taxonomic_group=["Pecora", "Aves", "Carnivora", "Rodentia", "Leporidae"], family_name="Other chordates", taxid_colname="taxid", family_colname="family")




In [616]:
# Only keep hits that are almost identical to a known eukaryote sequence
eukaryotic_contigs_df = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Eukaryota") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep eukaryotic groups that were not removed by the decontamination step
eukaryotic_contigs_df = pd.merge(eukaryotic_contigs_df, true_reads, how="left")
eukaryotic_contigs_df = eukaryotic_contigs_df[~eukaryotic_contigs_df["reads"].isnull()]
# Convert taxids to those of interest
eukaryotic_contigs = eukaryotic_contigs_df.assign(sci_name=eukaryotic_contigs_df["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_eukaryotic_contigs"] = len(eukaryotic_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
eukaryotic_contigs = get_summary_table(eukaryotic_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
eukaryotic_taxids = eukaryotic_contigs["taxid"].unique().tolist()
eukaryotic_contigs = correct_read_count(eukaryotic_contigs, true_reads, taxid=eukaryotic_taxids, taxid_colname="taxid", search_lower_lineages=False) 
# In the 'family' column, add information about the higher taxonomic grouping for taxids
eukaryotic_contigs = group_at_higher_tax(eukaryotic_contigs, taxonomic_group=["Trypanosomatidae", "Apicomplexa", "Amblyosporidae"], family_name="Other eukaryotes", taxid_colname="taxid", family_colname="family")


# Output file

In [617]:
all_contigs_df = pd.concat([wolbachia_contigs.assign(group="Wolbachia"), 
                            viral_contigs.assign(group="Virus"),
                            metazoan_contigs.assign(group="Metazoa"), 
                            eukaryotic_contigs.assign(group="Other Eukaryotes")])





of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  after removing the cwd from sys.path.


In [618]:
all_contigs_df.to_csv("../../figures/fig3/all_contigs_df.tsv", sep="\t", index=False)

In [619]:
hexapoda_decontam_reads = contig_stats_lca[["sample", "nonhost_reads"]][(~contig_stats_lca["sample"].duplicated()) & (~contig_stats_lca["nonhost_reads"].isnull())]
hexapoda_decontam_reads = hexapoda_decontam_reads.astype({"nonhost_reads":int})
hexapoda_decontam_reads.to_csv("../../data/metadata/nonhost_reads_decontam_nohexapoda.tsv", index=False, sep="\t")

