In [1]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
import math
ncbi = NCBITaxa()

In [2]:
def get_viral_family_df (row_x):
    segments = row_x["segments"]
    df_by_sample = pd.DataFrame([x.split("|") for x in segments[list(segments.keys())[0]]["contigs"]])
    df_by_sample = df_by_sample.assign(family=row_x["family"], taxid=row_x["taxid"])
    if not pd.isnull(row_x["provisional_name"]):
        df_by_sample = df_by_sample.assign(sci_name=row_x["provisional_name"])
    else:
        df_by_sample = df_by_sample.assign(sci_name=row_x["name"])
    df_by_sample = df_by_sample.assign(poly_group=row_x["poly_group"]).astype({"poly_group":int})
    df_by_sample = df_by_sample.rename(columns={0:"sample", 1:"contig_name"})
    return (df_by_sample)

def get_rows_taxid (df, taxid, taxid_colname="taxid", identity_qcov_cutoff=None):
    if (not isinstance(df, pd.DataFrame)):
        if isinstance(taxid, str):
            taxid = ncbi.get_name_translator([taxid])[taxid][0]
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]
    if identity_qcov_cutoff is not None:
        outdf = outdf[outdf["identity_qcov"]>=identity_qcov_cutoff]
    return (outdf)


def check_if_in_any_taxid(taxid, taxid_list):
    if taxid in taxid_list:
        return (taxid)
    taxids = ncbi.get_lineage(taxid)
    check_in = [i for i, x in enumerate(taxids) if x in taxid_list]
    if (len(check_in)==0):
        return (np.nan)
    return (taxids[check_in[0]])



def clean_taxids(df, taxids, root_taxid, taxid_colname="taxid"):
    if isinstance(taxids[0], str):
        taxids = dict(zip([ncbi.get_name_translator([x])[x][0] for x in taxids], taxids))
    else:
        taxids = ncbi.get_taxid_translator(taxids)
    if isinstance(root_taxid, str):
        root_taxid_number = ncbi.get_name_translator([root_taxid])[root_taxid][0]
        root_taxid = {root_taxid_number:root_taxid}
    else:
        root_taxid = ncbi.get_taxid_translator([root_taxid])
    df[taxid_colname] = df[taxid_colname].apply(check_if_in_any_taxid, taxid_list=taxids)
    df[taxid_colname][df[taxid_colname].isnull()] = list(root_taxid.keys())[0]
    taxids.update(root_taxid)
    df["sci_name"] = df[taxid_colname].apply(lambda x: taxids[x])
    return (df)
    
    
def get_summary_table (df, colnames, metric):
    df = df.groupby(colnames)[metric].sum().reset_index()
    if not isinstance(metric, list):
        metric = [metric]
    sort_order = colnames+metric
    sort_order.remove("sample")
    return (df.sort_values(by=sort_order, ascending=False))


def group_at_higher_tax(df, taxonomic_group, family_name, taxid_colname="taxid", family_colname="family"):
    groups = {}
    for x in taxonomic_group:
        if isinstance(x, str):
            groups[x] = ncbi.get_name_translator([x])[x][0]
        else:
            groups[ncbi.get_taxid_translator(x)[x]] = x
    family_assignments = {}
    for x in df["taxid"].unique():
        lin = ncbi.get_lineage(x)
        family_assignments[x] = family_name
        for key, taxid_x in groups.items():
            if taxid_x in lin:
                family_assignments[x] = key
                break
    df[family_colname] = df["taxid"].apply(lambda x: family_assignments[x])
    return (df)


# Read in data

In [3]:
identity_qcov_threshold = 0.9
metadata_cols = ["ska_genus", "ska_species", "collected_by"]
numbers = {}

In [4]:
## Read counts data
idseq_data = pd.read_csv("../../data/metadata/idseq_metadata.csv", header=0)

In [5]:
## Load metadata
metadata = pd.read_csv("../../data/metadata/CMS001_CMS002_MergedAnnotations.csv", header=0)
metadata = pd.merge(metadata, idseq_data[["sample", "nonhost_reads", "total_reads"]], left_on="NewIDseqName", right_on="sample")



In [6]:
## Load read count data for all contigs
contig_stats_all = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_all.tsv", sep="\t", header=0)
contig_stats_all = pd.merge(contig_stats_all, metadata, how="left", on="sample")
contig_stats_all["read_prop"] = contig_stats_all["read_count"]/contig_stats_all["nonhost_reads"]


In [7]:
## Load decontam data
true_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/lca_decontam.tsv", sep="\t", header=0)
contam_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/lca_contamination.tsv", sep="\t", header=0)
#viral_contam_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/viral_contamination.tsv", sep="\t", header=0)
true_viral_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/viral_decontam.tsv", sep="\t", header=0)


In [8]:
## Get the total number of nonhost reads belonging to a contig that we can confidently say is a living organism
## that is not hexapoda (> 2 reads per contig, no good hits to hexapoda, has blast hit or there is other evidence
## that it is viral)
nonhost_reads_nonviral = true_reads.groupby("sample")["reads"].sum().reset_index()
nonhost_reads_viral = true_viral_reads.groupby("sample")["reads"].sum().reset_index()
hexapoda_decontam_reads = pd.merge(nonhost_reads_nonviral, nonhost_reads_viral, on="sample", how="outer").fillna(0)
hexapoda_decontam_reads["nonhost_reads"] = hexapoda_decontam_reads["reads_x"]+hexapoda_decontam_reads["reads_y"]
hexapoda_decontam_reads = hexapoda_decontam_reads[["sample", "nonhost_reads"]]
hexapoda_decontam_reads.to_csv("../../data/metadata/nonhost_reads_decontam_nohexapoda.tsv", index=False, sep="\t")

In [9]:
## Load LCA data
contig_stats_lca_raw = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_lca.tsv", sep="\t", header=0)

In [10]:
## Process raw LCA data
contig_stats_lca = contig_stats_lca_raw.assign(identity_qcov=(contig_stats_lca_raw["identity"]/100*contig_stats_lca_raw["align_length"]/contig_stats_lca_raw["contig_length"]).apply(lambda x: min(x, 1)))
numbers["total_num_contigs_with_blast_hits"] = len(contig_stats_lca)
hexapoda_read_counts = contig_stats_lca[contig_stats_lca["hexapoda"]].groupby("sample")["read_count"].sum().reset_index()
contig_stats_lca = contig_stats_lca[~(contig_stats_lca["hexapoda"])]
numbers["total_nonhexapoda_contigs"] = len(contig_stats_lca)
contig_stats_lca = pd.merge(contig_stats_lca, metadata, how="left", on="sample")
contig_stats_lca["nonhost_reads"] = pd.merge(contig_stats_lca, hexapoda_decontam_reads,  how="left", on="sample")["nonhost_reads_y"]
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]

In [34]:
blast_lca_nt = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/blast_lca_nt_filtered.m9", sep="\t", header=0)
blast_lca_nr = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/blast_lca_nr_filtered.m9", sep="\t", header=0)


Unnamed: 0,sample,contig_name,contig_length,read_count,nt,nr,hexapoda,nt_or_nr,taxid,bitscore,identity,align_length,taxon_group
36,CMS001_001_Ra_S1,NODE_63_length_1477_cov_1.495000,1477,48,True,True,True,,,,,,
60,CMS001_001_Ra_S1,NODE_3473_length_326_cov_1.124498,326,5,True,True,True,,,,,,
91,CMS001_001_Ra_S1,NODE_2114_length_392_cov_1.853968,392,8,True,True,True,,,,,,
121,CMS001_001_Ra_S1,NODE_202_length_997_cov_1.132609,997,35,True,True,True,,,,,,
144,CMS001_001_Ra_S1,NODE_250_length_922_cov_2.069822,922,124,True,True,True,,,,,,
180,CMS001_001_Ra_S1,NODE_166_length_1066_cov_1.516684,1066,53,True,True,True,,,,,,
221,CMS001_001_Ra_S1,NODE_372_length_801_cov_0.959945,801,18,True,True,True,,,,,,
272,CMS001_001_Ra_S1,NODE_20_length_2429_cov_2.034439,2429,128,True,True,True,,,,,,
307,CMS001_001_Ra_S1,NODE_185_length_1028_cov_1.098843,1028,26,True,True,True,,,,,,
344,CMS001_001_Ra_S1,NODE_960_length_534_cov_1.768053,534,13,True,True,True,,,,,,


In [35]:
contig_stats_lca_raw[contig_stats_lca_raw["hexapoda"] & contig_stats_lca_raw["nr"] & (contig_stats_lca_raw["contig_name"].isin(blast_lca_nr["query"]))]


Unnamed: 0,sample,contig_name,contig_length,read_count,nt,nr,hexapoda,nt_or_nr,taxid,bitscore,identity,align_length,taxon_group
36,CMS001_001_Ra_S1,NODE_63_length_1477_cov_1.495000,1477,48,True,True,True,,,,,,
60,CMS001_001_Ra_S1,NODE_3473_length_326_cov_1.124498,326,5,True,True,True,,,,,,
91,CMS001_001_Ra_S1,NODE_2114_length_392_cov_1.853968,392,8,True,True,True,,,,,,
121,CMS001_001_Ra_S1,NODE_202_length_997_cov_1.132609,997,35,True,True,True,,,,,,
144,CMS001_001_Ra_S1,NODE_250_length_922_cov_2.069822,922,124,True,True,True,,,,,,
180,CMS001_001_Ra_S1,NODE_166_length_1066_cov_1.516684,1066,53,True,True,True,,,,,,
221,CMS001_001_Ra_S1,NODE_372_length_801_cov_0.959945,801,18,True,True,True,,,,,,
272,CMS001_001_Ra_S1,NODE_20_length_2429_cov_2.034439,2429,128,True,True,True,,,,,,
307,CMS001_001_Ra_S1,NODE_185_length_1028_cov_1.098843,1028,26,True,True,True,,,,,,
344,CMS001_001_Ra_S1,NODE_960_length_534_cov_1.768053,534,13,True,True,True,,,,,,


In [36]:
contig_stats_lca_raw[contig_stats_lca_raw["hexapoda"] & contig_stats_lca_raw["nt"] & (contig_stats_lca_raw["contig_name"].isin(blast_lca_nt["query"]))]


Unnamed: 0,sample,contig_name,contig_length,read_count,nt,nr,hexapoda,nt_or_nr,taxid,bitscore,identity,align_length,taxon_group
24,CMS001_001_Ra_S1,NODE_21_length_2269_cov_429.505474,2269,17303,True,True,True,,,,,,
55,CMS001_001_Ra_S1,NODE_722_length_595_cov_8.637066,595,164,True,True,True,,,,,,
59,CMS001_001_Ra_S1,NODE_54_length_1543_cov_0.932469,1543,147,True,True,True,,,,,,
75,CMS001_001_Ra_S1,NODE_583_length_658_cov_1.590361,658,15,True,True,True,,,,,,
107,CMS001_001_Ra_S1,NODE_2455_length_373_cov_0.986486,373,4,True,True,True,,,,,,
137,CMS001_001_Ra_S1,NODE_207_length_994_cov_1.401309,994,46,True,False,True,,,,,,
151,CMS001_001_Ra_S1,NODE_2347_length_378_cov_0.943522,378,4,True,True,True,,,,,,
160,CMS001_001_Ra_S1,NODE_4686_length_287_cov_0.928571,287,3,True,False,True,,,,,,
183,CMS001_001_Ra_S1,NODE_1921_length_408_cov_1.102719,408,5,True,True,True,,,,,,
192,CMS001_001_Ra_S1,NODE_2922_length_349_cov_3.040441,349,14,True,True,True,,,,,,


# Curated list of known viruses

In [35]:
# Information about Baltimore classification of virus family groups
viral_family_groups = pd.read_csv("../../data/virus_family_groups.csv", header=0)
viral_family_groups = viral_family_groups.loc[:, ~viral_family_groups.columns.str.startswith('Unnamed')]
# Convert virus json into data frame
with open ("../../data/darkmatter/virus.json", "r") as f:
    viral_json = pd.DataFrame(json.load(f)).T
    viral_json['poly_group'] = viral_json.index
viral_contigs_df = pd.concat(viral_json.apply(get_viral_family_df, axis=1).tolist())
numbers["num_viral_contigs"] = len(viral_contigs_df)
# Add read proportions columns
viral_contigs = pd.merge(viral_contigs_df, contig_stats_all[["sample", "read_prop", "contig_name"]], how="left", on=["sample", "contig_name"]).groupby(["sample", "sci_name", "taxid", "poly_group"])["read_prop"].sum().reset_index()
# Exclude viruses labelled as contamination
viral_contigs = pd.merge(viral_contigs, true_viral_reads, how="left", on=["sample", "poly_group"])
viral_contigs = viral_contigs[~viral_contigs["reads"].isnull()]
# Add metadata information
viral_contigs = pd.merge(viral_contigs, contig_stats_lca[["sample"]+metadata_cols].groupby(["sample"]).first().reset_index(), how="left")
viral_contigs = pd.merge(viral_contigs, viral_contigs_df[["sample", "sci_name", "family"]], how="left")
# Add baltimore group information about the viruses
viral_contigs = pd.merge(viral_contigs, viral_family_groups, on="family", how="left")
viral_contigs.loc[viral_contigs["family"]=="Chuviridae", "baltimore_group"] = "V"


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


# Curate lists of non-viral contigs with high-confidence hits to NCBI records

In [36]:
# Only keep hits that are almost identical to a known wolbachia sequence
wolbachia_taxid = 952
wolbachia_contigs = get_rows_taxid(contig_stats_lca, taxid=wolbachia_taxid, taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Only keep wolbachia groups that were not removed by the decontamination step
wolbachia_contigs = pd.merge(wolbachia_contigs, true_reads, how="left")
wolbachia_contigs = wolbachia_contigs[~wolbachia_contigs["reads"].isnull()]
numbers["total_wolbachia_contigs"] = len(wolbachia_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
wolbachia_contigs = get_summary_table(wolbachia_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid"], metric=["read_count", "read_prop"])
# rename column to 'reads' for consistency
wolbachia_contigs = wolbachia_contigs.rename(columns={"read_count":"reads"})
# Create sci_name column to denote that that this table contains Wolbachia samples
wolbachia_contigs = wolbachia_contigs.assign(sci_name=ncbi.get_taxid_translator([wolbachia_taxid])[wolbachia_taxid])



In [71]:
# Only keep hits that are almost identical to a known metazoan sequence
metazoan_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep metazoan groups that were not removed by the decontamination step
metazoan_contigs = pd.merge(metazoan_contigs, true_reads, how="left")
metazoan_contigs = metazoan_contigs[~metazoan_contigs["reads"].isnull()]
# Convert taxids to those of interest
#metazoan_contigs = clean_taxids(metazoan_contigs, taxids=["Leporidae", "Muroidea", "Homo sapiens", "Carnivora", "Odocoileinae", "Bovidae", "Neognathae"], root_taxid="Metazoa")
metazoan_contigs = metazoan_contigs.assign(sci_name=metazoan_contigs["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_metazoan_contigs"] = len(metazoan_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
metazoan_contigs = get_summary_table(metazoan_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# rename column to 'reads' for consistency
metazoan_contigs = metazoan_contigs.rename(columns={"read_count":"reads"})
# In the 'family' column, add information about the higher taxonomic grouping for taxids
metazoan_contigs = group_at_higher_tax(metazoan_contigs, taxonomic_group=["Onchocercidae"], family_name="Other metazoa", taxid_colname="taxid", family_colname="family")
    

In [38]:
# Only keep hits that are almost identical to a known chordate sequence
# chordate_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
chordate_contigs = get_rows_taxid(contig_stats_lca, taxid="Chordata", taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Only keep metazoan groups that were not removed by the decontamination step
chordate_contigs = pd.merge(chordate_contigs, true_reads, how="left")
chordate_contigs = chordate_contigs[~chordate_contigs["reads"].isnull()]
# Convert taxids to those of interest
chordate_contigs = chordate_contigs.assign(sci_name=chordate_contigs["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_chordate_contigs"] = len(chordate_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
chordate_contigs = get_summary_table(chordate_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# rename column to 'reads' for consistency
chordate_contigs = chordate_contigs.rename(columns={"read_count":"reads"})
# In the 'family' column, add information about the higher taxonomic grouping for taxids
chordate_contigs = group_at_higher_tax(chordate_contigs, taxonomic_group=["Pecora", "Aves", "Carnivora", "Rodentia", "Leporidae"], family_name="Other chordates", taxid_colname="taxid", family_colname="family")




In [72]:
# Only keep hits that are almost identical to a known eukaryote sequence
eukaryotic_contigs_df = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Eukaryota") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep eukaryotic groups that were not removed by the decontamination step
eukaryotic_contigs_df = pd.merge(eukaryotic_contigs_df, true_reads, how="left")
eukaryotic_contigs_df = eukaryotic_contigs_df[~eukaryotic_contigs_df["reads"].isnull()]
# Convert taxids to those of interest
eukaryotic_contigs = eukaryotic_contigs_df.assign(sci_name=eukaryotic_contigs_df["taxid"].apply(lambda x: ncbi.get_taxid_translator([x])[x]))
numbers["total_eukaryotic_contigs"] = len(eukaryotic_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
eukaryotic_contigs = get_summary_table(eukaryotic_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# # rename column to 'reads' for consistency
eukaryotic_contigs = eukaryotic_contigs.rename(columns={"read_count":"reads"})
# # In the 'family' column, add information about the higher taxonomic grouping for taxids
eukaryotic_contigs = group_at_higher_tax(eukaryotic_contigs, taxonomic_group=["Trypanosomatidae", "Apicomplexa", "Microsporidia"], family_name="Other eukaryotes", taxid_colname="taxid", family_colname="family")


# Output file

In [73]:
all_contigs_df = pd.concat([wolbachia_contigs.assign(group="Wolbachia"), 
                            viral_contigs.assign(group="Virus"),
                            metazoan_contigs.assign(group="Metazoa"),
                            chordate_contigs.assign(group="Chordates"),
                            eukaryotic_contigs.assign(group="Other Eukaryotes")])





of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """


In [74]:
all_contigs_df.to_csv("../../figures/fig3/all_contigs_df.tsv", sep="\t", index=False)