In [None]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
ncbi = NCBITaxa()

In [673]:
def get_viral_family_df (row_x):
    segments = row_x["segments"]
    df_by_sample = pd.DataFrame([x.split("|") for x in segments[list(segments.keys())[0]]["contigs"]])
    df_by_sample = df_by_sample.assign(sci_name=row_x["family"], taxid=row_x["taxid"])
    df_by_sample = df_by_sample.rename(columns={0:"sample", 1:"contig_name"})
    return (df_by_sample)

def get_rows_taxid (df, taxid, taxid_colname="taxid", identity_qcov_cutoff=None):
    if (not isinstance(df, pd.DataFrame)):
        if isinstance(taxid, str):
            taxid = ncbi.get_name_translator([taxid])[taxid][0]
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]
    if identity_qcov_cutoff is not None:
        outdf = outdf[outdf["identity_qcov"]>=identity_qcov_cutoff]
    return (outdf)


# def filter_by_criterion (df, colname, minthreshold, bysample=True):
#     if bysample:
#         sums = df.groupby(["sample", "taxid"])[colname].sum().reset_index()
#         sums["tokeep"] = sums[colname] >= minthreshold
#         df = pd.merge(df, sums.drop(columns=colname), how="left")
#         df = df[df["tokeep"]!=False].drop(columns="tokeep")
#     else:
#         df = df[df[colname] >= minthreshold]
#     return (df)


def check_if_in_any_taxid(taxid, taxid_list):
    if taxid in taxid_list:
        return (taxid)
    taxids = ncbi.get_lineage(taxid)
    check_in = [i for i, x in enumerate(taxids) if x in taxid_list]
    if (len(check_in)==0):
        return (np.nan)
    return (taxids[check_in[0]])



def clean_taxids(df, taxids, root_taxid, taxid_colname="taxid"):
    if isinstance(taxids[0], str):
        taxids = dict(zip([ncbi.get_name_translator([x])[x][0] for x in taxids], taxids))
    else:
        taxids = ncbi.get_taxid_translator(taxids)
    if isinstance(root_taxid, str):
        root_taxid_number = ncbi.get_name_translator([root_taxid])[root_taxid][0]
        root_taxid = {root_taxid_number:root_taxid}
    else:
        root_taxid = ncbi.get_taxid_translator([root_taxid])
    df[taxid_colname] = df[taxid_colname].apply(check_if_in_any_taxid, taxid_list=taxids)
    df[taxid_colname][df[taxid_colname].isnull()] = list(root_taxid.keys())[0]
    taxids.update(root_taxid)
    df["sci_name"] = df[taxid_colname].apply(lambda x: taxids[x])
    return (df)
    
    
def get_summary_table (df, colnames, metric):
    df = df.groupby(colnames)[metric].sum().reset_index()
    if not isinstance(metric, list):
        metric = [metric]
    sort_order = colnames+metric
    sort_order.remove("sample")
    return (df.sort_values(by=sort_order, ascending=False))

def correct_read_count(df, decontam_df, taxid, taxid_colname="taxid"):
    if isinstance(taxid, list):
        decontam_table = pd.concat([get_rows_taxid(decontam_df, taxid=x, taxid_colname=taxid_colname) for x in taxid])
    else:
        decontam_table = get_rows_taxid(decontam_df, taxid=taxid, taxid_colname=taxid_colname)
    decontam_table = decontam_table.groupby(["sample", "taxid"])["reads"].sum().reset_index()
    outdf = pd.merge(df, decontam_table)
    outdf["read_prop"] = outdf["reads"]/outdf["read_count"]*outdf["read_prop"]
    outdf = outdf.drop(columns=["read_count", "reads"]).rename(columns={"reads":"read_count"})
    return (outdf)

    

# Read in data

In [677]:
identity_qcov_threshold = 0.9
metadata_cols = ["ska_genus", "ska_species", "collected_by"]
numbers = {}

In [678]:
## Read counts data
idseq_data = pd.read_csv("../../data/project-mosquito_sample-table.csv", header=0)

In [679]:
## Load metadata
metadata = pd.read_csv("../../data/metadata/CMS001_CMS002_MergedAnnotations.csv", header=0)
metadata = pd.merge(metadata, idseq_data[["sample_name", "nonhost_reads", "total_reads"]], left_on="NewIDseqName", right_on="sample_name")

In [680]:
## Load LCA data
contig_stats_lca = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_lca.tsv", sep="\t", header=0)
contig_stats_lca = contig_stats_lca.assign(identity_qcov=(contig_stats_lca["identity"]/100*contig_stats_lca["align_length"]/contig_stats_lca["contig_length"]).apply(lambda x: min(x, 1)))
numbers["total_num_contigs_with_blast_hits"] = len(contig_stats_lca)
contig_stats_lca = contig_stats_lca[~(contig_stats_lca["hexapoda"])]
numbers["total_nonhexapoda_contigs"] = len(contig_stats_lca)
contig_stats_lca = pd.merge(contig_stats_lca, metadata, how="left", left_on="sample", right_on="NewIDseqName")
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]

# Curated list of known viruses

In [681]:
# Information about Baltimore classification of virus family groups
viral_family_groups = pd.read_csv("../../data/virus_family_groups.csv", header=0)
# Convert virus json into data frame
with open ("../../data/darkmatter/virus.json", "r") as f:
    viral_json = pd.DataFrame(json.load(f)).T
viral_contigs = pd.concat(viral_json.apply(get_viral_family_df, axis=1).tolist())
numbers["num_viral_contigs"] = len(viral_contigs)
# Add read proportions columns
viral_contig_stats = get_rows_taxid(contig_stats_lca[["sample", "read_prop", "taxid", "contig_name"]], taxid=10239, taxid_colname="taxid")
viral_contigs = pd.merge(viral_contigs, viral_contig_stats).groupby(["sample", "sci_name", "taxid"])["read_prop"].sum().reset_index()
viral_contigs = pd.merge(viral_contigs, contig_stats_lca[["sample"]+metadata_cols])







Unnamed: 0,sample,sci_name,taxid,read_prop,ska_genus,ska_species,collected_by
0,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
1,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
2,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
3,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
4,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
5,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
6,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
7,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
8,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO
9,CMS001_002_Ra_S1,Iflaviridae,2304480,0.701703,Culex,tarsalis,ALCO


# Curate lists of non-viral contigs with high-confidence hits to NCBI records

In [668]:
## Load decontam data
true_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/decontam_sample_read_counts.tsv", sep="\t", header=0)
contam_reads = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/sample_contamination.tsv", sep="\t", header=0)

In [None]:
# Deduct from the nonhost reads per sample the number of reads that were removed due to suspected contamination
contig_stats_lca["nonhost_reads"] = pd.merge(contig_stats_lca, contam_reads.groupby("sample")["reads"].sum().reset_index()).apply(lambda x: x["nonhost_reads"]-x["reads"], axis=1)
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["nonhost_reads"]

In [610]:
# Only keep hits that are almost identical to a known wolbachia sequence
wolbachia_taxid = 952
wolbachia_contigs = get_rows_taxid(contig_stats_lca, taxid=wolbachia_taxid, taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Only keep wolbachia groups that were not removed by the decontamination step
wolbachia_contigs = wolbachia_contigs[wolbachia_contigs["taxid"].isin(true_reads["taxid"])]
numbers["total_wolbachia_contigs"] = len(wolbachia_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
wolbachia_contigs = get_summary_table(wolbachia_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
wolbachia_contigs = correct_read_count(wolbachia_contigs, true_reads, wolbachia_taxid, "taxid")
#wolbachia_contigs = wolbachia_contigs.assign(taxid=wolbachia_taxid)
# Create sci_name column to denote that that this table contains Wolbachia samples
wolbachia_contigs = wolbachia_contigs.assign(sci_name=ncbi.get_taxid_translator([wolbachia_taxid])[wolbachia_taxid])



In [611]:
# Only keep hits that are almost identical to a known metazoan sequence
metazoan_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep metazoan groups that were not removed by the decontamination step
metazoan_contigs = metazoan_contigs[metazoan_contigs["taxid"].isin(true_reads["taxid"])]
# Convert taxids to those of interest
metazoan_contigs = clean_taxids(metazoan_contigs, taxids=["Leporidae", "Muroidea", "Homo sapiens", "Carnivora", "Odocoileinae", "Bovidae", "Neognathae"], root_taxid="Metazoa")
numbers["total_metazoan_contigs"] = len(metazoan_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
metazoan_contigs = get_summary_table(metazoan_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
metazoan_contigs = correct_read_count(metazoan_contigs, true_reads, taxid="Metazoa", taxid_colname="taxid")
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [612]:
# Only keep hits that are almost identical to a known eukaryote sequence
eukaryotic_taxids = ["Fungi", "Trypanosomatidae", "Plasmodium", "Stramenopiles", "Viridiplantae", "Euglenozoa", "Alveolata"]
eukaryotic_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Eukaryota") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Only keep eukaryotic groups that were not removed by the decontamination step
eukaryotic_contigs = eukaryotic_contigs[eukaryotic_contigs["taxid"].isin(true_reads["taxid"])]
# Convert taxids to those of interest
eukaryotic_contigs = clean_taxids(eukaryotic_contigs, taxids=eukaryotic_taxids, root_taxid="Eukaryota")
numbers["total_eukaryotic_contigs"] = len(eukaryotic_contigs)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
eukaryotic_contigs = get_summary_table(eukaryotic_contigs, colnames=metadata_cols+["sample", "taxid", "sci_name"], metric=["read_count", "read_prop"])
# Correct read counts after decontamination
eukaryotic_contigs = correct_read_count(eukaryotic_contigs, true_reads, taxid=eukaryotic_taxids, taxid_colname="taxid") 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Output file

In [682]:
all_contigs_df = pd.concat([wolbachia_contigs.assign(group="Wolbachia"), 
                            viral_contigs.assign(group="Virus"),
                            metazoan_contigs.assign(group="Metazoa"), 
                            eukaryotic_contigs.assign(group="Other Eukaryotes")])





of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  after removing the cwd from sys.path.


In [683]:
all_contigs_df.to_csv("../../figures/fig3/all_contigs_df.tsv", sep="\t", index=False)