In [367]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
import pdb
import seaborn as sns
import matplotlib.pyplot as plt
from lca_functions import *
ncbi = NCBITaxa()

In [291]:
identity_qcov_threshold = 0.9
min_total_contig_len = 1000

In [106]:
## Load metadata
metadata = pd.read_csv("../../data/metadata/CMS001_CMS002_MergedAnnotations.csv", header=0)

In [348]:
## Read counts data
all_read_counts = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_counts.csv", header=0)

In [354]:
## Load data
contig_stats_lca = pd.read_csv("s3://czbiohub-mosquito/contig_quality_concat/contig_stats_lca.tsv", sep="\t", header=0)
contig_stats_lca = contig_stats_lca.assign(identity_qcov=(contig_stats_lca["identity"]/100*contig_stats_lca["align_length"]/contig_stats_lca["contig_length"]).apply(lambda x: min(x, 1)))
contig_stats_lca = contig_stats_lca[~(contig_stats_lca["hexapoda"] | contig_stats_lca["taxid"].isnull())]


In [355]:
contig_stats_lca = pd.merge(pd.merge(contig_stats_lca, metadata, how="left", left_on="sample", right_on="NewIDseqName"), 
                            all_read_counts.groupby("sample").sum().reset_index().rename(columns={"counts":"total_nonhost"}),
                           how="left")

In [358]:
contig_stats_lca["read_prop"] = contig_stats_lca["read_count"]/contig_stats_lca["total_nonhost"]



In [380]:
def get_rows_taxid (df, taxid, taxid_colname="taxid", identity_qcov_cutoff=None):
    if (not isinstance(df, pd.DataFrame)):
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]
    if identity_qcov_cutoff is not None:
        outdf = outdf[outdf["identity_qcov"]>=identity_qcov_cutoff]
    return (outdf)

def filter_using_water_by_criterion (df, colname, split_by_taxid=True):
    if (len(df)==0):
        return (df)
    if split_by_taxid:
        water_counts = df[df["sample"].apply(lambda x: "ater" in x)].groupby(["taxid", "sample"])[colname].sum().reset_index().groupby("taxid").max()
        df_groupby = df.groupby(["taxid", "sample"])
        sample_sums = df_groupby[colname].sum().reset_index()
        for i, x in enumerate(water_counts[colname]):
            sample_sums["tokeep"] = sample_sums.loc[sample_sums["taxid"]==water_counts.index[i], colname] > x
            df = pd.merge(df, sample_sums.drop(columns=colname), how="left")
            df = df[df["tokeep"]!=False].drop(columns="tokeep")
    else:
        metrics = df.groupby("sample")[colname].sum().sort_values()
        selected_samples = metrics.index[([i for i, x in enumerate(metrics.index) if 'ater' in x][-1]+1):]
        df = df[df["sample"].isin(selected_samples)]
    return (df)
    
def use_water_filter (df, by_read_count=True, by_length=True, split_by_taxid=True):
    if (not df["sample"].apply(lambda x: "ater" in x).any()):
        return (df)
    if split_by_taxid:
        if (by_read_count):
            df = filter_using_water_by_criterion(df, "read_count", split_by_taxid)
        if (by_length):
            df = filter_using_water_by_criterion(df, "contig_length", split_by_taxid)
    else:
        if (by_read_count):
            filter_using_water_by_criterion(df, "read_count", split_by_taxid)
        if (by_length):
            filter_using_water_by_criterion(df, "contig_length", split_by_taxid)
    return (df)

def filter_by_criterion (df, colname, minthreshold, bysample=True):
    if bysample:
        sums = df.groupby(["sample", "taxid"])[colname].sum().reset_index()
        sums["tokeep"] = sums[colname] >= minthreshold
        df = pd.merge(df, sums.drop(columns=colname), how="left")
        df = df[df["tokeep"]!=False].drop(columns="tokeep")
    else:
        df = df[df[colname] >= minthreshold]
    return (df)


def check_if_in_any_taxid(taxid, taxid_list):
    if taxid in taxid_list:
        return (taxid)
    taxids = ncbi.get_lineage(taxid)
    check_in = [i for i, x in enumerate(taxids) if x in taxid_list]
    if (len(check_in)==0):
        return (np.nan)
    return (taxids[check_in[0]])



def clean_taxids(df, taxids, root_taxid, taxid_colname="taxid"):
    if isinstance(taxids[0], str):
        taxids = dict(zip([ncbi.get_name_translator([x])[x][0] for x in taxids], taxids))
    else:
        taxids = ncbi.get_taxid_translator(taxids)
    if isinstance(root_taxid, str):
        root_taxid_number = ncbi.get_name_translator([root_taxid])[root_taxid][0]
        root_taxid = {root_taxid_number:root_taxid}
    else:
        root_taxid = ncbi.get_taxid_translator([root_taxid])
    df[taxid_colname] = df[taxid_colname].apply(check_if_in_any_taxid, taxid_list=taxids)
    df[taxid_colname][df[taxid_colname].isnull()] = list(root_taxid.keys())[0]
    taxids.update(root_taxid)
    df["sci_name"] = df[taxid_colname].apply(lambda x: taxids[x])
    return (df)
    

def get_summary_table (df, colnames, metric):
    df = df.groupby(colnames)[metric].sum().reset_index()
    sort_order = colnames+[metric]
    sort_order.remove("sample")
    return (df.sort_values(by=sort_order, ascending=False))



    

In [387]:
# Only keep hits that are almost identical to a known wolbachia sequence
wolbachia_taxid = 952
wolbachia_contigs = get_rows_taxid(contig_stats_lca, taxid=wolbachia_taxid, taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Filter contigs that were less abundant or shorter than contigs found in water belonging to the same taxid
wolbachia_contigs = use_water_filter(wolbachia_contigs, by_read_count=True, by_length=True)
# Filter taxids for which the total length of contigs in a sample was less than min_total_contig_len (default 1000 bp)
wolbachia_contigs = filter_by_criterion(wolbachia_contigs, "contig_length", minthreshold=min_total_contig_len, bysample=True)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
wolbachia_contigs = get_summary_table(wolbachia_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample"], metric="read_prop")
wolbachia_contigs = wolbachia_contigs.assign(taxid=wolbachia_taxid)
wolbachia_contigs = wolbachia_contigs.assign(sci_name=ncbi.get_taxid_translator([wolbachia_taxid])[wolbachia_taxid])
wolbachia_contigs
    



Unnamed: 0,ska_genus,ska_species,collected_by,sample,read_prop,taxid,sci_name
30,Culex,quinquefasciatus,WVAL,CMS002_046a_Rb_S191_L004,0.133324,952,Wolbachieae
28,Culex,quinquefasciatus,WVAL,CMS002_027a_Rb_S152_L004,0.117018,952,Wolbachieae
29,Culex,quinquefasciatus,WVAL,CMS002_027b_Rb_S153_L004,0.111187,952,Wolbachieae
31,Culex,quinquefasciatus,WVAL,CMS002_046b_Rb_S192_L004,0.109397,952,Wolbachieae
25,Culex,quinquefasciatus,SAND,CMS002_013a_Rb_S120_L004,0.126148,952,Wolbachieae
22,Culex,quinquefasciatus,SAND,CMS002_001a_Rb_S116_L004,0.113309,952,Wolbachieae
23,Culex,quinquefasciatus,SAND,CMS002_004a_Rb_S117_L004,0.109971,952,Wolbachieae
24,Culex,quinquefasciatus,SAND,CMS002_010a_Rb_S119_L004,0.104038,952,Wolbachieae
26,Culex,quinquefasciatus,SAND,CMS002_022a_Rb_S137_L004,0.053287,952,Wolbachieae
27,Culex,quinquefasciatus,SAND,CMS002_023a_Rb_S138_L004,0.015931,952,Wolbachieae


In [395]:
# Only keep hits that are almost identical to a known viral sequence
viral_contigs = get_rows_taxid(contig_stats_lca, taxid=10239, taxid_colname="taxid", identity_qcov_cutoff=identity_qcov_threshold)
# Filter contigs that were less abundant or shorter than contigs found in water belonging to the same taxid
viral_contigs = use_water_filter(viral_contigs, by_read_count=True, by_length=True, split_by_taxid=True)
# Filter taxids for which the total length of contigs in a sample was less than 500bp
viral_contigs = filter_by_criterion(viral_contigs, "contig_length", minthreshold=500, bysample=True)
# Convert taxids to those of interest
viral_groups = ["unclassified Riboviria", "Bunyavirales", "Orthomyxoviridae", "Mononegavirales", 
                "Nodaviridae", "Flavivirus", "Narnaviridae", "Totiviridae", "Iflaviridae", "dsRNA viruses"]
viral_contigs = clean_taxids(viral_contigs, taxids=viral_groups, root_taxid=10239)
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
viral_contigs = get_summary_table(viral_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid", "sci_name"], metric="read_prop")
viral_contigs
    







A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ska_genus,ska_species,collected_by,sample,taxid,sci_name,read_prop
189,Culiseta,particeps,ALCO,CMS001_014_Ra_S5,35325.0,dsRNA viruses,0.000483
188,Culiseta,incidens,ALCO,CMS001_005_Ra_S3,35325.0,dsRNA viruses,0.000075
187,Culex,tarsalis,WVAL,CMS002_029e_Rb_S164_L004,2585030.0,unclassified Riboviria,0.114450
183,Culex,tarsalis,WVAL,CMS002_029d_Rb_S162_L004,2585030.0,unclassified Riboviria,0.096161
175,Culex,tarsalis,WVAL,CMS002_029b_Rb_S160_L004,1980410.0,Bunyavirales,0.163665
179,Culex,tarsalis,WVAL,CMS002_029c_Rb_S161_L004,1980410.0,Bunyavirales,0.127774
182,Culex,tarsalis,WVAL,CMS002_029d_Rb_S162_L004,1980410.0,Bunyavirales,0.127377
173,Culex,tarsalis,WVAL,CMS002_029a_Rb_S159_L004,1980410.0,Bunyavirales,0.120244
186,Culex,tarsalis,WVAL,CMS002_029e_Rb_S164_L004,1980410.0,Bunyavirales,0.106881
178,Culex,tarsalis,WVAL,CMS002_029c_Rb_S161_L004,699189.0,Iflaviridae,0.000334


In [382]:
# Only keep hits that are almost identical to a known metazoan sequence
metazoan_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Metazoa") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Filter contigs that were less abundant or shorter than contigs found in water belonging to the same taxid
metazoan_contigs = use_water_filter(metazoan_contigs, by_read_count=True, by_length=True, split_by_taxid=True)
# Filter taxids for which the total length of contigs in a sample was less than min_total_contig_len (default 1000 bp)
metazoan_contigs = filter_by_criterion(metazoan_contigs, "contig_length", minthreshold=min_total_contig_len, bysample=True)
# Convert taxids to those of interest
metazoan_contigs = clean_taxids(metazoan_contigs, taxids=["Leporidae", "Muroidea", "Homo sapiens", "Carnivora", "Odocoileinae", "Bovidae", "Neognathae"], root_taxid="Metazoa")
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
metazoan_contigs = get_summary_table(metazoan_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid", "sci_name"], metric="read_prop")
metazoan_contigs
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ska_genus,ska_species,collected_by,sample,taxid,sci_name,read_prop
60,Culiseta,particeps,ALCO,CMS001_003_Ra_S2,33208.0,Metazoa,0.001395
59,Culiseta,particeps,ALCO,CMS001_003_Ra_S2,9881.0,Odocoileinae,0.003614
61,Culiseta,particeps,ALCO,CMS001_008_Ra_S3,9881.0,Odocoileinae,0.000719
62,Culiseta,particeps,ALCO,CMS001_025_Ra_S7,9881.0,Odocoileinae,0.000111
58,Culiseta,inornata,ALCO,CMS001_016_Ra_S6,9881.0,Odocoileinae,0.001187
57,Culiseta,incidens,ALCO,CMS001_005_Ra_S3,9606.0,Homo sapiens,0.001814
56,Culex,tarsalis,COAV,CMS002_042a_Rb_S177_L004,33208.0,Metazoa,0.022357
45,Culex,tarsalis,ALCO,CMS001_031_Ra_S19,33554.0,Carnivora,0.013463
52,Culex,tarsalis,ALCO,CMS001_042_Ra_S23,33554.0,Carnivora,0.001881
44,Culex,tarsalis,ALCO,CMS001_028_Ra_S17,33554.0,Carnivora,0.000693


In [407]:
# Only keep hits that are almost identical to a known eukaryotic sequence
eukaryotic_contigs = contig_stats_lca[(contig_stats_lca["taxon_group"]=="Eukaryota") & (contig_stats_lca["identity_qcov"]>=identity_qcov_threshold)]
# Filter contigs that were less abundant or shorter than contigs found in water belonging to the same taxid
eukaryotic_contigs = use_water_filter(eukaryotic_contigs, by_read_count=True, by_length=True, split_by_taxid=True)
# Filter taxids for which the total length of contigs in a sample was less than min_total_contig_len (default 1000 bp)
eukaryotic_contigs = filter_by_criterion(eukaryotic_contigs, "contig_length", minthreshold=min_total_contig_len, bysample=True)
# Convert taxids to those of interest
eukaryotic_groups = ["Fungi", "Trypanosomatidae", "Plasmodium", "Stramenopiles", "Viridiplantae", "Euglenozoa", "Alveolata"]
eukaryotic_contigs = clean_taxids(eukaryotic_contigs, taxids=eukaryotic_groups, root_taxid="Eukaryota")
# Return a list of species grouped by mosquito species, collection site, sample, and taxid, and sorted by total read count
eukaryotic_contigs = get_summary_table(eukaryotic_contigs, colnames=["ska_genus", "ska_species", "collected_by", "sample", "taxid", "sci_name"], metric="read_prop")

eukaryotic_contigs
    



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,ska_genus,ska_species,collected_by,sample,taxid,sci_name,read_prop
32,Culiseta,particeps,ALCO,CMS001_009_Ra_S13,33090.0,Viridiplantae,0.000407
31,Culiseta,particeps,ALCO,CMS001_009_Ra_S13,4751.0,Fungi,0.005455
30,Culiseta,particeps,ALCO,CMS001_008_Ra_S3,2759.0,Eukaryota,0.000349
29,Culex,tarsalis,WVAL,CMS002_029d_Rb_S162_L004,4751.0,Fungi,0.025096
27,Culex,tarsalis,ALCO,CMS001_032_Ra_S7,33682.0,Euglenozoa,0.000552
28,Culex,tarsalis,ALCO,CMS001_036_Ra_S20,4751.0,Fungi,0.000261
26,Culex,quinquefasciatus,SAND,CMS002_004a_Rb_S117_L004,5820.0,Plasmodium,0.010679
25,Culex,pipiens,ALCO,CMS001_059_Ra_S10,33682.0,Euglenozoa,0.021565
23,Culex,pipiens,ALCO,CMS001_040_Ra_S21,5820.0,Plasmodium,0.003294
24,Culex,pipiens,ALCO,CMS001_041_Ra_S10,2759.0,Eukaryota,0.002005


In [408]:
all_contigs_df = pd.concat([wolbachia_contigs.assign(group="Wolbachia"), 
                            viral_contigs.assign(group="Virus"),
                            metazoan_contigs.assign(group="Metazoa"), 
                            eukaryotic_contigs.assign(group="Other Eukaryotes")])





of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  after removing the cwd from sys.path.


In [410]:
all_contigs_df.to_csv("../../figures/fig3/all_contigs_df.tsv", sep="\t", index=False)