In [1]:
import pandas as pd
import numpy as np
from ete3 import NCBITaxa
import boto3
import tempfile
import subprocess
import os
import io
import re
import time
import json
import math
ncbi = NCBITaxa()

In [2]:
# run this update as needed
#ncbi.update_taxonomy_database()

Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...


Loading node names...
2234295 names loaded.
216465 synonyms loaded.
Loading nodes...
2234295 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /Users/hanna/.etetoolkit/taxa.sqlite ...
 2234000 generating entries...   
Uploading to /Users/hanna/.etetoolkit/taxa.sqlite


Inserting synonyms:      25000 




Inserting taxid merges:  45000  




Inserting taxids:       25000  




Inserting taxids:       2230000 




# Read in data

In [3]:
gitdir = "/Users/Hanna/Desktop/MyBox/aa_DeRisi/mosquito/california-mosquito-study/"
identity_qcov_threshold = 0.9

In [4]:
## Load metadata
metadata = pd.read_csv(gitdir + "/data/metadata/CMS001_CMS002_MergedAnnotations.csv", header=0)
idseq_data = pd.read_csv(gitdir + "/data/metadata/idseq_metadata.csv", header=0)
idseq_data = idseq_data[~idseq_data['sample'].str.contains('Placeholder')]

In [5]:
## Load contig data
contig_calls_all = pd.read_csv(gitdir + "/data/s3/contig_calls.tsv", sep="\t", header=0)
contig_calls_decontam = pd.read_csv(gitdir + "/data/s3/contig_calls_decontam.tsv", sep="\t", header=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
## Calculate nonhost reads taking hexapoda and contaminating reads into account
metadata_cols = ["NewIDseqName","ska_genus", "ska_species", "collected_by"]
idseq_cols = ["sample","nonhost_reads", "total_reads"]
# Calculate reads removed during decontamination and hexapoda subtraction
contam_counts = pd.DataFrame(contig_calls_all.groupby('sample').read_count.sum() - 
                             contig_calls_decontam.groupby('sample').read_count.sum() )
# Sum reads to hexapoda contigs by sample
contigs_hex = contig_calls_all[(contig_calls_all.hexapoda == True) & (contig_calls_all.curated == False)]
hex_counts = contigs_hex.groupby("sample")["read_count"].sum().reset_index()
# Collect metadata, idseq nonhost reads, and additional hexapoda reads
metadata = pd.concat([ metadata[metadata_cols].rename(columns={"NewIDseqName":"sample"}).set_index("sample"), 
           idseq_data[idseq_cols].rename(columns={"nonhost_reads":"idseq_nonhost_reads"}).set_index("sample"), 
           hex_counts.rename(columns={"read_count":"hex_read_count"}).set_index("sample"),
           contam_counts.rename(columns={'read_count': 'contaminating_and_hex_reads'})], 
                     axis=1, sort=True)
#Check that hexapoda counts are < all contaminating_and_hex counts for all samples
#print(metadata[~(metadata['contaminating_and_hex_reads'] >= metadata['hex_read_count'])]) #shld only contain waters
# Subtract contaminating+hex reads from idseq_nonhost_reads to get final nonhost reads
metadata['hex_read_count'].fillna(0,inplace=True)
metadata['contaminating_and_hex_reads'].fillna(0,inplace=True)
metadata['nonhost_reads'] = metadata['idseq_nonhost_reads'] - metadata['contaminating_and_hex_reads']
# Reorganizing columns
metadata['sample'] = metadata.index
metadata.reset_index(inplace=True)
metadata.drop(columns='index',inplace=True)

In [7]:
# Write final nonhost reads to file
metadata[['sample','nonhost_reads']].to_csv(
    gitdir + "/data/metadata/nonhost_reads_decontam_nohexapoda.tsv", index=False, sep="\t")

In [8]:
# Update contig stats with nonhost reads
contig_stats = pd.merge(contig_calls_decontam, metadata, how="left", on="sample")
# Check no hexapoda remain
print( contig_stats[(contig_stats.hexapoda == True) & (contig_stats.curated == False)].shape[0] == 0 ) #shld be True
# Calculate read proportion and identity/query-coverage threshold
contig_stats['read_prop'] = contig_stats['read_count'] / contig_stats['nonhost_reads']
# Calculate identity/query-coverage threshold
contig_stats = contig_stats.assign(identity_qcov=(contig_stats["identity"]/100 
                                                  * contig_stats["align_length"]/contig_stats["contig_length"]
                                                 ).apply(lambda x: min(x, 1)))
contig_stats.loc[contig_stats['curated'] == True, 'identity_qcov'] = np.nan

True


# Viruses

In [9]:
contig_stats_viral = contig_stats[(contig_stats["taxon_group"]=="Viruses") & contig_stats["curated"]==True]

In [10]:
# Information about Baltimore classification of virus family groups
viral_family_groups = pd.read_csv(gitdir + "/data/virus_family_groups.csv", header=0)
viral_family_groups = viral_family_groups.loc[:, ~viral_family_groups.columns.str.startswith('Unnamed')]
viral_family_groups = viral_family_groups.append(
    {"family":"Chuviridae",
    "baltimore_group":"V",
    "genome_description":"",
    "number_of_segments_conventional":""}
    , ignore_index = True)

# Convert virus json into data frame
with open (gitdir +"/data/darkmatter/virus.json", "r") as f:
    viral_json = pd.DataFrame(json.load(f)).T
    viral_json['poly_group'] = viral_json.index

# Check which viral families not represented
print(set(viral_json.family.unique()) - set (viral_family_groups.family.unique())) #should be empty

# Merge viral info with contig stats
viral_info = viral_json.merge(viral_family_groups,on='family')
viral_info.provisional_name.fillna(viral_info.name, inplace=True)
viral_info.rename(columns={'provisional_name':'sci_name'},inplace=True)
contigs_viral = contig_stats_viral.merge(viral_info[['poly_group','family','baltimore_group',
                                                     'genome_description','number_of_segments_conventional',
                                                     'sci_name']].
                                              astype({'poly_group': 'float64'}),
                                              on='poly_group', how = 'left')
contigs_viral = contigs_viral.assign(group="Virus")

set()


# Non-viral contigs

In [11]:
def get_rows_taxid(df, taxid, taxid_colname="taxid"):
    if (not isinstance(df, pd.DataFrame)):
        if isinstance(taxid, str):
            taxid = ncbi.get_name_translator([taxid])[taxid][0]
        return (taxid in ncbi.get_lineage(df))
    outdf = df[df[taxid_colname].apply(get_rows_taxid, taxid=taxid)]
    return (outdf) 

def group_at_higher_tax(df, taxonomic_group, family_name, taxid_colname="taxid", family_colname="family"):
    groups = {}
    for x in taxonomic_group:
        if isinstance(x, str):
            groups[x] = ncbi.get_name_translator([x])[x][0]
        else:
            groups[ncbi.get_taxid_translator(x)[x]] = x
    family_assignments = {}
    for x in df["taxid"].unique():
        lin = ncbi.get_lineage(x)
        family_assignments[x] = family_name
        for key, taxid_x in groups.items():
            if taxid_x in lin:
                family_assignments[x] = key
                break
    df[family_colname] = df["taxid"].apply(lambda x: family_assignments[x])
    return (df)

def get_summary_table (df, colnames, metric):
    df = df.groupby(colnames)[metric].sum().reset_index()
    if not isinstance(metric, list):
        metric = [metric]
    sort_order = colnames+metric
    sort_order.remove("sample")
    return (df.sort_values(by=sort_order, ascending=False))

In [12]:
# Subset to contigs that are not viral, and are strong NCBI hits based on identity/qcov threshold
contig_stats_nonviral = contig_stats[ (contig_stats["taxon_group"]!="Viruses") &
                                    (contig_stats["identity_qcov"]>=identity_qcov_threshold) ]

In [13]:
## WOLBACHIA
wolbachia_taxid = 952
wolbachia_contigs = get_rows_taxid(contig_stats_nonviral, wolbachia_taxid);
## METAZOANS
metazoan_contigs = contig_stats_nonviral[contig_stats_nonviral["taxon_group"]=="Metazoa"];
# In the 'family' column, add information about the higher taxonomic grouping for taxids
metazoan_contigs = group_at_higher_tax(metazoan_contigs, 
                                       taxonomic_group=["Nematoda"], 
                                       family_name="Other metazoa", taxid_colname="taxid", family_colname="family")
## CHORDATA
chordate_contigs = get_rows_taxid(contig_stats_nonviral, taxid="Chordata");
# In the 'family' column, add information about the higher taxonomic grouping for taxids
chordate_contigs = group_at_higher_tax(chordate_contigs, 
                                       taxonomic_group=["Pecora", "Aves", "Carnivora", "Rodentia", "Leporidae"], 
                                       family_name="Other chordates", taxid_colname="taxid", family_colname="family")
## OTHER EUKARYOTES
eukaryotic_contigs = contig_stats_nonviral[contig_stats_nonviral["taxon_group"]=="Eukaryota"];
# In the 'family' column, add information about the higher taxonomic grouping for taxids
eukaryotic_contigs = group_at_higher_tax(eukaryotic_contigs, 
                                         taxonomic_group=["Trypanosomatidae", "Apicomplexa", "Microsporidia"], 
                                         family_name="Other eukaryotes", taxid_colname="taxid", family_colname="family")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
# Combine nonviral
contigs_nonviral = pd.concat([wolbachia_contigs.assign(group="Wolbachia").assign(family="Wolbachia"), 
                            metazoan_contigs.assign(group="Metazoa"),
                            chordate_contigs.assign(group="Chordates"),
                            eukaryotic_contigs.assign(group="Other Eukaryotes")], sort=False)
# Convert taxids to human_readable
contigs_nonviral = contigs_nonviral.assign(sci_name=contigs_nonviral["taxid"].
                                           apply(lambda x: ncbi.get_taxid_translator([x])[x]))
contigs_nonviral.loc[contigs_nonviral['group']=='Wolbachia', ['sci_name']] = 'Wolbachieae'

# Merging nonviral and viral

In [15]:
contigs_out = pd.concat([contigs_nonviral, contigs_viral], sort=False)
# Count total number of contigs in each broad group
numbers = contigs_out.groupby("group")["group"].count()
print(numbers)

group
Chordates            617
Metazoa              657
Other Eukaryotes     553
Virus               1223
Wolbachia           1137
Name: group, dtype: int64


In [16]:
# Summary table to combine reads to all contigs from the same sample-taxon pair
summary_nonviral = get_summary_table(contigs_out[contigs_out["group"]!="Virus"], 
                            colnames=["ska_genus","ska_species","collected_by","sample","taxid","group","family","sci_name"],
                                     metric=["read_count", "read_prop"])
summary_viral = get_summary_table(contigs_out[contigs_out["group"]=="Virus"], 
                            colnames=["ska_genus","ska_species","collected_by","sample","taxid","group","family","sci_name",
                                      "poly_group","baltimore_group","genome_description","number_of_segments_conventional"],
                            metric=["read_count", "read_prop"])
contigs_out_summary = pd.concat([summary_nonviral, summary_viral], sort=False, ignore_index = True)
# Rename column for consistency
contigs_out_summary.rename(columns={"read_count":"reads"}, inplace = True)
# Check all reads are accounted for in the summarizing
contigs_out['read_count'].sum() == contigs_out_summary['reads'].sum() #Shld be True

True

In [17]:
# Write to file
contigs_out_summary.to_csv(gitdir + "/figures/fig3/all_contigs_df_new.tsv", sep="\t", index=False)

In [18]:
# Write contig names to file for eukaryotes-in-bloodmeals analysis
category_names = ['Apicomplexa','Trypanosomatidae']
for catname in category_names:
    df = contigs_out[(contigs_out['family']==catname) & (contigs_out['sample'].str.contains('CMS001'))]
    out = df['sample'] + "~" + df['contig_name']
    filename = "contig_names_CMS001_" + catname + ".txt"
    print(catname + ": " + str(len(out)) + " contigs")
    out.to_csv(gitdir + "/figures/fig4/" + filename, sep="\n", index=False)

Apicomplexa: 9 contigs
Trypanosomatidae: 71 contigs


# Comparing old vs. new

In [19]:
def whatsin(df):
    return [{x:df[x].unique()} for x in df.columns.tolist()]

def get_desc_contigs(contig_df, txid):
    taxa = ncbi.get_descendant_taxa(txid, intermediate_nodes=True)
    taxa.append(txid)
    df = contig_df[contig_df["taxid"].apply(lambda x: x in taxa)]
    return df

In [20]:
# Checking column names and values in old vs. new
old_df = pd.read_csv(gitdir + "/figures/fig3/all_contigs_df.tsv", sep="\t", header=0)
for mycol in old_df.columns:
    print("For column: " + str(mycol))
    a = set(old_df[mycol].unique())
    b = set(contigs_out_summary[mycol].unique())
    print("old df uniquely contains: " + str(a-b))
    print("new df uniquely contains: " + str(b-a))
    print("------")

For column: baltimore_group
old df uniquely contains: set()
new df uniquely contains: set()
------
For column: collected_by
old df uniquely contains: set()
new df uniquely contains: set()
------
For column: family
old df uniquely contains: {nan, 'Onchocercidae'}
new df uniquely contains: {'Nematoda', 'Wolbachia'}
------
For column: genome_description
old df uniquely contains: set()
new df uniquely contains: {''}
------
For column: group
old df uniquely contains: set()
new df uniquely contains: set()
------
For column: number_of_segments_conventional
old df uniquely contains: set()
new df uniquely contains: {''}
------
For column: poly_group
old df uniquely contains: {nan}
new df uniquely contains: {nan, 5308.0}
------
For column: read_prop
old df uniquely contains: {0.08364174685418213, 0.12414373553409863, 4.277708859135047e-05, 0.25007581731060835, 0.00017110835436540188, 5.347136073918809e-05, 0.0004919365188005304, 8.555417718270094e-05, 0.02230070635721493, 0.011533125216788068, 0

In [21]:
txid = 5654
category_name = 'Trypanosomatidae'
new_level = 'family'
old_level = 'family'

def compare_new_to_old(txid, category_name, new_level = 'family', old_level = 'family'):
    df = pd.concat([ get_desc_contigs(contigs_out,txid).groupby('sample')['read_prop'].sum().rename('new_contigs_by_taxid'),
         contigs_out[(contigs_out[new_level]==category_name)].groupby('sample')['read_prop'].sum().rename('new_contigs_by_category'),
         contigs_out_summary[contigs_out_summary[new_level]==category_name].groupby('sample')['read_prop'].sum().rename('new_summary_by_category'),
         old_df[old_df[old_level]==category_name].groupby('sample')['read_prop'].sum().rename('old_by_category') ],
         axis=1, sort=False)
    return df

In [22]:
compare_new_to_old(5654,'Trypanosomatidae')

Unnamed: 0_level_0,new_contigs_by_taxid,new_contigs_by_category,new_summary_by_category,old_by_category
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMS001_001_Ra_S1,5.1e-05,5.1e-05,5.1e-05,4.8e-05
CMS001_007_Ra_S12,0.113066,0.113066,0.113066,0.083032
CMS001_011_Ra_S4,4.1e-05,4.1e-05,4.1e-05,3.5e-05
CMS001_018_Ra_S14,0.1344,0.1344,0.1344,0.12412
CMS001_022_Ra_S6,0.382009,0.382009,0.382009,0.340947
CMS001_029_Ra_S18,0.001015,0.001015,0.001015,0.000904
CMS001_032_Ra_S7,0.000619,0.000619,0.000619,0.000524
CMS001_035_Ra_S20,0.20938,0.20938,0.20938,0.197322
CMS001_046_Ra_S3,1e-05,1e-05,1e-05,1e-05
CMS001_047_Ra_S4,0.00014,0.00014,0.00014,0.000135


In [23]:
compare_new_to_old(5794,'Apicomplexa')

Unnamed: 0_level_0,new_contigs_by_taxid,new_contigs_by_category,new_summary_by_category,old_by_category
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMS001_027_Ra_S16,6.5e-05,6.5e-05,6.5e-05,5e-05
CMS001_035_Ra_S20,1.1e-05,1.1e-05,1.1e-05,1.1e-05
CMS001_037_Ra_S21,4.8e-05,4.8e-05,4.8e-05,4.4e-05
CMS001_040_Ra_S21,0.002026,0.002026,0.002026,0.001901
CMS001_058_Ra_S9,9.8e-05,9.8e-05,9.8e-05,8.6e-05
CMS002_004a_Rb_S117_L004,0.037329,0.037329,0.037329,0.02773
CMS002_017a_Rb_S122_L004,0.000863,0.000863,0.000863,0.000862
CMS002_017b_Rb_S123_L004,0.001441,0.001441,0.001441,0.001438
CMS002_017c_Rb_S124_L004,0.000247,0.000247,0.000247,0.000247


In [24]:
compare_new_to_old(6231,'Nematoda')

Unnamed: 0,new_contigs_by_taxid,new_contigs_by_category,new_summary_by_category,old_by_category
CMS001_029_Ra_S18,0.000226,0.000226,0.000226,
CMS001_032_Ra_S7,0.000518,0.000518,0.000518,
CMS002_042a_Rb_S177_L004,0.025484,0.025484,0.025484,


In [25]:
compare_new_to_old(629,'Onchocercidae')

Unnamed: 0,new_contigs_by_taxid,new_contigs_by_category,new_summary_by_category,old_by_category
CMS001_029_Ra_S18,,,,0.000201
CMS001_032_Ra_S7,,,,0.000289
CMS002_042a_Rb_S177_L004,,,,0.008251


In [26]:
compare_new_to_old(6029,'Microsporidia')

Unnamed: 0_level_0,new_contigs_by_taxid,new_contigs_by_category,new_summary_by_category,old_by_category
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CMS001_036_Ra_S20,0.001286,0.001286,0.001286,0.001198
CMS001_038_Ra_S22,0.006068,0.006068,0.006068,0.00587
CMS001_043_Ra_S24,0.000246,0.000246,0.000246,0.000213
CMS002_029d_Rb_S162_L004,0.018867,0.018867,0.018867,0.017507
CMS002_045g_Rb_S190_L004,3.1e-05,3.1e-05,3.1e-05,3e-05
CMS002_047j_Rb_S3_L004,0.001869,0.001869,0.001869,0.001603
CMS002_053a_Rb_S7_L004,5.7e-05,5.7e-05,5.7e-05,5.2e-05


In [27]:
contigs_out[contigs_out.taxon_group == "Viruses"].groupby("family")["family"].count()

family
Botourmiaviridae      6
Chrysoviridae        12
Chuviridae           11
Dicistroviridae       7
Flaviviridae          8
Iflaviridae          17
Leviviridae           2
Luteoviridae         15
Narnaviridae         87
Orthomyxoviridae    285
Partitiviridae       54
Peribunyaviridae     66
Phasmaviridae       102
Phenuiviridae        23
Reoviridae           49
Rhabdoviridae        28
Solemoviridae       147
Tombusviridae        36
Totiviridae         195
Tymoviridae           1
Virgaviridae         52
Xinmoviridae         20
Name: family, dtype: int64

In [28]:
contigs_out[contigs_out.taxon_group != "Viruses"].groupby("family")["family"].count()

family
Apicomplexa           29
Aves                 285
Carnivora             38
Leporidae              9
Microsporidia         15
Nematoda              23
Other chordates      120
Other eukaryotes     438
Other metazoa        634
Pecora               127
Rodentia              38
Trypanosomatidae      71
Wolbachia           1137
Name: family, dtype: int64

In [29]:
contigs_out[contigs_out.taxon_group != "Viruses"].groupby("group")["group"].count()

group
Chordates            617
Metazoa              657
Other Eukaryotes     553
Wolbachia           1137
Name: group, dtype: int64

In [30]:
contigs_out.family.unique()

array(['Wolbachia', 'Other metazoa', 'Nematoda', 'Pecora',
       'Other chordates', 'Carnivora', 'Rodentia', 'Leporidae', 'Aves',
       'Other eukaryotes', 'Trypanosomatidae', 'Apicomplexa',
       'Microsporidia', 'Tombusviridae', 'Phasmaviridae', 'Reoviridae',
       'Virgaviridae', 'Totiviridae', 'Peribunyaviridae', 'Iflaviridae',
       'Solemoviridae', 'Narnaviridae', 'Flaviviridae',
       'Orthomyxoviridae', 'Xinmoviridae', 'Partitiviridae',
       'Phenuiviridae', 'Rhabdoviridae', 'Leviviridae', 'Chuviridae',
       'Chrysoviridae', 'Luteoviridae', 'Tymoviridae', 'Dicistroviridae',
       'Botourmiaviridae'], dtype=object)