# Quality of contigs

In [None]:
cd /mnt/data
ftp ftp.ncbi.nlm.nih.gov
anonymous
lucy.li@czbiohub.org
cd pub/taxonomy/taxdump_archive/
passive on
get taxdmp_2019-02-01.zip
get taxdmp_2018-12-01.zip
get new_taxdump_2018-12-01.zip
bye
# conda install -c conda-forge nodejs
# npm i zip-to-tar -g
zip2tar taxdmp_2019-02-01.zip
zip2tar taxdmp_2018-12-01.zip
zip2tar new_taxdump_2018-12-01.zip

In [1]:
import pandas as pd
import os
import json
import numpy as np
from ete3 import NCBITaxa
from Bio import Entrez
Entrez.email = "lucy.li@czbiohub.org"
%load_ext rpy2.ipython

In [2]:
ncbi_20181201_new = NCBITaxa(dbfile="/mnt/data/new_taxa_2018-12-01.sqlite", taxdump_file="/mnt/data/new_taxdump_2018-12-01.tar.gz")
ncbi_20181201 = NCBITaxa(dbfile="/mnt/data/taxa_2018-12-01.sqlite", taxdump_file="/mnt/data/taxdmp_2018-12-01.tar.gz")

Loading node names...
2033846 names loaded.
204092 synonyms loaded.
Loading nodes...
2033846 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /mnt/data/new_taxa_2018-12-01.sqlite ...
 2033000 generating entries... generating entries... 
Uploading to /mnt/data/new_taxa_2018-12-01.sqlite


Inserting synonyms:      35000 




Inserting taxid merges:  40000  




Inserting taxids:       35000  




Inserting taxids:       2030000 


Loading node names...
2033846 names loaded.
204092 synonyms loaded.
Loading nodes...
2033846 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /mnt/data/taxa_2018-12-01.sqlite ...
 2033000 generating entries...  generating entries... 
Uploading to /mnt/data/taxa_2018-12-01.sqlite


Inserting synonyms:      35000 




Inserting taxid merges:  50000  





Inserting taxids:       2030000 




In [3]:
ncbi_20190201 = NCBITaxa(dbfile="/mnt/data/taxa_2019-02-01.sqlite", taxdump_file="/mnt/data/taxdmp_2019-02-01.tar.gz")

Loading node names...
2050856 names loaded.
205687 synonyms loaded.
Loading nodes...
2050856 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /mnt/data/taxa_2019-02-01.sqlite ...
 2050000 generating entries... 94000 generating entries...  
Uploading to /mnt/data/taxa_2019-02-01.sqlite


Inserting synonyms:      30000 




Inserting taxid merges:   5000  




Inserting taxids:       35000  




Inserting taxids:       2050000 




In [4]:
ncbi = [ncbi_20181201_new, ncbi_20181201, ncbi_20190201]

In [None]:
%%R
pkg_to_load <- c("dplyr", "magrittr", "ggplot2", "tibble")
lapply(pkg_to_load, function (x) {
    if (!(x %in% rownames(installed.packages()))) {
        install.packages(x)
    }
    library(x, character.only=TRUE)
})

In [5]:
blast_results_df = pd.read_csv("blast_results_df.csv", index_col=False)

In [10]:
contig_len_df = pd.read_csv("contig_len_df.csv", index_col=False)

In [11]:
# contig_coverage = {}
# for path, subdirs, files in os.walk("contigs"):
#     for name in files:
#         fn = os.path.join(path, name)
#         sample_name = os.path.basename(path)
#         if (".json" in name):
#             with open (fn) as json_file:
#                 contig_coverage[sample_name] = json.load(json_file)
#                 for key in contig_coverage[sample_name]:
#                     contig_coverage[sample_name][key]['len'] = len(contig_coverage[sample_name][key]['coverage'])

In [None]:
%%bash
#mkdir contig_quality
#ls contigs | parallel python get_quality_metric.py {}

In [29]:
contig_quality = pd.concat([pd.read_csv("contig_quality/"+x, sep="\t") for x in os.listdir("contig_quality")])

In [30]:
for col_x in ["blast_hits_gsnap_acc", "blast_hits_gsnap_taxid",
              "blast_hits_rapsearch_acc", "blast_hits_rapsearch_taxid"]:
    contig_quality[col_x] = contig_quality[col_x].apply(lambda x: x.strip('][').split(',') if not pd.isnull(x) else [])

In [31]:
contig_quality = contig_quality.assign(blast_gsnap_ntax=contig_quality["blast_hits_gsnap_taxid"].apply(lambda x: len(set(x)) if isinstance(x, (list,)) else 0))
contig_quality = contig_quality.assign(blast_rapsearch_ntax=contig_quality["blast_hits_rapsearch_taxid"].apply(lambda x: len(set(x)) if isinstance(x, (list,)) else 0))


In [32]:
taxa_of_interest = pd.DataFrame({'taxon':["Viruses", "Bacteria", "Archaea", "Eukaryota", "Fungi", "Viridiplantae", "Metazoa", "Chordata", "Arthropoda"]})
taxa_of_interest = taxa_of_interest.assign(taxid=[ncbi_20181201_new.get_name_translator([x])[x][0] for x in taxa_of_interest["taxon"]])
taxa_of_interest = taxa_of_interest.assign(descendants0=[ncbi[0].get_descendant_taxa(x) for x in taxa_of_interest["taxid"]])
taxa_of_interest = taxa_of_interest.assign(descendants1=[ncbi[1].get_descendant_taxa(x) for x in taxa_of_interest["taxid"]])
taxa_of_interest = taxa_of_interest.assign(descendants2=[ncbi[2].get_descendant_taxa(x) for x in taxa_of_interest["taxid"]])
                                   


In [33]:
taxa_of_interest

Unnamed: 0,taxon,taxid,descendants0,descendants1,descendants2
0,Viruses,10239,"[12340, 12347, 12366, 12371, 12374, 12375, 123...","[12340, 12347, 12366, 12371, 12374, 12375, 123...","[12340, 12347, 12366, 12371, 12374, 12375, 123..."
1,Bacteria,2,"[40892, 40893, 40894, 40895, 40896, 40897, 408...","[40892, 40893, 40894, 40895, 40896, 40897, 408...","[40892, 40893, 40894, 40895, 40896, 40897, 408..."
2,Archaea,2157,"[95860, 95861, 97710, 97711, 97712, 97713, 977...","[95860, 95861, 97710, 97711, 97712, 97713, 977...","[95860, 95861, 97710, 97711, 97712, 97713, 977..."
3,Eukaryota,2759,"[2792, 28024, 35688, 77580, 284705, 299626, 57...","[2792, 28024, 35688, 77580, 284705, 299626, 57...","[2792, 28024, 35688, 77580, 284705, 299626, 57..."
4,Fungi,4751,"[42900, 45238, 84418, 84419, 84420, 84421, 844...","[42900, 45238, 84418, 84419, 84420, 84421, 844...","[42900, 45238, 84418, 84419, 84420, 84421, 844..."
5,Viridiplantae,33090,"[1761683, 141714, 284941, 36881, 1498950, 1799...","[1761683, 141714, 284941, 36881, 1498950, 1799...","[3046, 881204, 1478115, 13790, 38272, 140095, ..."
6,Metazoa,33208,"[86045, 155679, 974997, 1162987, 1162988, 1162...","[86045, 155679, 974997, 1162987, 1162988, 1162...","[86045, 155679, 974997, 1162987, 1162988, 1162..."
7,Chordata,7711,"[7723, 581058, 581059, 1917238, 7725, 7726, 30...","[7723, 581058, 581059, 1917238, 7725, 7726, 30...","[7723, 581058, 581059, 1917238, 7725, 7726, 30..."
8,Arthropoda,6656,"[6848, 61293, 6850, 51645, 6852, 6853, 61202, ...","[6848, 61293, 6850, 51645, 6852, 6853, 61202, ...","[6848, 61293, 6850, 51645, 6852, 6853, 61202, ..."


In [34]:
def which_kingdom (taxid, ncbi_db, table):
    if pd.isnull(taxid):
        return (np.nan)
    if taxid==0:
        return (np.nan)
    lineage = ncbi_db.get_lineage(taxid)
    result = [x for x in table["taxid"] if (x in lineage)]
    print (result)
    if (len(result)>0):
        if pd.isnull(result[-1]):
            return (np.nan)
        return (table.loc[table["taxid"]==result[-1], "taxon"].str.cat(sep=''))
    else:
        return (np.nan)

In [35]:
unique_lca = set(list(contig_quality["blast_hits_gsnap_lca"].dropna().unique()) + 
                 list(contig_quality["blast_hits_rapsearch_lca"].dropna().unique()))
unique_lca_kingdom = [{x:which_kingdom(int(x), ncbi[2], taxa_of_interest[["taxon", "taxid"]])} for x in unique_lca]
unique_lca_kingdom = {k:v for el in unique_lca_kingdom for k,v in el.items()}

[]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208]
[2]
[2]
[2]
[2759, 33208]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[10239]
[10239]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2759]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2759, 4751]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 33208]
[2]
[2]
[2759, 33208, 6656]
[2759, 4751]
[2759, 33208]
[2]
[2759, 4751]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[]
[2]
[2]
[2]
[2]
[2]
[2]
[

[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2759, 33090]
[2759, 33090]
[2759, 33208, 7711]
[2759, 33208, 6656]
[2759, 33208, 6656]
[2759, 33090]
[2]
[2759, 33208, 7711]
[2]
[2759, 33090]
[2759, 4751]
[2]
[2759, 4751]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2759]
[2759, 33208, 6656]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 6656]
[2759, 33208, 6656]
[2]
[2]
[10239]
[2]
[2]
[2759, 33090]
[2759, 33090]
[2]
[2759, 33090]
[10239]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2759, 33090]
[2]
[10239]
[2759, 33208, 7711]
[2759, 33090]
[2759, 33090]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2759, 33090]
[2]
[2759, 33090]
[2759, 33090]
[2]
[2759, 33090]
[2759, 33208, 6656]
[2759, 33090]
[2759, 33208]
[2759, 4751]
[2]
[2]
[2]
[2]
[2157]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2759, 33090]
[2759, 33208, 7711]
[2759, 33090]
[2]
[2759]
[2759]
[2]
[2]
[2]
[10239]
[2]
[2759, 33090]
[2]
[2759, 4751]
[2]
[2759, 

[2759, 33208, 7711]
[2759, 33208, 7711]
[2759, 4751]
[2759, 33090]
[2]
[2759, 33208, 7711]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2]
[2759, 33208, 7711]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2]
[2759, 33208, 6656]
[2]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2759, 33208, 7711]
[10239]
[2]
[2]
[2759, 4751]
[2759, 4751]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2759, 4751]
[2]
[2]
[2759, 4751]
[2759, 33208, 7711]
[2]
[2759, 33090]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2759, 4751]
[2759, 4751]
[2759, 33208, 7711]
[2759, 4751]
[2]
[2759, 33208, 7711]
[2]
[2]
[2]
[2759, 4751]
[2759, 33208, 6656]
[2759, 33208, 6656]
[10239]
[2759, 33208]
[2]
[2]
[2]
[2]
[2759, 4751]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2759, 4751]
[2759]
[2759, 4751]
[2759, 33208, 6656]
[2]
[2759, 33208, 7711]


[2]
[2759, 4751]
[2]
[2759, 4751]
[2]
[2]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2759]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[10239]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 6656]
[2759, 33090]
[2]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 6656]
[2759, 33208, 6656]
[2]
[2759, 33208, 6656]
[2]
[2759, 33208, 6656]
[2759, 33208, 6656]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2759, 4751]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2759]
[2759]
[10239]
[2]
[2759, 33090]
[2759]
[2]
[2759, 4751]
[2759, 33208, 7711]
[10239]
[2]
[10239]
[10239]
[2759, 4751]
[10239]
[2]
[2759, 33208, 6656]
[2]
[2759, 4751]
[10239]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2759]
[2759]
[2]
[2]
[2]
[2759, 4751]
[2759, 4751]
[2]
[2]
[2]
[2759, 33208, 7711]
[2759, 4751]
[10239]
[10239]
[10239]
[10239]
[2]
[2]
[10239]
[10239]
[10239]
[10239]
[2]
[2]
[2759, 4751]
[2]
[10239]
[2]
[2]
[10239]
[10239]
[2]
[2]
[2759, 4751]
[



[2]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2759, 4751]
[2759, 33090]
[2]
[2759, 4751]
[2]
[2759, 33208, 6656]
[2]
[2759, 4751]
[2759, 33090]
[2759]
[2]
[2759, 4751]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2759, 4751]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2759]
[2]
[2]
[2]
[2157]
[2]
[2]
[2]
[2759, 33208]
[2]
[2]
[2759, 33208, 6656]
[2]
[2]
[2759, 4751]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2759, 33208]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[10239]
[2]
[2]
[2]
[2759, 33208, 6656]
[2759]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2759, 33208, 7711]
[2759]
[2759, 4751]
[2759, 33208, 6656]
[2759, 33090]
[2]
[2]
[10239]
[2759, 33208, 6656]
[2]
[2759, 33208, 6656]
[2759, 4751]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 4751]
[2759, 4751]
[10239]
[2759]
[2]
[10239]
[2]
[2]
[2]
[2759, 33208, 6656]
[2]
[2]
[2759, 4751]
[2759, 33208]
[2]
[2]
[2]
[2]

[2]
[2759, 33208, 6656]
[2759, 4751]
[2759, 4751]
[2]
[2]
[10239]
[10239]
[2]
[2]
[2]
[10239]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759]
[2759, 4751]
[2759, 4751]
[2]
[2759, 33208, 7711]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2759, 4751]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33208, 6656]
[2759, 33208, 7711]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2759, 33090]
[2]
[2759, 33208]
[2]
[2759, 33208, 6656]
[2]
[2759, 4751]
[2759, 4751]
[2759, 4751]
[2759, 4751]
[2759, 4751]
[2759, 4751]
[2759, 4751]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2759, 33208, 6656]
[2]
[2759]
[2]
[2759, 4751]
[2]
[2]
[2759, 4751]
[2]
[2]
[2759, 4751]
[2]
[2]
[10239]
[2]
[2]
[2]
[2]
[2759]
[2]
[2]
[2759, 33208, 7711]
[2]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2759, 33208, 7711]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2157]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2]
[2]
[2759, 4751]
[2759, 33208]
[2]
[2

[2]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2759, 33208, 7711]
[2]
[2]
[2]
[2759, 4751]
[2759, 4751]
[2]
[2759]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[10239]
[10239]
[2]
[2]
[2]
[10239]
[2]
[10239]
[10239]
[2]
[2]
[10239]
[10239]
[2]
[2]
[2759, 33208]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 4751]
[2]
[2759, 4751]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2759, 33090]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[10239]
[2]
[2]
[2]
[2]
[2]
[10239]
[2]
[2]
[2]
[2759, 4751]
[2]
[2157]
[2]
[2]
[2]
[10239]
[2]
[2]
[2759, 33208, 7711]
[2]
[2]
[2]
[2]
[2759, 33208, 7711]
[2]
[2759, 33208]
[2]
[2759]
[2759, 4751]
[2759, 4751]
[2759, 33090]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[2759, 33090]
[2]
[10239]
[10239]
[2]
[2759, 33208, 6656]
[2]
[2]
[2]
[2]
[2]
[2]
[2]
[10239]
[10239]
[10239]
[2]
[2759, 33090]
[2]

In [36]:
contig_quality = contig_quality.assign(kingdom_gsnap=[unique_lca_kingdom[int(x)] if not pd.isnull(x) else x for x in contig_quality["blast_hits_gsnap_lca"]])
contig_quality = contig_quality.assign(kingdom_rapsearch=[unique_lca_kingdom[int(x)] if not pd.isnull(x) else x for x in contig_quality["blast_hits_rapsearch_lca"]])

In [37]:
def choose_kingdom (gsnap, rapsearch, gsnap_quality, rapsearch_quality):
    if (gsnap==rapsearch):
        return (gsnap)
    elif (pd.isnull(gsnap)):
        return (rapsearch)
    elif (pd.isnull(rapsearch)):
        return (gsnap)
    else:
        if (gsnap_quality>rapsearch_quality):
            return (gsnap)
        else:
            return (rapsearch)
    
#contig_quality = contig_quality.assign(kingdom=contig_quality[["kingdom_gsnap", "kingdom_rapsearch", "pmatch_gsnap", "pmatch_rapsearch"]].apply(lambda x: choose_kingdom(x[0], x[1], x[2], x[3]), axis=1))

contig_quality = contig_quality.assign(taxid=contig_quality[["blast_hits_gsnap_lca", "blast_hits_rapsearch_lca", "pmatch_gsnap", "pmatch_rapsearch"]].apply(lambda x: choose_kingdom(x[0], x[1], x[2], x[3]), axis=1))

contig_quality = contig_quality.assign(kingdom=[unique_lca_kingdom[int(x)] if not pd.isnull(x) else x for x in contig_quality["taxid"]])

In [38]:
contig_lca = [x.get_taxid_translator(contig_quality["taxid"].dropna()) for x in ncbi]
contig_lca_commonname = [x.get_common_names(contig_quality.taxid.dropna()) for x in ncbi]
contig_lca_rank = []
for i in range(3):
    contig_lca_rank.append(ncbi[i].get_rank(contig_lca[i]))

In [40]:
def find_lca (x, db):
    if (x==0):
        return (np.nan)
    name = ""
    for i in range(len(ncbi)):
        if (int(x) in db[i]):
            name = db[i][int(x)]
            return (name)
            break
    if (len(name)==0):
        return (np.nan)

contig_quality = contig_quality.assign(scientific_name=[find_lca(x, contig_lca) for x in contig_quality["taxid"]])
contig_quality = contig_quality.assign(rank=[find_lca(x, contig_lca_rank) for x in contig_quality["taxid"]])


In [46]:
contig_quality["scientific_name"] = contig_quality["scientific_name"].str.replace("[", "").str.replace("]", "")

In [47]:
most_recent_taxid = ncbi[2].get_name_translator(contig_quality["scientific_name"].dropna().unique())
contig_lca_info = Entrez.read(Entrez.esummary(db="taxonomy", id=','.join([str(int(x)) for x in [most_recent_taxid[x][0] for x in most_recent_taxid]])))
contig_lca_info = pd.DataFrame(contig_lca_info)
contig_lca_info = contig_lca_info.assign(scientific_name=contig_lca_info["ScientificName"].str.replace("[", "").str.replace("]", ""))


In [50]:
contig_quality["scientific_name"].str.contains("\]").any()

False

In [51]:
contig_quality = pd.merge(contig_quality, contig_lca_info[["scientific_name", "Division", "Genus", "CommonName"]], how="left", on="scientific_name")


In [52]:
missing_common_names_filter = (contig_quality["CommonName"].isnull()) & (contig_quality["rank"]=="subspecies")
new_common_names = [list(ncbi[2].get_common_names([ncbi[2].get_lineage(int(x))[-2]]).values()) for x in contig_quality.loc[missing_common_names_filter, "taxid"]]
new_common_names = [x[0] if (len(x)>0) else "" for x in new_common_names]
contig_quality.loc[missing_common_names_filter, "CommonName"] = new_common_names





In [53]:
#contig_quality = pd.read_csv("contig_quality_df.tsv", index_col=False, sep="\t")

In [54]:
contig_quality.to_csv("contig_quality_df.tsv", index=False, sep="\t")

In [55]:
%%bash
aws s3 cp contig_quality_df.tsv s3://czbiohub-mosquito/contig_quality/

Completed 256.0 KiB/216.8 MiB with 1 file(s) remainingCompleted 512.0 KiB/216.8 MiB with 1 file(s) remainingCompleted 768.0 KiB/216.8 MiB with 1 file(s) remainingCompleted 1.0 MiB/216.8 MiB with 1 file(s) remaining  Completed 1.2 MiB/216.8 MiB with 1 file(s) remaining  Completed 1.5 MiB/216.8 MiB with 1 file(s) remaining  Completed 1.8 MiB/216.8 MiB with 1 file(s) remaining  Completed 2.0 MiB/216.8 MiB with 1 file(s) remaining  Completed 2.2 MiB/216.8 MiB with 1 file(s) remaining  Completed 2.5 MiB/216.8 MiB with 1 file(s) remaining  Completed 2.8 MiB/216.8 MiB with 1 file(s) remaining  Completed 3.0 MiB/216.8 MiB with 1 file(s) remaining  Completed 3.2 MiB/216.8 MiB with 1 file(s) remaining  Completed 3.5 MiB/216.8 MiB with 1 file(s) remaining  Completed 3.8 MiB/216.8 MiB with 1 file(s) remaining  Completed 4.0 MiB/216.8 MiB with 1 file(s) remaining  Completed 4.2 MiB/216.8 MiB with 1 file(s) remaining  Completed 4.5 MiB/216.8 MiB with 1 file(s) remaining  Completed 

## scratch

In [126]:
kingdom_df_check = contig_quality.dropna(subset=["kingdom_gsnap", "kingdom_rapsearch"])

kingdom_df_check_subset = kingdom_df_check[kingdom_df_check.apply(lambda x: x["kingdom_gsnap"]!=x["kingdom_rapsearch"], axis=1)]


kingdom_df_check_subset[kingdom_df_check_subset["kingdom_gsnap"].str.contains("Chordata") | kingdom_df_check_subset["kingdom_rapsearch"].str.contains("Chordata")]



Unnamed: 0,qseqid,sample,pmatch_gsnap,pmatch_rapsearch,qlength,step_change,other_blast_contigs,blast_hits_gsnap_acc,blast_hits_gsnap_taxid,blast_hits_gsnap_lca,...,blast_rapsearch_ntax,kingdom_gsnap,kingdom_rapsearch,kingdom,taxid,scientific_name,rank,Division,Genus,CommonName
3588,NODE_2379_length_265_cov_0.776596,CMS001_007_Ra_S12,0.996226,0.856604,265,False,17.0,['AC026806.5'],[9606],9606.0,...,2,Chordata,Arthropoda,Chordata,9606.0,Homo sapiens,species,primates,Homo,human
13176,NODE_31222_length_269_cov_0.760417,CMS002_021a_Rb_S136_L004,0.735128,0.049567,269,False,14.0,['XM_021845409.1'],[7159],7159.0,...,1,Arthropoda,Chordata,Arthropoda,7159.0,Aedes aegypti,species,mosquitos,Aedes,yellow fever mosquito
28660,NODE_49241_length_229_cov_0.822368,CMS002_021a_Rb_S136_L004,0.995633,0.925764,229,False,50.0,['BX088702.10'],[9606],9606.0,...,1,Chordata,Bacteria,Chordata,9606.0,Homo sapiens,species,primates,Homo,human
28705,NODE_1357_length_1326_cov_3.993595,CMS002_021a_Rb_S136_L004,0.306938,0.829563,1326,False,5.0,['XM_026726124.1'],[8673],8673.0,...,3,Chordata,Arthropoda,Arthropoda,43817.0,Culicinae,subfamily,mosquitos,,
38301,NODE_37662_length_251_cov_1.258621,CMS002_021a_Rb_S136_L004,0.672310,0.988048,251,False,13.0,['XM_008830759.2'],[1026970],1026970.0,...,4,Chordata,Arthropoda,Arthropoda,33392.0,Holometabola,cohort,insects,,
49305,NODE_12369_length_462_cov_1.493506,CMS002_021a_Rb_S136_L004,0.207851,0.993506,462,False,23.0,['XM_026318322.1'],[205130],205130.0,...,3,Chordata,Arthropoda,Arthropoda,53541.0,Stegomyia,subgenus,mosquitos,,
52644,NODE_11698_length_478_cov_1.092269,CMS002_021a_Rb_S136_L004,0.434481,0.707113,478,False,20.0,['AK112980.1'],[7719],7719.0,...,1,Chordata,Arthropoda,Arthropoda,680683.0,Amyelois transitella,species,moths,Amyelois,
55461,NODE_7261_length_229_cov_0.638158,CMS001_058_Ra_S9,0.956813,0.689956,229,False,66.0,['AL590814.5'],[9606],9606.0,...,2,Chordata,Arthropoda,Chordata,9606.0,Homo sapiens,species,primates,Homo,human
55764,NODE_6358_length_238_cov_0.167702,CMS001_058_Ra_S9,0.995798,0.764706,238,False,17.0,"['KX061874.1', 'KX061888.1', 'M30952.1', 'N...","[9593, 9601, 9600, 10090, 9925]",1437010.0,...,3,Chordata,Metazoa,Chordata,1437010.0,Boreoeutheria,no rank,placentals,,
56520,NODE_6425_length_237_cov_0.912500,CMS001_058_Ra_S9,1.000000,0.919831,237,False,98.0,['AC132219.3'],[9606],9606.0,...,1,Chordata,Arthropoda,Chordata,9606.0,Homo sapiens,species,primates,Homo,human


In [101]:
contig_quality["kingdom"].unique()

array(['Arthropoda', 'Bacteria', nan, 'Viruses', 'Metazoa', 'Chordata',
       'Viridiplantae', 'Fungi', 'Eukaryota', 'Archaea'], dtype=object)

In [63]:
contig_quality[(contig_quality["pmatch_gsnap"] < 0.1) & (contig_quality["pmatch_rapsearch"]<0.1) & (contig_quality["qlength"]<500)]





Unnamed: 0,qseqid,sample,pmatch_gsnap,pmatch_rapsearch,qlength,step_change,other_blast_contigs,blast_hits_gsnap_acc,blast_hits_gsnap_taxid,blast_hits_gsnap_lca,...,blast_rapsearch_ntax,kingdom_gsnap,kingdom_rapsearch,kingdom,taxid,scientific_name,rank,Division,Genus,CommonName
29,NODE_336_length_377_cov_1.440000,CMS002_046a_Rb_S191_L004,0.0,0.037206,377,False,1.0,[],[],,...,1,,Arthropoda,Arthropoda,7160.0,Aedes albopictus,species,mosquitos,Aedes,Asian tiger mosquito
93,NODE_238_length_448_cov_1.118598,CMS002_046a_Rb_S191_L004,0.0,0.000000,448,False,,[],[],,...,0,,,,0.0,,,,,
98,NODE_270_length_418_cov_0.856305,CMS002_046a_Rb_S191_L004,0.0,0.000000,418,False,,[],[],,...,0,,,,0.0,,,,,
107,NODE_313_length_392_cov_0.920635,CMS002_046a_Rb_S191_L004,0.0,0.000000,392,False,,[],[],,...,0,,,,0.0,,,,,
109,NODE_272_length_417_cov_4.091176,CMS002_046a_Rb_S191_L004,0.0,0.027406,417,False,15.0,[],[],,...,1,,Arthropoda,Arthropoda,7159.0,Aedes aegypti,species,mosquitos,Aedes,yellow fever mosquito
150,NODE_517_length_295_cov_1.357798,CMS002_046a_Rb_S191_L004,0.0,0.060593,295,False,18.0,[],[],,...,1,,Arthropoda,Arthropoda,43151.0,Anopheles darlingi,species,mosquitos,Anopheles,American malaria mosquito
151,NODE_777_length_252_cov_0.834286,CMS002_046a_Rb_S191_L004,0.0,0.046398,252,False,,[],[],,...,1,,Arthropoda,Arthropoda,180454.0,Anopheles gambiae str. PEST,no rank,mosquitos,Anopheles,
168,NODE_491_length_306_cov_0.956332,CMS002_046a_Rb_S191_L004,0.0,0.029230,306,False,1.0,[],[],,...,1,,Arthropoda,Arthropoda,7160.0,Aedes albopictus,species,mosquitos,Aedes,Asian tiger mosquito
217,NODE_521_length_291_cov_1.023364,CMS002_046a_Rb_S191_L004,0.0,0.000000,291,False,,[],[],,...,0,,,,0.0,,,,,
242,NODE_727_length_257_cov_0.811111,CMS002_046a_Rb_S191_L004,0.0,0.041764,257,False,7.0,[],[],,...,1,,Arthropoda,Arthropoda,7176.0,Culex quinquefasciatus,species,mosquitos,Culex,southern house mosquito


In [105]:
contig_quality.loc[(contig_quality["Division"]=="mosquitos") & (contig_quality["kingdom"]=="Chordata"), ["blast_hits_rapsearch_lca", "kingdom_gsnap", "kingdom_rapsearch", "pmatch_gsnap", "pmatch_rapsearch"]]





Unnamed: 0,blast_hits_rapsearch_lca,kingdom_gsnap,kingdom_rapsearch,pmatch_gsnap,pmatch_rapsearch


Percentage of contigs that we can resolve at the species level:

In [None]:
contig_quality[contig_quality["kingdom"]=="vertebrata"]["scientific_name"].value_counts()

In [None]:
contig_quality[contig_quality["kingdom"]=="vertebrata"][["blast_hits_gsnap_lca", "blast_hits_rapsearch_lca", "scientific_name"]]

In [None]:
np.mean(contig_quality["rank"].str.contains("species"))

In [None]:
contig_quality = pd.read_csv("contig_quality_df.tsv", index_col=False, sep="\t")

In [None]:
contig_quality[contig_quality["p"]]

In [None]:
contig_quality["blast_rapsearch_ntax"][contig_quality["blast_rapsearch_ntax"]>0].value_counts(normalize=True).sort_values(0)

In [None]:
summary_contigs_with_blast = contig_quality.groupby("sample").apply(lambda x: np.mean([num == 0 for num in x["pmatch_gsnap"]+x["pmatch_rapsearch"]]))
summary_contigs_with_blast.to_csv("summary_contigs_with_blast.tsv", sep="\t")

In [None]:
summary_contigs_with_blast.sort_values(ascending=False)

In [None]:
summary_contigs_with_blast[summary_contigs_with_blast==max(summary_contigs_with_blast)]

In [None]:
%%R -i summary_contigs_with_blast -w 5 -h 3 --units in -r 200
data.frame(summary_contigs_with_blast) %>%
    rownames_to_column("sample") %>%
    rename(contigs_with_blast_hits=summary_contigs_with_blast) %>%
    ggplot() %>%
    `+`(theme_bw()) %>%
    `+`(geom_bar(aes(x=sample, y=contigs_with_blast_hits), stat="identity")) %>%
    `+`(xlab("Sample")) %>%
    `+`(ylab("Proportion of contigs with BLAST hits")) %>%
    `+`(theme(text=element_text(family="Helvetica")))

In [None]:
[ncbi.get_taxid_translator(str(x)) for x in contig_quality[contig_quality["sample"]=="CMS001_water3_Qiagen_S26"]["blast_hits_rapsearch_lca"].unique()]

In [None]:
x = contig_len_df["sample"].unique()[20]
sample_df = blast_results_df[blast_results_df["sample"]==x]
y = sample_df.qseqid.unique()[100]
contig_df = sample_df[sample_df.qseqid==y]
metric = {'qseqid':y, 'sample':x, 'pmatch_gsnap':0, 'pmatch_rapsearch':0, 
          'qlength':contig_df.head(1).qlength,
          'step_change':False, 
          'other_blast_contigs':False}
if contig_df.blast_type.str.contains("gsnap").any():
    metric["pmatch_gsnap"] = get_pmatch_per_contig(contig_df[contig_df["blast_type"]=="gsnap"])
if contig_df.blast_type.str.contains("rapsearch2").any():
    metric["pmatch_rapsearch"] = get_pmatch_per_contig(contig_df[contig_df["blast_type"]=="rapsearch2"])

In [None]:
metric

In [None]:
df.iloc[2, :]