In [None]:
%%bash
cd /mnt/data
ftp ftp.ncbi.nlm.nih.gov
anonymous
lucy.li@czbiohub.org
cd pub/taxonomy/accession2taxid/
passive on
get nucl_gb.accession2taxid.gz
get prot.accession2taxid.gz
bye
gunzip -k nucl_gb.accession2taxid.gz 
gunzip -k prot.accession2taxid.gz 
mkdir nucl_gb; mkdir prot
mkdir nucleotide_acc; mkdir protein_acc
mkdir nucleotide_matches; mkdir protein_matches

In [16]:
%%bash
aws s3 sync s3://czbiohub-mosquito/contigs/ contigs
mkdir contig_lineages
mkdir blast_results

In [52]:
import pandas as pd
from ete3 import NCBITaxa
import os
import subprocess
import math
import re
import numpy as np
from Bio import Entrez
import time
import json
import dill
def get_taxid (acc, dir_name):
    fn = dir_name+"/"+acc+".txt"
    if (os.path.isfile(fn)):
        taxid = pd.read_csv(fn, sep=" ", header=None).iloc[0][1]
    else:
        db_name = "nucleotide"
        if ('protein' in dir_name):
            db_name = "protein"
        taxid = int(Entrez.read(Entrez.esummary(id=acc, db=db_name, rettype="gb", retmode="text"))[0]['TaxId'])
        output_string = acc+" "+str(taxid)
        with open (fn, 'w') as f:
            f.write("%s" % output_string)
    return (taxid)

In [19]:
ncbi = NCBITaxa()
Entrez.email = "lucy.li@czbiohub.org"

In [3]:
blast_col_names = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
                   "qstart", "qend", "sstart", "send", "evalue", "bitscore"]

In [4]:
blast_results = []
contig_coverage = {}
for path, subdirs, files in os.walk("contigs"):
    for name in files:
        fn = os.path.join(path, name)
        sample_name = os.path.basename(path)
        if ("blast" in name):
            tb = pd.read_csv(fn, sep="\t", header=None, names=blast_col_names)
            tb = tb.assign(blast_type=name.split('.')[0], sample=sample_name)
            blast_results.append(tb)
        if (".json" in name):
            with open (fn) as json_file:
                contig_coverage[sample_name] = json.load(json_file)
                for key in contig_coverage[sample_name]:
                    contig_coverage[sample_name][key]['len'] = int(key.split('_')[3])
                    

In [5]:
contig_len = {}
for sample_key in contig_coverage:
    for node_key in contig_coverage[sample_key]:
        contig_len[str(len(contig_len))] = {"sample":sample_key, "qseqid":node_key, 
                                            "qlength":int(node_key.split('_')[3])}

In [6]:
contig_len_df = pd.DataFrame.from_dict(contig_len, orient='index').reset_index(drop=True)

In [7]:
contig_len_df.to_csv("contig_len_df.csv")

In [8]:
blast_results_df = pd.concat(blast_results)
blast_results_df = pd.merge(blast_results_df, contig_len_df, how='left')
blast_results_df = blast_results_df.assign(qlength=blast_results_df["qseqid"].apply(lambda x: int(x.split('_')[3])))
blast_results_df = blast_results_df.assign(qcov=abs(blast_results_df.qstart-blast_results_df.qend)+(1-blast_results_df.mismatch)*[1 if x=="gsnap" else 3 for x in blast_results_df.blast_type])
blast_results_df = blast_results_df.assign(qcov_prop=blast_results_df.qcov/blast_results_df.qlength)
blast_results_df = blast_results_df.assign(pmatch=blast_results_df.qcov_prop * blast_results_df.pident / 100)


In [9]:
for blast_result_df_sample in blast_results_df["sample"].unique():
    blast_results_df[blast_results_df["sample"]==blast_result_df_sample].to_csv("blast_results/"+blast_result_df_sample+".csv")

    
    

In [10]:
blast_results_df.to_csv("blast_results_df.csv")

## Examples

In [14]:
blast_results_df[blast_results_df.pmatch>0.9].sort_values(by="qlength").head(10)

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,blast_type,sample,qlength,qcov,qcov_prop,pmatch
4673073,NODE_21001_length_78_cov_657.000000,KF687542.1,96.154,78,3,0,1,78,331,408,2.66e-28,128.0,gsnap,CMS001_016_Ra_S6,78,75,0.961538,0.924558
4477887,NODE_10828_length_78_cov_87.000000,AF166259.1,100.0,77,0,0,2,78,935,1011,2.51e-33,143.0,gsnap,CMS002_038a_Rb_S172_L004,78,77,0.987179,0.987179
4477888,NODE_10828_length_78_cov_87.000000,GQ375056.1,100.0,77,0,0,2,78,358,434,2.51e-33,143.0,gsnap,CMS002_038a_Rb_S172_L004,78,77,0.987179,0.987179
4673076,NODE_21004_length_78_cov_506.000000,KF687542.1,96.154,78,3,0,1,78,406,329,2.66e-28,128.0,gsnap,CMS001_016_Ra_S6,78,75,0.961538,0.924558
4673077,NODE_21005_length_78_cov_475.000000,KF687542.1,96.154,78,3,0,1,78,405,328,2.66e-28,128.0,gsnap,CMS001_016_Ra_S6,78,75,0.961538,0.924558
4159688,NODE_6126_length_78_cov_527.000000,JN006838.1,100.0,78,0,0,1,78,29,106,1.09e-33,145.0,gsnap,CMS001_035_Ra_S20,78,78,1.0,1.0
4159687,NODE_6126_length_78_cov_527.000000,JN006844.1,100.0,78,0,0,1,78,27,104,1.09e-33,145.0,gsnap,CMS001_035_Ra_S20,78,78,1.0,1.0
4159686,NODE_6126_length_78_cov_527.000000,AB242275.1,100.0,78,0,0,1,78,21,98,1.09e-33,145.0,gsnap,CMS001_035_Ra_S20,78,78,1.0,1.0
4159685,NODE_6126_length_78_cov_527.000000,HQ107970.1,100.0,78,0,0,1,78,29,106,1.09e-33,145.0,gsnap,CMS001_035_Ra_S20,78,78,1.0,1.0
4159684,NODE_6126_length_78_cov_527.000000,JN006830.1,100.0,78,0,0,1,78,29,106,1.09e-33,145.0,gsnap,CMS001_035_Ra_S20,78,78,1.0,1.0


In [70]:
blast_results_df[blast_results_df.qseqid=="NODE_177_length_1314_cov_2.272433"]

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,blast_type,sample,qlength,qcov,qcov_prop,pmatch
2734863,NODE_177_length_1314_cov_2.272433,WP_063865053.1,96.753,154,5,0,71,532,1,154,2.26e-154,312.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,449,0.341705,0.33061
2734864,NODE_177_length_1314_cov_2.272433,WP_063865053.1,98.551,138,2,0,516,929,149,286,2.26e-154,251.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,410,0.312024,0.307503
2734865,NODE_177_length_1314_cov_2.272433,WP_032490026.1,96.104,154,6,0,71,532,1,154,3.82e-154,309.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,446,0.339422,0.326198
2734866,NODE_177_length_1314_cov_2.272433,WP_032490026.1,99.275,138,1,0,516,929,149,286,3.82e-154,253.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,413,0.314307,0.312029
2734867,NODE_177_length_1314_cov_2.272433,WP_063864802.1,96.753,154,5,0,71,532,1,154,1.4600000000000002e-153,313.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,449,0.341705,0.33061
2734868,NODE_177_length_1314_cov_2.272433,WP_063864802.1,97.826,138,3,0,516,929,149,286,1.4600000000000002e-153,248.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,407,0.309741,0.303007
2734869,NODE_177_length_1314_cov_2.272433,WP_063865132.1,94.805,154,8,0,71,532,1,154,1.03e-151,305.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,440,0.334855,0.31746
2734870,NODE_177_length_1314_cov_2.272433,WP_063865132.1,98.551,138,2,0,516,929,149,286,1.03e-151,249.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,410,0.312024,0.307503
2734871,NODE_177_length_1314_cov_2.272433,WP_063864896.1,96.753,154,5,0,71,532,1,154,4.39e-148,312.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,449,0.341705,0.33061
2734872,NODE_177_length_1314_cov_2.272433,WP_063864896.1,98.551,138,2,0,516,929,149,286,4.39e-148,230.0,rapsearch2,CMS002_032a_Rb_S166_L004,1314,410,0.312024,0.307503


In [None]:
blast_results_df[blast_results_df.qseqid=="NODE_159_length_786_cov_70.668547"]

In [168]:
coverage_of_interest = []
for key_x in contig_coverage.keys():
    for node_x in contig_coverage[key_x].keys():
        cov = contig_coverage[key_x][node_x]['coverage']
        depth_map = [x<3 for x in cov]
        metric = sum(depth_map)/len(depth_map)
        if ((sum(cov)/len(cov)) <= 20):
            next
        if (metric > 0.4):
            coverage_of_interest.append({'sample':key_x, 'qseqid':node_x, 'low_depth_prop':metric})
coverage_of_interest = pd.DataFrame(coverage_of_interest)

In [169]:
coverage_of_interest[coverage_of_interest['sample']=="CMS001_001_Ra_S1"]

Unnamed: 0,low_depth_prop,qseqid,sample
14760,0.586190,NODE_29_length_1985_cov_1.600105,CMS001_001_Ra_S1
14761,0.506568,NODE_37_length_1751_cov_0.948029,CMS001_001_Ra_S1
14762,0.575262,NODE_56_length_1538_cov_0.534565,CMS001_001_Ra_S1
14763,0.532934,NODE_58_length_1532_cov_0.663918,CMS001_001_Ra_S1
14764,0.636059,NODE_60_length_1492_cov_0.653710,CMS001_001_Ra_S1
14765,0.508949,NODE_79_length_1391_cov_0.782344,CMS001_001_Ra_S1
14766,0.421547,NODE_81_length_1383_cov_1.592649,CMS001_001_Ra_S1
14767,0.675207,NODE_88_length_1327_cov_0.739200,CMS001_001_Ra_S1
14768,0.437158,NODE_93_length_1309_cov_1.077110,CMS001_001_Ra_S1
14769,0.547486,NODE_94_length_1303_cov_1.040783,CMS001_001_Ra_S1


In [170]:
coverage_of_interest.to_csv("coverage_of_interest.csv")

## Nucleotide to taxonomy ID conversion

In [12]:
nucleotide_acc = blast_results_df[blast_results_df.blast_type=='gsnap'].sseqid.unique()
protein_acc = blast_results_df[blast_results_df.blast_type!='gsnap'].sseqid.unique()

In [13]:
with open ("nucleotide_acc.txt", "w") as f:
    for x in blast_results_df[blast_results_df.blast_type=='gsnap'].sseqid.unique():
        f.write("%s\n" % x)

with open ("protein_acc.txt", "w") as f:
    for x in blast_results_df[blast_results_df.blast_type!='gsnap'].sseqid.unique():
        f.write("%s\n" % x)

In [54]:
%%bash
split -d -b 100000 nucleotide_acc.txt nucleotide_acc/nucleotide_acc
split -d -b 100000 protein_acc.txt protein_acc/protein_acc
mkdir nucleotide_matches_unique
mkdir protein_matches_unique
find_match () {
    acc_num=$1
    db_file=$2
    output_dir=$3
    if [ -f $output_dir/$acc_num.txt ]
    then
        echo $output_dir/$acc_num.txt already exists
        return
    fi
    match_result=$(grep -m 1 $acc_num $db_file)
    if [ ! -z "$match_result" ]
    then
        echo "$match_result" | awk '{print $2,$3}' > $output_dir/$acc_num.txt
    else
        echo $acc_num not found
        echo $acc_num >> ${db_file/.accession2taxid/_missing_acc}
    fi
}
export -f find_match
find nucleotide_matches_unique -size 0 | parallel rm {}
for acc_file in `find nucleotide_acc -type f`
do
    cat $acc_file | parallel find_match {} nucl_gb.accession2taxid nucleotide_matches_unique
done

find protein_matches_unique -size 0 | parallel rm {}
for acc_file in `find protein_acc -type f`
do
    cat $acc_file | parallel find_match {} prot.accession2taxid protein_matches_unique
done



Process is interrupted.


In [58]:
print (str(len(nucleotide_acc))+" unique blast hits (gsnap).")
print (str(len(os.listdir("nucleotide_matches_unique")))+" nucleotide hits found.")
print (str(len(protein_acc))+" unique blast hits (rapsearch2).")
print (str(len(os.listdir("protein_matches_unique")))+" protein hits found.")

15472 unique blast hits (gsnap).
15413 nucleotide hits found.
133960 unique blast hits (rapsearch2).
133746 protein hits found.


Some accession numbers are out of date.

In [61]:
nucleotide_nomatch = [line.strip() for line in open("nucl_gb_missing_acc", 'r')]
nucleotide_nomatch_taxid = [get_taxid(acc_x, "nucleotide_matches_unique") for acc_x in nucleotide_nomatch]

In [64]:
protein_nomatch = [line.strip() for line in open("prot_missing_acc", 'r')]
protein_nomatch_taxid = [get_taxid(acc_x, "protein_matches_unique") for acc_x in protein_nomatch]

## Nucleotide hits

In [87]:
nucleotide_matches = pd.read_table("nucleotide_matches_unique.txt", sep=" ", header=None, 
                                   names=['sseqid', 'taxid'])
nucleotide_matches = nucleotide_matches.assign(lineage=nucleotide_matches.taxid.apply(lambda x: ncbi.get_lineage(x)))

In [89]:
blast_results_df_gsnap = pd.merge(blast_results_df[blast_results_df.blast_type=="gsnap"], nucleotide_matches, how="left")

In [154]:
gsnap_chordata_indices = []
for x in range(len(blast_results_df_gsnap.index)):
    result = blast_results_df_gsnap.lineage[x]
    if isinstance(result, (list,)):
        gsnap_chordata_indices.append(7711 in result)
    else:
        gsnap_chordata_indices.append(7711 == result)

In [159]:
blast_results_df_gsnap_chordata = blast_results_df_gsnap.loc[blast_results_df_gsnap.index[gsnap_chordata_indices]]

In [161]:
blast_results_df_gsnap_chordata

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,blast_type,sample,qlength,qcov,qcov_prop,pmatch,taxid,lineage
19,NODE_83_length_1606_cov_2.077175,CP011907.1,82.716,81,14,0,509,589,9153776,9153696,1.130000e-10,75.0,gsnap,CMS001_003_Ra_S2,1606.0,80,0.0498132,0.0412035,112262.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
20,NODE_83_length_1606_cov_2.077175,CP027094.1,81.707,82,13,2,509,589,9151364,9151284,1.890000e-08,67.6,gsnap,CMS001_003_Ra_S2,1606.0,82,0.0510585,0.0417184,72004.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
21,NODE_95_length_1539_cov_15.399453,KM612277.1,97.013,1540,45,1,1,1539,2644,1105,0.000000e+00,2588.0,gsnap,CMS001_003_Ra_S2,1539.0,1539,1,0.97013,9874.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
22,NODE_95_length_1539_cov_15.399453,JN632657.1,97.068,1535,44,1,6,1539,2638,1104,0.000000e+00,2584.0,gsnap,CMS001_003_Ra_S2,1539.0,1534,0.996751,0.967526,43334.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
23,NODE_95_length_1539_cov_15.399453,KM612274.1,96.883,1540,47,1,1,1539,2644,1105,0.000000e+00,2577.0,gsnap,CMS001_003_Ra_S2,1539.0,1539,1,0.96883,9874.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
24,NODE_95_length_1539_cov_15.399453,HQ332445.1,96.688,1540,50,1,1,1539,14176,12637,0.000000e+00,2566.0,gsnap,CMS001_003_Ra_S2,1539.0,1539,1,0.96688,9874.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
25,NODE_95_length_1539_cov_15.399453,KM612272.1,96.753,1540,49,1,1,1539,2644,1105,0.000000e+00,2566.0,gsnap,CMS001_003_Ra_S2,1539.0,1539,1,0.96753,9874.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
28,NODE_208_length_1211_cov_2.247795,CP027090.1,96.970,33,0,1,88,119,32484363,32484331,1.110000e-04,54.7,gsnap,CMS001_003_Ra_S2,1211.0,32,0.0264244,0.0256238,72004.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
30,NODE_285_length_1096_cov_0.607458,CP027085.1,89.718,1099,103,9,2,1096,35574611,35575703,0.000000e+00,1395.0,gsnap,CMS001_003_Ra_S2,1086.0,1103,1.01565,0.911224,72004.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."
31,NODE_285_length_1096_cov_0.607458,CP027085.1,89.273,1100,111,6,1,1096,44124367,44125463,0.000000e+00,1371.0,gsnap,CMS001_003_Ra_S2,1086.0,1101,1.01381,0.905061,72004.0,"[1, 131567, 2759, 33154, 33208, 6072, 33213, 3..."


In [160]:
blast_results_df_gsnap_chordata.to_csv("blast_results_df_gsnap_chordata.csv")