In this notebook we estimate the precision and recall of SIMBA (as well as Kraken2) for detecting bacteria, fungi and viruses using various truth datasets created in the previous notebook. 


### Loading libraries

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

params = {'legend.fontsize': 'small',
         'axes.labelsize': 'small',
         'axes.titlesize':'small',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small',
         'figure.figsize': (3, 2)}
plt.rcParams.update(params)
import os
import glob
import re
import itertools
import collections
from collections import Counter
import math
import random
from random import randrange
import string
import subprocess
import numpy as np
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import seaborn as sns
import phylopandas as ph
cmap = sns.cm.rocket_r
sns.set_style("white")
import anndata
from anndata import read_h5ad
from anndata import AnnData
import phylopandas as ph
import wget


### directories

In [32]:
mainDir = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/bacterial_test/'
mainDir1= mainDir + 'micoNT_blastn/'
mainDir2 = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/'
dbDir = '/oak/stanford/groups/quake/gita/raw/database/taxonomyNCBI20200125/'
paper = '/oak/stanford/groups/quake/gita/raw/nb/microbe/paper/forGitHub/human_tissue_microbiome_atlas/post_review/'
tables = paper + 'tables/'

### a few useful functions for estimating various metrics like precision

In [3]:
def param_estimate(true_pos, false_pos, false_neg):
    '''estimating precision, recall and f1'''
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1 = 2*(precision * recall) / (precision + recall)
    return (np.round(precision, 3), np.round(recall, 3), np.round(f1, 3))

In [4]:
def cat(tax, df, df2): #I use df = found, and df2=fin found is a subset of fin that produced a hit result
    tp = df[(df[ tax + '_test']==df[tax +'_truth']) | (df['staxids_test']==df['taxid_truth'])].shape[0]
    fp = df[(df[tax +'_test']!= df[tax + '_truth']) & (df['staxids_test']!=df['taxid_truth'])].shape[0]
    fn= df2[df2['found_test'].isna()].shape[0]
    precision, recall, f1=param_estimate(tp, fp, fn)
    return(precision, recall, f1)

def cat2(tax, df, df2):
    tp = df[(df[ tax + '_test']==df[tax +'_truth']) | (df['staxids_test']==df['taxid_truth'])].shape[0]
    fp = df[(df[tax +'_test']!= df[tax + '_truth']) & (df['staxids_test']!=df['taxid_truth'])].shape[0]
    fn= df2[df2['found_test'].isna()].shape[0]
    return(tp,fp,fn)

### Taxonomic information from ncbi

In [5]:
tax = pd.read_csv(dbDir + 'ncbi_lineages_2021-01-26.csv')
#want to take only the following columns from the lineage dataframe tax 
tax_short=tax[['tax_id','superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']]
tax_short = tax_short.rename(columns={'tax_id':'taxid'})


  interactivity=interactivity, compiler=compiler, result=result)


### Analyzing results for batch of 10 (5000 sequences from gut bacteria, 100 per genome)

reading the output csvs of SIMBA, concatenating them into one csv. They were divided in the first place to run 10 parallel SIMBA jobs. I have a column called found that allows me to look for sequences that produced a hit. 

In [52]:
results=pd.DataFrame({})
for file in glob.glob(mainDir + 'batch_of_ten/micoNT_blastn/*_deduplicated.csv'):
    bl = pd.read_csv(file)
    bl['found'] = ['yes']*bl.shape[0]
    results=pd.concat([results, bl])
results['found'] = ['yes']*results.shape[0]
results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one

adding taxonomic lineage info

In [53]:
tax_shortcopy = tax_short.copy()
tax_shortcopy['taxid'] = tax_shortcopy['taxid'].astype('str')
tax_shortcopy = tax_shortcopy.rename(columns={"taxid":"staxids"})
results_df=results.merge(tax_shortcopy, on='staxids', how='left')
results_df.columns = [str(col) + '_test' for col in results_df.columns]
results_df.shape

(1472, 24)

now reading the input csv files which contain the true sequences

In [54]:
input_df=pd.DataFrame({})
for file in glob.glob(mainDir + 'batch_of_ten/fasta_files/*.csv'):
    bl = pd.read_csv(file)
    input_df=pd.concat([input_df, bl])

adding taxonomic lineage info to truth dataset. Also adding truth suffix to the lineage

In [55]:
tax_shortcopy = tax_shortcopy.rename(columns={"staxids":"taxid"})
input_df['taxid'] = input_df['taxid'].astype(str)
cols=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
input_df=input_df.merge(tax_shortcopy, on='taxid', how='left')
input_df = input_df.drop(columns=['Unnamed: 0'])
input_df.columns = [str(col) + '_truth' for col in input_df.columns]


merging the two datasets ( truth, and results) on a shared column

In [56]:
input_df['organism_name_truth'] = input_df['organism_name_truth'].apply(lambda x: "_".join(x.split(' '))) 
input_df['seqName_test'] = input_df['taxid_truth'] + '_' + input_df['id2_truth'] + "_" + input_df['organism_name_truth']

now merging the results dataframe with the truth dataframe to see which sequences produced results. 

In [57]:
fin = input_df.merge(results_df, on='seqName_test', how='outer')


In [75]:
fin.to_csv(mainDir2 + 'results/gut_fin.csv', index=False)

time for measuring some metrics!

first, how many sequences out of 5000 produced a hit? 

In [90]:
a=['superkingdom_test', 'phylum_test','class_test','order_test','family_test','genus_test']
found = fin[~fin[a].isnull().any(axis=1)] #taking output seqs with incomplete lineage
print(str(found.shape[0]) + ' sequences out of 5000 input sequences')



1446 sequences out of 5000 input sequences


the vast majority of false positives at the species level come from sequences that are shared between highly related organisms

In [91]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)


for category in cols:
    table[category]= cat(category, df=found, df2=fin)

table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
   
table

Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,1.0,0.999,0.997,0.996,0.973,0.947,0.699
recall,0.291,0.291,0.29,0.29,0.285,0.28,0.223


In [20]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['true positives', 'false positives', 'false negatives']
table2=pd.DataFrame(columns=cols, index = ind)


for category in cols:
    table2[category]= cat2(category, df=found, df2=fin)

    
table2

Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
true positives,1465,1455,1452,1450,1410,1369,1011
false positives,7,17,20,22,62,103,461
false negatives,3528,3528,3528,3528,3528,3528,3528


what are the 7 seqs that are getting misclassified at the domain level? 

In [92]:
tax = 'superkingdom'
df = found
df2 = fin 
tp = df[(df[ tax + '_test']==df[tax +'_truth']) | (df['staxids_test']==df['taxid_truth'])]
fp = df[(df[tax +'_test']!= df[tax + '_truth']) & (df['staxids_test']!=df['taxid_truth'])]
fn= df2[df2['found_test'].isna()].shape[0]



### Testing precision/recall of bacterial sequences from Refseq
RefSeq bacterial truth dataset (100 seqs per genome, 50 genomes, 10 batches for a total of 5000 seqs)


In [102]:
results=pd.DataFrame({})
for file in glob.glob(mainDir + 'refseq/micoNT_blastn/*_deduplicated.csv'):
    bl = pd.read_csv(file)
    bl['found'] = ['yes']*bl.shape[0]
    results=pd.concat([results, bl])

results['found'] = ['yes']*results.shape[0]
results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one


In [95]:
tax_shortcopy = tax_short.copy()
tax_shortcopy['taxid'] = tax_shortcopy['taxid'].astype(str)

tax_shortcopy = tax_shortcopy.rename(columns={"taxid":"staxids"})
results_df=results.merge(tax_shortcopy, on='staxids', how='left')
results_df.columns = [str(col) + '_test' for col in results_df.columns]
results_df.shape

(261, 24)

In [96]:
input_df=pd.DataFrame({})
for file in glob.glob(mainDir + 'refseq/fasta_files/*.csv'):
    bl = pd.read_csv(file)
    input_df=pd.concat([input_df, bl])

In [97]:
cols=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

input_df = input_df.drop(columns=['Unnamed: 0'])
input_df.columns = [str(col) + '_truth' for col in input_df.columns]


creating a column to merge the two datasets on called id2

In [98]:
input_df['id2'] = input_df['id2_truth'].copy()
results_df['id2'] =results_df['seqName_test'].apply(lambda x: '_'.join(x.split('_')[1:4]))

In [99]:
fin = input_df.merge(results_df, on='id2', how='outer')

In [146]:
fin.to_csv(mainDir2 + 'results/refseq_fin.csv', index=False)

how many genera does this dataset include

In [100]:
a=['superkingdom_test', 'phylum_test','class_test','order_test','family_test','genus_test']
found = fin[~fin[a].isnull().any(axis=1)] #taking output seqs with incomplete lineage
print(str(found.shape[0]) + ' sequences out of 5000 input sequences')


261 sequences out of 5000 input sequences


In [103]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)


for category in cols:
    table[category]= cat(category, df=found, df2=fin)
    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table

Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,1.0,1.0,1.0,0.996,0.992,0.985,0.897
recall,0.052,0.052,0.052,0.052,0.052,0.051,0.047


In [104]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['true positives', 'false positives', 'false negatives']
table2=pd.DataFrame(columns=cols, index = ind)


for category in cols:
    table2[category]= cat2(category, df=found, df2=fin)
    
table2

Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
true positives,261,261,261,260,259,257,234
false positives,0,0,0,1,2,4,27
false negatives,4738,4738,4738,4738,4738,4738,4738


In [30]:
found[(found['species_truth']!=found['species_test'])][['species_truth', 'species_test', 'pident_test', 'length_test']].drop_duplicates()

Unnamed: 0,species_truth,species_test,pident_test,length_test
1976,Clostridium perfringens,Clostridium butyricum,100.0,100.0
2270,Nitrosomonas europaea,uncultured Nitrosomonas sp.,100.0,100.0
2271,Nitrosomonas europaea,Nitrosospira multiformis,100.0,100.0
2272,Nitrosomonas europaea,Pigmentiphaga aceris,100.0,100.0
2600,Sphingomonas taxi,Sphingomonas sp. CL5.1,100.0,100.0
2602,Sphingomonas taxi,Sphingomonas sp. HMP9,100.0,100.0
3711,Xanthomonas cucurbitae,Xanthomonas campestris,100.0,100.0
4100,Yersinia pseudotuberculosis,Yersinia pestis,100.0,100.0
4180,Yersinia pseudotuberculosis,Serratia marcescens,100.0,100.0
4390,Pseudomonas fluorescens,Pseudomonas sp. 43A,100.0,100.0


### HOMD (human oral microbiome database)

In [105]:
results=pd.DataFrame({})
for file in glob.glob(mainDir + 'homd/micoNT_blastn/*_deduplicated.csv'):
    if 'batch_3000' not in file: #this file didn't produce any hits so skipping reading this csv since it won't have the right columns
        bl = pd.read_csv(file)
        bl['found'] = ['yes']*bl.shape[0]
        results=pd.concat([results, bl])

    results['found'] = ['yes']*results.shape[0]
    results['staxids'] = results['staxids'].astype(str)
    results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one


In [106]:
tax_shortcopy = tax_short.copy()
tax_shortcopy['taxid'] = tax_shortcopy['taxid'].astype(str)

tax_shortcopy = tax_shortcopy.rename(columns={"taxid":"staxids"})
results_df=results.merge(tax_shortcopy, on='staxids', how='left')
results_df.columns = [str(col) + '_test' for col in results_df.columns]
results_df.shape

(945, 24)

In [107]:
input_df=pd.DataFrame({})
for file in glob.glob(mainDir + 'homd/fasta_files/*.csv'):
    if 'batch_3500' not in file: #this file didn't run yet so not including it 
        bl = pd.read_csv(file)
        input_df=pd.concat([input_df, bl])

In [108]:
cols=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

input_df = input_df.drop(columns=['Unnamed: 0'])
input_df.columns = [str(col) + '_truth' for col in input_df.columns]


input_df['id2'] = input_df['id2_truth'].copy()
results_df['id2'] =results_df['seqName_test'].apply(lambda x: '_'.join(x.split('_')[1:4]))

fin = input_df.merge(results_df, on='id2', how='outer')



In [139]:
fin.to_csv(mainDir2 + 'results/homd_fin.csv', index=False)

In [111]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)

a=['superkingdom_test', 'phylum_test','class_test','order_test','family_test','genus_test']
found = fin[~fin[a].isnull().any(axis=1)] #taking output seqs with incomplete lineage

for category in cols:
    table[category]= cat(category, df=found, df2=fin)
    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table

Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,1.0,1.0,0.996,0.99,0.987,0.974,0.854
recall,0.209,0.209,0.208,0.207,0.207,0.205,0.184


In [45]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['true positives', 'false positives', 'false negatives']
table2=pd.DataFrame(columns=cols, index = ind)


for category in cols:
    table2[category]= cat2(category, df=found, df2=fin)
    
table2

Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
true positives,945,941,937,932,927,917,804
false positives,0,4,8,13,18,28,141
false negatives,3552,3552,3552,3552,3552,3552,3552


In [898]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)


chuncks=np.array_split(fin, 5)
for chunck in chuncks:
    df=pd.DataFrame(chunck)
    found =df[df['found_test']=='yes']
    for category in cols:
        table[category]= cat2(category)
        
    print(table)

           superkingdom  phylum  class  order  family  genus  species
precision           226     226    226    226     224    225      211
recall                0       0      0      0       2      1       15
F1                 3552    3552   3552   3552    3552   3552     3552
           superkingdom  phylum  class  order  family  genus  species
precision           171     171    171    171     171    171      122
recall                0       0      0      0       0      0       49
F1                 3552    3552   3552   3552    3552   3552     3552
           superkingdom  phylum  class  order  family  genus  species
precision           137     137    137    135     135    132      106
recall                0       0      0      2       2      5       31
F1                 3552    3552   3552   3552    3552   3552     3552
           superkingdom  phylum  class  order  family  genus  species
precision           186     182    182    179     177    176      172
recall              

### human sequences against the viral branch of SIMBA

In [131]:
humdir = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/human_test/'

In [132]:
results=pd.DataFrame({})
for file in glob.glob(humdir + 'virNTblastn/*_deduplicated.csv'):
    bl = pd.read_csv(file)
    bl['found'] = ['yes']*bl.shape[0]
    results=pd.concat([results, bl])

results['found'] = ['yes']*results.shape[0]
results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one


number of sequences that had been passed onto NT, but were correctly identified as human in origin

In [646]:
results.seqName.nunique()

65

In [394]:
results.pathogen.value_counts()

Homo sapiens isolate hybrid A9-21 NOR distal junction on HSA21 genomic sequence                                 32
Eukaryotic synthetic construct chromosome 19                                                                     8
Homo sapiens isolate hybrid A9-13 NOR distal junction on HSA13 genomic sequence                                  6
Homo sapiens BAC clone RP11-199K19 from chromosome unknown, complete sequence                                    3
Homo sapiens isolate A9-15 NOR far distal junction on HSA15 genomic sequence                                     3
Homo sapiens glucosaminyl (N-acetyl) transferase 2 (I blood group) (GCNT2), RefSeqGene on chromosome 6           2
Homo sapiens 12 BAC RP11-440E12 (Roswell Park Cancer Institute Human BAC Library) complete sequence              2
Eukaryotic synthetic construct chromosome 20                                                                     1
Human DNA sequence from clone RP11-291L22 on chromosome 10, complete sequence   

In [372]:
possible_vir = results[results['pathogen']=='PREDICTED: Pan paniscus endogenous retrovirus group K member 7 Env polyprotein-like (LOC117978169), misc_RNA']

In [387]:
possible_vir

Unnamed: 0,seqName,seq,refName,pathogen,bitscore,pident,evalue,gapopen,qstart,qend,sstart,send,length,mismatch,staxids,duplicates,found
18,13_NW_021160012.1_1289,AACTGTTATAGATTGGGCACCTCGAGGTCAATTCTACCACAATTGC...,gi|1848978676|ref|XR_004669128.1|,PREDICTED: Pan paniscus endogenous retrovirus ...,185,100.0,2.58e-43,0,1,100,593,692,100,0,9597,1,yes


In [380]:
possible_vir['seq'].iloc[0]

'AACTGTTATAGATTGGGCACCTCGAGGTCAATTCTACCACAATTGCTCAGGACAAACTCAGTCATGTCCAAGTGCACAAGTGAGTCCAGCTGTTGATAGC'

### viral sequences against viral branch of SIMBA
Viral (1000 total sequences, 100 genomes, 10 contigs per genome)
test set created from viral refseq database. Downloaded March, 2020

In [112]:
vir = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/viral_test/'
results=pd.DataFrame({})
for file in glob.glob(vir + 'virNTblastn/*_deduplicated.csv'):
    bl = pd.read_csv(file)
    bl['found'] = ['yes']*bl.shape[0]
    results=pd.concat([results, bl])

results['found'] = ['yes']*results.shape[0]
results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one


In [113]:
tax_shortcopy = tax_short.copy()
tax_shortcopy = tax_shortcopy.rename(columns={"taxid":"staxids"})
tax_shortcopy['staxids'] = tax_shortcopy['staxids'].astype('str')
results_df=results.merge(tax_shortcopy, on='staxids', how='left')

results_df.columns = [str(col) + '_test' for col in results_df.columns]
results_df.shape

(998, 24)

In [114]:
input_df=pd.DataFrame({})
for file in glob.glob(vir + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    input_df=pd.concat([input_df, bl])

In [115]:
cols=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

input_df = input_df.drop(columns=['Unnamed: 0'])
input_df.columns = [str(col) + '_truth' for col in input_df.columns]


creatinig a column to merge the two dataframes (results and input)

In [116]:
input_df['id2'] = input_df['id2_truth'].copy()
results_df['id2'] =results_df['seqName_test'].apply(lambda x: '_'.join(x.split('_')[:3]))

In [117]:
fin = input_df.merge(results_df, on='id2', how='outer')

In [118]:
fin['species_test'] = fin['species_test'].fillna('not found')
# fin['species_test'] = fin['species_test'].apply(lambda x: "_".join(x.split(' ')))

going to get the viral taxids from the assembly report for the truth dataset
Will do this by downloading the assembly file from NCBI. 

In [86]:
wget.download('https://ftp.ncbi.nlm.nih.gov/genomes/refseq/viral/assembly_summary.txt')

'assembly_summary.txt'

In [134]:
!mv assembly_summary.txt viral_assembly_summary.txt

I removed the first line containing comment in the assembly report manually 

In [119]:
m1 = '/oak/stanford/groups/quake/gita/raw/nb/microbe/paper/forGitHub/post_review/'

assembly_report=pd.read_csv(m1 + 'viral_assembly_summary.txt', delimiter='	')
header = ['assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter',
       'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material',
       'asm_not_live_date']
assembly_report.columns = header
assembly_report.columns

Index(['assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter',
       'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material',
       'asm_not_live_date'],
      dtype='object')

matching the organism name in the truth dataset to assembly report organism names to be able to get taxids for truth dataset. Using taxids I can better match the truth and test datasets

In [120]:
fin['shortdesc_truth'] = fin['description_truth'].apply(lambda x: ' '.join(x.split('_')[2:]))

In [121]:
a=pd.DataFrame({})
for org in assembly_report['organism_name'].unique().tolist():
    for true_org in fin['shortdesc_truth'].unique().tolist():
        if org in true_org:
            b = assembly_report[assembly_report['organism_name']==org]
            b['shortdesc_truth']= true_org
            a = pd.concat([a, b])
a = a.drop_duplicates('shortdesc_truth')
            
ashort = a[['shortdesc_truth', 'taxid', 'ftp_path', 'assembly_accession']]
ashort['taxid']= ashort['taxid'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [122]:
fin_withtaxid = fin.merge(ashort, on='shortdesc_truth', how='left')

I did not get all the taxids by matching to assembly report, so I manually have to find current names

In [123]:
fin_withtaxid[fin_withtaxid['taxid'].isna()]['shortdesc_truth'].unique().tolist()

['Mycobacterium virus TA17a, complete genome',
 'Cervus elaphus papillomavirus type 2, complete genome',
 'Glyptapanteles indiensis bracovirus segment 15, complete sequence',
 'Salmonella phage 118970 sal1, complete genome',
 'Synechococcus phage S-RIM2 R1 1999, complete genome',
 'Bacteriophage Diva, complete genome',
 'Enterobacteria phage vB EcoM-VR7, complete genome',
 'Chaetoceros tenuissimus RNA virus genes for replicase polyprotein, structural polyprotein, complete cds',
 'Streptococcus prophage 315.4, complete genome',
 'Nse virus strain F24/CI/2004, complete genome',
 'Stx2 converting phage vB EcoP 24B, complete genome',
 'Thunberg fritillary virus, complete genome',
 'Acinetobacter phage vB AbaP D2, complete genome',
 'Campylobacter phage CP220, complete genome',
 'Campoletis sonorensis ichnovirus segment O1, complete genome',
 'Canna yellow mottle-associated virus isolate CaYMAV-Ci, complete genome',
 'Lettuce big-vein associated virus RNA 1, complete genome']

editing some names that don't appear in the assembly report to be able to find them based on their more current names. For a couple of them, I was not able to find the current name so finding their taxid is a bit more challenging. 

In [124]:
edits = pd.DataFrame(columns=['name1', 'name2']) 
edits['name1'] = fin_withtaxid[fin_withtaxid['taxid'].isna()]['shortdesc_truth'].unique().tolist()
edits['name2'] = ['Rosebushvirus TA17a', 'Cervus elaphus papillomavirus 2','Bracoviriform indiense', 'Salmonella phage 118970_sal1',
                 'Synechococcus phage S-RIM2 R1_1999','Paenibacillus phage Diva','Escherichia phage vB_EcoM_VR7',
                 'Chaetoceros tenuissimus RNA virus 01', 'Streptococcus prophage 315.4', 'Nse virus strain F24/CI/2004',
                 'Escherichia phage vB_EcoP_24B', 'Thunberg fritillary mosaic virus', 'Acinetobacter phage vB_AbaP_D2',
                 'Firehammervirus CP220', 'Campoletis sonorensis ichnovirus', 'Canna yellow mottle associated virus',
                 'Lettuce big-vein associated virus RNA 1']  

notaxid = fin_withtaxid[fin_withtaxid['taxid'].isna()]
col = 'shortdesc_truth'

for row_index in range(edits.shape[0]):
    notaxid[col] = notaxid[col].str.replace(edits['name1'].iat[row_index], edits['name2'].iat[row_index])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


this step will get the taxids from assembly report

In [125]:
new_a=pd.DataFrame({})
for org in assembly_report['organism_name'].unique().tolist():
    for true_org in notaxid['shortdesc_truth']:
        if org in true_org:
            new_b = assembly_report[assembly_report['organism_name']==org]
            new_b['shortdesc_truth']= true_org
            new_a = pd.concat([new_a, new_b])
new_a_short = new_a[['shortdesc_truth', 'taxid']].drop_duplicates()
new_a_short['taxid']= new_a_short['taxid'].astype('str')

notaxid = notaxid.merge(new_a_short, on='shortdesc_truth', how='outer')
notaxid['taxid_x'].fillna(notaxid['taxid_y'], inplace=True)
del notaxid['taxid_y']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


now concatenating the notaxid dataframe (which now has taxids!) with the other part of the data which already had taxids

In [126]:
vir_df = pd.concat([fin_withtaxid[fin_withtaxid['taxid'].isna()==False], notaxid])
vir_df['taxid'].fillna(vir_df['taxid_x'], inplace=True)
del vir_df['taxid_x']

adding taxonomic lineage to the final viral dataset containing truth and test calls

In [127]:
tax_shortcopy = tax_short.copy()
tax_shortcopy['taxid'] = tax_shortcopy['taxid'].astype('str')
vir_df=vir_df.merge(tax_shortcopy, on='taxid', how='left')

these are the viruses for which I don't have the truth or test lineage and thus will exclude from analysis. I need to have the full taxonomic lineage to compare to the results

In [128]:
vir_df2=vir_df[~((vir_df['genus'].isna()) | (vir_df['genus_test'].isna()))] #some have only species and superkingdom info, so this line will get rid of those that have missing information in between those two types of classification


what is the number of input sequences we can do analysis on? 

In [129]:
vir_df2.shape

(710, 44)

let save the dataframe for comparison to kraken results

In [130]:
vir_df2.to_csv(mainDir2 + 'results/viral_fin.csv', index=False)

In [130]:
lin = ['taxid', 'staxids_test', 'description_truth', 'pathogen_test', 'superkingdom', 'phylum', 'class','order', 'family', 'genus', 'species', 
       'superkingdom_test', 'phylum_test', 'class_test', 'order_test','family_test', 'genus_test', 'species_test']


how many sequences out of 710 have exactly the same taxids or species names off the bat? (there may be multiple taxids for the same species). These are perfect matches between truth and test dataset

In [131]:
match = vir_df2[(vir_df2['taxid']==vir_df2['staxids_test']) | (vir_df2['species']==vir_df2['species_test'])]
match.shape

(548, 44)

how many are not perfect match? 

In [132]:
vir_df2.shape[0] - match.shape[0]

162

Out of these how many are actually prophages (i.e. not matching because they are being mapped to a prophage sequence of the phage in the truth dataset?)

In [133]:
prophages = vir_df2[vir_df2['superkingdom_test']=='Bacteria']
pro=prophages.groupby(['species','species_test']).count().iloc[:20]
pro = pro.reset_index().rename(columns={'species':'species_truth', 'id_truth':'count'})

prophages 

In [596]:
pro


Unnamed: 0,species_truth,species_test,count,sequence_truth,description_truth,label_truth,uid_truth,genome_length_truth,rand_seq_truth,id2_truth,...,shortdesc_truth,taxid,ftp_path,assembly_accession,superkingdom,phylum,class,order,family,genus
0,Escherichia virus 24B,Escherichia coli,10,10,10,10,10,10,10,10,...,10,10,0,0,10,10,10,10,10,10
1,Escherichia virus TL2011,Escherichia coli,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
2,Salmonella phage SEN5,Escherichia coli,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
3,Salmonella phage SEN5,Salmonella enterica,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,Staphylococcus phage StauST398-3,Staphylococcus aureus,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10


will exclude these for the analysis eventhough they are technically true positives. 

In [134]:
vir_df3=vir_df2[vir_df2['superkingdom_test']!='Bacteria']
vir_df3['found']=['yes']*vir_df3.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [135]:
vir_df3['found']=['yes']*vir_df3.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


how many differrent genera are in the truth dataset

In [136]:
vir_df3['genus'].nunique()

62

In [137]:
vir_df3['genus'].unique()

array(['Vitivirus', 'Jenstvirus', 'Pakpunavirus', 'Allexivirus',
       'Obolenskvirus', 'Bruynoghevirus', 'Closterovirus', 'Henipavirus',
       'Torradovirus', 'Gamtrevirus', 'Pipefishvirus', 'Redivirus',
       'Pamexvirus', 'Alphanemrhavirus', 'Nickievirus', 'Bocaparvovirus',
       'Deltaarterivirus', 'Victorivirus', 'Senquatrovirus',
       'Luckytenvirus', 'Flavivirus', 'Pbunavirus', 'Hepacivirus',
       'Orthobunyavirus', 'Limdunavirus', 'Plotvirus', 'Bingvirus',
       'Norovirus', 'Sopolycivirus', 'Gammapapillomavirus', 'Avunavirus',
       'Yuavirus', 'Potyvirus', 'Gordtnkvirus', 'Jeilongvirus',
       'Tangaroavirus', 'Sapelovirus', 'Fromanvirus', 'Alphacoronavirus',
       'Carlavirus', 'Mavirus', 'Aquambidensovirus', 'Gelderlandvirus',
       'Orthophasmavirus', 'Marthavirus', 'Schizotequatrovirus',
       'Ledantevirus', 'Likavirus', 'Kobuvirus', 'Marburgvirus',
       'Lightbulbvirus', 'Foveavirus', 'Rosebushvirus',
       'Xipapillomavirus', 'Bracovirus', 'Chivirus', 

In [138]:
def cat3(tax, df):
    tp = df[(df[tax + '_test']==df[tax]) |  (df['taxid']==df['staxids_test'])].shape[0]
    fp = df[(df[tax +'_test']!=df[tax ]) & (df['taxid']!=df['staxids_test'])].shape[0]
    fn= df[df['found'].isna()].shape[0]
    precision, recall, f1=param_estimate(tp, fp, fn) 
    return(np.round(precision, 3), np.round(recall, 3), np.round(f1, 3))


In [139]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)


for category in cols:
    table[category]= cat3(category,vir_df3)
    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table

Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,1.0,1.0,1.0,1.0,1.0,0.999,0.809
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0


these are false positives at the species level. you can see the the test and truth seqs are coming from similar species

In [654]:
sim_sp = vir_df3[(vir_df3['species']!=vir_df3['species_test']) & 
        (vir_df3['taxid']!=vir_df3['staxids_test'])]
sim_sp2 = sim_sp.groupby(['species', 'species_test']).count().reset_index().iloc[:,:3].rename(columns={'species':'species_truth', 'id_truth':'counts'})
sim_sp2

Unnamed: 0,species_truth,species_test,counts
0,Acinetobacter virus AbP2,Acinetobacter phage Bphi-R1888,8
1,Acinetobacter virus D2,Acinetobacter phage vB_AbaP_APK44,1
2,Acinetobacter virus D2,Acinetobacter phage vB_AbaP_D2M,4
3,Acinetobacter virus D2,Acinetobacter phage vB_AbaP_PMK34,5
4,Arthrobacter virus Piccoletto,Arthrobacter virus Beans,1
5,Campylobacter virus CP220,Campylobacter phage CP20,2
6,Campylobacter virus CP220,Campylobacter virus CP21,1
7,Campylobacter virus CP220,Campylobacter virus IBB35,1
8,Canna yellow mottle associated virus,Canna yellow mottle virus,4
9,Escherichia virus VR7,Escherichia virus VR25,1


In [665]:
sim_sp[['species', 'species_test','pident_test', 'length_test']].rename(columns={"species":'species_truth'}).drop_duplicates()

Unnamed: 0,species_truth,species_test,pident_test,length_test
30,Pseudomonas phage C11,Pseudomonas phage vB_PaeM_SCUT-S2,100.0,100.0
32,Pseudomonas phage C11,Pseudomonas phage PaZq-1,100.0,100.0
33,Pseudomonas phage C11,Pseudomonas phage PaGz-1,100.0,100.0
34,Pseudomonas phage C11,Pseudomonas phage K5,100.0,100.0
51,Acinetobacter virus AbP2,Acinetobacter phage Bphi-R1888,100.0,100.0
72,Pseudomonas phage DL54,Pseudomonas phage phiPA01_EW,100.0,100.0
75,Pseudomonas phage DL54,Pseudomonas virus Pa222,100.0,100.0
79,Pseudomonas phage DL54,Pseudomonas phage otherone,100.0,100.0
122,Mycobacterium phage Phaedrus,Mycobacterium phage Akoma,100.0,100.0
123,Mycobacterium phage Phaedrus,Mycobacterium phage Compostia,100.0,100.0


### Fungal truth dataset 
note list of accessions came from a study of fungi. https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-017-0373-4#Sec20


In [38]:
fung = '/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/benchmarking_post_reviews/fungal/'
results=pd.DataFrame({})
for file in glob.glob(fung + 'fungi_NT_blastn/*_deduplicated.csv'):
    bl = pd.read_csv(file)
    bl['found'] = ['yes']*bl.shape[0]
    results=pd.concat([results, bl])

results['found'] = ['yes']*results.shape[0]
results['staxids'] = results['staxids'].astype(str)
results['staxids'] = results['staxids'].apply(lambda x: x.split(';')[0]) #sometimes two taxids are provided, taking just one


In [39]:
tax_shortcopy = tax_short.copy()
tax_shortcopy = tax_shortcopy.rename(columns={"taxid":"staxids"})
tax_shortcopy['staxids'] = tax_shortcopy['staxids'].astype('str')
results_df=results.merge(tax_shortcopy, on='staxids', how='left')

results_df.columns = [str(col) + '_test' for col in results_df.columns]
results_df['id2'] = results_df['seqName_test'].apply(lambda x: '_'.join(x.split('_')[1:3]))
results_df.shape

(923, 25)

In [40]:
input_df=pd.DataFrame({})
for file in glob.glob(fung + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    input_df=pd.concat([input_df, bl])
    
cols=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

input_df = input_df.drop(columns=['Unnamed: 0'])
input_df.columns = [str(col) + '_truth' for col in input_df.columns]

input_df['id2'] = input_df['id2_truth'].copy()

fin = input_df.merge(results_df, on='id2', how='outer')

fin['species_test'] = fin['species_test'].fillna('not found')

In [51]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)

found=fin[fin['species_test']!='not found']
for category in cols:
    table[category]= cat(category, df=found, df2=fin)
    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table

Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,1.0,0.99,0.988,0.985,0.961,0.948,0.793
recall,0.207,0.205,0.205,0.204,0.2,0.198,0.171


In [None]:
    tp = df[(df[tax + '_test']==df[tax]) |  (df['taxid']==df['staxids_test'])].shape[0]
    fp = df[(df[tax +'_test']!=df[tax ]) & (df['taxid']!=df['staxids_test'])].shape[0]
    fn= df[df['found'].isna()].shape[0]


### Comparison with Kraken

I combined the fasta files that were input to SIMBA (gut, homd, human, viral and bact refseq) to Kraken to see how well it does. Since I concatenated the input fasta files I need to retrieve which dataset each seq came from by matching it to the csvs, based on the rand_seq column which is the sequence that was input.  

In [140]:
kra = mainDir2 + 'kraken/'
kradf=ph.read_fasta(kra + 'nt/gut_homd_refseq_viral_human_datasets_classified.fasta')
kradf['taxid'] = kradf['description'].apply(lambda x: x.split('kraken:taxid|')[1])
kradf2=ph.read_fasta(kra + 'nt/gut_homd_refseq_viral_human_datasets_unclassified.fasta')

kradf['classified'] = ['yes']*kradf.shape[0]
kradf2['classified'] = ['no']*kradf2.shape[0]

kra_res=pd.concat([kradf, kradf2])[['id', 'sequence', 'taxid', 'classified']]
kra_res['taxid'] = kra_res['taxid'].astype(str)


tax_shortcopy = tax_short.copy()
tax_shortcopy['taxid'] = tax_shortcopy['taxid'].astype(str)
kra_res=kra_res.merge(tax_shortcopy, on='taxid', how='left')
kra_res = kra_res.rename(columns={'sequence': "rand_seq"})
kra_res.columns = [str(col) + '_test' for col in kra_res.columns]
kra_res['rand_seq'] = kra_res['rand_seq_test'].copy()


this is the result of kraken 

In [658]:
kra_res.head(1)

Unnamed: 0,id_test,rand_seq_test,taxid_test,classified_test,superkingdom_test,phylum_test,class_test,order_test,family_test,genus_test,species_test,rand_seq
0,89014_NZ_JAAITI010000001.1_0_Blautia_luti,GGAAGTTCATCGATGACTGGCATGGTTGCCATTGTTTCACCGGTTT...,657314,yes,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Blautia,Blautia obeum,GGAAGTTCATCGATGACTGGCATGGTTGCCATTGTTTCACCGGTTT...


reading the input datasets and marking adding a descriptive column called "dataset" so I know where each seq is coming from

In [141]:
vir_dir = mainDir2 + 'viral_test/'
hum_dir = mainDir2 + 'human_test/'
ref_dir = mainDir + 'refseq/'
gut_dir = mainDir + 'batch_of_ten/'
homd_dir = mainDir + 'homd/'

vir=pd.DataFrame({})
for file in glob.glob(vir_dir + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    vir=pd.concat([vir, bl])
vir['dataset']=vir.shape[0]*['viral']


hum=pd.DataFrame({})
for file in glob.glob(hum_dir + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    hum=pd.concat([hum, bl])
hum['dataset']=hum.shape[0]*['human']


ref=pd.DataFrame({})
for file in glob.glob(ref_dir + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    ref=pd.concat([ref, bl])
ref['dataset']=ref.shape[0]*['refseq']

gut=pd.DataFrame({})
for file in glob.glob(gut_dir + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    gut=pd.concat([gut, bl])
gut['dataset']=gut.shape[0]*['gut']
    
    
homd=pd.DataFrame({})
for file in glob.glob(homd_dir + 'fasta_files/*.csv'):
    bl = pd.read_csv(file)
    homd=pd.concat([homd, bl])
homd['dataset']=homd.shape[0]*['homd']

In [6]:
def kra_cat(tax, df):
    df_sub = df[df[tax + '_test'].isna()==False]
    tp = df_sub[(df_sub[tax + '_test']==df_sub[tax]) |  (df_sub['taxid_test']==df_sub['taxid'])].shape[0]
    fp = df_sub[(df_sub[tax +'_test']!=df_sub[tax ]) & (df_sub['taxid_test']!=df_sub['taxid'])].shape[0]
    fn= df[df[tax + '_test'].isna()].shape[0]
    precision, recall, f1=param_estimate(tp, fp, fn) 
    return(np.round(precision,3), np.round(recall, 3), np.round(f1, 3))

def kra_cat2(tax, df):
    df_sub = df[df[tax + '_test'].isna()==False]
    tp = df_sub[(df_sub[tax + '_test']==df_sub[tax]) |  (df_sub['taxid_test']==df_sub['taxid'])].shape[0]
    fp = df_sub[(df_sub[tax +'_test']!=df_sub[tax]) & (df_sub['taxid_test']!=df_sub['taxid'])].shape[0]
    fn= df[df[tax + '_test'].isna()].shape[0]
    return(tp, fp, fn)

#### Gut bacterial seqs

analyzing how well kraken picks up gut seqs

In [143]:
gut = gut[['id2', 'rand_seq', 'description','dataset', 'taxid']]
kra_gut=kra_res.merge(gut, how='right', on=['rand_seq'])
kra_gut['taxid'] = kra_gut['taxid'].astype('str')
kra_gut=kra_gut.merge(tax_shortcopy, on='taxid', how='left')
kra_gut = kra_gut.drop_duplicates('rand_seq')
kra_gut.shape

(5000, 23)

In [144]:
kra_gut['found']=np.NaN * kra_gut.shape[0]
kra_gut.loc[kra_gut['species_test'].isna()==False, 'found']='yes'

In [145]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table[category]= kra_cat(category,kra_gut)

    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table


Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,0.952,0.948,0.944,0.945,0.927,0.929,0.664
recall,0.619,0.606,0.603,0.603,0.586,0.534,0.417


In [23]:
tax='superkingdom'
df_sub = kra_gut[kra_gut['species_test'].isna()==False]
fp = df_sub[(df_sub[tax +'_test']!=df_sub[tax ]) & (df_sub['taxid_test']!=df_sub['taxid'])]
lin = ['species_truth', 'species_test','superkingdom_truth',  tax+'_test']

fp_sub = fp[fp['superkingdom_test']=='Eukaryota'].rename(columns={'species':'species_truth', 'superkingdom':'superkingdom_truth'})
fp_sub[lin].groupby(['species_truth','species_test', 'superkingdom_truth', 'superkingdom_test']).count().head(20)

species_truth,species_test,superkingdom_truth,superkingdom_test
Alistipes putredinis,Lycaena phlaeas,Bacteria,Eukaryota
Alistipes timonensis,Ovis canadensis,Bacteria,Eukaryota
Bacteroides faecis,Deilephila porcellus,Bacteria,Eukaryota
Bacteroides faecis,Lineus longissimus,Bacteria,Eukaryota
Blautia obeum,Bos taurus,Bacteria,Eukaryota
Blautia obeum,Carassius auratus,Bacteria,Eukaryota
Clostridioides difficile,Noctua fimbriata,Bacteria,Eukaryota
Clostridioides difficile,Penaeus monodon,Bacteria,Eukaryota
Clostridium disporicum,Acomys russatus,Bacteria,Eukaryota
Clostridium disporicum,Anas platyrhynchos,Bacteria,Eukaryota


In [24]:
fp_sub.shape

(51, 24)

In [None]:
tax='superkingdom'
df_sub = kra_gut[kra_gut['species_test'].isna()==False]
fp = df_sub[(df_sub[tax +'_test']!=df_sub[tax ]) & (df_sub['taxid_test']!=df_sub['taxid'])]
lin = ['species_truth', 'species_test','superkingdom_truth',  tax+'_test']

fp_sub = fp[fp['superkingdom_test']=='Eukaryota'].rename(columns={'species':'species_truth', 'superkingdom':'superkingdom_truth'})
fp_sub[lin].groupby(['species_truth','species_test', 'superkingdom_truth', 'superkingdom_test']).count()

In [665]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['true positives', 'false positives', 'false negatives']
table2=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table2[category]= kra_cat2(category,kra_gut)
    
table2


Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
true positives,3001,2931,2912,2911,2803,2565,1720
false positives,150,162,173,171,220,196,872
false negatives,1849,1907,1915,1918,1977,2239,2408


#### refseq truth dataset vs kraken

In [146]:
ref = ref[['id2', 'rand_seq', 'name','dataset', 'taxid']]
kra_ref=kra_res.merge(ref, how='right', on=['rand_seq'])
kra_ref['taxid'] = kra_ref['taxid'].astype('str')
kra_ref=kra_ref.merge(tax_shortcopy, on='taxid', how='left')
kra_ref =kra_ref.drop_duplicates('rand_seq')
kra_ref.head(1)

Unnamed: 0,id_test,rand_seq_test,taxid_test,classified_test,superkingdom_test,phylum_test,class_test,order_test,family_test,genus_test,...,name,dataset,taxid,superkingdom,phylum,class,order,family,genus,species
0,81479_NZ_CP019240.1_0_Rhodoferax_antarcticus,AACAGCAGTTTCAGACCTGGATCAAGCCGTTGACAGCCCAGGTGTC...,81479,yes,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Rhodoferax,...,Rhodoferax antarcticus strain DSMZ24876 chromo...,refseq,81479,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Rhodoferax,Rhodoferax antarcticus


In [147]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table[category]= kra_cat(category,kra_ref)

    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table


Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,0.997,0.997,0.997,0.997,0.997,0.997,0.991
recall,0.984,0.983,0.981,0.98,0.979,0.977,0.863


#### homd vs kraken

In [148]:
homd = homd[['id2', 'rand_seq', 'description','dataset', 'taxid']]
kra_homd=kra_res.merge(homd, how='right', on=['rand_seq'])
kra_homd['taxid'] = kra_homd['taxid'].astype('str')
kra_homd=kra_homd.merge(tax_shortcopy, on='taxid', how='left')
kra_homd =kra_homd.drop_duplicates('rand_seq')
kra_homd.head(1)

Unnamed: 0,id_test,rand_seq_test,taxid_test,classified_test,superkingdom_test,phylum_test,class_test,order_test,family_test,genus_test,...,description,dataset,taxid,superkingdom,phylum,class,order,family,genus,species
0,1596_NZ_JVOU01000027.1_3501_Lactobacillus_gasseri,ATCTTATTAACGCCTTCCCAGTCGCTCTTCATTGTATCTTCATCAA...,1596,yes,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,...,NZ_JVOU01000027.1 Lactobacillus gasseri strain...,homd,1596,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus gasseri


In [149]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table[category]= kra_cat(category,kra_homd)

    
table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table


Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,0.991,0.99,0.988,0.979,0.973,0.971,0.912
recall,0.767,0.758,0.754,0.752,0.703,0.745,0.635


In [355]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['true positives', 'false positives', 'false negatives']
table2=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table2[category]= kra_cat2(category,kra_homd)
    
table2

Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
true positives,3807,3759,3734,3701,3447,3643,2988
false positives,34,39,47,78,97,109,290
false negatives,1156,1199,1216,1218,1453,1245,1719


#### Human vs Kraken

In general Kraken seems to be doing well against multiple datasets from bacteria. How many false positives does it have for human seqs, and also why did it originally have such high false positives? 

In [150]:
hum['taxid']=['9606']*hum.shape[0] #adding the human taxid to the truth datatset
hum = hum[['id2', 'rand_seq', 'description','dataset', 'taxid']]
kra_hum=kra_res.merge(hum, how='right', on=['rand_seq'])
kra_hum['taxid'] = kra_hum['taxid'].astype('str')
kra_hum=kra_hum.merge(tax_shortcopy, on='taxid', how='left')
kra_hum =kra_hum.drop_duplicates('rand_seq')
kra_hum.head(1)

Unnamed: 0,id_test,rand_seq_test,taxid_test,classified_test,superkingdom_test,phylum_test,class_test,order_test,family_test,genus_test,...,description,dataset,taxid,superkingdom,phylum,class,order,family,genus,species
0,1_NT_187361.1_0,GTAGATGTATCTTATCTTAACTTGAGTCTTTGCTGCCCCTAATGAG...,9606,yes,Eukaryota,Chordata,Mammalia,Primates,Hominidae,Homo,...,NT_187361.1 Homo sapiens chromosome 1 unlocali...,human,9606,Eukaryota,Chordata,Mammalia,Primates,Hominidae,Homo,Homo sapiens


In [151]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['precision', 'recall', 'F1']
table=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table[category]= kra_cat(category,kra_hum)

table = table.rename(columns={'superkingdom': 'domain'}).iloc[:2,:]
table


Unnamed: 0,domain,phylum,class,order,family,genus,species
precision,1.0,0.991,0.987,0.982,0.976,0.969,0.97
recall,0.64,0.633,0.632,0.628,0.621,0.561,0.561


In [93]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['true positives', 'false positives', 'false negatives']
table2=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table2[category]= kra_cat2(category,kra_hum)
    
table2

Unnamed: 0,superkingdom,phylum,class,order,family,genus,species
true positives,1535,1511,1503,1490,1468,1323,1323
false positives,0,13,20,28,36,42,41
false negatives,865,876,877,882,896,1035,1036


In [101]:
kra_hum['phylum_test'].value_counts()

Chordata        1511
Arthropoda         9
Streptophyta       2
Mollusca           1
Apicomplexa        1
Name: phylum_test, dtype: int64

#### A real TS dataset versus Kraken 
coming from TSP2_Bladder_NA_10X_1_1_S5_L001 TS sample which I had previously used to look at Kraken false positives, rechecking my previous work

In [7]:
mainDir10 ='/oak/stanford/groups/quake/gita/raw/tab3-14_20210420/all/nt/'


In [8]:
c = ph.read_fasta(mainDir10 + 'TSP2_Bladder_NA_10X_1_1_S5_L001_classified.fasta')

In [9]:
c['taxid'] = c['description'].apply(lambda x: x.split('kraken:taxid|')[1])
u=ph.read_fasta(mainDir10 + 'TSP2_Bladder_NA_10X_1_1_S5_L001_unclassified.fasta')

c['classified'] = ['yes']*c.shape[0]
u['classified'] = ['no']*u.shape[0]

cu=pd.concat([c, u])[['id', 'sequence', 'taxid', 'classified']]
cu['taxid'] = cu['taxid'].astype(str)

tax_shortcopy = tax_short.copy()
tax_shortcopy['taxid'] = tax_shortcopy['taxid'].astype(str)
cu=cu.merge(tax_shortcopy, on='taxid', how='left')
cu.columns = [str(col) + '_test' for col in cu.columns]


In [10]:
print('how many reads were classified: ' , c.shape[0])
print('how many reads were unclassified: ', u.shape[0])

how many reads were classified:  377872
how many reads were unclassified:  152759


Number of reads that were classified by Kraken2. This does not talk about correct classification, just classification

In [11]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
vals=[]
for cat in cols: 
    unclass=cu[cu[cat + '_test'].isna()==False].shape[0]
    print(cat, np.round(unclass/cu.shape[0],2))
    vals.append(np.round(unclass/cu.shape[0],2))

frac_c = pd.DataFrame({'category':cols, 'classified fraction': vals})

    

superkingdom 0.54
phylum 0.52
class 0.5
order 0.48
family 0.44
genus 0.38
species 0.38


In [12]:
frac_c

Unnamed: 0,category,classified fraction
0,superkingdom,0.54
1,phylum,0.52
2,class,0.5
3,order,0.48
4,family,0.44
5,genus,0.38
6,species,0.38


In [13]:
pd.DataFrame(cu.superkingdom_test.value_counts())

Unnamed: 0,superkingdom_test
Eukaryota,284988
Bacteria,2669
Viruses,1078
Archaea,94


**NNTR:** going to create a subsampled dataset of Krarken-identified euks, bacteria, and viruses to see how many of them are correctly identified. 3 batches, each containing 300 sequences (100 from each category of organisms. Saving the csvs and creating input fasta files for validation against NT. 

In [493]:
c_sub=cu[(cu['superkingdom_test']!='Archaea') & (cu['species_test'].isna()==False) & (cu['superkingdom_test'].isna()==False) 
         &  (cu['phylum_test'].isna()==False)] #want to exclude archaea, also exlcude enteries that don't have full taxonomic information
c_grouped=c_sub.groupby('superkingdom_test', group_keys=False)

#first batch writting the csv and fasta file
c_rand_batch1 = c_grouped.apply(pd.DataFrame.sample, n=100) 
c_rand_batch1.to_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch1.csv')

c_rand_batch2 = c_grouped.apply(pd.DataFrame.sample, n=100)
c_rand_batch2.to_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch2.csv')

c_rand_batch3 = c_grouped.apply(pd.DataFrame.sample, n=100) 
c_rand_batch3.to_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch3.csv')

f= open(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch1.fasta', 'w')
for i in list(range(0,c_rand_batch1.shape[0])):
    header = str(c_rand_batch1['id_test'].iat[i]) 
    sequence = c_rand_batch1['sequence_test'].iat[i]
    f.write('>' + header + '\n' + sequence + '\n')
f.close()
#doing the same thing for the other two batches
f= open(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch2.fasta', 'w')
for i in list(range(0,c_rand_batch2.shape[0])):
    header = str(c_rand_batch2['id_test'].iat[i]) 
    sequence = c_rand_batch2['sequence_test'].iat[i]
    f.write('>' + header + '\n' + sequence + '\n')
f.close()

f= open(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch3.fasta', 'w')
for i in list(range(0,c_rand_batch3.shape[0])):
    header = str(c_rand_batch3['id_test'].iat[i]) 
    sequence = c_rand_batch3['sequence_test'].iat[i]
    f.write('>' + header + '\n' + sequence + '\n')
f.close()



now checking to see what BLAST results against NT say about the hits that Kraken had identified as eukaryotic, bacterial or viral. Going to concatenate all three batches. Note while I had selected 300 random seqs (100 for each domain) per batch, the total is less than 900 because some of those sequences were not unique. The total dataset size for testing is 818 after getting rid of duplicate test seqs.  

In [14]:
batch1_kra = pd.read_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch1.csv')
batch1_bla = pd.read_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/nt/batch1_deduplicated.csv')

batch2_kra = pd.read_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch2.csv')
batch2_bla = pd.read_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/nt/batch2_deduplicated.csv')

batch3_kra = pd.read_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/fasta_files/batch3.csv')
batch3_bla = pd.read_csv(mainDir2 + 'kraken/blasted_kraken_subset_of_ts/nt/batch3_deduplicated.csv')


will match based on fasta header to see what Kraken predicts versus BLASTn

In [15]:
def merge(kra_df, bla_df):
    kra_df['seqName']=kra_df['id_test'].copy()
    kra_df = kra_df.drop_duplicates('sequence_test')
    print('size of test dataset, number of unique input seqs: ', kra_df.shape[0])
    bla_df['staxids'] = bla_df['staxids'].astype('str')
    bla_df = bla_df.drop_duplicates(['seqName'])
    print('size of output blast dataset, number of unique output seqs: ', bla_df.shape[0])

    mer=kra_df.merge(bla_df, on='seqName', how='outer').drop_duplicates('sequence_test')
    #adding taxonomic lineage to blasted results
    tax_shortcopy = tax_short.copy()
    tax_shortcopy['staxids'] = tax_shortcopy['taxid'].astype(str)
    mer=mer.merge(tax_shortcopy, on='staxids', how='left')
    return(mer)

In [16]:
mer1=merge(batch1_kra, batch1_bla)
mer2=merge(batch2_kra, batch2_bla)
mer3=merge(batch3_kra, batch3_bla)

size of test dataset, number of unique input seqs:  289
size of output blast dataset, number of unique output seqs:  240
size of test dataset, number of unique input seqs:  288
size of output blast dataset, number of unique output seqs:  237
size of test dataset, number of unique input seqs:  283
size of output blast dataset, number of unique output seqs:  235


In [17]:
def krak_blast_compare(df, tax):
    prec = []
    for domain in ['Eukaryota', 'Bacteria', 'Viruses']:        
        df_domain = df[df['superkingdom_test']==domain]
        df_sub = df_domain[df_domain[tax + '_test'].isna()==False]
        tp = df_sub[(df_sub[tax + '_test']==df_sub[tax]) |  (df_sub['taxid_test']==df_sub['taxid'])].shape[0]
        fp = df_sub.shape[0] - tp #anything that isn't true positive is a false positive because it did not produce a significant blastn hit
        precision = tp / (tp + fp)
        prec.append(np.round(precision,3))
    return(prec)

def fp_tp(df, tax, domain): #returns true positive dataframe and false positive dataframe at a given taxonmic level (tax)
    df_domain = df[df['superkingdom_test']==domain]
    df_sub = df_domain[df_domain[tax + '_test'].isna()==False]
    tp = df_sub[(df_sub[tax + '_test']==df_sub[tax]) | (df_sub['taxid_test']==df_sub['taxid'])] 
    fp = df_sub[(df_sub[tax + '_test']!=df_sub[tax]) & (df_sub['taxid_test']!=df_sub['taxid'])]
    return(tp, fp)


BATCH 1

In [18]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['Eukaryotic', 'Bacterial', 'Viral']
table1=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table1[category]= krak_blast_compare(mer1, category)

    

table1 = table1.rename(columns={'superkingdom': 'domain'})
table1


Unnamed: 0,domain,phylum,class,order,family,genus,species
Eukaryotic,0.74,0.71,0.61,0.596,0.57,0.53,0.53
Bacterial,0.061,0.061,0.062,0.061,0.065,0.071,0.051
Viral,0.033,0.033,0.022,0.022,0.025,0.047,0.022


BATCH 2

In [19]:
table2=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table2[category]= krak_blast_compare(mer2, category)

table2 = table2.rename(columns={'superkingdom': 'domain'})
table2


Unnamed: 0,domain,phylum,class,order,family,genus,species
Eukaryotic,0.75,0.65,0.54,0.52,0.5,0.47,0.47
Bacterial,0.082,0.071,0.075,0.072,0.072,0.074,0.061
Viral,0.011,0.011,0.011,0.011,0.012,0.0,0.011


BATCH 3

In [20]:
table3=pd.DataFrame(columns=cols, index = ind)

for category in cols:
    table3[category]= krak_blast_compare(mer3, category)

table3 = table3.rename(columns={'superkingdom': 'domain'})
table3

Unnamed: 0,domain,phylum,class,order,family,genus,species
Eukaryotic,0.82,0.81,0.69,0.67,0.65,0.63,0.63
Bacterial,0.051,0.051,0.052,0.042,0.042,0.044,0.041
Viral,0.047,0.047,0.047,0.047,0.05,0.065,0.047


now combining the results of these three batches to present them as mean +/- standard deviation. Note these values represent precision of kraken2. This is based on counting anything that did not produce a significant BLASTn hit as a false positive. We can also see what the false positives are for just the enteries where BLAST also produced a hit. 

In [21]:
cols = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ind = ['Eukaryotic', 'Bacterial', 'Viral']
newtable = pd.DataFrame(columns=cols, index = ind)

for row in range(table1.shape[0]):
    for col in range(table1.shape[1]):
        val1 = table1.iloc[row, col]
        val2 = table2.iloc[row, col]
        val3 = table3.iloc[row, col]
        avg = np.round(np.average([val1,val2,val3]),2)
        sdev = np.round(np.std([val1,val2,val3]),2)
        newtable.iloc[row,col] = str(avg) + u" \u00B1 " + str(sdev)

newtable = newtable.rename(columns={'superkingdom': 'domain'})
newtable

Unnamed: 0,domain,phylum,class,order,family,genus,species
Eukaryotic,0.77 ± 0.04,0.72 ± 0.07,0.61 ± 0.06,0.6 ± 0.06,0.57 ± 0.06,0.54 ± 0.07,0.54 ± 0.07
Bacterial,0.06 ± 0.01,0.06 ± 0.01,0.06 ± 0.01,0.06 ± 0.01,0.06 ± 0.01,0.06 ± 0.01,0.05 ± 0.01
Viral,0.03 ± 0.01,0.03 ± 0.01,0.03 ± 0.02,0.03 ± 0.02,0.03 ± 0.02,0.04 ± 0.03,0.03 ± 0.02


lets explore some of these false positives

In [25]:
tp, fp =fp_tp(mer1, 'species', 'Viruses')

shows repeated instances of misclassification by Kraken2, assigning human and carp reads as viral

In [26]:
fp.groupby(['species_test', 'species', 'order_test', 'order', 'superkingdom_test', 'superkingdom']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 0,id_test,sequence_test,taxid_test,classified_test,phylum_test,class_test,family_test,genus_test,seqName,...,send,length,mismatch,staxids,duplicates,taxid,phylum,class,family,genus
species_test,species,order_test,order,superkingdom_test,superkingdom,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Alces alces faeces associated microvirus MP11 5517,Homo sapiens,Petitvirales,Primates,Viruses,Eukaryota,1,1,1,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,1,1,1
Carnation latent virus,Cyprinus carpio,Tymovirales,Cypriniformes,Viruses,Eukaryota,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
Carnation latent virus,Homo sapiens,Tymovirales,Primates,Viruses,Eukaryota,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
Carnation latent virus,Hylobates moloch,Tymovirales,Primates,Viruses,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Carnation latent virus,Macaca mulatta,Tymovirales,Primates,Viruses,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Carnation latent virus,Pan paniscus,Tymovirales,Primates,Viruses,Eukaryota,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Carnation latent virus,Streptomyces sp. AgN23,Tymovirales,Streptomycetales,Viruses,Bacteria,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Carnation latent virus,Tyto alba,Tymovirales,Strigiformes,Viruses,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Caudovirales sp.,Cyprinus carpio,Caudovirales,Cypriniformes,Viruses,Eukaryota,8,8,8,8,8,8,8,0,0,8,...,8,8,8,8,8,8,8,8,8,8
Caudovirales sp.,Streptomyces sp. AgN23,Caudovirales,Streptomycetales,Viruses,Bacteria,1,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,1,1,1,1


In [31]:
fp_short = fp[['species_test', 'species', 'order_test', 'order', 'superkingdom_test', 'superkingdom']]
fp_prim = fp_short[fp_short['order']=='Primates']
fp_prim = fp_prim.rename(columns={'species_test':'kraken2_species',
                       'order_test':'kraken2_order',
                       'superkingdom_test':'kraken2_superkingdom'}).reset_index(drop=['index'])
fp_prim


Unnamed: 0,kraken2_species,species,kraken2_order,order,kraken2_superkingdom,superkingdom
0,Carnation latent virus,Macaca mulatta,Tymovirales,Primates,Viruses,Eukaryota
1,Carnation latent virus,Homo sapiens,Tymovirales,Primates,Viruses,Eukaryota
2,Alces alces faeces associated microvirus MP11 ...,Homo sapiens,Petitvirales,Primates,Viruses,Eukaryota
3,Carnation latent virus,Homo sapiens,Tymovirales,Primates,Viruses,Eukaryota
4,Porcine bastrovirus,Homo sapiens,Stellavirales,Primates,Viruses,Eukaryota
5,Carnation latent virus,Homo sapiens,Tymovirales,Primates,Viruses,Eukaryota
6,Carnation latent virus,Pan paniscus,Tymovirales,Primates,Viruses,Eukaryota
7,Carnation latent virus,Homo sapiens,Tymovirales,Primates,Viruses,Eukaryota
8,Pestivirus A,Homo sapiens,Amarillovirales,Primates,Viruses,Eukaryota
9,Carnation latent virus,Hylobates moloch,Tymovirales,Primates,Viruses,Eukaryota


In [33]:
fp_prim.to_csv(tables + 'kraken2_false_positive_examples.csv', index=False)

In [90]:
tp.groupby(['species_test', 'species', 'order_test', 'order', 'superkingdom_test', 'superkingdom']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 0,id_test,sequence_test,taxid_test,classified_test,phylum_test,class_test,family_test,genus_test,seqName,...,send,length,mismatch,staxids,duplicates,taxid,phylum,class,family,genus
species_test,species,order_test,order,superkingdom_test,superkingdom,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Escherichia virus phiX174,Escherichia virus phiX174,Petitvirales,Petitvirales,Viruses,Viruses,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Ungulate erythroparvovirus 1,Ungulate erythroparvovirus 1,Piccovirales,Piccovirales,Viruses,Viruses,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [34]:
tp, fp =fp_tp(mer1, 'species', 'Bacteria')

In [35]:
fp.groupby(['species_test', 'species', 'order_test', 'order', 'superkingdom_test', 'superkingdom']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 0,id_test,sequence_test,taxid_test,classified_test,phylum_test,class_test,family_test,genus_test,seqName,...,send,length,mismatch,staxids,duplicates,taxid,phylum,class,family,genus
species_test,species,order_test,order,superkingdom_test,superkingdom,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Acholeplasma hippikon,Cyprinus carpio,Acholeplasmatales,Cypriniformes,Bacteria,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Actinomyces succiniciruminis,Tyto alba,Actinomycetales,Strigiformes,Bacteria,Eukaryota,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
Aeromonas encheleia,Pan paniscus,Aeromonadales,Primates,Bacteria,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Altererythrobacter sp. TH136,Homo sapiens,Sphingomonadales,Primates,Bacteria,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Alteromonas australica,Cyprinus carpio,Alteromonadales,Cypriniformes,Bacteria,Eukaryota,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Calothrix sp. NIES-4101,Cyprinus carpio,Nostocales,Cypriniformes,Bacteria,Eukaryota,1,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Celeribacter marinus,Ailuropoda melanoleuca,Rhodobacterales,Carnivora,Bacteria,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Citrobacter werkmanii,Homo sapiens,Enterobacterales,Primates,Bacteria,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Clostridiales bacterium,Pan paniscus,Clostridiales,Primates,Bacteria,Eukaryota,1,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,1,1,1,1
Coprothermobacter proteolyticus,Pan paniscus,Coprothermobacterales,Primates,Bacteria,Eukaryota,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [36]:
fp_short = fp[['species_test', 'species', 'order_test', 'order', 'superkingdom_test', 'superkingdom']]
fp_prim = fp_short[fp_short['order']=='Primates']
fp_prim = fp_prim.rename(columns={'species_test':'kraken2_species',
                       'order_test':'kraken2_order',
                       'superkingdom_test':'kraken2_superkingdom'}).reset_index(drop=['index'])
fp_prim


Unnamed: 0,kraken2_species,species,kraken2_order,order,kraken2_superkingdom,superkingdom
0,Pasteurella multocida,Pan paniscus,Pasteurellales,Primates,Bacteria,Eukaryota
1,Paenibacillus kribbensis,Pan paniscus,Bacillales,Primates,Bacteria,Eukaryota
2,Kitasatospora sp. MMS16-BH015,Homo sapiens,Streptomycetales,Primates,Bacteria,Eukaryota
3,Coprothermobacter proteolyticus,Pan paniscus,Coprothermobacterales,Primates,Bacteria,Eukaryota
4,Desulfovibrio marinus,Homo sapiens,Desulfovibrionales,Primates,Bacteria,Eukaryota
5,Streptomyces autolyticus,Pan troglodytes,Streptomycetales,Primates,Bacteria,Eukaryota
6,[Clostridium] cellulosi,Homo sapiens,Clostridiales,Primates,Bacteria,Eukaryota
7,Citrobacter werkmanii,Homo sapiens,Enterobacterales,Primates,Bacteria,Eukaryota
8,Deinococcus ficus,Homo sapiens,Deinococcales,Primates,Bacteria,Eukaryota
9,Methylobacterium durans,Gorilla gorilla,Rhizobiales,Primates,Bacteria,Eukaryota
