# Evaluating truth vs sourmash gather



In [73]:
import csv
import sys
sys.path.insert(0, '../2018-ncbi-lineages/')
import ncbi_taxdump_utils

In [21]:
taxfoo = ncbi_taxdump_utils.NCBI_TaxonomyFoo()

taxfoo.load_nodes_dmp('../2018-ncbi-lineages/genbank/nodes.dmp.gz')
taxfoo.load_names_dmp('../2018-ncbi-lineages/genbank/names.dmp.gz')

want_taxonomy = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species']

In [48]:
from sourmash_lib.lca import lca_utils, command_index

def format_lineage(lineage_tup):
    return ";".join(lca_utils.zip_lineage(lineage_tup))

## Sourmash gather: mapping from accession -> lineages

This is important for dealing with the CSV output of `sourmash gather`; `sourmash lca gather`
comes with the taxonomy already in the CSV output.

In [43]:
acc_to_lineage, num_rows = command_index.load_taxonomy_assignments('genbank-lineages.csv', start_column=3)
print('loaded {} rows'.format(num_rows))

examining spreadsheet headers...
** assuming column 'accession' is identifiers in spreadsheet


loaded 84331 rows


In [44]:
list(acc_to_lineage.items())[:1]

[('AAAC01000001',
  (LineagePair(rank='superkingdom', name='Bacteria'),
   LineagePair(rank='phylum', name='Firmicutes'),
   LineagePair(rank='class', name='Bacilli'),
   LineagePair(rank='order', name='Bacillales'),
   LineagePair(rank='family', name='Bacillaceae'),
   LineagePair(rank='genus', name='Bacillus'),
   LineagePair(rank='species', name='Bacillus anthracis')))]

In [79]:
def get_lineage_by_acc(acc):
    acc = acc.split(' ')[0].split('.')[0]
    print(acc)
    return acc_to_lineage.get(acc, None)

# example for 'get_lineage_by_acc'
lineage = get_lineage_by_acc('AAAC01000001')
print(lineage)
print(format_lineage(lineage))

AAAC01000001
(LineagePair(rank='superkingdom', name='Bacteria'), LineagePair(rank='phylum', name='Firmicutes'), LineagePair(rank='class', name='Bacilli'), LineagePair(rank='order', name='Bacillales'), LineagePair(rank='family', name='Bacillaceae'), LineagePair(rank='genus', name='Bacillus'), LineagePair(rank='species', name='Bacillus anthracis'))
Bacteria;Firmicutes;Bacilli;Bacillales;Bacillaceae;Bacillus;Bacillus anthracis


In [77]:
def load_sourmash_gather_csv(filename):
    with open(filename, 'rt') as fp:
        r = csv.DictReader(fp)
        rows = list(r)
    return rows

load_sourmash_gather_csv('output/Huttenhower_HC1.fasta.gz.scaled10k.k51.gather.matches.csv')[:1]

[OrderedDict([('intersect_bp', '330000'),
              ('f_orig_query', '0.008465879938429965'),
              ('f_match', '0.03466386554621849'),
              ('f_unique_to_query', '0.008465879938429965'),
              ('name',
               'NC_013132.1 Chitinophaga pinensis DSM 2588, complete genome'),
              ('filename', 'inputs/databases/genbank-k51.sbt.json'),
              ('md5', 'd06a405d39c8ebed579540db2994afe8')])]

In [83]:
def make_gather_lineages(filename):
    rows = load_sourmash_gather_csv(filename)
    rows2 = []
    for d in rows:
        name = d['name']
        lineage = get_lineage_by_acc(name)
        d2 = dict(d)
        d2['lineage'] = lineage
        rows2.append(d2)
        
    return rows2

make_gather_lineages('output/Huttenhower_HC1.fasta.gz.scaled10k.k51.gather.matches.csv')[:1]

NC_013132
NZ_ABFV01000241
NC_012856
NC_009832
NC_015138
NC_017309
NC_013729
NC_010551
NC_009511
NC_010002
NZ_JYNG01000001
NC_004459
NC_009441
NC_014171
NC_009664
NC_012032
NZ_FPKM01000170
NC_007954
NC_003552
NC_007298
NZ_CM002545
NC_011566
NC_022269
NZ_JPGN01000541
NZ_CP013315
NC_014562
NC_013850
NC_011144
NC_015177
NC_013956
NC_014364
NC_013315
NC_007493
NZ_LXBO01000001
NZ_AEFW01000030
NC_010172
NC_011831
NZ_CP009968
NC_002967
NC_009831
NC_012108
NC_005125
NC_014033
NC_010602
NC_007643
NC_007512
NC_004193
NZ_LNBT01000155
NZ_CP018190
NC_007406
NZ_LYUI01000001
NC_014392
NC_010556
NC_008555
NZ_CABC01000134
NC_007349
NC_006624
NC_013162
NC_014925
NZ_LYPH01000001
NC_008025
NC_010080
NC_014554
NC_007644
NC_009437
NZ_FNNX01000090
NC_002977
NC_013523
NC_000907
NC_014924
NC_013156
NC_008346
NC_008023
NC_002935
NC_011999
NC_013949
NC_009337
NC_011653
FN869568
NZ_CP013939
NC_002950
NC_008701
NC_006138
NC_009138
NC_009881
NC_005362
NC_006156
NC_014658
NC_014804
NC_012804
NC_015185
NC_006461
NC_00

[{'f_match': '0.03466386554621849',
  'f_orig_query': '0.008465879938429965',
  'f_unique_to_query': '0.008465879938429965',
  'filename': 'inputs/databases/genbank-k51.sbt.json',
  'intersect_bp': '330000',
  'lineage': None,
  'md5': 'd06a405d39c8ebed579540db2994afe8',
  'name': 'NC_013132.1 Chitinophaga pinensis DSM 2588, complete genome'}]

## Truth files: loading in & fleshing out NCBI taxonomy

Also needed :).

In [57]:
def load_truth_file(filename):
    with open(filename, 'rt') as fp:
        lines = fp.readlines()
        
    lines = [ x.strip() for x in lines ]
    lines = [ x.split('\t') for x in lines ]
    
    rows = []
    for x in lines:
        taxid, a, b, rank, name = x
        taxid = int(taxid)
        rows.append((taxid, a, b, rank, name))
    return rows
    
print(load_truth_file('truth_sets/species/Huttenhower_HC1_TRUTH.txt')[:5])

[(84980, '1.00000', '0.01000', 'species', 'Desulfotalea psychrophila'), (1428, '1.00000', '0.01000', 'species', 'Bacillus thuringiensis'), (1496, '1.00000', '0.01000', 'species', 'Peptoclostridium difficile'), (1525, '1.00000', '0.01000', 'species', 'Moorella thermoacetica'), (1423, '1.00000', '0.01000', 'species', 'Bacillus subtilis')]


In [62]:
def make_lineage_from_taxid(taxid):
    lineage_d = taxfoo.get_lineage_as_dict(taxid, want_taxonomy)
    lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage_d.items() ]
    return tuple(lineage)

make_lineage_from_taxid(84980)

(LineagePair(rank='species', name='Desulfotalea psychrophila'),
 LineagePair(rank='genus', name='Desulfotalea'),
 LineagePair(rank='family', name='Desulfobulbaceae'),
 LineagePair(rank='order', name='Desulfobacterales'),
 LineagePair(rank='class', name='Deltaproteobacteria'),
 LineagePair(rank='phylum', name='Proteobacteria'),
 LineagePair(rank='superkingdom', name='Bacteria'))

In [69]:
def make_truth_lineages(filename):
    rows = load_truth_file(filename)
    
    rows2 = []
    for (taxid, a, b, rank, name) in rows:
        lineage = make_lineage_from_taxid(taxid)
        rows2.append((taxid, a, b, rank, name, lineage))
        for lintup in lineage:
            if lintup.rank == rank:
                if lintup.name != name:
                    print('DISAGREE: ncbi={}, truthfile={}'.format(lintup.name, name))
    return rows2

truth_lineages = make_truth_lineages('truth_sets/species/Huttenhower_HC1_TRUTH.txt')

DISAGREE: ncbi=Clostridioides difficile, truthfile=Peptoclostridium difficile
DISAGREE: ncbi=Sediminispirochaeta smaragdinae, truthfile=Spirochaeta smaragdinae
