# Evaluating truth vs sourmash gather



In [15]:
import csv
import sys
sys.path.insert(0, '../2018-ncbi-lineages/')
import ncbi_taxdump_utils

In [16]:
taxfoo = ncbi_taxdump_utils.NCBI_TaxonomyFoo()

taxfoo.load_nodes_dmp('../2018-ncbi-lineages/genbank/nodes.dmp.gz')
taxfoo.load_names_dmp('../2018-ncbi-lineages/genbank/names.dmp.gz')

want_taxonomy = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species']

In [17]:
from sourmash_lib.lca import lca_utils, command_index

def format_lineage(lineage_tup):
    return ";".join(lca_utils.zip_lineage(lineage_tup))

## Sourmash gather: mapping from accession -> lineages

This is important for dealing with the CSV output of `sourmash gather`; `sourmash lca gather`
comes with the taxonomy already in the CSV output.

In [18]:
acc_to_lineage, num_rows = command_index.load_taxonomy_assignments('gather-lineages.csv', start_column=3)
print('loaded {} rows'.format(num_rows))

loaded 797 rows


examining spreadsheet headers...
** assuming column 'accession' is identifiers in spreadsheet


In [19]:
list(acc_to_lineage.items())[:1]

[('NC_000917',
  (LineagePair(rank='superkingdom', name='Archaea'),
   LineagePair(rank='phylum', name='Euryarchaeota'),
   LineagePair(rank='class', name='Archaeoglobi'),
   LineagePair(rank='order', name='Archaeoglobales'),
   LineagePair(rank='family', name='Archaeoglobaceae'),
   LineagePair(rank='genus', name='Archaeoglobus'),
   LineagePair(rank='species', name='Archaeoglobus fulgidus')))]

In [20]:
def get_lineage_by_acc(acc):
    acc = acc.split(' ')[0].split('.')[0]
    return acc_to_lineage.get(acc, None)

# example for 'get_lineage_by_acc'
lineage = get_lineage_by_acc('NC_000917')
print(lineage)
print(format_lineage(lineage))

(LineagePair(rank='superkingdom', name='Archaea'), LineagePair(rank='phylum', name='Euryarchaeota'), LineagePair(rank='class', name='Archaeoglobi'), LineagePair(rank='order', name='Archaeoglobales'), LineagePair(rank='family', name='Archaeoglobaceae'), LineagePair(rank='genus', name='Archaeoglobus'), LineagePair(rank='species', name='Archaeoglobus fulgidus'))
Archaea;Euryarchaeota;Archaeoglobi;Archaeoglobales;Archaeoglobaceae;Archaeoglobus;Archaeoglobus fulgidus


In [21]:
def load_sourmash_gather_csv(filename):
    with open(filename, 'rt') as fp:
        r = csv.DictReader(fp)
        rows = list(r)
    return rows

load_sourmash_gather_csv('output/Huttenhower_HC1.fasta.gz.scaled10k.k51.gather.matches.csv')[:1]

[OrderedDict([('intersect_bp', '330000'),
              ('f_orig_query', '0.008465879938429965'),
              ('f_match', '0.03466386554621849'),
              ('f_unique_to_query', '0.008465879938429965'),
              ('name',
               'NC_013132.1 Chitinophaga pinensis DSM 2588, complete genome'),
              ('filename', 'inputs/databases/genbank-k51.sbt.json'),
              ('md5', 'd06a405d39c8ebed579540db2994afe8')])]

In [22]:
def make_gather_lineages(filename):
    rows = load_sourmash_gather_csv(filename)
    rows2 = []
    for d in rows:
        name = d['name']
        lineage = get_lineage_by_acc(name)
        d2 = dict(d)
        d2['lineage'] = lineage
        rows2.append(d2)
        
    return rows2

make_gather_lineages('output/Huttenhower_HC1.fasta.gz.scaled10k.k51.gather.matches.csv')[:1]

[{'f_match': '0.03466386554621849',
  'f_orig_query': '0.008465879938429965',
  'f_unique_to_query': '0.008465879938429965',
  'filename': 'inputs/databases/genbank-k51.sbt.json',
  'intersect_bp': '330000',
  'lineage': (LineagePair(rank='superkingdom', name='Bacteria'),
   LineagePair(rank='phylum', name='Bacteroidetes'),
   LineagePair(rank='class', name='Chitinophagia'),
   LineagePair(rank='order', name='Chitinophagales'),
   LineagePair(rank='family', name='Chitinophagaceae'),
   LineagePair(rank='genus', name='Chitinophaga'),
   LineagePair(rank='species', name='Chitinophaga pinensis')),
  'md5': 'd06a405d39c8ebed579540db2994afe8',
  'name': 'NC_013132.1 Chitinophaga pinensis DSM 2588, complete genome'}]

## Truth files: loading in & fleshing out NCBI taxonomy

Also needed :).

In [23]:
def load_truth_file(filename):
    with open(filename, 'rt') as fp:
        lines = fp.readlines()
        
    lines = [ x.strip() for x in lines ]
    lines = [ x.split('\t') for x in lines ]
    
    rows = []
    for x in lines:
        taxid, a, b, rank, name = x
        taxid = int(taxid)
        rows.append((taxid, a, b, rank, name))
    return rows
    
print(load_truth_file('truth_sets/species/Huttenhower_HC1_TRUTH.txt')[:5])

[(84980, '1.00000', '0.01000', 'species', 'Desulfotalea psychrophila'), (1428, '1.00000', '0.01000', 'species', 'Bacillus thuringiensis'), (1496, '1.00000', '0.01000', 'species', 'Peptoclostridium difficile'), (1525, '1.00000', '0.01000', 'species', 'Moorella thermoacetica'), (1423, '1.00000', '0.01000', 'species', 'Bacillus subtilis')]


In [24]:
def make_lineage_from_taxid(taxid):
    lineage_d = taxfoo.get_lineage_as_dict(taxid, want_taxonomy)
    lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage_d.items() ]
    return tuple(lineage)

make_lineage_from_taxid(84980)

(LineagePair(rank='species', name='Desulfotalea psychrophila'),
 LineagePair(rank='genus', name='Desulfotalea'),
 LineagePair(rank='family', name='Desulfobulbaceae'),
 LineagePair(rank='order', name='Desulfobacterales'),
 LineagePair(rank='class', name='Deltaproteobacteria'),
 LineagePair(rank='phylum', name='Proteobacteria'),
 LineagePair(rank='superkingdom', name='Bacteria'))

In [25]:
def make_truth_lineages(filename):
    rows = load_truth_file(filename)
    
    rows2 = []
    for (taxid, a, b, rank, name) in rows:
        lineage = make_lineage_from_taxid(taxid)
        rows2.append((taxid, a, b, rank, name, lineage))
        for lintup in lineage:
            if lintup.rank == rank:
                if lintup.name != name:
                    print('DISAGREE: ncbi={}, truthfile={}'.format(lintup.name, name))
    return rows2

truth_lineages = make_truth_lineages('truth_sets/species/Huttenhower_HC1_TRUTH.txt')

DISAGREE: ncbi=Clostridioides difficile, truthfile=Peptoclostridium difficile
DISAGREE: ncbi=Sediminispirochaeta smaragdinae, truthfile=Spirochaeta smaragdinae
