Skip to content

Commit

Permalink
Fixes #469 NCBI db update crashes due to nocase duplicate synonyms
Browse files Browse the repository at this point in the history
  • Loading branch information
jhcepas committed Aug 29, 2020
1 parent 7597af3 commit 7a1f0c5
Showing 1 changed file with 30 additions and 19 deletions.
49 changes: 30 additions & 19 deletions ete3/ncbi_taxonomy/ncbiquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def get_lineage_translator(self, taxids):

return id2lineages


def get_lineage(self, taxid):
"""Given a valid taxid number, return its corresponding lineage track as a
hierarchically sorted list of parent taxids.
Expand All @@ -241,7 +241,7 @@ def get_lineage(self, taxid):
raise ValueError("%s taxid not found" %taxid)
else:
warnings.warn("taxid %s was translated into %s" %(taxid, merged_conversion[taxid]))

track = list(map(int, raw_track[0].split(",")))
return list(reversed(track))

Expand Down Expand Up @@ -326,7 +326,7 @@ def translate_to_names(self, taxids):
for sp in taxids:
names.append(id2name.get(sp, sp))
return names


def get_descendant_taxa(self, parent, intermediate_nodes=False, rank_limit=None, collapse_subspecies=False, return_tree=False):
"""
Expand All @@ -342,11 +342,11 @@ def get_descendant_taxa(self, parent, intermediate_nodes=False, rank_limit=None,
except KeyError:
raise ValueError('%s not found!' %parent)

# checks if taxid is a deprecated one, and converts into the right one.
# checks if taxid is a deprecated one, and converts into the right one.
_, conversion = self._translate_merged([taxid]) #try to find taxid in synonyms table
if conversion:
if conversion:
taxid = conversion[taxid]

with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE:
prepostorder = pickle.load(CACHED_TRAVERSE)
descendants = {}
Expand All @@ -358,12 +358,12 @@ def get_descendant_taxa(self, parent, intermediate_nodes=False, rank_limit=None,
descendants[tid] = descendants.get(tid, 0) + 1
elif found == 2:
break

if not found:
raise ValueError("taxid not found:%s" %taxid)
elif found == 1:
return [taxid]
return [taxid]

if rank_limit or collapse_subspecies or return_tree:
tree = self.get_topology(list(descendants.keys()), intermediate_nodes=intermediate_nodes, collapse_subspecies=collapse_subspecies, rank_limit=rank_limit)
if return_tree:
Expand All @@ -372,7 +372,7 @@ def get_descendant_taxa(self, parent, intermediate_nodes=False, rank_limit=None,
return list(map(int, [n.name for n in tree.get_descendants()]))
else:
return map(int, [n.name for n in tree])

elif intermediate_nodes:
return [tid for tid, count in six.iteritems(descendants)]
else:
Expand All @@ -398,7 +398,7 @@ def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, collap
"""
from .. import PhyloTree
taxids, merged_conversion = self._translate_merged(taxids)
taxids, merged_conversion = self._translate_merged(taxids)
if len(taxids) == 1:
root_taxid = int(list(taxids)[0])
with open(self.dbfile+".traverse.pkl", "rb") as CACHED_TRAVERSE:
Expand All @@ -407,7 +407,7 @@ def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, collap
found = 0
nodes = {}
hit = 0
visited = set()
visited = set()
start = prepostorder.index(root_taxid)
try:
end = prepostorder.index(root_taxid, start+1)
Expand Down Expand Up @@ -435,7 +435,7 @@ def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, collap
id2lineage = self.get_lineage_translator(taxids)
all_taxids = set()
for lineage in id2lineage.values():
all_taxids.update(lineage)
all_taxids.update(lineage)
id2rank = self.get_rank(all_taxids)
for sp in taxids:
track = []
Expand Down Expand Up @@ -493,7 +493,7 @@ def annotate_tree(self, t, taxid_attr="name", tax2name=None, tax2track=None, tax
:param t: a Tree (or Tree derived) instance.
:param name taxid_attr: Allows to set a custom node attribute
:param name taxid_attr: Allows to set a custom node attribute
containing the taxid number associated to each node (i.e.
species in PhyloTree instances).
Expand All @@ -513,7 +513,7 @@ def annotate_tree(self, t, taxid_attr="name", tax2name=None, tax2track=None, tax
merged_conversion = {}

taxids, merged_conversion = self._translate_merged(taxids)

if not tax2name or taxids - set(map(int, list(tax2name.keys()))):
tax2name = self.get_taxid_translator(taxids)
if not tax2track or taxids - set(map(int, list(tax2track.keys()))):
Expand Down Expand Up @@ -543,7 +543,7 @@ def annotate_tree(self, t, taxid_attr="name", tax2name=None, tax2track=None, tax
n.add_features(sci_name = tax2name.get(node_taxid, getattr(n, taxid_attr, '')),
common_name = tax2common_name.get(node_taxid, ''),
lineage = tax2track.get(node_taxid, []),
rank = tax2rank.get(node_taxid, 'Unknown'),
rank = tax2rank.get(node_taxid, 'Unknown'),
named_lineage = [tax2name.get(tax, str(tax)) for tax in tax2track.get(node_taxid, [])])
elif n.is_leaf():
n.add_features(sci_name = getattr(n, taxid_attr, 'NA'),
Expand Down Expand Up @@ -674,19 +674,30 @@ def load_ncbi_tree_from_dump(tar):
name2rank = {}
node2common = {}
print("Loading node names...")
unique_nocase_synonyms = set()
for line in tar.extractfile("names.dmp"):
line = str(line.decode())
fields = [_f.strip() for _f in line.split("|")]
nodename = fields[0]
name_type = fields[3].lower()
taxname = fields[1]

# Clean up tax names so we make sure the don't include quotes. See https://github.com/etetoolkit/ete/issues/469
taxname = taxname.rstrip('"').lstrip('"')

if name_type == "scientific name":
node2taxname[nodename] = taxname
if name_type == "genbank common name":
node2common[nodename] = taxname
elif name_type in set(["synonym", "equivalent name", "genbank equivalent name",
"anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]):
synonyms.add( (nodename, taxname) )

# Keep track synonyms, but ignore duplicate case-insensitive names. See https://github.com/etetoolkit/ete/issues/469
synonym_key = (nodename, taxname.lower())
if synonym_key not in unique_nocase_synonyms:
unique_nocase_synonyms.add(synonym_key)
synonyms.add((nodename, taxname))

print(len(node2taxname), "names loaded.")
print(len(synonyms), "synonyms loaded.")

Expand Down Expand Up @@ -749,7 +760,7 @@ def update_db(dbfile, targz_file=None):
md5_check = md5_file.readline().split()[0]
targz_file = "taxdump.tar.gz"
do_download = False

if os.path.exists("taxdump.tar.gz"):
local_md5 = md5(open("taxdump.tar.gz", "rb").read()).hexdigest()
if local_md5 != md5_check:
Expand Down Expand Up @@ -786,7 +797,7 @@ def update_db(dbfile, targz_file=None):
raise
else:
os.system("rm syn.tab merged.tab taxa.tab")
# remove only downloaded taxdump file
# remove only downloaded taxdump file
if not targz_file:
os.system("rm taxdump.tar.gz")

Expand Down

0 comments on commit 7a1f0c5

Please sign in to comment.