## Upgrading the local database

In [1]:
from gtdbquery import GTDBTaxa

In [2]:
dumpfile = "./gtdbdump/gtdbr202dump.tar.gz" 
gtdb = GTDBTaxa()
GTDBTaxa().update_taxonomy_database(taxdump_file='./gtdbdump/gtdbr202dump.tar.gz')

GTDB database not present yet (first time used?)


/home/deng/Projects/ete4/GTDB_demo/ete_gtdb/gtdbdump/gtdbr202dump.tar.gz
Loading node names...
324355 names loaded.
1 synonyms loaded.
Loading nodes...
324355 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /home/deng/Projects/ete4/GTDB_demo/ete_gtdb/.gtdb/gtdbtaxa.sqlite ...
 319000 generating entries... 

Inserting taxids:       20000 

 324000 generating entries... 
Uploading to /home/deng/Projects/ete4/GTDB_demo/ete_gtdb/.gtdb/gtdbtaxa.sqlite



Inserting taxids:       320000  




In [15]:
GTDBTaxa().update_taxonomy_database(taxdump_file='./gtdbdump/gtdbr202dump.tar.gz')

./gtdbdump/gtdbr202dump.tar.gz
Loading node names...
324355 names loaded.
1 synonyms loaded.
Loading nodes...
324355 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /home/deng/Projects/ete4/GTDB_demo/ete_gtdb/.gtdb/gtdbtaxa.sqlite ...
 324000 generating entries... 
Uploading to /home/deng/Projects/ete4/GTDB_demo/ete_gtdb/.gtdb/gtdbtaxa.sqlite


Inserting taxids:       30000 




Inserting taxids:       320000 




### customize dbfile

In [14]:
gtdb = GTDBTaxa(dbfile="./.gtdb/gtdbtaxa.sqlite")

## Getting taxa information

Note that the taxIDs are arbitrarily assigned and don't match anything in the NCBI! Running gtdb_to_taxdump on a different list of taxonomies (e.g., a different GTDB release) will create different taxIDs. 

In [1]:
#dir(gtdb)

In [10]:
name2taxid = gtdb.get_name_translator(['p__Bacteroidota', 'p__Firmicutes_F'])
print(name2taxid)

{'p__Bacteroidota': [139581], 'p__Firmicutes_F': [175734]}


In [11]:
gtdb.get_name_translator(['g__Methanodesulfokores'])

{'g__Methanodesulfokores': [2561]}

In [12]:
lineage = gtdb.get_lineage(name2taxid['p__Bacteroidota'][0])
print(lineage)


names = gtdb.get_taxid_translator(lineage)
print([names[taxid] for taxid in lineage])

[1, 8034, 139581]
['root', 'd__Bacteria', 'p__Bacteroidota']


## Getting descendant taxa

In [17]:
from gtdbquery import GTDBTaxa
gtdb = GTDBTaxa()

descendants = gtdb.get_descendant_taxa('f__Korarchaeaceae')
print(descendants)
#['GB_GCA_011358815.1', 'RS_GCF_000019605.1', 'RS_GCF_003948265.1', 'GB_GCA_003344655.1', 'GB_GCA_011056255.1', 'RS_GCF_003947435.1', 'GB_GCA_004347975.1', 'GB_GCA_004212055.1', 'GB_GCA_011042755.1']

# you can easily ignore subspecies, so only taxa labeled as "species" will be reported:
# descendants = gtdb.get_descendant_taxa('f__Korarchaeaceae', intermediate_nodes=True, collapse_subspecies=True, return_tree=True)
# print(descendants)

descendants = gtdb.get_descendant_taxa('f__Korarchaeaceae', intermediate_nodes=True, collapse_subspecies=False, return_tree=False)
print(descendants)

descendants = gtdb.get_descendant_taxa('f__Korarchaeaceae', intermediate_nodes=False, collapse_subspecies=True, return_tree=False)
print(descendants)

# or even returned as an annotated tree
tree = gtdb.get_descendant_taxa('f__Korarchaeaceae', collapse_subspecies=True, return_tree=True)
print(tree.get_ascii(attributes=['sci_name', 'taxid', 'rank']))

['GB_GCA_011358815.1', 'RS_GCF_000019605.1', 'RS_GCF_003948265.1', 'GB_GCA_003344655.1', 'GB_GCA_011056255.1', 'RS_GCF_003947435.1', 'GB_GCA_004347975.1', 'GB_GCA_004212055.1', 'GB_GCA_011042755.1']
['g__Korarchaeum', 's__Korarchaeum cryptofilum', 'GB_GCA_011358815.1', 'RS_GCF_000019605.1', 'RS_GCF_003948265.1', 's__Korarchaeum sp003344655', 'GB_GCA_003344655.1', 's__Korarchaeum sp011056255', 'GB_GCA_011056255.1', 'g__Methanodesulfokores', 's__Methanodesulfokores washburnensis', 'RS_GCF_003947435.1', 'GB_GCA_004347975.1', 'GB_GCA_004212055.1', 'g__DRBY01', 's__DRBY01 sp011042755', 'GB_GCA_011042755.1']
['s__Korarchaeum cryptofilum', 's__Korarchaeum sp003344655', 's__Korarchaeum sp011056255', 's__Methanodesulfokores washburnensis', 's__DRBY01 sp011042755']

                                                                                 /-s__Korarchaeum cryptofilum, s__Korarchaeum cryptofilum, species
                                                                                |
    

## Getting GTDB species tree topology

In [18]:
from gtdbquery import GTDBTaxa
gtdb = GTDBTaxa()

tree = gtdb.get_topology(["p__Huberarchaeota", "o__Peptococcales", "f__Korarchaeaceae", "s__Korarchaeum"], intermediate_nodes=True, collapse_subspecies=True)
print(tree.get_ascii(attributes=["taxid",  "sci_name", "rank"]))


                                                       /-p__Huberarchaeota, p__Huberarchaeota, phylum
                   /d__Archaea, d__Archaea, superkingdom
-root, root, no rank                                   \p__Thermoproteota, p__Thermoproteota, phylumc__Korarchaeia, c__Korarchaeia, classo__Korarchaeales, o__Korarchaeales, order-f__Korarchaeaceae, f__Korarchaeaceae, family
                  |
                   \d__Bacteria, d__Bacteria, superkingdomp__Firmicutes_B, p__Firmicutes_B, phylumc__Peptococcia, c__Peptococcia, class-o__Peptococcales, o__Peptococcales, order


In [7]:
from gtdbquery import GTDBTaxa
gtdb = GTDBTaxa()

tree = gtdb.get_topology(['f__Korarchaeaceae'])
print(tree.get_ascii(attributes=["taxid", "rank"]))


                                                                                 /-GB_GCA_011358815.1, subspecies
                                                                                |
                                              /s__Korarchaeum cryptofilum, species-RS_GCF_000019605.1, subspecies
                                             |                                  |
                                             |                                   \-RS_GCF_003948265.1, subspecies
                         /g__Korarchaeum, genus
                        |                    |--GB_GCA_003344655.1, subspecies
                        |                    |
                        |                     \-GB_GCA_011056255.1, subspecies
                        |
-f__Korarchaeaceae, family                                            /-RS_GCF_003947435.1, subspecies
                        |                                            |
                        |-s__Methanodesu

In [12]:
from gtdbquery import GTDBTaxa
gtdb = GTDBTaxa()
tree = gtdb.get_topology(['d__Archaea', 'f__UBA4738'], intermediate_nodes=True)
print(tree.get_ascii(attributes=["taxid", "sci_name"]))


          /-d__Archaea, d__Archaea
-root, root
          \d__Bacteria, d__Bacteriap__Actinobacteriota, p__Actinobacteriotac__UBA4738, c__UBA4738o__UBA4738, o__UBA4738-f__UBA4738, f__UBA4738


In [9]:
#GTDB demo
from ete3 import PhyloTree
from gtdbquery import GTDBTaxa
gtdb = GTDBTaxa()
#tree = PhyloTree('((161839, 163565), 1795);', sp_naming_function=lambda name: name)
tree = PhyloTree('((c__Thorarchaeia, c__Lokiarchaeia_A), s__Caballeronia udeis);')
#print(tree.get_ascii(attributes=["name", "sci_name", "taxid"]))
tax2name, tax2track, tax2rank = gtdb.annotate_tree(tree, taxid_attr="name")
print(tree.get_ascii(attributes=["taxid", "name", "sci_name", "rank"]))


                                                                     /-c__Thorarchaeia, c__Thorarchaeia, c__Thorarchaeia, class
                     /p__Asgardarchaeota, , p__Asgardarchaeota, phylum
-root, , root, no rank                                               \-c__Lokiarchaeia_A, c__Lokiarchaeia_A, c__Lokiarchaeia_A, class
                    |
                     \-s__Caballeronia udeis, s__Caballeronia udeis, s__Caballeronia udeis, species
