In [1]:
from cloudpathlib import CloudPath, AnyPath
import treeViz as tv
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)
logging.info("Starting")

INFO:root:Starting


In [2]:
prefix = "20230502_MITI_Genomes_Tree"
output_dir = AnyPath("../data/generated/prune_tree/")
output_dir.mkdir(parents=True, exist_ok=True)

formatted_names_file = AnyPath("../data/imported/display_names.csv")

processed_genomes = AnyPath("../data/generated/process_genomes")
processed_genomes.mkdir(parents=True, exist_ok=True)

gtdb_tk_s3_basepath = CloudPath(
    "s3://genomics-workflow-core/Results/GTDB-tk-trees/MITI/20230502/classify"
)
gtdb_tk_s3_summary_file = (
    gtdb_tk_s3_basepath / "gtdb.20230502_MITI_Genomes_Tree.bac120.summary.tsv"
)
gtdb_tk_s3_tree_file = (
    gtdb_tk_s3_basepath / "gtdb.20230502_MITI_Genomes_Tree.bac120.classify.tree"
)

summary_file = processed_genomes / gtdb_tk_s3_summary_file.name
tree_file = processed_genomes / gtdb_tk_s3_tree_file.name

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
# download file to processed_genomes
gtdb_tk_s3_summary_file.download_to(summary_file)
gtdb_tk_s3_tree_file.download_to(tree_file)

PosixPath('../data/generated/process_genomes/gtdb.20230502_MITI_Genomes_Tree.bac120.classify.tree')

In [4]:
summary_prefix = output_dir / prefix
summary_prefix = summary_prefix.as_posix()
summary_file = summary_file.as_posix()
tree_file = tree_file.as_posix()
# formatted_names_file = formatted_names_file.as_posix()

In [5]:
genomes, color_dict, common_ancestor = tv.parse_summary_file(
    summary_file, out_prefix=summary_prefix, color=True, taxa_rank="phylum"
)

INFO:root:Final tree will contain 164 genomes.
INFO:root:Found 8 levels at phylum rank.
INFO:root:{'p__Actinobacteriota': '#a1c9f4', 'p__Bacteroidota': '#ffb482', 'p__Desulfobacterota_I': '#8de5a1', 'p__Firmicutes': '#ff9f9b', 'p__Firmicutes_A': '#d0bbff', 'p__Firmicutes_C': '#debb9b', 'p__Proteobacteria': '#fab0e4', 'p__Verrucomicrobiota': '#cfcfcf'}


In [6]:
formatted_name_df = pd.read_csv(
    formatted_names_file,
    header=0,
    usecols=["node_name", "display_name"],
)
formatted_name_dict = dict(
    zip(formatted_name_df.node_name, formatted_name_df.display_name)
)

### Circular tree w/ Phylum color background

In [8]:
tree_prefix = output_dir / f"{prefix}.circular_w_bgcolor"
tv.generate_tree(
    tree_file,
    genomes,
    out_prefix=tree_prefix,
    circular=True,
    color_dict=color_dict,
    common_ancestor=common_ancestor,
    formatted_name_dict=formatted_name_dict,
)

INFO:root:Pruning tree to 164 organisms ...
INFO:root:Saving pruned tree in newick format ...
INFO:root:Drawing pruned tree ...
INFO:root:Adding some color to the tree of 164 organisms ...
INFO:root:Saving pruned tree image ...


### Rectangular tree w/ Phylum color background

In [10]:
tree_prefix = output_dir / f"{prefix}.rect_w_bgcolor"
tv.generate_tree(
    tree_file,
    genomes,
    out_prefix=tree_prefix,
    circular=False,
    color_dict=color_dict,
    common_ancestor=common_ancestor,
    formatted_name_dict=formatted_name_dict,
)

INFO:root:Pruning tree to 164 organisms ...
INFO:root:Saving pruned tree in newick format ...
INFO:root:Drawing pruned tree ...
INFO:root:Adding some color to the tree of 164 organisms ...
INFO:root:Saving pruned tree image ...


### Rectangular tree w/o color

In [11]:
tree_prefix = output_dir / f"{prefix}.rect_no_color"
tv.generate_tree(
    tree_file,
    genomes,
    out_prefix=tree_prefix,
    circular=False,
    color_dict=None,
    common_ancestor=None,
    formatted_name_dict=formatted_name_dict,
)

INFO:root:Pruning tree to 164 organisms ...
INFO:root:Saving pruned tree in newick format ...
INFO:root:Drawing pruned tree ...
INFO:root:Saving pruned tree image ...
