# Visualize glycoprotein phylogeny

This notebook creates a final tree figure using `ete3`

In [None]:
# Imports
import os
import yaml
import ete3
import warnings
# Need this line to render ete3 trees
# https://github.com/etetoolkit/ete/issues/296
os.environ["QT_QPA_PLATFORM"]= "offscreen"
import pandas as pd
import numpy as np
import matplotlib.colors
import seaborn as sns
import matplotlib.pyplot as plt
from pymsaviz import MsaViz 
from Bio import SeqIO, AlignIO

# Create color palette
def color_gradient_hex(start, end, n):
    """Color function from polyclonal"""
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            name="_", colors=[start, end], N=n
        )
    return [matplotlib.colors.rgb2hex(tup) for tup in cmap(list(range(0, n)))]

# Black to white color scale
gray_scale = color_gradient_hex("white", "#000000", n=11)

# Rearranged to make the tree look nicer
tol_muted_adjusted = [
    "#000000",
    "#CC6677", 
    "#1f78b4", 
    "#88CCEE",
    "#882255",
    "#117733", 
    "#DDCC77",
    "#44AA99", 
    "#999933", 
    "#AA4499", 
    "#EE7733",
    "#CC3311",
    "#DDDDDD",
]

# Seaborn style settings
sns.set(rc={"figure.dpi":300, "savefig.dpi":300})
sns.set_style("ticks")
sns.set_palette(tol_muted_adjusted)

# Suppress warnings
warnings.simplefilter("ignore")

In [None]:
# Open config file
config_path = "Configure/config.yml"
with open(config_path) as f:
    config = yaml.safe_load(f)

# Set paths
protein_path = config["Protein_sequences"]
isolate_sequences = config["Protein_validation_sequences"]
isolate_alignment = config["GPC_protein_validation_alignment"]
chosen_validations = config["Validation_sequence_names"]
alignment_figure = config["GPC_protein_validation_alignment_figure"]
codon_tree_file_path = config["GPC_codon_reduced_tree_prefix"] + ".treefile"
protein_tree_file_path = config["GPC_protein_reduced_tree_prefix"] + ".treefile"
alignment_file_path = config["GPC_codon_reduced_alignment"]
protein_alignment_file_path = config["GPC_protein_reduced_alignment"]
metadata_file_path = config["Metadata"]
figure_dir = config["Figures_dir"]
codon_tree_figure_file_path = config["GPC_codon_reduced_tree_figure"]
percent_identity_to_josiah_figure_file_path = config["Amino_acid_identity_to_josiah_figure"]
protein_tree_figure_file_path = config["GPC_protein_reduced_tree_figure"]

In [None]:
# # Uncomment to run interactively
# # Open config file
# config_path = "../Configure/config.yml"
# with open(config_path) as f:
#     config = yaml.safe_load(f)

# # Set paths
# protein_path = "../" + config["Protein_sequences"]
# isolate_sequences = "../" + config["Protein_validation_sequences"]
# isolate_alignment = "../" + config["GPC_protein_validation_alignment"]
# chosen_validations = config["Validation_sequence_names"]
# alignment_figure = "../" + config["GPC_protein_validation_alignment_figure"]
# codon_tree_file_path = "../" + config["GPC_codon_reduced_tree_prefix"] + ".treefile"
# protein_tree_file_path = "../" + config["GPC_protein_reduced_tree_prefix"] + ".treefile"
# alignment_file_path = "../" + config["GPC_codon_reduced_alignment"]
# protein_alignment_file_path = "../" + config["GPC_protein_reduced_alignment"]
# metadata_file_path = "../" + config["Metadata"]
# figure_dir = "../" + config["Figures_dir"]
# codon_tree_figure_file_path = "../" + config["GPC_codon_reduced_tree_figure"]
# percent_identity_to_josiah_figure_file_path = "../" + config["Amino_acid_identity_to_josiah_figure"]
# protein_tree_figure_file_path = "../" + config["GPC_protein_reduced_tree_figure"]

In [None]:
# Load metadata as dataframe
all_metadata = pd.read_csv(metadata_file_path, sep="\t")

# Load list of sequence names from alignment
strains = []
for curr_fasta in SeqIO.parse(alignment_file_path, "fasta"):
    strains.append(str(curr_fasta.id))

# Filter metadata
GPC_reduced_metadata = (
    all_metadata.loc[all_metadata["strain"].isin(strains)].copy()
)
# Add Sierra Leone for Josiah strain
GPC_reduced_metadata.at[0, "country"] = "Sierra Leone"

# Create more succinct name conversion dict
GPC_reduced_metadata["succint_name"] = (
    GPC_reduced_metadata.apply(lambda x: str(x["strain"].split("_")[0]) + "_" + str(x["accession"]), axis=1)
)
name_conversion = (
    dict(zip(
        GPC_reduced_metadata["strain"].tolist(),
        GPC_reduced_metadata["succint_name"].tolist()
    ))
)

In [None]:
# Map country and strain name to color
country_to_color = (
    dict(zip(
        GPC_reduced_metadata["country"].unique().tolist(),
        tol_muted_adjusted[1:]
    ))
)
# Map name to country
name_to_country = GPC_reduced_metadata.set_index("strain")["country"].to_dict()

In [None]:
# Isolates to highlight in tree
to_annotate = config["Validation_sequence_names"]

names_to_annotate = (
    GPC_reduced_metadata.loc[GPC_reduced_metadata["strain"].isin(to_annotate)]["strain"]
    .tolist()
)

## Create alignmnet of isolates chosen for validation

In [None]:
# Sequences identified in config file
isolates = chosen_validations

# Extract chosen validation isolates
with open(isolate_sequences, "w") as isolate_seqs:
    for curr_fasta in SeqIO.parse(protein_path, "fasta"):
        curr_id = str(curr_fasta.id).split(" ")[0][:-2]
        if curr_id in isolates:
            curr_fasta.id = name_conversion[str(curr_fasta.id)[:-2]]
            curr_fasta.description = ""
            SeqIO.write(curr_fasta, isolate_seqs, "fasta")
# Close files
isolate_seqs.close()

In [None]:
# Align chosen validation isolates
os.system(f"mafft --auto --quiet {isolate_sequences} > {isolate_alignment}")

# Visualize alignment
mv = MsaViz(
    isolate_alignment, 
    color_scheme="Identity", 
    consensus_color="lightgrey",
    wrap_length=100, 
    show_grid=True, 
    show_consensus=True
)

mv.set_plot_params(
    grid_color="black", 
    identity_color="lightgrey"
)

# Add text annotations
mv.add_text_annotation((1, 58), "Stable Signal Peptide", text_color="#AA4499", range_color="#AA4499")
mv.add_text_annotation((59, 259), "Glycoprotein 1", text_color="#88CCEE", range_color="#88CCEE")
mv.add_text_annotation((260, 491), "Glycoprotein 2", text_color="#EE7733", range_color="#EE7733")

fig = mv.plotfig()

# Make output dir if doesn't exist
if not os.path.exists(figure_dir):
    os.mkdir(figure_dir)

fig.savefig(alignment_figure, dpi=300)

## Create a codon tree

In [None]:
# Function to create and style tree
def get_pretty_codon_tree(treefile, country_to_color, name_to_country, names_to_annotate, name_conversion):
    """
    Function that formats a tree using ete3. This 
    function is modified from Equia et al analysis.
    """
    # Create ete3 tree object
    t = ete3.Tree(treefile, format=1)

    # Calculate the midpoint node
    # and set it as tree outgroup
    R = t.get_midpoint_outgroup()
    t.set_outgroup(R)
    # Ladderize tree
    t.ladderize()
    
    ts = ete3.TreeStyle()
    ts.show_leaf_name = False  # add tip names manually
    ts.scale = 50
    ts.branch_vertical_margin = 2
    
    name_to_color = {name: country_to_color[country] for name, country in name_to_country.items()}

    # Style and annotate each leaf
    for n in t.traverse():
        nstyle = ete3.NodeStyle()
        nstyle["hz_line_width"] = 1
        nstyle["vt_line_width"] = 1
        nstyle["hz_line_color"] = "black"
        nstyle["vt_line_color"] = "black"
        if n.is_leaf():
            nstyle["fgcolor"] = name_to_color[n.name]
            if n.name in names_to_annotate:
                if n.name == "Josiah_NC_004296_reverse_complement_2018-08-13":
                    nstyle["size"] = 8
                    nstyle["shape"] = "square"
                else:
                    nstyle["size"] = 6
                    nstyle["shape"] = "square"
            else:
                nstyle["size"] = 4
            if n.name in names_to_annotate:
                if n.name == "Josiah_NC_004296_reverse_complement_2018-08-13":
                    n.add_face(ete3.TextFace(f" {name_conversion[n.name]}",
                                             ftype="Arial",
                                             tight_text=True,
                                             fsize=8,
                                             fgcolor=tol_muted_adjusted[0],
                                             ),
                               column=0,
                               position="branch-right")
                else:
                    n.add_face(ete3.TextFace(f" {name_conversion[n.name]}",
                                             ftype="Arial",
                                             tight_text=True,
                                             fsize=6,
                                             fgcolor=tol_muted_adjusted[0],
                                             ),
                               column=0,
                               position="branch-right")
        else:
            nstyle["size"] = 0
        n.set_style(nstyle)

    # Add legend
    for country,color in country_to_color.items():
        # Marker
        marker = ete3.CircleFace(5, color=color)
        marker.margin_bottom = 1
        marker.margin_right = 8
        ts.legend.add_face(marker, column=0)
        # Text
        text = ete3.TextFace(country, ftype="Arial", fsize=8)
        text.margin_bottom = 1
        ts.legend.add_face(text, column=1)
    ts.legend_position = 2
        
    # Add scale bar but can't edit text
    # scale bar: https://github.com/etetoolkit/ete/issues/266
    ts.show_scale = True
    ts.scale_length = 0.5
         
    return t, ts

In [None]:
# Create codon tree
t, ts = get_pretty_codon_tree(codon_tree_file_path, country_to_color, name_to_country, names_to_annotate, name_conversion)

# Create dictionary to re-order dataframe for percent IDs later
tree_order_dict = {}
index = len(t.get_leaves())
for leaf in t.get_leaves():
    tree_order_dict[leaf.name] = index - 1
    index -= 1

# # Uncomment to display in notebook
# display(
#     t.render(
#         "%%inline", 
#         tree_style=ts,
#         # units="in",
#         # h=6.5,
#         # w=3.5,
#         dpi=300,
#     )
# )

# Make output dir if doesn't exist
if not os.path.exists(figure_dir):
    os.mkdir(figure_dir)

# Render image
t.render(
    codon_tree_figure_file_path, 
    tree_style=ts,
    # units="in",
    # h=6.5,
    # w=3.5,
    dpi=300,
)

## Create a percent amino-acid identity bar to accompany codon tree

In [None]:
def percent_ids(seq1, seq2):
    """
    Function to calculate percent similarity between two sequences.
    """
    # Make sure aligned sequences have same length
    assert len(seq1) == len(seq2), "Aligned sequences do not have same length!"

    # Count base similarities 
    length = len(seq1)
    num_gaps = seq1.count('-')
    num_bases = length - num_gaps
    matching_bases = 0
    for i in range(length):
        if seq1[i] == seq2[i] and seq1[i] != "-":
            matching_bases += 1

    # Return percent similarity between the two sequences
    return ((matching_bases)/num_bases) * 100

In [None]:
# Load alignment info
protein_alignment_df = pd.DataFrame(columns=["strain", "sequence"])

# Add alignment sequence to dataframe
for curr_fasta in AlignIO.read(protein_alignment_file_path, "fasta"):
    protein_alignment_df.loc[len(protein_alignment_df.index)] = [
        str(curr_fasta.id),
        str(curr_fasta.seq),
    ] 

josiah_strain = protein_alignment_df.at[0, "sequence"]

protein_alignment_df["percent identity"] = protein_alignment_df.apply(lambda x: percent_ids(josiah_strain, x["sequence"]), axis=1)
protein_alignment_df["percent identity rounded"] = protein_alignment_df["percent identity"].round()
protein_alignment_df["percent identity rounded"] = protein_alignment_df["percent identity rounded"].astype(int)
percent_to_color_dict = dict(zip(
    list(range(90, 101)), 
    gray_scale
))

protein_alignment_df["color"] = protein_alignment_df["percent identity rounded"].map(percent_to_color_dict)
protein_alignment_df["tree order"] = protein_alignment_df["strain"].map(tree_order_dict)
protein_alignment_df = (
    protein_alignment_df
    .sort_values(by="tree order")
    .reset_index(drop=True)
)

In [None]:
# Create amino acid simiarlity bar for phylogentic tree
plt.figure(figsize=(6, 1))
percent_AA_plot = sns.heatmap(
    data=[np.asarray(protein_alignment_df["percent identity rounded"])],
    cmap=gray_scale,
    vmin=90,
    vmax=100,
    linewidths=0.5,
    linecolor="black",
    square=True,
    yticklabels=False,
    xticklabels=False,
)

percent_AA_plot.tick_params(axis="both", length=1, width=0.5)
cbar = percent_AA_plot.collections[0].colorbar
cbar.outline.set_color("black")
cbar.outline.set_linewidth(0.5)
cbar.set_ticks(ticks=[100, 90], labels=["100", "90"])
cbar.ax.tick_params(axis="both", length=1, width=0.5)

plt.tight_layout()

plt.savefig(percent_identity_to_josiah_figure_file_path)

## Create a protein tree

In [None]:
# Function to create and style tree
def get_pretty_protein_tree(treefile, country_to_color, name_to_country, names_to_annotate, name_conversion):
    """
    Function that formats a tree using ete3. This 
    function is modified from Equia et al analysis.
    """
    # Create ete3 tree object
    t = ete3.Tree(treefile, format=1)

    # Calculate the midpoint node
    # and set it as tree outgroup
    R = t.get_midpoint_outgroup()
    t.set_outgroup(R)
    # Ladderize tree
    t.ladderize()
    
    ts = ete3.TreeStyle()
    ts.show_leaf_name = False  # add tip names manually
    ts.scale = 250
    ts.branch_vertical_margin = 2
    
    name_to_color = {name: country_to_color[country] for name, country in name_to_country.items()}

    # Style and annotate each leaf
    for n in t.traverse():
        nstyle = ete3.NodeStyle()
        nstyle["hz_line_width"] = 1
        nstyle["vt_line_width"] = 1
        nstyle["hz_line_color"] = "black"
        nstyle["vt_line_color"] = "black"
        if n.is_leaf():
            nstyle["fgcolor"] = name_to_color[n.name]
            if n.name in names_to_annotate:
                if n.name == "Josiah_NC_004296_reverse_complement_2018-08-13":
                    nstyle["size"] = 8
                    nstyle["shape"] = "square"
                else:
                    nstyle["size"] = 6
                    nstyle["shape"] = "square"
            else:
                nstyle["size"] = 4
            if n.name in names_to_annotate:
                if n.name == "Josiah_NC_004296_reverse_complement_2018-08-13":
                    n.add_face(ete3.TextFace(f" {name_conversion[n.name]}",
                                             ftype="Arial",
                                             tight_text=True,
                                             fsize=8,
                                             fgcolor=tol_muted_adjusted[0],
                                             ),
                               column=0,
                               position="branch-right")
                else:
                    n.add_face(ete3.TextFace(f" {name_conversion[n.name]}",
                                             ftype="Arial",
                                             tight_text=True,
                                             fsize=6,
                                             fgcolor=tol_muted_adjusted[0],
                                             ),
                               column=0,
                               position="branch-right")
        else:
            nstyle["size"] = 0
        n.set_style(nstyle)

    # Add legend
    for country,color in country_to_color.items():
        # Marker
        marker = ete3.CircleFace(5, color=color)
        marker.margin_bottom = 1
        marker.margin_right = 8
        ts.legend.add_face(marker, column=0)
        # Text
        text = ete3.TextFace(country, ftype="Arial", fsize=10)
        text.margin_bottom = 1
        ts.legend.add_face(text, column=1)
    ts.legend_position = 2
        
    # Add scale bar but can't edit text
    # scale bar: https://github.com/etetoolkit/ete/issues/266
    ts.show_scale = True
    ts.scale_length = 0.05
         
    return t, ts

In [None]:
# Create protein tree
t, ts = get_pretty_protein_tree(protein_tree_file_path, country_to_color, name_to_country, names_to_annotate, name_conversion)

# # Uncomment to display in notebook
# display(
#     t.render(
#         "%%inline", 
#         tree_style=ts,
#         # units="in",
#         # h=5,
#         # w=3.5,
#         dpi=300,
#     )
# )

# Make output dir if doesn't exist
if not os.path.exists(figure_dir):
    os.mkdir(figure_dir)

# Render image
t.render(
    protein_tree_figure_file_path, 
    tree_style=ts,
    # units="in",
    # h=5,
    # w=3.5,
    dpi=300,
)