# make_nipah_phylogeny_baltic
Script to pull in a newick tree file, and make a pretty phylogeny with the baltic package https://github.com/evogytis/baltic

* Written by Brendan Larsen

In [None]:
# this cell is tagged as parameters for `papermill` parameterization
input_phylo = None
output_img = None

In [None]:
from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MafftCommandline
from Bio.Seq import Seq

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patheffects as path_effects
import matplotlib.patches as mpatches
from matplotlib import gridspec,patheffects
import pandas as pd
import os
from matplotlib.patches import Rectangle,ConnectionPatch
import baltic as bt

Setup working directory

In [None]:
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

Load in newick tree

In [None]:
tree = bt.loadNewick(input_phylo,tip_regex='-([0-9\-]+)$',absoluteTime=False)

Function to plot tree

Function to get country of origin information from genbank files

In [None]:
# Always provide your email address
Entrez.email = "blarsen@fredhutch.org"

def get_country_from_accession_id(accession_ids):
    country_info = {}
    
    for accession_id in accession_ids:
        # Fetch the record from GenBank
        handle = Entrez.efetch(db="nucleotide", id=accession_id, rettype="gb", retmode="text")
        
        # Parse the GenBank record
        try:
            record = SeqIO.read(handle, "genbank")
        except Exception as e:
            print(f"Error reading record for {accession_id}: {e}")
            handle.close()
            continue
        
        handle.close()
        
        # Extract country information from features
        country = None
        for feature in record.features:
            if feature.type == "source":
                if "country" in feature.qualifiers:
                    country = feature.qualifiers["country"][0]
                    break
        
        country_info[accession_id] = country
    
    return country_info

def read_accession_ids_from_file(file_path):
    with open(file_path, 'r') as file:
        # Read each line and strip any whitespace or newline characters,
        # assuming each line contains one accession ID
        accession_ids = [line.strip() for line in file.readlines()]
    return accession_ids


file_path = 'data/custom_analyses_data/alignments/phylo/nipah_whole_genome_genbank_accession_ids.txt'
accession_ids = read_accession_ids_from_file(file_path)

# Fetch country information for each accession ID
country_info = get_country_from_accession_id(accession_ids)
print(country_info)

Convert country information to dataframe

In [None]:
def convert_to_dataframe(country_info):
    # Convert dictionary to DataFrame
    df = pd.DataFrame(list(country_info.items()), columns=['Accession ID', 'Country'])
    return df
    
# Convert country information to pandas DataFrame
country_df = convert_to_dataframe(country_info)
display(country_df)

fixed_country_df = pd.read_csv('data/custom_analyses_data/alignments/phylo/countrydf.csv')
display(fixed_country_df)

In [None]:
def get_accessions_by_country(country_df):
    country_dict = {}
    country_lists = country_df['Country'].unique()
    for country in country_lists:
        country_specific_df = country_df[country_df['Country'] == country]
        country_dict[country] = list(country_specific_df['Accession ID'])
    return country_dict

country_dict = get_accessions_by_country(fixed_country_df)

Now make the tree

In [None]:
def make_tree(tree, country_dict):
    mpl.rc('font', family='sans-serif')
    mpl.rc('font', serif='Helvetica')
    mpl.rc('text', usetex='false')
    mpl.rcParams.update({'font.size': 6})
    mpl.rcParams['font.weight'] = 'light'
    
    fig = plt.figure(figsize=(7.5, 10), facecolor='w')
    gs = gridspec.GridSpec(1, 1, wspace=0.0)
    ax = plt.subplot(gs[0], facecolor='w')

    ### MAP COLORS
    colors = ['#5778a4', '#e49444', '#d1615d', '#a87c9f', '#e7ca60']
    color_mapping = {country: colors[i % len(colors)] for i, country in enumerate(country_dict.keys())}
    color_func = lambda k: color_mapping[next(country for country, ids in country_dict.items() if k.name in ids)]

    ### DRAW TREE
    tree.drawTree()
    tree.plotTree(ax, colour='black', width=1, connection_type='baltic')
    tree.plotPoints(ax, colour=color_func, size=40, zorder=4)

    ### ADD LEAF TEXT
    x_attr = lambda k: k.x+0.001
    kwargs={'va':'center','ha':'left'} ## kwargs for text
    #tree.addText(ax,x_attr=x_attr,**kwargs)

    ### SET MATPLOTLIB INFO
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)

    ### MAKE COUNTRY COLOR LEGEND
    legend_patches = [mpatches.Patch(color=color, label=country) for country, color in color_mapping.items()]
    legend = ax.legend(handles=legend_patches, loc='upper left', bbox_to_anchor=(1, 1), frameon=False, fontsize=6)
    plt.tight_layout()
    plt.savefig(output_img,dpi=300)
    plt.show()
    
make_tree(tree,country_dict)