# Draft of clean phydiv code

In [2]:
import numpy as np
import pandas as pd

import itertools
import random
import toytree
import toyplot
from src.simpd.simpd import Simpd

ImportError: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

## Built-in example data

In [10]:
# establishing a mock community
# species pool = 50, each community has 30 sp, no phylogenetic structure, 20 communities
mock = Simpd(ntips = 50)

# the mock tree
mock_tree = mock.sp_tree

# community matrix
#newick_str = mock_tree.write()
#with open("mock_tree.nwk", "w") as f:
#    f.write(newick_str)
#mock_matrix = mock.simmat(sr = 30, pa = 0, nsites = 20, df = True, csv = "mock_matrix")
mock_matrix = pd.read_csv("mock_matrix.csv")

In [114]:
mock.sp_tree

<toytree.ToyTree at 0x16aae15b0>

## Plotting

Plot pruned trees by community

In [81]:
def plot_prune(tree, matrix, community = None):
    """
    Plot pruned trees for sample communities
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    community: list of communities to plot, given as matrix row indices
    
    Return:
    ---
    plot of communities pruned from metacommunity phylogeny
    """
    
    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    if type(community) is int:
        pass
    elif type(community) is not list:
        raise Exception("Communities for plotting should be given as a list of row indices")
    elif not community:
        community = range(len(matrix)) #all communities if none specified

    # select only species in communities
    spp = matrix.apply(lambda row: row.index[row == 1].tolist(), axis=1)

    # creating a tree for each mock community
    comm_trees = []
    for i in range(len(spp)):
        query_list = spp[i]
        new_tree = toytree.mod.prune(tree, *query_list)
        comm_trees.append(new_tree)

    # plotting only specified communities (default: plot all)
    if type(community) is int:
        comm_trees[community].draw(); #.label.text(f"community {community}")
        
    else:
        mtree = toytree.mtree([comm_trees[c] for c in community])
        canvas, axes, marks = mtree.draw();
        # add a label to each subplot
        #for adx, ax in enumerate(axes):
        #    ax.label.text = f"community {comm_trees[c]}"

In [67]:
plot_comms = [1, 2]
plot_prune(mock_tree, mock_matrix, 0)

Plot metacommunity tree with community tips highlighted

In [82]:
def plot_highlight(tree = None, matrix = None, community = None):
    """
    Plot metacommunity phylogeny with tips highlighted for species in specified communities
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    community: list of communities to plot, given as matrix row indices
    
    Return:
    ---
    plot of metacommunity phylogeny with highlighted tips
    """
    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    if type(community) is int:
        pass
    #elif type(community) is not list:
    #    raise Exception("Communities for plotting should be given as a list of row indices")
    #elif not community:
    #    community = range(len(matrix)) #all communities if none specified
    else:
        raise Exception("Specify community by matrix row index")

    # select only species in communities
    spp = matrix.apply(lambda row: row.index[row == 1].tolist(), axis=1)
    
    #create a mask for species in the community only
    mask = []
    for i in range(len(spp)):
        query_list = spp[i]
        comm_mask = tree.get_node_mask(*query_list)
        mask.append(comm_mask)

    # plotting community
    tree.draw(node_mask=mask[community], node_sizes=12);
    

In [83]:
plot_highlight(community = 2)

Plot all community data on tree as heat map

In [118]:
def plot_all(tree = None, matrix = None):
    """
    Plot metacommunity phylogeny with heatmap for species presence across all communities
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    
    Return:
    ---
    plot of metacommunity phylogeny with heatmap of species for all communities, colored by presence/absence or abundance
    """

    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    # make species matrix a numpy array
    matrix_np = matrix.to_numpy()

    # set row/column parameters
    tmatrix = np.transpose(matrix_np) #transpose matrix to match vertical tree
    trows = tmatrix.shape[0]
    tcolumns = tmatrix.shape[1]

    # create a canvas
    canvas = toyplot.Canvas(width=540, height=900);
    
    # add tree 
    axes = canvas.cartesian(bounds=(50, 150, 50, 850)) # xmin, xmax, ymin, ymax
    tree.draw(axes=axes, tip_labels=True, tip_labels_align=True)
    
    # add matrix
    table = canvas.table(
        rows= trows, #n species
        columns= tcolumns,  #n communities
        margin=1,
        bounds=(160, 490, 50, 850),
    )
    
    colormap = toyplot.color.brewer.map("RedPurple") #need to reverse color!
    
    # apply a color to each cell in the table
    for ridx in range(trows): #row index
        for cidx in range(tcolumns): #column index
            cell = table.cells.cell[ridx, cidx]
            cell.style = {
                "fill": colormap.colors(tmatrix[ridx, cidx], 0, np.max(tmatrix)), 
            }
    
    # style the gaps between cells
    table.body.gaps.columns[:] = 3
    table.body.gaps.rows[:] = 3
    
    # hide axes coordinates
    axes.show = False

In [119]:
plot_all()

## Metrics

Faith's PD

In [108]:
def metric_fpd(tree = None, matrix = None, csv = None):
    """
    Calculate Faith's phylogenetic diversity (FPD or PD) for each community
    Here, abbreviated as FPD to reduce confusion with pandas as pd
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    
    Return:
    ---
    Numpy array of FPD values for each community. Option for writing to csv.
    """

    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    # select only species in communities
    spp = matrix.apply(lambda row: row.index[row == 1].tolist(), axis=1)

    # creating a tree for each mock community
    comm_trees = []
    for i in range(len(spp)):
        query_list = spp[i]
        new_tree = toytree.mod.prune(tree, *query_list)
        comm_trees.append(new_tree)

    # For the pruned trees, sum distances
    tree_fpd = []
    for tree in comm_trees:
        fpd = tree.get_node_data("dist").sum()
        tree_fpd.append(fpd)

    # Option to write csv or print to stdout
    if type(csv) is str:
        pd.Dataframe(tree_fpd).to_csv(f"{csv}.csv", index = False) #write the csv if specified
    else:
        return tree_fpd

In [109]:
metric_fpd()

[13.333333333333332,
 13.666666666666664,
 13.833333333333332,
 13.666666666666664,
 13.666666666666664,
 14.0,
 13.666666666666666,
 14.0,
 13.833333333333332,
 13.833333333333332,
 13.833333333333332,
 13.666666666666664,
 13.999999999999998,
 13.833333333333332,
 13.666666666666666,
 13.833333333333332,
 13.666666666666666,
 13.333333333333332,
 13.999999999999998,
 14.333333333333332]

MPD

In [110]:
def metric_mpd(tree = None, matrix = None, csv = None):
    """
    Calculate mean phylogenetic distance (MPD) for each community
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    
    Return:
    ---
    Numpy array of MPD values for each community. Option for writing to csv.
    """

    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    # select only species in communities
    spp = matrix.apply(lambda row: row.index[row == 1].tolist(), axis=1)

    # calculate MPD for each community
    tree_mpd = []
    for i in range(len(spp)):
        pairs = list(itertools.combinations(spp[i], 2))
        pair_dists = []
        for p in range(len(pairs)):
            query_list = pairs[p]
            dist = tree.distance.get_node_distance(*query_list)
            pair_dists.append(dist)
        tree_mpd.append(sum(pair_dists)/len(pair_dists))

    # Option to write csv or print to stdout
    if type(csv) is str:
        pd.Dataframe(tree_mpd).to_csv(f"{csv}.csv", index = False) #write the csv if specified
    else:
        return tree_mpd

In [111]:
metric_mpd()

[1.704214559386973,
 1.7057471264367816,
 1.6980842911877394,
 1.703448275862069,
 1.7065134099616857,
 1.7065134099616857,
 1.7049808429118773,
 1.7011494252873562,
 1.7049808429118773,
 1.6988505747126437,
 1.6842911877394635,
 1.6996168582375477,
 1.710344827586207,
 1.6988505747126437,
 1.7118773946360153,
 1.711111111111111,
 1.6973180076628351,
 1.6950191570881223,
 1.7026819923371648,
 1.7088122605363982]

MNTD

In [112]:
def metric_mntd(tree = None, matrix = None, csv = None):
    """
    Calculate mean nearest taxon distance (MNTD) for each community
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    
    Return:
    ---
    Numpy array of MNTD values for each community. Option for writing to csv.
    """

    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    # select only species in communities
    spp = matrix.apply(lambda row: row.index[row == 1].tolist(), axis=1)

    # get distance matrix of metacommunity tree
    meta_dm = tree.distance.get_tip_distance_matrix(df = True)
    
    #remove absent species from distance matrix for each community
    comm_dists = []
    for comm in spp:
        comm_dm = meta_dm.loc[comm, comm]
        comm_dists.append(comm_dm)
    comm_dists[0]
    
    # calculate MNTD for each community
    tree_mntd = []
    for dm in comm_dists: #for community in dms of communities list
        nt = [] #empty list of nearest taxa
        for row in range(dm.shape[0]): #for each species (row)
            sp_dist = list(dm.iloc[row]) #select species row
            del sp_dist[row] #exclude same-species distance
            nt.append(min(sp_dist)) #append minimum to nearest list
        tree_mntd.append(sum(nt)/len(nt)) #append average to mntd list
    
    # Option to write csv or print to stdout
    if type(csv) is str:
        pd.Dataframe(tree_mntd).to_csv(f"{csv}.csv", index = False) #write the csv if specified
    else:
        return tree_mntd

In [113]:
metric_mntd()

[0.5222222222222223,
 0.5666666666666667,
 0.5888888888888888,
 0.5777777777777777,
 0.5555555555555555,
 0.5888888888888888,
 0.5444444444444444,
 0.611111111111111,
 0.6,
 0.5888888888888888,
 0.5888888888888888,
 0.5666666666666667,
 0.6,
 0.5666666666666667,
 0.5333333333333333,
 0.5777777777777777,
 0.5666666666666667,
 0.5222222222222223,
 0.611111111111111,
 0.6333333333333333]

Function for all metric functions

In [None]:
def metric_all(tree = None, matrix = None, csv = None):
    """
    Calculate all phylogenetic diversity metrics for each community (Faith's PD, MPD, and MNTD)
    
    Parameters:
    ---
    tree: toytree tree
    matrix: site by species matrix in pandas format
    
    Return:
    ---
    Pandas dataframe of diversity metric values for each community. Option for writing to csv.
    """
    # default values
    if tree is None and matrix is None:
        tree = mock_tree #default if no tree given
        matrix = mock_matrix #default if no matrix given
    elif tree != mock_tree and matrix != mock_matrix:
        pass
    else:
        raise Exception("Input both metacommunity tree and site x species community matrix")

    # calculate all metrics
    fpd = metric_fpd(tree = tree, matrix = matrix)
    mpd = metric_mpd(tree = tree, matrix = matrix)
    mntd = metric_mntd(tree = tree, matrix = matrix)

    #combine all metrics into pandas dataframe
    metrics = pd.Dataframe(fpd, mpd, mntd)

    # Option to write csv or print to stdout
    if type(csv) is str:
        metrics.to_csv(f"{csv}.csv", index = False) #write the csv if specified
    else:
        return metrics