# Anomaly zone MSC simulations

See notebook 1 for a step by step demonstrations of the functions used below. This notebook focuses just on running these functions over a range of parameter values and visualizing results.

In [1]:
# conda install ipcoal toytree -c conda-forge
# conda install astral3 -c conda-forge-c eaton-lab
# conda install raxml-ng -c conda-forge -c bioconda

In [4]:
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import numpy as np
import ipcoal
import toytree

In [5]:
print("toytree", toytree.__version__)
print("ipcoal", ipcoal.__version__)

toytree 3.0.dev1
ipcoal 0.4.dev1


### Global simulation variables
These variables are used throughout the notebook. All relevant variable are set here.

In [4]:
# full paths to conda-installed phylo inference binaries
ASTRAL = "/home/deren/miniconda3/envs/ipyrad/bin/astral.5.7.1.jar"
RAXML_NG = "/home/deren/miniconda3/envs/ipyrad/bin/raxml-ng"

In [28]:
# directory in curdir to store tree files, rm any previous csv files in it.
WORKDIR = Path("./anomaly_data")
WORKDIR.mkdir(exist_ok=True)
for tmpfile in WORKDIR.glob("*.csv"):
    tmpfile.unlink()

In [6]:
# you may wish to scale NCORES parallelization based on your system
NCORES = 50

In [7]:
# coalescent simulation parameters
NREPS = 100                                                            # replicates for measuring accuracy
SEED = 123                                                             # random seed
POPSIZE = 1e4                                                          # constant Ne across sptree
NSITES = 2000                                                          # locus length 
NLOCI = np.array([100, 250, 500, 1000, 2500, 5000, 10000, 20000])      # range of N independent loci
GTIMES = np.array([50, 100, 200, 300, 400, 500, 750, 1000]) * 1000     # range of treeheights (gens) to test
CTIMES = GTIMES / (4 * POPSIZE)                                        # range of times in coal units
MUT = 5e-8                                                             # mut rate (/site/gen)
RECOMB = 5e-9                                                          # recomb rate (/site/gen)
IMBTREE = toytree.rtree.imbtree(ntips=5)                               # an imbalanced sptree topology
RELTREE = IMBTREE.set_node_data("height", {5: 0.050, 6: 0.055, 7: 0.060, 8: 1}) # relative edge len sptree

In [8]:
# population scaled parameter settings
print(f"theta = {4 * POPSIZE * MUT}")
print(f"rho = {4 * POPSIZE * RECOMB}")
print(f"short edge len in coal units = {list(CTIMES * 0.05)}")

theta = 0.002
rho = 0.0002
short edge len in coal units = [0.0625, 0.125, 0.25, 0.375, 0.5, 0.625, 0.9375, 1.25]


In [9]:
def get_scaled_sptree(tree_height: int) -> toytree.ToyTree:
    """Return the species tree with all edges scaled to a new root height."""
    sptree = RELTREE.mod.edges_scale_to_root_height(tree_height)
    return sptree

In [10]:
def get_astral_ge_accuracy(tree_height: int, nloci: int) -> float:
    """Returns the proportion of correct astral results from nreplicate analyses.
    
    This function analyses true simulated genealogies (not inferred gene trees).
    The Ne parameter is maintained constant using the global POPSIZE, but the
    edge lengths in units of generations are affected by `tree_height`, thus
    the ratio of (tg) edge length / Ne can be varied by changing `tree_height`.
    NREPS replicate analyses are started from different random seeds.
    
    The ipcoal.Model.df DataFrames are saved to the WORKDIR directory.
    """
    rng = np.random.default_rng(SEED)
    
    # copy imbalanced sptree, scale to new height, and set short edge to 1/5
    tmp_sptree = get_scaled_sptree(tree_height)
    
    # iterate over replicates
    correct = np.zeros(NREPS, dtype=bool)
    for idx in range(NREPS):
        rseed = rng.integers(1e12)
        model = ipcoal.Model(tmp_sptree, Ne=POPSIZE, seed_trees=rseed)
        model.sim_trees(nloci=nloci, nsites=1)

        # load trees into a MultiTree and pass to ipcoal astral tool
        mtree = toytree.mtree(model.df.genealogy)
        ast_tree = ipcoal.phylo.infer_astral_tree(mtree, binary_path=ASTRAL)
        
        # does it match?
        if ast_tree.distance.get_treedist_rf(IMBTREE, normalize=True) == 0:
            correct[idx] = True
            
        # save the distribution of trees to a file
        name = f"th{int(tree_height)}-nloci{int(nloci)}-ast{int(correct[idx])}"
        model.df.to_csv(WORKDIR / f"{name}-sim_trees.csv")
    return correct.mean()

In [11]:
def run_astral_ge_accuracy_parallel() -> np.ndarray:
    """Return array of results from `get_astral_ge_accuracy` run in parallel.
    
    This runs the analysis over a grid of parameters for all combinations
    in the globals `NLOCI` and `GTIMES`. The resulting simulated DataFrames
    with genealogies are also saved to the `WORKDIR` directory. This will
    likely take a long time to run.
    """
    rasyncs = {}
    with ProcessPoolExecutor(max_workers=NCORES) as pool:
        for tidx, time in enumerate(GTIMES):
            for nidx, nloci in enumerate(NLOCI):
                rasync = pool.submit(get_astral_ge_accuracy, time, nloci)
                rasyncs[(tidx, nidx)] = rasync
    
    # fill result of % correct into a 2-D dataframe
    arr = np.zeros(shape=(GTIMES.size, NLOCI.size))
    for key, future in rasyncs.items():
        tidx, nidx = key
        arr[tidx, nidx] = future.result()
    return arr

In [18]:
def get_astral_gt_accuracy(tree_height: int, nloci: int, locus_len: int, recomb: float) -> tuple[float, float]:
    """Returns the proportion of correct astral results from nreplicate analyses.
    
    The Ne parameter is maintained constant using the global POPSIZE, but the
    edge lengths in units of generations are affected by `tree_height`, thus
    the ratio of edge length / Ne can be varied here to see its effect. Replicate
    analyses are started from different random seeds.
    """
    rng = np.random.default_rng(SEED)
    
    # copy imbalanced sptree, scale to new height, and set short edge to 1/5
    tmp_sptree = get_scaled_sptree(tree_height)
    
    # iterate over replicates
    correct = np.zeros(NREPS, dtype=bool)
    correct_concat = np.zeros(NREPS, dtype=bool)
    for idx in range(NREPS):
        rseed = rng.integers(1e12)
        model = ipcoal.Model(tmp_sptree, Ne=POPSIZE, seed_trees=rseed, mut=MUT, recomb=recomb)
        model.sim_loci(nloci=nloci, nsites=locus_len)

        # load trees into a MultiTree and pass to ipcoal astral tool
        rax_data = ipcoal.phylo.infer_raxml_ng_trees(model, ncores=1, nthreads=1)
        mtree = mtree = toytree.mtree(rax_data.gene_tree)
        ast_tree = ipcoal.phylo.infer_astral_tree(mtree, binary_path=ASTRAL)
        
        # does it match?
        if ast_tree.distance.get_treedist_rf(IMBTREE, normalize=True) == 0:
            correct[idx] = True

        # infer concatenation tree
        concat_tree = ipcoal.phylo.infer_raxml_ng_tree(model, nthreads=1)

        # does it match?
        if concat_tree.distance.get_treedist_rf(IMBTREE, normalize=True) == 0:
            correct_concat[idx] = True

        # save genealogies and gene trees
        name = (
            f"th{int(tree_height)}-nloci{int(nloci)}-recomb{int(recomb)}-rep{idx}"
            f"-ast{int(correct[idx])}-concat{int(correct_concat[idx])}"
        )
        model.df.to_csv(WORKDIR / f"{name}-sim_loci.csv")
        rax_data.to_csv(WORKDIR / f"{name}-raxml_trees.csv")
    return correct.mean(), correct_concat.mean()

In [19]:
def run_astral_gt_accuracy_parallel(locus_len: int, recomb: float) -> np.ndarray:
    """Return array of results from `get_astral_gt_accuracy` run in parallel."""
    rasyncs = {}
    with ProcessPoolExecutor(max_workers=NCORES) as pool:
        for tidx, time in enumerate(GTIMES):
            for nidx, nloci in enumerate(NLOCI):
                rasync = pool.submit(get_astral_gt_accuracy, *(time, nloci, locus_len, recomb))
                rasyncs[(tidx, nidx)] = rasync
    
    # fill result of % correct into a 2-D dataframe
    arr = np.zeros(shape=(GTIMES.size, NLOCI.size))
    arr_concat = np.zeros(shape=(GTIMES.size, NLOCI.size))
    for key, future in rasyncs.items():
        tidx, nidx = key
        acc, acc_concat = future.result()
        arr[tidx, nidx] = acc
        arr[tidx, nidx] = acc_concat
    return arr

### Run analyses

In [21]:
arr_ge = run_astral_ge_accuracy_parallel()

In [22]:
arr_gt = run_astral_gt_accuracy_parallel(locus_len=2000, recomb=0)

In [23]:
arr_gtr = run_astral_gt_accuracy_parallel(locus_len=2000, recomb=RECOMB)

### Visualize results

In [24]:
# plot_accuracy(CTIMES, gt_500, label=f"ASTRAL accuracy on 1000 inferred gene trees (500bp)");
# plot_accuracy(CTIMES, gt_500r, label=f"ASTRAL accuracy on 1000 inferred gene trees (500bp) w/ recomb");