### Distribution of waiting times in SMC under a species tree model

In [37]:
import toytree
import toyplot
import ipcoal
import pandas as pd
import numpy as np

In [38]:
from ipcoal.smc.smc3 import (
    get_gene_tree_coal_intervals, 
    get_prob_gene_tree_is_unchanged_by_recomb_event,
    get_prob_gene_tree_is_unchanged_by_recomb_on_edge,
    get_prob_gene_tree_is_unchanged,
    get_expected_dist_until_gene_tree_changes,
)

### Define a species tree and sample a single genealogy

In [39]:
SPTREE = toytree.rtree.unittree(ntips=6, treeheight=1e6, seed=123)
SPTREE = SPTREE.set_node_data("Ne", {i: 5e4 for i in (0, 1, 8)}, default=1e5)
#SPTREE = SPTREE.set_node_data("Ne", default=2.5e5)
MODEL = ipcoal.Model(SPTREE, seed_trees=123)
MODEL.sim_trees(1, 1)
GTREE = toytree.tree(MODEL.df.genealogy[0])

In [40]:
def draw_trees(species_tree, gene_tree):
    """Return a drawing of species tree, gene tree, and intervals of interest.
    """   
    # draw the tree
    mtre = toytree.mtree([species_tree, gene_tree])
    canvas, axes, _ = mtre.draw(
        ts='p',
        shared_axes=True,
        scale_bar=True,
        fixed_order=SPTREE.get_tip_labels(),
        node_labels="idx",
        node_labels_style={"baseline-shift": "10px", "font-size": "11px"},
        node_sizes=6,
        height=325, width=500,
    );

    axes[0].label.text = "SPTREE"
    axes[1].label.text = "GTREE"
    for ax in axes:
        ax.hlines(
            species_tree.get_node_data("height").unique(),
            style={
                "stroke": toytree.COLORS1[1], 
                "stroke-width": 2,
                "stroke-dasharray": "2,4"},
        )
    return canvas, axes, _

### get the genetree coal events on a specified gene tree edge idx

In [45]:
-np.exp(709)

-8.218407461554972e+307

In [47]:
0.1/ 1e6

1.0000000000000001e-07

In [41]:
get_gene_tree_coal_intervals(SPTREE, GTREE, 2)

Unnamed: 0,start,stop,neff,edges,st_node,event,dist
0,0.0,250000.0,100000.0,1,2->6,ILS,250000.0
1,250000.0,500000.0,100000.0,2,6->7,ILS,250000.0
2,500000.0,696076.852632,100000.0,3,7->7,COAL-g7,196076.852632


In [28]:
%%timeit
SPTREE.set_node_data("ncoals", default=0)

282 µs ± 5.38 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [30]:
%%timeit
gene_tree = GTREE.set_node_data(
    feature="age",
    mapping=GTREE.get_node_data("height").to_dict()
)

723 µs ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [32]:
%%timeit
ncoals = {}
for node in SPTREE.treenode.traverse("postorder"):
    if not node.is_leaf():
        pass#st_tips = node.get_leaf_names()
        

9.38 µs ± 23.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [51]:
-np.exp(-1e7)

-0.0

In [16]:
draw_trees(SPTREE, GTREE);

### get the prob. gene tree does *not* change given recomb at edge x time

In [17]:
get_prob_gene_tree_is_unchanged_by_recomb_event(SPTREE, GTREE, (3, 500_000))

0.9572377923299176

In [18]:
def plot_edge_recomb_probs(species_tree, gene_tree, idx, **kwargs):
    """plots prob gtree is unchanged for recomb positions along the edge"""
    node = gene_tree.idx_dict[idx]
    xpos = np.linspace(node.height, node.up.height, 50)
    probs = []
    for time in xpos:
        prob = get_prob_gene_tree_is_unchanged_by_recomb_event(
            species_tree, gene_tree, (idx, time)
        )
        probs.append(prob)
    
    # optional styling kwargs
    style = {
        'width': 350, 
        'height': 300, 
        'xlabel': "position of recomb. on edge", 
        'ylabel': "prob. gene tree is unchanged.",
        'style': {"stroke-width": 3},
        'ymin': 0,
    }
    style.update(kwargs)
    
    # plot the prob. of unchanged given position of recomb.
    return toyplot.plot(xpos, probs, **style);

In [19]:
plot_edge_recomb_probs(SPTREE, GTREE, 0);

In [60]:
from decimal import Decimal

In [69]:
a = Decimal(700.000)
b = Decimal(800.322)

In [70]:
np.exp(b) - np.exp(a)

Decimal('3.762082765615092907558784684498222459674255591664278458601422654430489014422676468461548300523140101E+347')

In [71]:
import decimal
decimal.getcontext().prec = 100

In [75]:
np.exp(np.array([900], dtype=np.float128))

array([7.32881422e+390], dtype=float128)

In [79]:
np.exp(np.float128(9000))

4.4703047331654426408e+3908

In [None]:
WARNING | smc_pat.py   | **** 0-1
first_term=500000.0
second_term_before=30793.83379790111
first_expr=2.4903429574618414
second_expr=-83808.08494617493
branch_sum=291289.1258759295
--------------

WARNING | smc_pat.py   | **** 1-1
first_term=72909.02135234904
second_term_before=0
first_expr=176.7816208899733
second_expr=-202.6625475142001
branch_sum=37082.00770909751
--------------

### Get prob gene tree is unchanged given recomb on edge

In [10]:
# integration over the unif prob of recomb anywhere on edge
get_prob_gene_tree_is_unchanged_by_recomb_on_edge(SPTREE, GTREE, 3)

0.7394815308146538

### Get prob gene tree is unchanged given recomb on *any edge*

In [11]:
get_prob_gene_tree_is_unchanged(SPTREE, GTREE)

0.6350486982672543

### Get expected (spatial) waiting time until gene tree changes 

In [12]:
get_expected_dist_until_gene_tree_changes(SPTREE, GTREE, 1e-8)

49.912035634775854

### Compare expected distance to simulations

In [34]:
# model uses the variable Ne values assigned to SPTREE nodes
model = ipcoal.Model(SPTREE, recomb=1e-9, seed_trees=123)
model.sim_loci(nloci=1, nsites=1e7)

In [35]:
# how long did each genealogy span (nbps) column
observed = model.df.nbps

In [36]:
# distribute computation of 'expected' in parallel
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=6) as pool:
    rasyncs = {}
    for idx in model.df.index:
        gtree = toytree.tree(model.df.genealogy[idx])
        args = (SPTREE, gtree, 1e-9)
        rasyncs[idx] = pool.submit(get_expected_dist_until_gene_tree_changes, *args)

# collect results
expected_waiting_times = np.array([rasyncs[idx].result() for idx in rasyncs])

In [53]:
# simulation-based expectations
observed.mean(), observed.std()

NameError: name 'observed' is not defined

In [81]:
import scipy.stats

In [89]:
scipy.stats.expon(657)

<scipy.stats._distn_infrastructure.rv_frozen at 0x7f1899c710d0>

In [93]:
# now sample random lengths from the expected waiting times
expected_len_dists = np.random.exponential(expected_waiting_times)
expected_len_dists.mean(), expected_len_dists.std()

NameError: name 'expected_waiting_times' is not defined

In [60]:
# now sample random lengths from the expected waiting times
expected_len_dists = np.random.exponential(expected_waiting_times)
expected_len_dists.mean(), expected_len_dists.std()

(525.697274589227, 537.201712529203)

In [122]:
sorted(SPTREE.get_node_data("height")[SPTREE.ntips:] / (2 * 1e5))

[1.25, 2.5, 3.75, 3.75, 5.0]

In [121]:
toytree.rtree.bdtree(10).get_node_data("height").sort_values()

9     0.000000e+00
8     0.000000e+00
6     0.000000e+00
5     0.000000e+00
4     0.000000e+00
0     4.440892e-16
2     4.440892e-16
1     4.440892e-16
3     4.440892e-16
7     8.881784e-16
10    1.350746e-02
11    3.906704e-02
12    6.948640e-01
13    1.145003e+00
14    1.526557e+00
17    1.651152e+00
15    1.836662e+00
16    2.067886e+00
18    2.458722e+00
Name: height, dtype: float64

In [123]:
from math import comb

In [125]:
comb(4, 2)

6

In [106]:
%%timeit
comb(10, 3)

13 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [103]:
from scipy.special import comb

In [105]:
%%timeit
comb(10, 3)

12.2 µs ± 27.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [95]:
canvas = toyplot.Canvas(width=600, height=300)
ax0 = canvas.cartesian(grid=(1, 2, 0), xmax=3000, label="simulations")
ax1 = canvas.cartesian(grid=(1, 2, 1), xmax=3000, label="expected")
ax0.bars(np.histogram(observed, bins=np.linspace(0, 3000, 20)));
ax1.bars(np.histogram(expected_len_dists, bins=np.linspace(0, 3000, 20)));

NameError: name 'observed' is not defined