## Generation-time branch attraction

This notebook focuses on Ne effects

In [1]:
import toytree
import toyplot, toyplot.svg
import ipcoal
import numpy as np
import ipyrad.analysis as ipa

### Simulation scenario with deep divergences (crown 25Ma)

In [3]:
# get an ultrametric imbalanced tree
tree = toytree.rtree.baltree(12, treeheight=24e6)
tree.draw(ts='p');

In [4]:
# store the alphanumeric order of names on the tree
NAMEORDER = tree.get_tip_labels()

### Species tree in units of generations
To setup a simulation on this tree we need branch lengths to be in units of generations. If we assume that 1 generation = 1 year then nothing has to be done. Here we assume that generations times are 1 for half of the taxa on the tree, but 1 generation = 10 years for several other taxa. The tree below shows that this looks like in terms of the transformed branch lengths. 

Here the edge lengths of the species tree are all the same in coalescent units.

In [10]:
# set gentime on nodes
ntree = tree.set_node_values(
    feature="Ne", 
    values={i: 100e6 for i in (0,1,2,12,16, 9,10,11,15,19)},
    default=10e6,
)

# show tc (coalescent time units) for all node dists
print("tc:", ntree.get_node_values("dist", 1, 1) / (2 * ntree.get_node_values("Ne", 1, 1)))

# draw tree showing Ne and g dists
ntree.draw(ts='p', width=400, node_sizes=0, node_labels=False, edge_type='p');

tc: [0.3  0.3  0.3  0.03 0.3  0.3  0.03 0.03 0.3  0.3  0.03 0.06 0.03 0.03
 0.6  0.3  0.3  0.6  0.3  0.3  0.06 0.03 0.03]


In [14]:
# set gentime on nodes
gtree = tree.set_node_values(
    feature="g", 
    values={i: 10 for i in (0,1,2,12,16, 9,10,11,15,19)},
    default=1,
)

# set gentime on nodes
gtree = gtree.set_node_values(
    feature="dist", 
    values={i: j.dist / j.g for (i, j) in gtree.idx_dict.items()}
)

# show tc (coalescent time units) for all node dists
print("tc:", gtree.get_node_values("dist", 1, 1) / (2 * 10e6))

# draw tree showing Ne and g dists
gtree.draw(ts='p', width=400, node_sizes=0, node_labels=False, edge_type='p');

tc: [0.3  0.3  0.3  0.03 0.3  0.3  0.03 0.03 0.3  0.3  0.03 0.06 0.03 0.03
 0.6  0.3  0.3  0.6  0.3  0.3  0.06 0.03 0.03]


### Example sequential genealogies

In [16]:
# simulate K unlinked genealogies
u_model = ipcoal.Model(gtree, Ne=10e6, seed=12345)
u_model.sim_trees(5000)
u_trees = toytree.mtree(u_model.df.genealogy)
u_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(u_trees))

5000


In [17]:
# simulate K linked genealogies
l_model = ipcoal.Model(gtree, Ne=10e6, seed=12345)
l_model.sim_trees(nloci=1, nsites=50000)
l_trees = toytree.mtree(l_model.df.genealogy)
l_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(l_trees))

17714


### Are branches attracted by generation times on true genealogies?

- current approach is to measure topological pairwise dist between all nodes. 

- alternative could be to measure avg pairwise dist on each node too all other nodes...

In [18]:
def topo_dist(tree):
    """
    Return alphanumeric name ordered array of topo dist
    between all nodes on a tree
    """
    t1 = tree.unroot()
    
    # array of tips in alphanumeric ordr
    arr = np.zeros((len(t1), len(t1)), dtype=float)
    
    # iterate to compare all tips to tips
    labels = NAMEORDER
    for idx1 in range(len(t1)):
        for idx2 in range(len(t1)):
            if idx1 != idx2:
                
                # get distance between tips on every tree
                node1 = t1.idx_dict[idx1]
                node2 = t1.idx_dict[idx2]
                dist = t1.treenode.get_distance(
                    node1, node2, topology_only=True,
                )
                nidx1 = labels.index(node1.name)
                nidx2 = labels.index(node2.name)
                arr[nidx1, nidx2] = dist
    return arr

In [19]:
def sequential_dist(trees):
    """
    Return array of topo dists between nodes on sequential trees
    in ordered input.
    """
    arrs = []
    dists = None
    for tree in trees:
        if dists is not None:
            diff = abs(dists - topo_dist(tree))
            arrs.append(diff)
        dists = topo_dist(tree)
    return np.array(arrs)

In [20]:
def distance_dist(df, dist=100, nsamples=1000):
    """
    Return array of topo dists between nodes on a chrom separated
    by a set distance in bp.
    """
    arrs = []
    
    # randomly sample 1000 trees
    tidxs = np.random.uniform(df.start.min(), df.start.max(), nsamples)
    
    for tidx in tidxs:
        
        # sample random tree
        t1 = df.loc[df.end >= tidx, "genealogy"].iloc[0]
        
        # sample another tree dist away
        if tidx + dist < df.end.max():
            t2 = df.loc[df.end >= tidx + dist, "genealogy"].iloc[0]
        else:
            t2 = df.loc[df.end >= tidx - dist, "genealogy"].iloc[0]

        # get topo distance between the two trees
        tree1 = toytree.tree(t1)
        tree2 = toytree.tree(t2)
        diff = abs(topo_dist(tree1) - topo_dist(tree2))
        
        # store result
        arrs.append(diff)
    return np.array(arrs)

In [21]:
def plot_matrix(dat):
    
    # get min and max of the off-diagonal elements
    dat = dat.copy()
    dat[np.diag_indices_from(dat)] = np.nanmean(dat)
    cmap = toyplot.color.LinearMap(domain_min=dat.min(), domain_max=dat.max())
    canvas = toyplot.Canvas(320, 300)
    ax1 = canvas.table(rows=dat.shape[0], columns=dat.shape[1], bounds=(50, 250, 50, 250))

    # apply colors to cells but not diagonals
    for ridx in range(dat.shape[0]):
        for cidx in range(dat.shape[1]):
            
            if ridx == cidx:
                ax1.cells.cell[ridx, cidx].style = {'fill': 'grey', 'stroke': 'none'}
            else:
                col = cmap.color(dat[ridx, cidx])
                ax1.cells.cell[ridx, cidx].style = {'fill': col, 'stroke': 'none'}

    # style spacing between grid cells            
    ax1.body.gaps.columns[...] = 1
    ax1.body.gaps.rows[...] = 1

    # add a colorbar to canvas
    numberline = canvas.numberline(260, 250, 260, 50)
    numberline.colormap(cmap, style={"stroke-width":5}, offset=-10)
    numberline.axis.ticks.locator = toyplot.locator.Extended(only_inside=True)
    return canvas, ax1

In [22]:
# plot the variance in differences between UNLINKED trees
unlinked_var = sequential_dist(u_trees.treelist).var(axis=0)
plot_matrix(unlinked_var);

In [23]:
# plot the variance in differences between LINKED trees
vdist50 = np.var(distance_dist(l_model.df, 50, 10000), axis=0)
plot_matrix(vdist50);

In [24]:
# plot the variance in differences between LINKED trees
vdist20 = np.var(distance_dist(l_model.df, 20, 10000), axis=0)
plot_matrix(vdist20);

In [None]:
# plot the variance in differences between UNLINKED trees
linked_var = sequential_dist(l_trees.treelist).var(axis=0)
plot_matrix(linked_var);

In [None]:
# plot the difference in variance between linked and unlinked
c, t = plot_matrix(1 - (vdist20 / unlinked_var));
a = c.cartesian(bounds=(5, 45, 50, 250))
t = tree.ladderize(1)
t.draw(
    axes=a, 
    tip_labels=False,
    edge_colors=t.get_edge_values_mapped({39: 'red', 36: 'red'}, False),
);
a.show = False

a = c.cartesian(bounds=(50, 250, 5, 45))
t = tree.ladderize(0)
t.draw(
    axes=a, 
    tip_labels=False,
    layout='d',
    edge_colors=t.get_edge_values_mapped({32: 'red', 35: 'red'}, False),
);
a.show = False

a = c.cartesian(bounds=(50, 251, 260, 260))
a.text(
    np.arange(tree.ntips),
    np.repeat(0, tree.ntips),
    [i[1:] for i in tree.get_tip_labels()],
    style={"font-size": "8px", "fill": "black"}
);
a.show = False

# save plot
toyplot.svg.render(c, "../figures/bigvar-Ne.svg")
c

### Multi-species coalescent is not affected with True genealogies
With unlinked genealogies the greater variance in position of taxa with long generations times is no different than the greater variance caused by differences in Ne. In other words, generation time can be ignored since it the Ne parameter can effectively suck up any variance explained by differences in generation time.

In [177]:
# # infer a species tree
# ast = ipa.astral([i.write() for i in trees])
# ast.run()

In [178]:
# assume julia & snaq are already installed.
# needs to parse the returned network into major tree + edge tuples
# snaq = ipa.snaq()

In [179]:
# toytree.tree(ast.tree).root(regex='r[0-8]').draw(layout='d');

### Simulate sequence data on genealogies under a GTR model

In [None]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(gtree, Ne=10e6, seed=12345)
model.sim_loci(nloci=10, nsites=3e3)
model.write_loci_to_hdf5(name="bigvar-g", outdir="db")

In [None]:
model.infer