## Generation-time branch attraction

This notebook focuses on Ne effects

In [1]:
import toytree
import toyplot, toyplot.svg
import ipcoal
import numpy as np
import ipyrad.analysis as ipa

### Simulation scenario with deep divergences

In [2]:
# get an ultrametric imbalanced tree
tree = toytree.rtree.imbtree(8, treeheight=20e6)
tree.draw(ts='p');

In [3]:
# set gentime on nodes
ntree = tree.set_node_values(
    feature="Ne", 
    values={i: 100e6 for i in (3,4,5,10,11,12)},
    default=10e6,
)

# show tc (coalescent time units) for all node dists
ntree = ntree.set_node_values(
    feature="tc",
    values={
        i: node.dist / (2 * node.Ne) 
        for i, node in ntree.idx_dict.items()
    },
)

# draw tree showing Ne and g dists
ntree.draw(
    ts='p', 
    width=400, 
    node_sizes=0, 
    node_labels=ntree.get_node_values('tc', 0, 0),
    edge_type='c',
);

### Concatenation: Confirm scenario is in the anomaly zone 

Here we aim to select a scenario that will lie in the 'anomaly zone', where concatenation will yield incorrect results but a proper MSC method should infer a correct result.

In [4]:
model = ipcoal.Model(ntree)
model.sim_loci(1, 1e6)

In [18]:
# the total number of genealogies
print('ngenealogies:', model.df.tidx.max())
print('mean len of gtree: {:.2f} bp'.format(model.df.nbps.mean()))

ngenealogies: 166948
mean len of gtree: 5.99 bp


In [5]:
model.infer_gene_trees(inference_args={"T": '20'})

wrote concat locus (8 x 1000000bp) to /tmp/21065.phy


In [6]:
etree = toytree.tree(model.df.inferred_tree[0]).root('r7')
etree.draw(ts='o');

### Confirm that species tree is not an anomaly
Astral gets the correct tree when given > ~500 trees as input.

In [68]:
# load trees and convert edge lengths to E(mut/site)
mtre = toytree.mtree(model.df.genealogy[::300])
print(len(mtre))
for tre in mtre.treelist:
    for node in tre.idx_dict.values():
        node.dist = node.dist * 1e-8

# infer sptree
ast = ipa.astral([i.write() for i in mtre.treelist])
ast.run()
toytree.tree(ast.tree).root("r7").draw(ts='o');

557
[astral.5.7.1.jar]
inferred tree written to (/home/deren/gentime-attraction/notebooks/analysis-astral/test.tre)


### Store the true tip order for storing ordered results

In [4]:
# store the alphanumeric order of names on the tree
NAMEORDER = tree.get_tip_labels()

### Species tree in units of generations
To setup a simulation on this tree we need branch lengths to be in units of generations. If we assume that 1 generation = 1 year then nothing has to be done. Here we assume that generations times are 1 for half of the taxa on the tree, but 1 generation = 10 years for several other taxa. The tree below shows that this looks like in terms of the transformed branch lengths. 

Here the edge lengths of the species tree are all the same in coalescent units.

In [5]:
# set gentime on nodes
ntree = tree.set_node_values(
    feature="Ne", 
    values={i: 100e6 for i in (3,4,5,10,11,12)},
    default=10e6,
)

# show tc (coalescent time units) for all node dists
print("tc:", ntree.get_node_values("dist", 1, 1) / (2 * ntree.get_node_values("Ne", 1, 1)))

# draw tree showing Ne and g dists
ntree.draw(ts='p', width=400, node_sizes=0, node_labels=False, edge_type='c');

tc: [0.14285714 0.14285714 0.01428571 0.01428571 0.01428571 0.14285714
 0.14285714 1.         0.85714286 0.07142857 0.05714286 0.04285714
 0.28571429 0.14285714 0.14285714]


In [6]:
# set gentime on nodes
gtree = tree.set_node_values(
    feature="g", 
    values={i: 10 for i in (3,4,5,10,11,12)},
    default=1,
)

# set gentime on nodes
gtree = gtree.set_node_values(
    feature="dist", 
    values={i: j.dist / j.g for (i, j) in gtree.idx_dict.items()}
)

# show tc (coalescent time units) for all node dists
print("tc:", gtree.get_node_values("dist", 1, 1) / (2 * 10e6))

# draw tree showing Ne and g dists
gtree.draw(ts='p', width=400, node_sizes=0, node_labels=False, edge_type='p');

tc: [0.14285714 0.14285714 0.01428571 0.01428571 0.01428571 0.14285714
 0.14285714 1.         0.85714286 0.07142857 0.05714286 0.04285714
 0.28571429 0.14285714 0.14285714]


In [7]:
# set gentime on nodes
ftree = tree.set_node_values(
    feature="g", 
    values={i: 10 for i in (3,4,5,10,11,12)},
    default=1,
)

ftree = ftree.set_node_values(
    feature="Ne", 
    values={i: 1e6 for i in (3,4,5,10,11,12)},
    default=10e6,
)

# set gentime on nodes
ftree = ftree.set_node_values(
    feature="dist", 
    values={i: j.dist / j.g for (i, j) in ftree.idx_dict.items()}
)

# show tc (coalescent time units) for all node dists
print("tc:", ftree.get_node_values("dist", 1, 1) / (2 * ftree.get_node_values("Ne", 1, 1)))

# draw tree showing Ne and g dists
ftree.draw(ts='p', width=400, node_sizes=0, node_labels=False, edge_type='c');

tc: [0.14285714 0.14285714 0.14285714 0.14285714 0.14285714 0.14285714
 0.14285714 1.         0.85714286 0.71428571 0.57142857 0.42857143
 0.28571429 0.14285714 0.14285714]


### Example sequential genealogies

In [8]:
# simulate K unlinked genealogies
gu_model = ipcoal.Model(gtree, Ne=10e6, seed=12345)
gu_model.sim_trees(10000)
gu_trees = toytree.mtree(gu_model.df.genealogy)
gu_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(gu_trees))

10000


In [9]:
# simulate K linked genealogies
gl_model = ipcoal.Model(gtree, Ne=10e6, seed=12345)
gl_model.sim_trees(nloci=1, nsites=50000)
gl_trees = toytree.mtree(gl_model.df.genealogy)
gl_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(gl_trees))

5875


### Are branches attracted by generation times on true genealogies?

- current approach is to measure topological pairwise dist between all nodes. 

- alternative could be to measure avg pairwise dist on each node too all other nodes...

In [10]:
def topo_dist(tree):
    """
    Return alphanumeric name ordered array of topo dist
    between all nodes on a tree.
    """
    t1 = tree.unroot()
    
    # array of tips in alphanumeric ordr
    arr = np.zeros((len(t1), len(t1)), dtype=float)
    
    # iterate to compare all tips to tips
    labels = NAMEORDER
    for idx1 in range(len(t1)):
        for idx2 in range(len(t1)):
            if idx1 != idx2:
                
                # get distance between tips on every tree
                node1 = t1.idx_dict[idx1]
                node2 = t1.idx_dict[idx2]
                dist = t1.treenode.get_distance(
                    node1, node2, topology_only=True,
                )
                nidx1 = labels.index(node1.name)
                nidx2 = labels.index(node2.name)
                arr[nidx1, nidx2] = dist
    return arr

In [11]:
def sequential_dist(trees):
    """
    Return array of topo dists between nodes on sequential trees
    in ordered input.
    """
    arrs = []
    dists = None
    for tree in trees:
        if dists is not None:
            diff = abs(dists - topo_dist(tree))
            arrs.append(diff)
        dists = topo_dist(tree)
    return np.array(arrs)

In [12]:
def distance_dist(df, dist=100, nsamples=10000):
    """
    Return array of topo dists between nodes on a chrom separated
    by a set distance in bp.
    """
    arrs = []
    
    # randomly sample 1000 trees
    tidxs = np.random.uniform(df.start.min(), df.start.max(), nsamples)
    
    for tidx in tidxs:
        
        # sample random tree
        t1 = df.loc[df.end >= tidx, "genealogy"].iloc[0]
        
        # sample another tree dist away
        if tidx + dist < df.end.max():
            t2 = df.loc[df.end >= tidx + dist, "genealogy"].iloc[0]
        else:
            t2 = df.loc[df.end >= tidx - dist, "genealogy"].iloc[0]

        # get topo distance between the two trees
        tree1 = toytree.tree(t1)
        tree2 = toytree.tree(t2)
        diff = abs(topo_dist(tree1) - topo_dist(tree2))
        
        # store result
        arrs.append(diff)
    return np.array(arrs)

In [13]:
def plot_matrix(dat, domain_min=None, domain_max=None):
    
    # get min and max of the off-diagonal elements
    dat = dat.copy()
    dat[np.diag_indices_from(dat)] = np.nanmean(dat)
    dmin = (domain_min if domain_min is not None else dat.min())
    dmax = (domain_max if domain_max is not None else dat.max())
    cmap = toyplot.color.LinearMap(domain_min=dmin, domain_max=dmax)
    canvas = toyplot.Canvas(320, 300)
    ax1 = canvas.table(rows=dat.shape[0], columns=dat.shape[1], bounds=(50, 250, 50, 250))

    # apply colors to cells but not diagonals
    for ridx in range(dat.shape[0]):
        for cidx in range(dat.shape[1]):
            
            if ridx == cidx:
                ax1.cells.cell[ridx, cidx].style = {'fill': 'grey', 'stroke': 'none'}
            else:
                col = cmap.color(dat[ridx, cidx])
                ax1.cells.cell[ridx, cidx].style = {'fill': col, 'stroke': 'none'}

    # style spacing between grid cells            
    ax1.body.gaps.columns[...] = 1
    ax1.body.gaps.rows[...] = 1

    # add a colorbar to canvas
    numberline = canvas.numberline(260, 250, 260, 50)
    numberline.colormap(cmap, style={"stroke-width":5}, offset=-10)
    numberline.axis.ticks.locator = toyplot.locator.Extended(only_inside=True)
    return canvas, ax1

### GTREE

In [14]:
gunlinked_var = sequential_dist(gu_trees.treelist).var(axis=0)

In [15]:
gl50 = distance_dist(gl_model.df, 50)
gl100 = distance_dist(gl_model.df, 100)
gl200 = distance_dist(gl_model.df, 200)

In [28]:
# plot the variance in differences between UNLINKED trees
c0, t0 = plot_matrix(gunlinked_var, 0.5, 2);

# plot the variance in differences between UNLINKED trees
c1, t1 = plot_matrix(gl100.mean(0), 0.5, 2);

# plot the variance in differences between UNLINKED trees
c2, t2 = plot_matrix(1 - (gl100.mean(0) - gunlinked_var), 0.5, 2);

In [29]:
toyplot.svg.render(c0, "../figures/8tips-imb-g-unlinked.svg")
toyplot.svg.render(c1, "../figures/8tips-imb-g-linked100.svg")
toyplot.svg.render(c2, "../figures/8tips-imb-g-difference.svg")

In [17]:
# # plot the difference in variance between linked and unlinked
# c, t = plot_matrix(1 - (gl100.mean(0) - gunlinked_var), 0.5, 2);
# #toyplot.svg.render(c, "../figures/8tips-imb-g.svg")

In [18]:
# a = c.cartesian(bounds=(5, 45, 55, 245))
# t = tree.ladderize(1)
# t.draw(
#     axes=a, 
#     tip_labels=False,
#     #edge_colors=t.get_edge_values_mapped({16: 'red', 19: 'red'}, False),
# );
# a.show = False

# a = c.cartesian(bounds=(55, 245, 5, 45))
# t = tree.ladderize(0)
# t.draw(
#     axes=a, 
#     tip_labels=False,
#     layout='d',
#     #edge_colors=t.get_edge_values_mapped({16: 'red', 19: 'red'}, False),
# );
# a.show = False

# a = c.cartesian(bounds=(55, 247, 260, 260))
# a.text(
#     np.arange(tree.ntips),
#     np.repeat(0, tree.ntips),
#     [i[1:] for i in tree.get_tip_labels()],
#     style={"font-size": "8px", "fill": "black", "text-anchor": "middle"}
# );
# a.show = False

# # save plot
# toyplot.svg.render(c, "../figures/8tips-imb-g.svg")
# c

### N tree model

In [19]:
# simulate K unlinked genealogies
nu_model = ipcoal.Model(ntree, seed=12345)
nu_model.sim_trees(10000)
nu_trees = toytree.mtree(nu_model.df.genealogy)
nu_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(nu_trees))

# simulate K linked genealogies
nl_model = ipcoal.Model(ntree, seed=12345)
nl_model.sim_trees(nloci=1, nsites=50000)
nl_trees = toytree.mtree(nl_model.df.genealogy)
nl_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(nl_trees))

10000
8379


In [20]:
nunlinked_var = sequential_dist(nu_trees.treelist).var(axis=0)

In [21]:
nl50 = distance_dist(nl_model.df, 50)
nl100 = distance_dist(nl_model.df, 100)
nl200 = distance_dist(nl_model.df, 200)

In [30]:
# plot the variance in differences between UNLINKED trees
c0, t0 = plot_matrix(nunlinked_var, 0.5, 2);

# plot the variance in differences between UNLINKED trees
c1, t1 = plot_matrix(nl100.mean(0), 0.5, 2);

# plot the variance in differences between UNLINKED trees
c2, t2 = plot_matrix(1 - (nl100.mean(0) - nunlinked_var), 0.5, 2);

In [31]:
toyplot.svg.render(c0, "../figures/8tips-imb-n-unlinked.svg")
toyplot.svg.render(c1, "../figures/8tips-imb-n-linked100.svg")
toyplot.svg.render(c2, "../figures/8tips-imb-n-difference.svg")

In [23]:
# # plot the difference in variance between linked and unlinked
# c, t = plot_matrix(1 - (nl100.mean(0) - nunlinked_var), 0.5, 2);

# a = c.cartesian(bounds=(5, 45, 60, 240))
# t = tree.ladderize(1)
# t.draw(
#     axes=a, 
#     tip_labels=False,
#     #edge_colors=t.get_edge_values_mapped({16: 'red', 19: 'red'}, False),
# );
# a.show = False

# a = c.cartesian(bounds=(60, 240, 5, 45))
# t = tree.ladderize(0)
# t.draw(
#     axes=a, 
#     tip_labels=False,
#     layout='d',
#     #edge_colors=t.get_edge_values_mapped({16: 'red', 19: 'red'}, False),
# );
# a.show = False

# a = c.cartesian(bounds=(60, 240, 260, 260))
# a.text(
#     np.arange(tree.ntips),
#     np.repeat(0, tree.ntips),
#     [i[1:] for i in tree.get_tip_labels()],
#     style={"font-size": "8px", "fill": "black", "text-anchor": "middle"}
# );
# a.show = False

# # save plot
# # toyplot.svg.render(c, "../figures/8tips-Ne.svg")
# c

### Ftree model has equal tc on all internal edges

In [24]:
# simulate K unlinked genealogies
fu_model = ipcoal.Model(ftree, seed=12345)
fu_model.sim_trees(10000)
fu_trees = toytree.mtree(fu_model.df.genealogy)
fu_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(fu_trees))

# simulate K linked genealogies
fl_model = ipcoal.Model(ftree, seed=12345)
fl_model.sim_trees(nloci=1, nsites=50000)
fl_trees = toytree.mtree(fl_model.df.genealogy)
fl_trees.draw(ts='n', layout='d', height=225, width=900);
print(len(fl_trees))

10000
5324


In [25]:
funlinked_var = sequential_dist(fu_trees.treelist).var(axis=0)
fl50 = distance_dist(fl_model.df, 50)
fl100 = distance_dist(fl_model.df, 100)
fl200 = distance_dist(fl_model.df, 200)

In [32]:
# plot the variance in differences between UNLINKED trees
c0, t0 = plot_matrix(funlinked_var, 0.5, 2);

# plot the variance in differences between UNLINKED trees
c1, t1 = plot_matrix(fl100.mean(0), 0.5, 2);

# plot the variance in differences between UNLINKED trees
c2, t2 = plot_matrix(1 - (fl100.mean(0) - funlinked_var), 0.5, 2);

In [33]:
toyplot.svg.render(c0, "../figures/8tips-imb-f-unlinked.svg")
toyplot.svg.render(c1, "../figures/8tips-imb-f-linked100.svg")
toyplot.svg.render(c2, "../figures/8tips-imb-f-difference.svg")

In [27]:
# # plot the difference in variance between linked and unlinked
# c, t = plot_matrix(1 - (fl100.mean(0) - funlinked_var), 0.5, 2);

# a = c.cartesian(bounds=(5, 45, 60, 240))
# t = tree.ladderize(1)
# t.draw(
#     axes=a, 
#     tip_labels=False,
#     #edge_colors=t.get_edge_values_mapped({16: 'red', 19: 'red'}, False),
# );
# a.show = False

# a = c.cartesian(bounds=(60, 240, 5, 45))
# t = tree.ladderize(0)
# t.draw(
#     axes=a, 
#     tip_labels=False,
#     layout='d',
#     #edge_colors=t.get_edge_values_mapped({16: 'red', 19: 'red'}, False),
# );
# a.show = False

# a = c.cartesian(bounds=(60, 240, 260, 260))
# a.text(
#     np.arange(tree.ntips),
#     np.repeat(0, tree.ntips),
#     [i[1:] for i in tree.get_tip_labels()],
#     style={"font-size": "8px", "fill": "black", "text-anchor": "middle"}
# );
# a.show = False

# # save plot
# # toyplot.svg.render(c, "../figures/8tips-Ne.svg")
# c

### Simulate sequence data on genealogies under a GTR model
Here we should generate a pretty large number of loci and sites. 2-3K sites should be high enough to observe gene tree estimation error within loci due to concatelescence. Also, 5K loci should be enough to provide enough unlinked SNPs to provide power to methods like SVDquartets (tetrad). 

In [None]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(gtree, Ne=10e6, seed=12345)
model.sim_loci(nloci=10000, nsites=2e3)
model.write_loci_to_hdf5(name="8tips-imb-10K-g", outdir="db")
model.write_snps_to_hdf5(name="8tips-imb-10K-g", outdir="db")

In [None]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(ntree, seed=12345)
model.sim_loci(nloci=10000, nsites=2e3)
model.write_loci_to_hdf5(name="8tips-imb-10K-n", outdir="db")
model.write_snps_to_hdf5(name="8tips-imb-10K-n", outdir="db")

In [None]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(ftree, seed=12345)
model.sim_loci(nloci=10000, nsites=2e3)
model.write_loci_to_hdf5(name="8tips-imb-10K-f", outdir="db")
model.write_snps_to_hdf5(name="8tips-imb-10K-f", outdir="db")

### Infer tetrad tree on data

In [None]:
import ipyrad.analysis as ipa

In [175]:
tet = ipa.tetrad(
    name="8tips-imb-5K-n", 
    data="./db/8tips-imb-5K-n.snps.hdf5", 
    #nboots=100,
)
#tet.run(force=True, auto=True)

loading snps array [8 taxa x 8806178 snps]
max unlinked SNPs per quartet [nloci]: 5000
quartet sampler [full]: 70 / 70


In [177]:
tet._refresh()
tet._init_odb()
tet._init_idb_quartets(True)

In [181]:
from tetrad.worker import *

In [225]:
with h5py.File(tet.files.idb, 'r') as io5:
    seqview = io5["bootsarr"][:]
    maparr = io5["bootsmap"][:]
    smps = io5["quartets"][:]

In [226]:
TIDXS = np.array([
    [0, 1, 2, 3], 
    [0, 2, 1, 3], 
    [0, 3, 1, 2]], dtype=np.uint8,
)
TESTS = np.array([0, 1, 2])
rquartets = np.zeros((smps.shape[0], 4), dtype=np.uint16)
rinvariants = np.zeros((smps.shape[0], 16, 16), dtype=np.uint16)
rnsnps = np.zeros(smps.shape[0])

In [227]:
# iterate over quartet sets
for idx in range(smps.shape[0]):

    # get quartet
    sidx = smps[idx]
    
    # get seqs of this quartet
    seqs = seqview[sidx]

    # mask sites with missing
    sums = np.sum(seqs, axis=0)
    nmask = sums > 70

    # mask invariant sites
    nmask += np.sum(seqs == seqs[0], axis=0) == 4    

    # count SNPs into 3x16x16 arrays
    cmats = full_chunk_to_matrices(seqs, maparr[:, 0], nmask)

    # skip if seqs is empty
    nsnps = cmats[0].sum()
    if not nsnps:
        qorder = TIDXS[np.random.randint(3)]        
    else:
        # empty arrs to fill
        svds = np.zeros((3, 16), dtype=np.float64)
        scor = np.zeros(3, dtype=np.float64)
        rank = np.zeros(3, dtype=np.float64)

        # svd and rank.
        for test in TESTS:
            svds[test] = np.linalg.svd(cmats[test].astype(np.float64))[1]
            rank[test] = np.linalg.matrix_rank(cmats[test].astype(np.float64))

        # get minrank, or 11 (TODO: can apply seq model here)
        minrank = int(min(10, rank.min()))
        for test in TESTS:
            scor[test] = np.sqrt(np.sum(svds[test, minrank:]**2))

        # sort to find the best qorder
        qorder = TIDXS[np.argmin(scor)]

    # store results
    rquartets[idx] = sidx[qorder]
    rinvariants[idx] = cmats[0]
    rnsnps[idx] = nsnps

In [234]:
with open("/tmp/test.q", 'w') as out:
    for i in rquartets:
        q = "{},{}|{},{}\n".format(*i)
        out.write(q)

In [240]:
toytree.tree("/tmp/test.out").root('7').draw();

In [202]:
np.linalg.svd(cmats[0].astype(np.float64))[1]

array([4.87913864e+05, 1.68145081e+05, 1.60587813e+05, 1.60359717e+05,
       1.59858391e+05, 1.17939971e+05, 1.17595906e+05, 1.17097151e+05,
       1.51898307e+04, 1.51026245e+04, 4.52932130e+02, 3.84413796e+02,
       2.92602736e+02, 2.23311836e+02, 1.95448720e+02, 3.77032092e+01])

In [213]:
a = np.sqrt(np.sum(np.linalg.svd(cmats[0].astype(np.float64))[1])**2)
b=np.sqrt(np.sum(np.linalg.svd(cmats[1].astype(np.float64))[1])**2)
c=np.sqrt(np.sum(np.linalg.svd(cmats[2].astype(np.float64))[1])**2)
a,b,c

(1521376.762634818, 1774250.8045312897, 1773970.3492976888)

In [173]:
toytree.tree(tet.trees.tree).root("r7").draw(node_labels="support");

In [174]:
tet = ipa.tetrad(
    name="8tips-imb-5K-n", 
    data="./db/8tips-imb-5K-n.snps.hdf5", 
    #nboots=100,
)
tet.run(force=True, auto=True, quiet=True)
toytree.tree(tet.trees.tree).root("r7").draw(node_labels="support");

loading snps array [8 taxa x 8806178 snps]
max unlinked SNPs per quartet [nloci]: 5000
quartet sampler [full]: 70 / 70
[####################] 100% 0:00:12 | full tree * | mean SNPs/qrt: 5000 

In [159]:
tet = ipa.tetrad(
    name="8tips-imb-5K-g", 
    data="./db/8tips-imb-5K-g.snps.hdf5", 
    #nboots=100,
)
tet.run(auto=True, quiet=True)
toytree.tree(tet.trees.tree).root("r7").draw(node_labels="support");

loading snps array [8 taxa x 2343903 snps]
max unlinked SNPs per quartet [nloci]: 1000
quartet sampler [full]: 70 / 70


In [164]:
tet = ipa.tetrad('test', data="./db/8tips-imb-n.snps.hdf5")
tet.run(force=True, auto=True)

loading snps array [8 taxa x 2642207 snps]
max unlinked SNPs per quartet [nloci]: 1000
quartet sampler [full]: 70 / 70
Parallel connection | pinky: 80 cores
initializing quartet sets database
[####################] 100% 0:00:04 | full tree * | mean SNPs/qrt: 1000 


In [165]:
toytree.tree(tet.trees.tree).root("r7").draw();

In [None]:
cat /tmp/

In [52]:
model.infer_gene_trees()

wrote concat locus (12 x 2000bp) to /tmp/28533.phy


In [57]:
model.df['dist'] = 0
for idx in model.df.index:
    t1 = toytree.tree(model.df.genealogy[idx]).unroot()
    t2 = toytree.tree(model.df.inferred_tree[idx]).unroot()
    dists = t1.treenode.robinson_foulds(t2.treenode, unrooted_trees=True)
    #     print(dists[0], dists[1])
    model.df.loc[idx, "dist"] = dists[0] / dists[1]

In [72]:
loc0 = model.df[model.df.locus == 0]

c, a, m = toyplot.fill(
    loc0.start,
    loc0.dist,
    width=500,
    height=200,
    ymin=0,
);
a.hlines(loc0.dist.mean())

<toyplot.mark.AxisLines at 0x7f4953be2710>

In [75]:
toytree.tree(model.df.inferred_tree[0]).draw();

(<toyplot.canvas.Canvas at 0x7f4946e1d150>,
 <toyplot.coordinates.Cartesian at 0x7f4946fca910>,
 <toytree.Render.ToytreeMark at 0x7f4946f9a450>)

In [None]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(ntree, seed=12345)
model.sim_loci(nloci=3000, nsites=2e3)
model.write_loci_to_hdf5(name="12tips-n", outdir="db")