## Generation-time branch attraction

This notebook focuses on (g) effects

In [None]:
# conda install ipcoal ipyrad -c conda-forge -c bioconda 

In [1]:
import numpy as np
import pandas as pd
import toytree
import toyplot, toyplot.svg
import ipcoal
import ipyrad.analysis as ipa

### Treestyle

In [26]:
ts = {
    'ts': 'p', 
    'layout': 'r',
    'height': 300,
    'node_sizes': 7,
    'node_style': {"stroke": "black", 'stroke-width': 2},
    'node_labels': False,
    'edge_type': 'c',
    'scalebar': False,
    'tip_labels_align': True,
    'scalebar': True,
}

### Starting simulation scenario with deep divergences

In [3]:
# get an ultrametric imbalanced tree
tree = toytree.rtree.imbtree(8, treeheight=20e6)
tree.draw(**ts);

### Modify species tree params to make "variable g" tree

In [5]:
# set gentime on nodes
gtree = tree.set_node_values(
    feature="g", 
    values={i: 10 for i in (3,4,5,10,11,12)},
    default=1,
)

# set gentime on nodes
gtree = gtree.set_node_values(
    feature="dist", 
    values={i: j.dist / j.g for (i, j) in gtree.idx_dict.items()}
)

gtree.draw(**ts);

#### Save to file

In [7]:
gtree.write("./trees/tree-g.nwk")

### Confirm concatenation is in the anomaly zone 

Here we aim to select a scenario that will lie in the 'anomaly zone', where concatenation will yield incorrect results but a proper MSC method should infer a correct result.

In [8]:
# simulate a long chrom
model = ipcoal.Model(gtree, Ne=10e6, seed=123)
model.sim_loci(1, 1e6)
model.write_loci_to_hdf5(name="g-concat", outdir="db")

In [10]:
# show the total number of genealogies
print('ngenealogies:', model.df.tidx.max())
print('mean len of gtree: {:.2f} bp'.format(model.df.nbps.mean()))

# show the locus (pretty high variation)
model.draw_seqview(0, 0, 50);

ngenealogies: 116401
mean len of gtree: 8.59 bp


In [24]:
# init raxml inference
tool = ipa.treeslider(
    data="/home/deren/gentime-attraction/notebooks/db/g-concat.seqs.hdf5",
    name="g-concat",
    scaffold_idxs=0,
    inference_args={
        "f": "d",
        "N": 10,
        "p": 12345,
        "x": None,
        "T": 20,
    }
)
tool.ipcluster['threads'] = 20
tool.show_inference_command()

# infer concat tree
tool.run(auto=True, force=True)

/home/deren/miniconda3/envs/ipy/bin/raxmlHPC-PTHREADS-AVX2 -f d -T 20 -m GTRGAMMA -n ... -w ... -s ... -p 12345 -N 10
building database: nwindows=1; minsnps=1
[####################] 100% 0:00:22 | inferring trees 
tree_table written to /home/deren/gentime-attraction/notebooks/analysis-treeslider/g-concat.tree_table.csv


In [35]:
# re-load and draw the inferred concat tree
tw = pd.read_csv("./analysis-treeslider/g-concat.tree_table.csv")
etree = toytree.tree(tw.tree[0]).root("r7")
etree.draw(**ts);

#### Save to file

In [36]:
etree.write("./trees/tree-g-concat.nwk")

### Confirm ASTRAL-genealogy is not an anomaly
Astral gets the correct tree when given > ~500 trees as input.

In [38]:
# simulate many unlinked genealogies
model = ipcoal.Model(gtree, Ne=10e6, seed=123)
model.sim_trees(nloci=10000, nsites=1)
toytree.mtree(model.df.genealogy[:10]).draw();

In [45]:
# load trees
mtre = toytree.mtree(model.df.genealogy)

# convert bl units to E(subst.)
for tre in mtre.treelist:
    for node in tre.idx_dict.values():
        node.dist = node.dist * 1e-8

# infer MSC sptree with ASTRAL
ast = ipa.astral(
    data=[i.write() for i in mtre.treelist], 
    name="n-astral-genealogy",
)
ast.run()

# draw sptree
atree = toytree.tree(ast.tree).root("r7")
atree.draw(**ts);

[astral.5.7.1.jar]
inferred tree written to (/home/deren/gentime-attraction/notebooks/analysis-astral/n-astral-genealogy.tre)


#### save to file

In [46]:
atree.write("./trees/tree-g-astral-true.nwk")

### Confirm SNAQ-genealogy is not an anomaly

In [23]:
# simulate many unlinked genealogies (same as above)
model = ipcoal.Model(gtree, Ne=10e6, seed=123)
model.sim_trees(nloci=10000, nsites=1)
toytree.mtree(model.df.genealogy[:10]).draw();

In [52]:
# (re-)load trees
mtre = toytree.mtree(model.df.genealogy)

# convert bl units to E(subst.)
for tre in mtre.treelist:
    for node in tre.idx_dict.values():
        node.dist = node.dist * 1e-8      
        
# write re-formatted trees to a tmp file
mtre.write("/tmp/trees.nwk")

# infer sptree
snaq0 = ipa.snaq(
    gtrees="/tmp/trees.nwk",
    netin="./trees/tree-g-astral-true.nwk",
    name="n-snaq-genealogy-net0",
    nedges=0,
    nruns=10,
    nproc=10,
    seed=123,
)
snaq0.run()

# infer sptree
snaq1 = ipa.snaq(
    gtrees="/tmp/trees.nwk",
    netin="./trees/tree-g-astral-true.nwk",
    name="n-snaq-genealogy-net1",
    nedges=1,
    nruns=10,
    nproc=10,
    seed=123,
)
snaq1.run()

using existing CF table: /home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogy-net0.CFs.csv
[SNAQ v.x.y]
[nproc = 10]
julia /home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogy-net0.jl
inferred network written to (/home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogy-net0.snaq)
[SNAQ v.x.y]
[nproc = 10]
julia /home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogy-net1.jl
inferred network written to (/home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogy-net1.snaq)


In [60]:
snaq1.admix

{'H9': (['r6'], ['r2'], 0.5, {}, '0.017')}

In [62]:
# draw tree
stree = toytree.tree(snaq1.tree).root("r7")
stree.draw(**ts, admixture_edges=snaq1.admix.values());

#### Save result to file

In [68]:
# write network
stree.write("trees/tree-g-snaq-true.nwk")

### Estimate gene trees

Do 10K loci and test a few range of locus lengths to allow examining length effect.

In [None]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(gtree, Ne=10e6, seed=123)
model.sim_loci(nloci=10000, nsites=2e3)
model.write_loci_to_hdf5(name="n-10000l-2000s", outdir="sim-loci")
model.write_snps_to_hdf5(name="n-10000l-2000s", outdir="sim-loci")

In [None]:
model.draw_seqview(0, 0, 50);

In [None]:
# init tree slider inference tool
ts = ipa.treeslider(
    data="./sim-loci/n-10000l-2000s.seqs.hdf5",
    name="n-10000l-2000s",
    scaffold_idxs=range(10000),
    inference_args={
        "N": 10,
        "f": "d",
        "p": 12345,
        "x": None,
        "T": 2,
    }
)

# infer all gene trees
ts.ipcluster['cores'] = 20
ts.ipcluster['threads'] = 2
ts.run(auto=True, force=True)

In [72]:
# load tree slider (gene tree inference) results
tw = pd.read_csv("./analysis-treeslider/n-10000l-2000s.tree_table.csv", index_col=0)

# load gene trees into toytree multitree object
mtre = toytree.mtree(tw.tree)
mtre.treelist = [i.root("r7") for i in mtre.treelist]
mtre.draw(ts='o', use_edge_lengths=False);

# write list of trees as a tmp file
mtre.write("/tmp/trees-g.nwk")

In [88]:
snaq0 = ipa.snaq(
    gtrees="/tmp/trees-g.nwk",
    netin="./analysis-astral/g-astral-genealogy.tre",
    name="tree-g-astral-true.nwk",
    nedges=0,
    nruns=10,
    nproc=10,
)
snaq0.run()


snaq1 = ipa.snaq(
    gtrees="/tmp/trees-g.nwk",
    netin="./analysis-astral/g-astral-genealogy.tre",
    name="tree-g-astral-true.nwk",
    nedges=1,
    nruns=10,
    nproc=10,
)
snaq1.run()

In [None]:
snaq1.admix

In [None]:
# draw tree
stree = toytree.tree(snaq1.tree).root("r7")
stree.draw(**ts, admixture_edges=snaq1.admix.values());

In [36]:
# draw tree
toytree.tree(snaq1.tree).root("r7").draw(ts='s', admixture_edges=snaq1.admix.values());