## Generation-time branch attraction

This notebook focuses on (g) effects

In [1]:
# conda install ipcoal ipyrad -c conda-forge -c bioconda 

In [2]:
import numpy as np
import pandas as pd
import toytree
import toyplot, toyplot.svg
import ipcoal
import ipyrad.analysis as ipa

### Treestyle

In [35]:
ts = {
    'ts': 'p', 
    'layout': 'r',
    'height': 300,
    'node_sizes': 7,
    'node_style': {"stroke": "black", 'stroke-width': 2},
    'node_labels': False,
    'edge_type': 'c',
    'scalebar': False,
    'tip_labels_align': True,
    'scalebar': True,
}

### Starting simulation scenario with deep divergences

In [26]:
# get an ultrametric imbalanced tree
tree = toytree.rtree.imbtree(8, treeheight=20e6)
tree.draw(**ts);

### Modify species tree params to make "variable N" tree

In [27]:
# set gentime on nodes
ntree = tree.set_node_values(
    feature="Ne", 
    values={i: 100e6 for i in (3,4,5,10,11,12)},
    default=10e6,
)

# draw the tree
ntree.draw(**ts);

# save to file
ntree.write("./trees/n-tree.nwk")

### Simulate a concatenated sequence (chromosome)

In [15]:
# simulate a long chrom
model = ipcoal.Model(ntree, seed=123)
model.sim_loci(1, 1e6)

# write to db
model.write_loci_to_hdf5(name="n-concat", outdir="db")

# show the total number of genealogies
print('ngenealogies:', model.df.tidx.max())
print('mean len of gtree: {:.2f} bp'.format(model.df.nbps.mean()))

# show the locus (pretty high variation)
model.draw_seqview(0, 0, 50);

wrote 1 loci to /home/deren/gentime-attraction/notebooks/db/n-concat-2.seqs.hdf5
ngenealogies: 151455
mean len of gtree: 6.60 bp


### Confirm concatenation is in the anomaly zone 

Here we aim to select a scenario that will lie in the 'anomaly zone', where concatenation will yield incorrect results but a proper MSC method should infer a correct result.

In [16]:
# init raxml inference
tool = ipa.treeslider(
    data="./db/n-concat.seqs.hdf5",
    name="n-concat",
    scaffold_idxs=0,
    inference_args={
        "f": "d",
        "N": 10,
        "p": 12345,
        "x": None,
        "T": 20,
    }
)
tool.ipcluster['threads'] = 20
tool.show_inference_command()

# infer concat tree
tool.run(auto=True, force=True)

/home/deren/miniconda3/envs/ipy/bin/raxmlHPC-PTHREADS-AVX2 -f d -T 20 -m GTRGAMMA -n ... -w ... -s ... -p 12345 -N 10
building database: nwindows=1; minsnps=1
[####################] 100% 0:00:28 | inferring trees 
tree_table written to /home/deren/gentime-attraction/notebooks/analysis-treeslider/n-concat.tree_table.csv


In [47]:
# re-load the newick from treeslider output
tw = pd.read_csv("./analysis-treeslider/n-concat.tree_table.csv")
etree = toytree.tree(tw.tree[0]).root("r7")

# draw the concat tree
etree.draw(**ts);

# save tree file
etree.write("./trees/n-concat.nwk")

### Simulate unlinked genealogies

In [23]:
# simulate many unlinked genealogies (same as above)
model = ipcoal.Model(ntree, seed=123)
model.sim_trees(nloci=10000, nsites=1)

# load into a tree object and draw a few
mtre = toytree.mtree(model.df.genealogy)
mtre.draw();

# convert bl units to E(subst.)
for tre in mtre.treelist:
    for node in tre.idx_dict.values():
        node.dist = node.dist * 1e-8      
        
# write re-formatted trees to a tmp file
mtre.write("./trees/n-genealogies.nwk")

### Confirm ASTRAL-genealogy is not an anomaly


In [25]:
# infer MSC sptree with ASTRAL
ast = ipa.astral(
    data=[i.write() for i in toytree.mtree("./trees/n-genealogies.nwk")],
    name="n-astral-genealogy",
)
ast.run()

# draw sptree
atree = toytree.tree(ast.tree).root("r7")
atree.draw(**ts);

# save tree file
atree.write("./trees/n-astral-genealogies.nwk")

[astral.5.7.1.jar]
inferred tree written to (/home/deren/gentime-attraction/notebooks/analysis-astral/n-astral-genealogy.tre)


### Confirm SNAQ-genealogies is not an anomaly

In [29]:
# infer sptree
snaq0 = ipa.snaq(
    gtrees="./trees/n-genealogies.nwk",
    netin="./trees/n-astral-genealogies.nwk",
    name="n-snaq-genealogies-net0",
    nedges=0,
    nruns=10,
    nproc=10,
    seed=123,
)
snaq0.run()

# infer sptree
snaq1 = ipa.snaq(
    gtrees="./trees/n-genealogies.nwk",
    netin="./trees/n-astral-genealogies.nwk",
    name="n-snaq-genealogies-net1",
    nedges=1,
    nruns=10,
    nproc=10,
    seed=123,
)
snaq1.run()

[SNAQ v.x.y]
[nproc = 10]
julia /home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogies-net0.jl
inferred network written to (/home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogies-net0.snaq)
[SNAQ v.x.y]
[nproc = 10]
julia /home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogies-net1.jl
inferred network written to (/home/deren/gentime-attraction/notebooks/analysis-snaq/n-snaq-genealogies-net1.snaq)


In [30]:
snaq1.admix

{'H9': (['r1'], ['r4'], 0.5, {}, '0.01')}

In [31]:
# draw tree
stree = toytree.tree(snaq1.tree).root("r7")
stree.draw(**ts, admixture_edges=snaq1.admix.values());

# write network
stree.write("trees/n-snaq-genealogies.nwk")

### Simulate loci
Do 10K loci and test a few range of locus lengths to allow examining length effect.

In [12]:
# simulate loci that are each 3000bp in length
model = ipcoal.Model(ntree, seed=123)
model.sim_loci(nloci=10000, nsites=2e3)
model.write_loci_to_hdf5(name="n-10000l-2000s", outdir="db")
model.write_snps_to_hdf5(name="n-10000l-2000s", outdir="db")

# draw seqview
model.draw_seqview(0, 0, 50);

wrote 10000 loci to /home/deren/gentime-attraction/notebooks/db/n-10000l-2000s.seqs.hdf5
wrote 17611826 SNPs to /home/deren/gentime-attraction/notebooks/db/n-10000l-2000s.snps.hdf5


### Infer gene trees from loci


In [32]:
# init tree slider inference tool
tool = ipa.treeslider(
    data="./db/n-10000l-2000s.seqs.hdf5",
    name="n-10000l-2000s",
    scaffold_idxs=range(10000),
    inference_args={
        "N": 10,
        "f": "d",
        "p": 12345,
        "x": None,
        "T": 2,
    }
)

# infer all gene trees
tool.ipcluster['cores'] = 20
tool.ipcluster['threads'] = 2
tool.run(auto=True, force=True)

building database: nwindows=10000; minsnps=1
[####################] 100% 1:52:00 | inferring trees 
tree_table written to /home/deren/gentime-attraction/notebooks/analysis-treeslider/n-10000l-2000s.tree_table.csv


In [43]:
# load tree slider (gene tree inference) results
tw = pd.read_csv("./analysis-treeslider/n-10000l-2000s.tree_table.csv", index_col=0)

# load gene trees into toytree multitree object
mtre = toytree.mtree(tw.tree)
mtre.treelist = [i.root("r7") for i in mtre.treelist]
mtre.draw(ts='o', use_edge_lengths=False);

# write list of trees as a tmp file
print(len(mtre), "gene trees")
mtre.write("./trees/n-genetrees.nwk")

10000 gene trees


### Infer SNAQ network from inferred gene trees

In [46]:
snaq0 = ipa.snaq(
    gtrees="./trees/n-genetrees.nwk",
    netin="./trees/n-astral-genealogies.nwk",
    name="n-genetrees-net0",
    nedges=0,
    nruns=10,
    nproc=10,
)
snaq0.run()

snaq1 = ipa.snaq(
    gtrees="./trees/n-genetrees.nwk",
    netin="./trees/n-astral-genealogies.nwk",
    name="n-genetrees-net1",
    nedges=1,
    nruns=10,
    nproc=10,
)
snaq1.run()

using existing CF table: /home/deren/gentime-attraction/notebooks/analysis-snaq/n-genetrees-net1.CFs.csv
[SNAQ v.x.y]
[nproc = 10]
julia /home/deren/gentime-attraction/notebooks/analysis-snaq/n-genetrees-net1.jl
inferred network written to (/home/deren/gentime-attraction/notebooks/analysis-snaq/n-genetrees-net1.snaq)


In [48]:
snaq1.admix

{'H9': (['r2'], ['r1'], 0.5, {}, '0.277')}

In [51]:
# draw tree
stree = toytree.tree(snaq1.tree).root("r7")
stree.draw(**ts, admixture_edges=snaq1.admix.values());

# save tree
stree.write("./trees/n-snaq-genetrees.nwk")

### Infer ASTRAL tree from inferred gene trees

In [52]:
# infer MSC sptree with ASTRAL
ast = ipa.astral(
    data=[i.write() for i in toytree.mtree("./trees/n-genetrees.nwk")],
    name="n-astral-genetrees",
)
ast.run()

# draw sptree
atree = toytree.tree(ast.tree).root("r7")
atree.draw(**ts);

# save tree file
atree.write("./trees/n-astral-genetrees.nwk")

[astral.5.7.1.jar]
inferred tree written to (/home/deren/gentime-attraction/notebooks/analysis-astral/n-astral-genetrees.tre)
