# Database generation
This notebook demonstrates how to use `simcat.Database` objects to simulate a database of invariant matrices representing admixture over all edges of an input topology, and under a range of demographic scenarios. 

In [1]:
import simcat
import toytree
import numpy as np
import h5py

### The two tree shapes we will compare

In [2]:
t0 = toytree.rtree.baltree(ntips=8, treeheight=1.0)
t1 = toytree.rtree.imbtree(ntips=8, treeheight=1.0)
toytree.mtree([t0, t1]).draw_tree_grid(width=400, height=250, x=1, y=2);

### Example simulations

In [3]:
# perform a single simulation
model = simcat.Model(
    tree=t0,
    admixture_edges=(1, 2, 0.5, 0.1),
    admixture_type=0,
    theta=1.0,
    nsnps=1000,
    ntests=1,
    nreps=1,
    seed=None,
    debug=False,
    run=True
)
model.counts.shape

(1, 70, 16, 16)

In [4]:
# init database of many simulations
db = simcat.Database(
    name="test",
    workdir="../database",
    tree=t0,
    admix_edge_min=0.5,
    admix_edge_max=0.5,
    admix_prop_min=0.05,
    admix_prop_max=0.5,
    nedges=1,
    ntests=1,
    nreps=1,
    nsnps=1000,
    theta=1.0,
    seed=123,
    force=True,
)
db.run(auto=True)

350 sims: /home/deren/Documents/simcat-eaton-lab/database/test.labels.h5


Box(children=(HTML(value="<span style='font-size:14px; font-family:monospace'>Establishing parallel connection…

Box(children=(HTML(value="<span style='font-size:14px; font-family:monospace'>Parallelization: <i>oud</i>: 4 c…


Keyboard Interrupt by user



### Simulate data for test 1

Here we will compare the ability for our methods to detect admixture edges on these trees under different scenarios that vary the amount of information and complexity of the model. 

In [None]:
for treeshape in ["imb", "bal"]:
    for ntips in [8, 16]:
        for nsnps in [10000, 20000]:
            for theta in [1.0, 0.1]:
                               
                # set name
                name = "{}-tr{}-t10-r10-s{}-th{}".format(
                    treeshape, ntips, nsnps, theta,
                )
                
                # get tree 
                if treeshape == "imb":
                    tree = toytree.rtree.imbtree(ntips=ntips, treeheight=3.0)
                else:
                    tree = toytree.rtree.baltree(ntips=ntips, treeheight=3.0)
                
                # init database
                db = simcat.Database(
                    name=name,
                    workdir="../database",
                    tree=tree,
                    nedges=1,
                    ntests=10,
                    nreps=10,
                    nsnps=nsnps,
                    theta=theta,
                    seed=123,
                    force=True,
                    run=True
                )
                
                # run on parallel client
                db.run()

### An example dataset

In [53]:
# the counts array (matrix of sim 0, quartet 0)
with h5py.File(db.labels) as io5:
    print(io5["thetas"][0])
    print(io5["admix_sources"][0])
    print(io5["admix_targets"][0])
    print(io5["admix_times"][0])
    print(io5["admix_props"][0])
    print(io5.attrs['tree'])
    print(io5.attrs['nsnps'])

0.39901473699216816
[4]
[7]
[2.625]
[0.49896081]
(4:3,(3:2.25,(2:1.5,(1:0.75,0:0.75)1:0.75)1:0.75)1:0.75);
1000
