In [1]:
import itertools
import pandas as pd
import numpy as np
import toytree
import toyplot
import arviz as az
import pymc3 as pm
from pymc3.distributions.dist_math import normal_lccdf, normal_lcdf
import sproc

In [2]:
# Load tree; drop outgroups, duplicates and tips with no data; and show tip labels.
tree = toytree.tree("/home/henry/oaks-thesis/moto/newick-cal-crown-lambda0_basic_scaffold05.tree")
atree = tree.drop_tips(names = ['Quercus|Quercus|Leucomexicana|Q.laeta', 'Quercus|Lobatae|Erythromexicana|Q.conzattii',
                       'Quercus|Quercus|Leucomexicana|Q.arizonica', 'SRR5984321', 'SRR5632417', 'SRR5632562'])
btree = atree.drop_tips(wildcard = "Chrysolepis")
ctree = btree.drop_tips(wildcard = "Notholithocarpus")
dtree = ctree.drop_tips(wildcard = "Lithocarpus")
etree = dtree.drop_tips(wildcard = "SRR")
ftree = etree.drop_tips(wildcard = "reference")
gtree = ftree.drop_tips(names = ['Quercus|Quercus|Leucomexicana|Q.species', 'Quercus|Quercus|Roburoids|Q.vulcanica',
                                'Quercus|Quercus|Roburoids|Q.imeretina', 'Quercus|Virentes|nan|Q.sagraeana',
                                'Quercus|Lobatae|Erythromexicana|Q.lowilliamsii', 
                                 'Quercus|Lobatae|Agrifoliae|Q.oxyadenia', 'Quercus|Quercus|Roburoids|Q.kotschyana',
                                'Quercus|Quercus|Roburoids|Q.cedrorum', 'Quercus|Quercus|Dumosae|Q.pacifica',
                                 'Quercus|Lobatae|Erythromexicana|Q.sartorii',
                                'Quercus|Lobatae|Agrifoliae|Q.tamalpaiensis','Quercus|Lobatae|Agrifoliae|Q.shrevei',
                                 'Cerris|Cyclobalanopsis|Semiserrata|Q.litoralis', 
                                 'Cerris|Cyclobalanopsis|Acuta|Q.ciliaris', 'Cerris|Cyclobalanopsis|Acuta|Q.stewardiana',
                                'Cerris|Cyclobalanopsis|Semiserrata|Q.patelliformis',
                                'Cerris|Cyclobalanopsis|Glauca|Q.multinervis', 'Cerris|Ilex|Himalayansubalpine|Q.sp.nov.'])

In [390]:
# True param values
𝛼_mean = 0.05
𝛼_std = 0.005
𝛽_mean = 1.5
𝛽_std = 0.05
𝜓_mean = 0.0
𝜓_std = 0.33
𝜎_std = 0.05

In [391]:
# 8 different clade effects on rate of RI (used for partial-pooling data)
𝜓_Quercus_mean = 0.8
𝜓_Quercus_std = 0.15
𝜓_Virentes_mean = -1.0
𝜓_Virentes_std = 0.1
𝜓_Ponticae_mean = -1.0
𝜓_Ponticae_std = 0.1
𝜓_Protobalanus_mean = -0.5
𝜓_Protobalanus_std = 0.05
𝜓_Lobatae_mean = 1.0
𝜓_Lobatae_std = 0.2
𝜓_Cyclobalanopsis_mean = 1.0
𝜓_Cyclobalanopsis_std = 0.2
𝜓_Ilex_mean = -0.2
𝜓_Ilex_std = 0.05
𝜓_Cerris_mean = -0.2
𝜓_Cerris_std = 0.05

In [5]:
# Get crown nodes for eight clades.
crowns = [
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Quercus|Quercus"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Quercus|Virentes"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Quercus|Ponticae"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Quercus|Protobalanus"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Quercus|Lobatae"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Cerris|Cyclobalanopsis"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Cerris|Ilex"),
    gtree.get_mrca_idx_from_tip_labels(wildcard = "Cerris|Cerris")
]
crowns

[393, 394, 405, 417, 425, 426, 422, 423]

In [6]:
# Dict to set clade indices.
clades = {
    "Quercus" : 0,
    "Virentes" : 1,
    "Ponticae" : 2,
    "Protobalanus" : 3,
    "Lobatae" : 4,
    "Cyclobalanopsis" : 5,
    "Ilex" : 6,
    "Cerris" : 7
}

In [392]:
tips = len(gtree.get_tip_labels())
Cerris_tips = len(gtree.get_tip_labels(423))
Ilex_tips = len(gtree.get_tip_labels(422))
Cyclobalanopsis_tips = len(gtree.get_tip_labels(426))
Lobatae_tips = len(gtree.get_tip_labels(425))
Protobalanus_tips = len(gtree.get_tip_labels(417))
Ponticae_tips = len(gtree.get_tip_labels(405))
Virentes_tips = len(gtree.get_tip_labels(394))
Quercus_tips = len(gtree.get_tip_labels(393))

In [393]:
SPECIES_DATA = pd.DataFrame({
    "Species": ["Quercus " + "{}".format(
        gtree.idx_dict[idx].name.split("|")[-1].split(".")[-1]) for idx in range(len(gtree.get_tip_labels()))],
    "𝛽": np.random.normal(𝛽_mean, 𝛽_std, tips),
    "𝜓": np.random.normal(𝜓_mean, 𝜓_std, tips),
    "𝜓_x": np.concatenate([
        np.random.normal(𝜓_Quercus_mean, 𝜓_Quercus_std, Quercus_tips),
        np.random.normal(𝜓_Virentes_mean, 𝜓_Virentes_std, Virentes_tips),
        np.random.normal(𝜓_Ponticae_mean, 𝜓_Ponticae_std, Ponticae_tips),
        np.random.normal(𝜓_Protobalanus_mean, 𝜓_Protobalanus_std, Protobalanus_tips),
        np.random.normal(𝜓_Lobatae_mean, 𝜓_Lobatae_std, Lobatae_tips),
        np.random.normal(𝜓_Cyclobalanopsis_mean, 𝜓_Cyclobalanopsis_std, Cyclobalanopsis_tips),
        np.random.normal(𝜓_Ilex_mean, 𝜓_Ilex_std, Ilex_tips),
        np.random.normal(𝜓_Cerris_mean, 𝜓_Cerris_std, Cerris_tips),
    ]),
    "gidx": np.concatenate([
        np.repeat(0, Quercus_tips),
        np.repeat(1, Virentes_tips),
        np.repeat(2, Ponticae_tips),
        np.repeat(3, Protobalanus_tips),
        np.repeat(4, Lobatae_tips),
        np.repeat(5, Cyclobalanopsis_tips),
        np.repeat(6, Ilex_tips),
        np.repeat(7, Cerris_tips),
    ]),
})

# species RI velocity is the base velocity times species specific 
SPECIES_DATA["velo"] = SPECIES_DATA["𝛽"] + SPECIES_DATA["𝜓"]
SPECIES_DATA["velo_x"] = SPECIES_DATA["𝛽"] + SPECIES_DATA["𝜓_x"]
SPECIES_DATA.head()

Unnamed: 0,Species,𝛽,𝜓,𝜓_x,gidx,velo,velo_x
0,Quercus aj...,1.519878,0.414604,0.779744,0,1.934482,2.299622
1,Quercus tu...,1.587244,-0.024202,0.538587,0,1.563042,2.125831
2,Quercus to...,1.520985,-0.057836,0.80329,0,1.463149,2.324275
3,Quercus ob...,1.49118,0.218928,0.833209,0,1.710108,2.324389
4,Quercus ch...,1.508965,0.196346,0.763123,0,1.705311,2.272088


In [17]:
ranges = [sproc.Sproc(species=spp, workdir="/tmp", scalar=2.5) for spp in SPECIES_DATA['Species']]

12:54 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 39 occurrence records[0m
12:54 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 0[0m
12:54 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_ajoensis.json[0m
12:54 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 1060 occurrence records[0m
12:54 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 37[0m
12:54 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_turbinella.json[0m
12:54 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 177 occurrence records[0m
12:54 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 2[0m
12:54 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_toumeyi.json[0m
12:54 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 302 occurrence records[0m
12:54 | INFO    | [1m[35m_mark_outliers

12:56 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 4[0m
12:56 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_germana.json[0m
12:56 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 249 occurrence records[0m
12:56 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 1[0m
12:56 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_martinezii.json[0m
12:56 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 914 occurrence records[0m
12:56 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 341[0m
12:56 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_stellata.json[0m
12:56 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 7 occurrence records[0m
12:56 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 0[0m
12:56 | INFO    | [1m[35mwrite          [0m[1m[0

12:59 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 3513 occurrence records[0m
12:59 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 3279[0m
12:59 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_petraea.json[0m
12:59 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 916 occurrence records[0m
12:59 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 269[0m
12:59 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_fabri.json[0m
12:59 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 1376 occurrence records[0m
12:59 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 1169[0m
12:59 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_aliena.json[0m
12:59 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 172 occurrence records[0m
12:59 | INFO    | [1m[35m_mark_outlie

01:01 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_sadleriana.json[0m
01:02 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 2142 occurrence records[0m
01:02 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 15[0m
01:02 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_chrysolepis.json[0m
01:02 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 403 occurrence records[0m
01:02 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 10[0m
01:02 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_tomentella.json[0m
01:02 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 545 occurrence records[0m
01:02 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 2[0m
01:02 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_vacciniifolia.json[0m
01:02 | INFO    | 

01:03 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 216 occurrence records[0m
01:03 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 6[0m
01:03 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_durifolia.json[0m
01:03 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 554 occurrence records[0m
01:03 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 24[0m
01:03 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_scytophylla.json[0m
01:03 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 1121 occurrence records[0m
01:03 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 22[0m
01:03 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_acutifolia.json[0m
01:04 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 887 occurrence records[0m
01:04 | INFO    | [1m[35m_mark_o

01:05 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_pagoda.json[0m
01:05 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 38 occurrence records[0m
01:05 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 3[0m
01:05 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_buckleyi.json[0m
01:05 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 697 occurrence records[0m
01:05 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 187[0m
01:05 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_coccinea.json[0m
01:06 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 2816 occurrence records[0m
01:06 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 790[0m
01:06 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_rubra.json[0m
01:06 | INFO    | [1m[35m__init_

01:07 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 476[0m
01:07 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_myrsinifolia.json[0m
01:07 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 15 occurrence records[0m
01:07 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 0[0m
01:07 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_kouangsiensis.json[0m
01:07 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 9 occurrence records[0m
01:07 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 0[0m
01:07 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_chrysocalyx.json[0m
01:07 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 29 occurrence records[0m
01:07 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 16[0m
01:07 | INFO    | [1m[35mwrite          

01:09 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 126 occurrence records[0m
01:09 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 0[0m
01:09 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_baronii.json[0m
01:09 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 138 occurrence records[0m
01:09 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 5[0m
01:09 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_phillyreoides.json[0m
01:09 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 138 occurrence records[0m
01:09 | INFO    | [1m[35m_mark_outliers [0m[1m[0m | [1mdropped outliers: 4[0m
01:09 | INFO    | [1m[35mwrite          [0m[1m[0m | [1mwrote data to /tmp/Quercus_franchetii.json[0m
01:09 | INFO    | [1m[35m__init__       [0m[1m[0m | [1mfetched 57 occurrence records[0m
01:09 | INFO    | [1m[35m_mark_outli

In [399]:
SPECIES_DATA

Unnamed: 0,Species,𝛽,𝜓,𝜓_x,gidx,velo,velo_x,sproc
0,Quercus aj...,1.519878,0.414604,0.779744,0,1.934482,2.299622,<Sproc spp...
1,Quercus tu...,1.587244,-0.024202,0.538587,0,1.563042,2.125831,<Sproc spp...
2,Quercus to...,1.520985,-0.057836,0.803290,0,1.463149,2.324275,<Sproc spp...
3,Quercus ob...,1.491180,0.218928,0.833209,0,1.710108,2.324389,<Sproc spp...
4,Quercus ch...,1.508965,0.196346,0.763123,0,1.705311,2.272088,<Sproc spp...
...,...,...,...,...,...,...,...,...
211,Quercus ma...,1.585490,0.316306,-0.260218,7,1.901796,1.325273,<Sproc spp...
212,Quercus br...,1.488416,0.105489,-0.213626,7,1.593905,1.274790,<Sproc spp...
213,Quercus it...,1.480673,0.529441,-0.199932,7,2.010115,1.280742,<Sproc spp...
214,Quercus ac...,1.481948,0.109845,-0.195077,7,1.591793,1.286871,<Sproc spp...


In [400]:
NSAMPLES = 5000
RNG = np.random.default_rng(123)

In [401]:
# sample random cross pairs
crosses = RNG.choice(gtree.get_tip_labels(), size=(NSAMPLES, 2))

# translate names to indices
name2nidx = gtree.get_feature_dict("name", "idx")

# randomly choose tip idx numbers maternal and paternal
data = pd.DataFrame({
    "sp0": crosses[:, 0],
    "sp1": crosses[:, 1],
    "sidx0": [name2nidx[i] for i in crosses[:, 0]],
    "sidx1": [name2nidx[i] for i in crosses[:, 1]],
    "genetic_dist": 0.,
    "geo_overlap": [SPECIES_DATA[SPECIES_DATA['Species'] == "Quercus " + data.loc[
    idx, 'sp0'].split("|")[-1].split(".")[-1]]['sproc'][data.loc[idx, 'sidx0']].georange.intersects(
SPECIES_DATA[SPECIES_DATA['Species'] == "Quercus " + data.loc[
    idx, 'sp1'].split("|")[-1].split(".")[-1]]['sproc'][data.loc[idx, 'sidx1']].georange) for idx in data.index]
})

In [338]:
def get_dist(tree, idx0, idx1):
    "returns the genetic distance between two nodes on a tree"
    dist = tree.treenode.get_distance(
        tree.idx_dict[idx0], 
        tree.idx_dict[idx1],
    )
    return dist

In [402]:
# calc genetic distances and normalize to range(0, 1)
data['genetic_dist'] = [get_dist(gtree, *data.iloc[idx, [2, 3]]) / 2 for idx in data.index]
data['genetic_dist'] += data['genetic_dist'].min()
data['genetic_dist'] /= data['genetic_dist'].max()

In [403]:
intercept = RNG.normal(𝛼_mean, 𝛼_std, data.shape[0])
error = RNG.normal(0.0, 𝜎_std, data.shape[0])
data['RI_pooled'] = data.genetic_dist * SPECIES_DATA.loc[data.sidx0, "𝛽"].values
data['RI_pooled'] += intercept + error
data['RI_unpooled'] = data.genetic_dist * SPECIES_DATA.loc[data.sidx0, "velo"].values
data['RI_unpooled'] += intercept + error
data['RI_groups'] = data.genetic_dist * SPECIES_DATA.loc[data.sidx0, "velo_x"].values
data['RI_groups'] += intercept + error

In [404]:
# censor values in range 0-1
data.loc[data['RI_pooled'] < 0, 'RI_pooled'] = 0
data.loc[data['RI_pooled'] > 1, 'RI_pooled'] = 1
data.loc[data['RI_unpooled'] < 0, 'RI_unpooled'] = 0
data.loc[data['RI_unpooled'] > 1, 'RI_unpooled'] = 1
data.loc[data['RI_groups'] < 0, 'RI_groups'] = 0
data.loc[data['RI_groups'] > 1, 'RI_groups'] = 1

In [176]:
data['geo_overlap'] = data['geo_overlap'].astype(np.int64)

In [405]:
data

Unnamed: 0,sp0,sp1,sidx0,sidx1,genetic_dist,geo_overlap,RI_pooled,RI_unpooled,RI_groups
0,Quercus|Qu...,Quercus|Lo...,3,147,0.927411,False,1.0,1.000000,1.0
1,Quercus|Lo...,Quercus|Qu...,128,11,0.927411,False,1.0,1.000000,1.0
2,Cerris|Ile...,Quercus|Qu...,196,47,1.000000,False,1.0,1.000000,1.0
3,Quercus|Qu...,Quercus|Qu...,55,39,0.803571,False,1.0,1.000000,1.0
4,Quercus|Qu...,Quercus|Qu...,72,37,0.803571,False,1.0,1.000000,1.0
...,...,...,...,...,...,...,...,...,...
4995,Cerris|Cyc...,Quercus|Qu...,182,33,1.000000,False,1.0,1.000000,1.0
4996,Quercus|Lo...,Quercus|Qu...,134,39,0.927411,True,1.0,1.000000,1.0
4997,Quercus|Qu...,Quercus|Lo...,25,121,0.927411,False,1.0,1.000000,1.0
4998,Quercus|Lo...,Quercus|Qu...,129,60,0.927411,True,1.0,0.951425,1.0


In [84]:
def toytrace(trace, var_names, titles):
    """
    Plot posterior trace with toyplot
    """
    nvars = len(var_names)
    
    # setup canvase
    canvas = toyplot.Canvas(width=500, height=200 * nvars)
    
    # store axes
    axes = []
    
    # iter over params
    for pidx, param in enumerate(var_names):
        
        # get param posterior
        posterior = trace.get_values(param)
        
        # setup axes 
        ax = canvas.cartesian(grid=(nvars, 1, pidx))
        ax.y.show = False
        ax.x.spine.style = {"stroke-width": 1.5}
        ax.x.ticks.labels.style = {"font-size": "12px"}
        ax.x.ticks.show = True
        ax.x.label.text = f"param='{titles[pidx]}'"        
        
        # iterate over shape of param
        for idx in range(posterior.shape[1]):
            mags, bins = np.histogram(posterior[:, idx], bins=100)
            ax.plot(bins[1:], mags, stroke_width=2, opacity=0.6)
        axes.append(ax)
    return canvas, axes

In [85]:
def censored_pooled_regression(x, y, **kwargs):
    
    # data pre-processing
    lower_censored = y[y <= 0].index
    _x_lc = x[lower_censored].values
    _y_lc = y[lower_censored].values

    upper_censored = y[y >= 1].index
    _x_uc = x[upper_censored].values
    _y_uc = y[upper_censored].values

    uncensored = (y > 0) & (y < 1)
    _x = x[uncensored].values
    _y = y[uncensored].values
    
    # define model
    with pm.Model() as model:  

        # parameters and error
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝜎 = pm.HalfNormal('𝜎', 5.0, shape=1)

        # linear model prediction
        ri = 𝛼 + 𝛽 * _x

        # data likelihood (normal distributed errors)
        y = pm.Normal("y", mu=ri, sigma=𝜎, observed=_y)

        # density of censored data
        if sum(lower_censored):
            lcensored = pm.Potential(
                "lower_censored", 
                normal_lcdf(𝛼 + 𝛽 * _x_lc, 𝜎, _y_lc),
            )
        if sum(upper_censored):
            ucensored = pm.Potential(
                "upper_censored",
                normal_lccdf(𝛼 + 𝛽 * _x_uc, 𝜎, _y_uc),
            )

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]
    
        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [365]:
def censored_unpooled_noncentered_regression(x, y, idx0, idx1, **kwargs):
    
    # data pre-processing
    lower_censored = y[y <= 0].index
    _x_lc = x[lower_censored].values
    _y_lc = y[lower_censored].values

    upper_censored = y[y >= 1].index
    _x_uc = x[upper_censored].values
    _y_uc = y[upper_censored].values

    uncensored = (y > 0) & (y < 1)
    _x = x[uncensored].values
    _y = y[uncensored].values
    
    # define model
    with pm.Model() as model:
        
        # censored indexers
        sidx0 = pm.Data("spp_idx0", idx0.values[uncensored])
        sidx1 = pm.Data("spp_idx1", idx1.values[uncensored])
        sidx0_u = pm.Data("sidx0_u", idx0.values[upper_censored])
        sidx1_u = pm.Data("sidx1_u", idx1.values[upper_censored])
        sidx0_l = pm.Data("sidx0_l", idx0.values[lower_censored])
        sidx1_l = pm.Data("sidx1_l", idx1.values[lower_censored])

        # parameters and error
        𝜓_mean = pm.Normal('𝜓_mean', mu=0., sigma=5., shape=1)
        𝜓_std = pm.HalfNormal('𝜓_std', 5., shape=1)
        𝜓_offset = pm.Normal('𝜓_offset', mu=0, sigma=1., shape=tips)
        𝜓 = pm.Deterministic('𝜓', 𝜓_mean + 𝜓_std * 𝜓_offset)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝜎 = pm.HalfNormal('𝜎', 5., shape=1)
        
        # linear model prediction
        ri = 𝛼 + (𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * _x

        # data likelihood (normal distributed errors)
        y = pm.Normal("y", mu=ri, sigma=𝜎, observed=_y)

        # density of censored data
        if sum(lower_censored):
            lcensored = pm.Potential(
                "lower_censored", 
                normal_lcdf(𝛼 + (𝛽 + 𝜓[sidx0_l] + 𝜓[sidx1_l]) * _x_lc, 𝜎, _y_lc),
            )
        
        if sum(upper_censored):
            ucensored = pm.Potential(
                "upper_censored",
                normal_lccdf(𝛼 + (𝛽 + 𝜓[sidx0_u] + 𝜓[sidx1_u]) * _x_uc, 𝜎, _y_uc),
            )

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]

        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [406]:
def censored_partpooled_noncentered_regression(x, y, idx0, idx1, gidx, **kwargs):
    
    # data pre-processing
    lower_censored = y[y <= 0].index
    _x_lc = x[lower_censored].values
    _y_lc = y[lower_censored].values

    upper_censored = y[y >= 1].index
    _x_uc = x[upper_censored].values
    _y_uc = y[upper_censored].values

    uncensored = (y > 0) & (y < 1)
    _x = x[uncensored].values
    _y = y[uncensored].values
    
    # define model
    with pm.Model() as model:
        
        # censored indexers
        sidx0 = pm.Data("spp_idx0", idx0.values[uncensored])
        sidx1 = pm.Data("spp_idx1", idx1.values[uncensored])
        sidx0_u = pm.Data("sidx0_u", idx0.values[upper_censored])
        sidx1_u = pm.Data("sidx1_u", idx1.values[upper_censored])
        sidx0_l = pm.Data("sidx0_l", idx0.values[lower_censored])
        sidx1_l = pm.Data("sidx1_l", idx1.values[lower_censored])
        gidx = pm.Data("gidx", gidx)

        # parameters and error
        𝜓_mean = pm.Normal('𝜓_mean', mu=0., sigma=5., shape=8)
        𝜓_std = pm.HalfNormal('𝜓_std', 5., shape=8)
        𝜓_offset = pm.Normal('𝜓_offset', mu=0, sigma=1., shape=tips)
        𝜓 = pm.Deterministic('𝜓', 𝜓_mean[gidx] + 𝜓_std[gidx] * 𝜓_offset)
        𝛽 = pm.Normal('𝛽', mu=0., sigma=10., shape=1)
        𝛼 = pm.Normal('𝛼', mu=0., sigma=10., shape=1)
        𝜎 = pm.HalfNormal('𝜎', 5., shape=1)
        
        # linear model prediction
        ri = 𝛼 + (𝛽 + 𝜓[sidx0] + 𝜓[sidx1]) * _x

        # data likelihood (normal distributed errors)
        y = pm.Normal("y", mu=ri, sigma=𝜎, observed=_y)

        # density of censored data
        if sum(lower_censored):
            lcensored = pm.Potential(
                "lower_censored", 
                normal_lcdf(𝛼 + (𝛽 + 𝜓[sidx0_l] + 𝜓[sidx1_l]) * _x_lc, 𝜎, _y_lc),
            )
        
        if sum(upper_censored):
            ucensored = pm.Potential(
                "upper_censored",
                normal_lccdf(𝛼 + (𝛽 + 𝜓[sidx0_u] + 𝜓[sidx1_u]) * _x_uc, 𝜎, _y_uc),
            )

        # sample posterior, skip burnin
        trace = pm.sample(**kwargs)[1000:]

        # show summary table
        stats = pm.summary(trace)
        
    # organize results
    result_dict = {
        'model': model, 
        'trace': trace,
        'stats': stats,
    }
    return result_dict

In [89]:
# MCMC sampler kwargs
sample_kwargs = dict(
    tune=4000,
    draws=4000,
    target_accept=0.95,
    return_inferencedata=False,
    progressbar=True,
)

In [90]:
# model input
model_args = [
    data.genetic_dist,
    data.RI_pooled,
    data.sidx0,
    data.sidx1,
    SPECIES_DATA.gidx,
]

# fit pooled model to pooled data
pooled = censored_pooled_regression(*model_args[:2], **sample_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛽, 𝛼]


Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 188 seconds.


In [271]:
pooled['stats']

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
𝛼[0],0.048,0.003,0.043,0.053,0.0,0.0,5199.0,5182.0,5193.0,6352.0,1.0
𝛽[0],1.498,0.007,1.485,1.512,0.0,0.0,5329.0,5329.0,5330.0,6187.0,1.0
𝜎[0],0.051,0.001,0.049,0.054,0.0,0.0,7449.0,7437.0,7462.0,6953.0,1.0


In [366]:
model_args = [
    data.genetic_dist,
    data.RI_unpooled,
    data.sidx0,
    data.sidx1,
    SPECIES_DATA.gidx,
]

# run models
unpooled = censored_unpooled_noncentered_regression(*model_args[:4], **sample_kwargs)

INFO (theano.gof.compilelock): Refreshing lock /home/henry/.theano/compiledir_Linux-4.15--generic-x86_64-with-debian-buster-sid-x86_64-3.7.9-64/lock_dir/lock
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 21557 seconds.
INFO (theano.gof.compilelock): Refreshing lock /home/henry/.theano/compiledir_Linux-4.15--generic-x86_64-with-debian-buster-sid-x86_64-3.7.9-64/lock_dir/lock
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_ac

In [368]:
RI_ESTIMATE = (
    unpooled['trace']['𝛽'] + 
    unpooled['trace']['𝜓_mean'] + 
    unpooled['trace']['𝜓_std'] * 
    unpooled['trace']['𝜓_offset'].mean(axis=0)
).mean(axis=0)

# show plot of TRUE vs. ESTIMATED rates
toyplot.scatterplot(
    SPECIES_DATA.velo,
    RI_ESTIMATE,
    width=400,
    height=250,
    xlabel="TRUE species velocity",
    ylabel="ESTIMATED species velocity"
);

In [373]:
# model input
model_args = [
    data.genetic_dist,
    data.RI_groups,
    data.sidx0,
    data.sidx1,
    SPECIES_DATA.gidx,
]

partpooled = censored_partpooled_noncentered_regression(*model_args, **sample_kwargs)

INFO (theano.gof.compilelock): Refreshing lock /home/henry/.theano/compiledir_Linux-4.15--generic-x86_64-with-debian-buster-sid-x86_64-3.7.9-64/lock_dir/lock
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 22879 seconds.
INFO (theano.gof.compilelock): Refreshing lock /home/henry/.theano/compiledir_Linux-4.15--generic-x86_64-with-debian-buster-sid-x86_64-3.7.9-64/lock_dir/lock
There were 19 divergences after tuning. Increase `target_accept` or reparameterize.
There were 19 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 13 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 12 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
The number of effective samples is smaller than 10% for some parameters.


In [407]:
# model input
model_args = [
    data.genetic_dist,
    data.RI_groups,
    data.sidx0,
    data.sidx1,
    SPECIES_DATA.gidx,
]

partpooled2 = censored_partpooled_noncentered_regression(*model_args, **sample_kwargs)

INFO (theano.gof.compilelock): Refreshing lock /home/henry/.theano/compiledir_Linux-4.15--generic-x86_64-with-debian-buster-sid-x86_64-3.7.9-64/lock_dir/lock
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [𝜎, 𝛼, 𝛽, 𝜓_offset, 𝜓_std, 𝜓_mean]


Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 14660 seconds.
INFO (theano.gof.compilelock): Refreshing lock /home/henry/.theano/compiledir_Linux-4.15--generic-x86_64-with-debian-buster-sid-x86_64-3.7.9-64/lock_dir/lock
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase target_accept or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
The chain reached the maximum tree depth. Increase max_treedepth, increase ta

In [408]:
RI_ESTIMATE = (
    partpooled2['trace']['𝛽'] + 
    partpooled2['trace']['𝜓_mean'][:, SPECIES_DATA.gidx] + 
    partpooled2['trace']['𝜓_std'][:, SPECIES_DATA.gidx] * 
    partpooled2['trace']['𝜓_offset'].mean(axis=0)
).mean(axis=0)

In [409]:
# show plot of TRUE vs. ESTIMATED rates
toyplot.scatterplot(
    SPECIES_DATA.velo_x,
    RI_ESTIMATE,
    width=400,
    height=250,
    xlabel="TRUE species velocity",
    ylabel="ESTIMATED species velocity",
    color=[toyplot.color.Palette()[i] for i in SPECIES_DATA.gidx],
);

In [414]:
# get canvas size
canvas = toyplot.Canvas(width=600, height=600);

# colormap for values between 0-1
cmap = toyplot.color.LinearMap(domain_min=0, domain_max=1.0)

# add tree to canvas
ax0 = canvas.cartesian(bounds=(50, 210, 50, 550), show=False)
gtree.draw(
    axes=ax0, 
    layout='r', 
    tip_labels=False,
    edge_colors=gtree.get_edge_values_mapped({
        423: toytree.colors[0],
        422: toytree.colors[1],
        426: toytree.colors[2],
        425: toytree.colors[3],
        417: toytree.colors[4],
        405: toytree.colors[5],
        394: toytree.colors[6],
        393: toytree.colors[7],
    }),
);

maxdist = max([
    get_dist(gtree, i, j) / 2. for (i, j) in itertools.product(range(tips), range(tips))
])

# add heatmap
ax1 = canvas.table(rows=tips, columns=tips, bounds=(220, 525, 50, 550), margin=20)
for t0 in gtree.get_tip_labels():
    for t1 in gtree.get_tip_labels():

        # generate error
        ridx = name2nidx[t0]
        cidx = name2nidx[t1]
        dist = (get_dist(gtree, ridx, cidx) / 2.) / maxdist

        # spp_effects slope RI here already has Beta included.
        spp_effects = RI_ESTIMATE[ridx] + RI_ESTIMATE[cidx]
        ri = 𝛼_mean + (spp_effects) * dist

        # normalize -- cannot be >1 or <0
        col = cmap.color(ri)
        ax1.cells.cell[(tips - ridx - 1), cidx].style = {
            "fill": col, "stroke": "none"
        }

# dividers
ax1.body.gaps.columns[...] = 0.5
ax1.body.gaps.rows[...] = 0.5

# add a colorbar
numberline = canvas.numberline(550, 550, 550, 50)
numberline.colormap(cmap, style={"stroke-width":5})