### Preparing the Julia environment

In [1]:
cd(@__DIR__)
using Pkg
Pkg.activate(".")
Pkg.instantiate()

[32m[1m  Activating[22m[39m project at `~/projects/research/phylogeneticCausalAnalysis/code`


In [95]:
using CSV
using DataFrames
using Plots
using StatsPlots
using MCPhylo
using MCPhyloTree
plotlyjs()
using Distributions
using LinearAlgebra
using Pipe
using Statistics
using GLM
using MLBase

In [3]:
import MCPhylo: SamplerTune, SymDistributionType, Sampler, ElementOrVector
import MCPhylo: sample!

include("prior_sampler.jl")

StatsBase.sample!

In [4]:
nChains = 2

2

### Loading the data

In [5]:
d = CSV.File("../data/data_cues.txt") |> DataFrame
show(d, allcols=true)

[1m30×7 DataFrame[0m
[1m Row [0m│[1m Glottocode [0m[1m Language   [0m[1m Genus             [0m[1m Case_Marking [0m[1m Tight_Semantics [0m[1m Rigid_Order [0m[1m Verb_Middle [0m
[1m     [0m│[90m String15   [0m[90m String15   [0m[90m String31          [0m[90m Float64      [0m[90m Float64         [0m[90m Float64     [0m[90m Float64     [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │ stan1318    Arabic      Semitic                   0.036            0.218        0.655        0.391
   2 │ bulg1262    Bulgarian   Slavic                    0.028            0.144        0.782        0.966
   3 │ croa1245    Croatian    Slavic                    0.415            0.147        0.414        0.9
   4 │ czec1258    Czech       Slavic                    0.525            0.172        0.24         0.818
   5 │ dani1285    Danish      Germanic                  0.0              0.208        0.926       

### normalizing data



In [6]:
dataColumns = names(d)[4:end]


4-element Vector{String}:
 "Case_Marking"
 "Tight_Semantics"
 "Rigid_Order"
 "Verb_Middle"

In [7]:
normalize(x) = (x .- mean(x)) ./ std(x)

normalize (generic function with 1 method)

In [8]:
dArray = @pipe d |> 
    select(_, dataColumns) |>
    Array |>
    mapslices(normalize, _, dims=1);

In [9]:
corrplot(dArray)

In [10]:
cor(dArray)

4×4 Matrix{Float64}:
  1.0        0.34742   -0.77153   -0.191945
  0.34742    1.0       -0.18423   -0.369734
 -0.77153   -0.18423    1.0        0.177435
 -0.191945  -0.369734   0.177435   1.0

### Loading the trees

In [11]:
trees = MCPhylo.ParseNewick("../data/posterior.tree");
nTrees = length(trees)
summary(trees)

"1000-element Vector{GeneralNode}"

### extracting covariance matrices from trees

In [12]:
sigmas = [MCPhyloTree.to_covariance(t) for t in trees];

# The model

- There are 30 data points
- $X$ is a $30\times (n+1)$ matrix with the first column being all-1. ($n$ is the number of independent variables.)
- $Y$ is a length-30 vector.
- $\beta$, the regression coefficient vector, is a vector of length $n+1$.
- $\rho$, the rate, is a positive real number.
- $\Sigma$ is a variance-covariance matrix (taken from a phylogenetic tree)


$$
\begin{aligned}
    Y &\sim \mathcal N(X\beta, \rho\Sigma)\\
    \beta_i &\sim \mathcal N(0, 10) &\forall i\\
    \sigma &\sim \mathrm{Exponential}(1)
\end{aligned}
$$

In [371]:
iv = [2]
dv = 3

3

In [372]:
scatter(dArray[:,dv], dArray[:,iv[:,1]], legend=false)

In [373]:
names(d)

7-element Vector{String}:
 "Glottocode"
 "Language"
 "Genus"
 "Case_Marking"
 "Tight_Semantics"
 "Rigid_Order"
 "Verb_Middle"

In [375]:
lm(@formula(Rigid_Order ~ Tight_Semantics), d)


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

Rigid_Order ~ 1 + Tight_Semantics

Coefficients:
─────────────────────────────────────────────────────────────────────────────
                     Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
─────────────────────────────────────────────────────────────────────────────
(Intercept)       0.787781    0.169079   4.66    <1e-04   0.441437   1.13412
Tight_Semantics  -0.927782    0.935424  -0.99    0.3298  -2.84391    0.988348
─────────────────────────────────────────────────────────────────────────────

In [376]:
data = Dict{Symbol, Any}(
    :x => dArray[:,iv] |> vec,
    :y => dArray[:,dv] |> vec,
    :sigmas => sigmas,
);

In [377]:
nCoefficients = size(data[:x],2)
inits = [
    Dict{Symbol, Any}(
        :idx => rand(1:nTrees),
        :x => data[:x],
        :y => data[:y],
        :sigmas => data[:sigmas],
        :α => rand(Normal(0, 10)),
        :β => rand(Normal(0, 10)),
        :ϵ => rand(30),
        :ρ => rand(Exponential()),
    )
    for i in 1:nChains
];

In [378]:
model = Model(
    y = Stochastic(1,
        (x, α, β, ρ) -> MvNormal(α .+ β .* x, ρ),
        false,
    ),
    α = Stochastic(
        () -> Normal(0.0,30),
        true,
    ),
    β = Stochastic(
        () -> Normal(0.0,10),
        true,
    ),
    ρ = Stochastic(
        () -> Exponential(1),
        true,
    ),
    idx=Stochastic(
        () -> Uniform(0, nTrees),
        false,
    ),
);

In [379]:
scheme = [NUTS([:α, :β, :ρ]), Prior([:idx])];


In [380]:
setsamplers!(model, scheme);

In [381]:
sim = mcmc(
    model,
    data,
    inits,
    10000,
    burnin=5000,
    thin=1,
    chains=nChains,
    trees=false,
    verbose=true,
);


MCMC Simulation of 10000 Iterations x 2 Chains...


[32mChain 1: 100%|████████████████████████████| Time: 0:00:00 (94.68 μs/it)[39m
[A2mChain 2: 100%|████████████████████████████| Time: 0:00:01 ( 0.18 ms/it)[39m





In [382]:
gelmandiag(sim)

            PSRF 97.5%
         β 1.000 1.000
         ρ 1.001 1.002
         α 1.000 1.000
likelihood   NaN   NaN



In [383]:
plot(sim)

Press ENTER to draw next plot
stdin> 


2-element Vector{Plots.Plot}:
 Plot{Plots.PlotlyJSBackend() n=6}
 Plot{Plots.PlotlyJSBackend() n=6}

In [384]:
describe(sim)

Iterations = 5001:10000
Thinning interval = 1
Chains = 1,2
Samples per chain = 5000

Empirical Posterior Estimates:
                Mean         SD       Naive SE       MCSE      ESS
         β -0.1862455215 0.19294965 0.0019294965 0.0021580090 5000
         ρ  1.0098993630 0.13827326 0.0013827326 0.0016577651 5000
         α  0.0018337354 0.18599984 0.0018599984 0.0024171963 5000
likelihood            -∞        NaN          NaN          NaN  NaN

Quantiles:
               2.5%       25.0%         50.0%         75.0%       97.5%  
         β -0.56673712 -0.31543322 -0.18371816834 -0.058307988 0.19339318
         ρ  0.78820985  0.91011053  0.99443686195  1.092049614 1.32700971
         α -0.36257862 -0.12468938  0.00034768222  0.124506929 0.36642935
likelihood          -∞          -∞             -∞           -∞         -∞



In [385]:
hpd(sim)

            95% Lower   95% Upper
         β -0.57993705 0.17469099
         ρ  0.78191076 1.30603863
         α -0.37565124 0.34856347
likelihood          -∞         -∞



### Null model


In [386]:
model0 = Model(
    y = Stochastic(1,
        (α, β, ρ, x) -> MvNormal(α .+ β .* x, ρ),
        false,
    ),
    α = Stochastic(
        () -> Normal(0,50),
        true,
    ),
    β = Stochastic(
        () -> Normal(0,10),
        true,
    ),
    ρ = Stochastic(
        () -> Exponential(1),
        true,
    ),
);

In [387]:
scheme = [NUTS([:α, :β, :ρ])];


In [388]:
setsamplers!(model0, scheme);

In [389]:
sim = mcmc(
    model0,
    data,
    inits,
    200000,
    burnin=100000,
    thin=100,
    chains=nChains,
    trees=false,
    verbose=true,
);


MCMC Simulation of 200000 Iterations x 2 Chains...


[32mChain 1: 100%|████████████████████████████| Time: 0:00:11 (55.32 μs/it)[39m
[A2mChain 2: 100%|████████████████████████████| Time: 0:00:22 ( 0.11 ms/it)[39m39m





In [390]:
gelmandiag(sim)

           PSRF 97.5%
         β    1 1.000
         ρ    1 1.002
         α    1 1.001
likelihood  NaN   NaN



In [391]:
plot(sim)

Press ENTER to draw next plot
stdin> 


2-element Vector{Plots.Plot}:
 Plot{Plots.PlotlyJSBackend() n=6}
 Plot{Plots.PlotlyJSBackend() n=6}

In [392]:
describe(sim)

Iterations = 100100:200000
Thinning interval = 100
Chains = 1,2
Samples per chain = 1000

Empirical Posterior Estimates:
                Mean          SD       Naive SE       MCSE      ESS
         β -0.18166010409 0.18753700 0.0041934547 0.0049837075 1000
         ρ  1.00749373591 0.13707084 0.0030649972 0.0027523645 1000
         α -0.00066762502 0.18370781 0.0041078314 0.0032491866 1000
likelihood             -∞        NaN          NaN          NaN  NaN

Quantiles:
               2.5%        25.0%        50.0%         75.0%       97.5%  
         β -0.53908201 -0.310509731 -0.1831245646 -0.057261526 0.19471475
         ρ  0.78537676  0.912049420  0.9923097265  1.086441908 1.33169308
         α -0.34264308 -0.123805575 -0.0060405546  0.125259207 0.35243993
likelihood          -∞           -∞            -∞           -∞         -∞



In [393]:
hpd(sim)

            95% Lower   95% Upper
         β -0.53830010 0.19701669
         ρ  0.75071624 1.27518264
         α -0.34070052 0.35480521
likelihood          -∞         -∞

