## Demographic inference of the divergence history of *Erythrura trichroa* and *Erythrura papuana*

This notebook performs a demographic inference of the divergence history of *Erythrura trichroa* and *Erythrura papuana* using `dadi2`. For now I run a limited number of replicates, to be boosted in the future. We start by importing the relevant libraries: 

In [1]:
import os
import numpy as np
import dadi
import nlopt
from dadi import Numerics
from scipy.optimize import minimize

Next we provide inputs for the analysis: our `.vcf`, population IDs and names, projections and points, replicates, and seeds:

In [2]:
vcf = "/home/k14m234/erythrura_assembly/results/GCF_005870125.1/QC/erythrura.pruned.vcf.gz"
popfile = "/home/k14m234/erythrura/config/pixy_pop.txt"
pops = ['trichroa', 'papuana']
projections = [20, 20]
n_reps = 10
maxiter = 10          # set 3 to mimic the moments example; 20-100 is more realistic
seed = 1
rng = np.random.default_rng(seed)

Directories for output: 

In [3]:
outdir = "/home/k14m234/erythrura/results/dadi"
os.makedirs(outdir, exist_ok=True)

Scaling constraints: 

In [4]:
mu = 2.3e-9            # per-site per-generation mutation rate
L  = 1010218420        # from awk '{sum += $3-$2} END{print sum}' erythrura_callable_sites.bed 
gen_time = 1.0         # years per generation (optional; set to 1 if unknown)

A folded SFS: 

In [5]:
dd = dadi.Misc.make_data_dict_vcf(vcf, popfile)
fs = dadi.Spectrum.from_data_dict(dd, pops, projections=projections, polarized=False)
print(fs.S())

73881.42115007679


Sample sizes and grid sizes:  

In [6]:
ns = fs.sample_sizes
pts_l = [max(ns)+20, max(ns)+30, max(ns)+40]

Bootstrapped datasets:

In [7]:
# --- ADD 1: bootstrap spectra ONCE (outside loop) ---
Nboot = 100
chunk_size = int(5e6)
chunks = dadi.Misc.fragment_data_dict(dd, chunk_size)
boots = dadi.Misc.bootstraps_from_dd_chunks(chunks, Nboot, pops, ns)
boots = [b.fold() for b in boots]
for b in boots:
    b.mask = fs.mask

Define models:

In [8]:
# strict isolation
def SI(params, ns, pts):
    nu1, nu2, T = params
    xx = dadi.Numerics.default_grid(pts)
    phi = dadi.PhiManip.phi_1D(xx)
    phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
    phi = dadi.Integration.two_pops(phi, xx, T, nu1=nu1, nu2=nu2, m12=0, m21=0)
    return dadi.Spectrum.from_phi(phi, ns, (xx, xx))
si_ex = dadi.Numerics.make_extrap_log_func(SI)

# isolation-with-migration
def IM(params, ns, pts):
    nu1, nu2, T, m12, m21 = params
    xx = dadi.Numerics.default_grid(pts)
    phi = dadi.PhiManip.phi_1D(xx)
    phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
    phi = dadi.Integration.two_pops(phi, xx, T, nu1=nu1, nu2=nu2, m12=m12, m21=m21)
    fs_model = dadi.Spectrum.from_phi(phi, ns, (xx, xx))
    return fs_model
im_ex = dadi.Numerics.make_extrap_log_func(IM)

# secondary contact
def SC(params, ns, pts):
    nu1, nu2, T1, T2, m12, m21 = params
    xx = dadi.Numerics.default_grid(pts)
    phi = dadi.PhiManip.phi_1D(xx)
    phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)

    # Phase 1: strict isolation
    phi = dadi.Integration.two_pops(phi, xx, T1, nu1=nu1, nu2=nu2, m12=0,   m21=0)

    # Phase 2: secondary contact
    phi = dadi.Integration.two_pops(phi, xx, T2, nu1=nu1, nu2=nu2, m12=m12, m21=m21)

    fs_model = dadi.Spectrum.from_phi(phi, ns, (xx, xx))
    return fs_model
sc_ex = dadi.Numerics.make_extrap_log_func(SC)

We now run replicate model fitting, starting with strict isolation (SI): 

In [12]:
# define starting param
SI_params = [1, 1, 0.01]
SI_param_names = ["nu1","nu2","T"]
SI_lower = [1e-3, 1e-3, 1e-4]
SI_upper = [10,   10,   1]

# checkoutput dir
os.makedirs("dadi", exist_ok=True)

# 20 replicates
for i in range(50):

    # random staring perturbation
    p0 = dadi.Misc.perturb_params(
        SI_params, fold=3,
        upper_bound=SI_upper,
        lower_bound=SI_lower
    )

    # generate model
    popt, ll_model = dadi.Inference.opt(
        p0, fs, si_ex, pts_l,
        lower_bound=SI_lower,
        upper_bound=SI_upper,
        algorithm=nlopt.LN_BOBYQA,
        log_opt=True, 
        maxeval=400,
        verbose=100
    )

    # get model fs
    model_fs = si_ex(popt, ns, pts_l)

    # theta + params
    theta0 = dadi.Inference.optimal_sfs_scaling(model_fs, fs)
    Nref   = theta0 / (4 * mu * L)
    nTri = popt[0] * Nref
    nPap = popt[1] * Nref
    t1  = popt[2] * 2 * Nref

    # write real params
    out1 = "\t".join(map(str, [i+1, ll_model, nTri, nPap, t1, theta0])) + "\n"
    with open("dadi/si_real.tsv", "a") as f:
        f.write(out1)

    # write model params
    out2 = "\t".join(map(str, [i+1, ll_model] + list(popt) + [theta0])) + "\n"
    with open("dadi/si_model.tsv", "a") as f:
        f.write(out2)

    for eps in [1e-2, 5e-3, 1e-3]:
        try:
            uncert = dadi.Godambe.GIM_uncert(si_ex, pts_l, boots, popt, fs, eps=eps)
        except np.linalg.LinAlgError:
            print(f"rep {i+1}: Godambe failed (singular); skipping CI")
            continue   # ← must be inside except block
    
        # Only runs if no exception
        if len(uncert) == len(popt) + 1:
            se = uncert[:-1]
        else:
            se = uncert
    
        ci_low  = np.array(popt) - 1.96 * np.array(se)
        ci_high = np.array(popt) + 1.96 * np.array(se)

    with open("dadi/si_ci.tsv", "a") as f:
        for name, est, se_i, lo, hi in zip(SI_param_names, popt, se, ci_low, ci_high):
            f.write("\t".join(map(str, [i+1, ll_model, name, est, se_i, lo, hi])) + "\n")

    print(i+1, ll_model, popt)

3000    , -2260.35    , array([ 4.49613    ,  2.63701    ,  0.0663418  ])
1 -2242.0955908728392 [4.68146167 3.11296947 0.00506944]
3100    , -4641.5     , array([ 1.19075    ,  0.432194   ,  0.0115314  ])
2 -2242.095590397311 [2.09187379 0.56679293 0.00497308]
3200    , -2242.1     , array([ 5.07819    ,  2.2861     ,  0.0618193  ])
3 -2242.0955904677285 [0.33199165 1.04077775 0.07333642]
3300    , -2242.14    , array([ 5.07585    ,  2.27039    ,  0.0615191  ])
4 -2242.095590426463 [1.20767709e+00 4.48316962e+00 4.25730462e-03]
3400    , -2242.18    , array([ 5.05145    ,  2.26485    ,  0.0617129  ])
5 -2242.0955906952354 [0.12894932 0.13962357 0.00268666]
3500    , -2242.15    , array([ 5.08982    ,  2.26378    ,  0.0617032  ])
6 -2242.095590586398 [1.58793331 6.51112802 0.01027972]
3600    , -2246.51    , array([ 4.57234    ,  2.45083    ,  0.0614911  ])
7 -2242.09559064135 [2.21255386e+00 3.35687868e-01 1.59988108e-03]
3700    , -2242.1     , array([ 5.06454    ,  2.28553    ,  0.06

Next up is secondary contact (SC): 

In [None]:
# define starting param
SC_params = [1, 1, 0.01, 0.01, 0.001, 0.001]
SC_param_names = ["nu1","nu2","T1","T2","m12","m21"]
SC_lower = [1e-3, 1e-3, 1e-4, 1e-4, 1e-6, 1e-6]
SC_upper = [10,   10,   1,   1,   0.5,   0.5]

# checkoutput dir
os.makedirs("dadi", exist_ok=True)

# pick stepsize and write path for cis
eps = 1e-3
ci_path = "dadi/sc_ci.tsv"
if not os.path.exists(ci_path):
    with open(ci_path, "w") as f:
        f.write("rep\tll\tparam\test\tse\tci_low\tci_high\n")

# 20 replicates
for i in range(50):

    # random staring perturbation
    p0 = dadi.Misc.perturb_params(
        SC_params, fold=3,
        upper_bound=SC_upper,
        lower_bound=SC_lower
    )

    # generate model
    popt, ll_model = dadi.Inference.opt(
        p0, fs, sc_ex, pts_l,
        lower_bound=SC_lower,
        upper_bound=SC_upper,
        algorithm=nlopt.LN_BOBYQA,
        log_opt=True, 
        maxeval=400,
        verbose=100
    )

    # get model fs
    model_fs = sc_ex(popt, ns, pts_l)

    # theta + real params
    theta0 = dadi.Inference.optimal_sfs_scaling(model_fs, fs)
    Nref   = theta0 / (4 * mu * L)
    Nref=theta0/(4*mu*L)
    nTri=popt[0]*Nref
    nPap=popt[1]*Nref
    t1=popt[2]*2*Nref
    t2=popt[3]*2*Nref
    m12=popt[4]/(2*Nref)
    m21=popt[5]/(2*Nref)
    
    # write real params
    out1 = "\t".join(map(str, [i+1, ll_model, nTri, nPap, t1, t2, m12, m21, theta0])) + "\n"
    with open("dadi/sc_real.tsv", "a") as f:
        f.write(out1)

    # write model params
    out2 = "\t".join(map(str, [i+1, ll_model] + list(popt) + [theta0])) + "\n"
    with open("dadi/sc_model.tsv", "a") as f:
        f.write(out2)

    for eps in [1e-2, 5e-3, 1e-3]:
        try:
            uncert = dadi.Godambe.GIM_uncert(sc_ex, pts_l, boots, popt, fs, eps=eps)
        except np.linalg.LinAlgError:
            print(f"rep {i+1}: Godambe failed (singular); skipping CI")
            continue   # ← must be inside except block
    
        # Only runs if no exception
        if len(uncert) == len(popt) + 1:
            se = uncert[:-1]
        else:
            se = uncert
    
        ci_low  = np.array(popt) - 1.96 * np.array(se)
        ci_high = np.array(popt) + 1.96 * np.array(se)

    with open("dadi/sc_ci.tsv", "a") as f:
        for name, est, se_i, lo, hi in zip(SC_param_names, popt, se, ci_low, ci_high):
            f.write("\t".join(map(str, [i+1, ll_model, name, est, se_i, lo, hi])) + "\n")

    print(i+1, ll_model, popt)

Lastly, we'll run IM:

In [None]:
# define starting parameters
IM_params = [1, 1, 0.01, 0.001, 0.001]
IM_param_names = ["nu1","nu2","T","m12","m21"]
IM_lower = [1e-3, 1e-3, 1e-4, 1e-6, 1e-6]
IM_upper = [10,   10,   1,   0.5,   0.5]

# 20 replicates
for i in range(50):

    # random staring perturbation
    p0 = dadi.Misc.perturb_params(
        IM_params, fold=10,
        upper_bound=IM_upper,
        lower_bound=IM_lower
    )

    # generate model
    popt, ll_model = dadi.Inference.opt(
        p0, fs, im_ex, pts_l,
        lower_bound=IM_lower,
        upper_bound=IM_upper,
        algorithm=nlopt.GN_CRS2_LM,
        maxeval=400,
        verbose=100
    )

    # get model fs
    model_fs = im_ex(popt, ns, pts_l)

    # theta + real params
    theta0 = dadi.Inference.optimal_sfs_scaling(model_fs, fs)
    Nref   = theta0 / (4 * mu * L)
    Nref=theta0/(4*mu*L)
    nTri=popt[0]*Nref
    nPap=popt[1]*Nref
    t1=popt[2]*2*Nref
    m12=popt[3]/(2*Nref)
    m21=popt[4]/(2*Nref)
    
    # write real params
    out1 = "\t".join(map(str, [i+1, ll_model, nTri, nPap, t1, m12, m21, theta0])) + "\n"
    with open("dadi/im_real.tsv", "a") as f:
        f.write(out1)

    # write model params
    out2 = "\t".join(map(str, [i+1, ll_model] + list(popt) + [theta0])) + "\n"
    with open("dadi/im_model.tsv", "a") as f:
        f.write(out2)

    for eps in [1e-2, 5e-3, 1e-3]:
        try:
            uncert = dadi.Godambe.GIM_uncert(im_ex, pts_l, boots, popt, fs, eps=eps)
        except np.linalg.LinAlgError:
            print(f"rep {i+1}: Godambe failed (singular); skipping CI")
            continue   # ← must be inside except block
    
        # Only runs if no exception
        if len(uncert) == len(popt) + 1:
            se = uncert[:-1]
        else:
            se = uncert
    
        ci_low  = np.array(popt) - 1.96 * np.array(se)
        ci_high = np.array(popt) + 1.96 * np.array(se)

    with open("dadi/im_ci.tsv", "a") as f:
        for name, est, se_i, lo, hi in zip(IM_param_names, popt, se, ci_low, ci_high):
            f.write("\t".join(map(str, [i+1, ll_model, name, est, se_i, lo, hi])) + "\n")
    
    print(i+1, ll_model, popt)

Write parameters with CIs: