In [33]:
#############
# Libraries #
#############

using CSV, DataFrames, DataFramesMeta, Missings, CategoricalArrays
using StatsBase, Statistics, HypothesisTests, MatrixLM
using Random, Distributions, StatsModels
using LinearAlgebra
using FreqTables, Plots, StatsPlots
using ColorSchemes, RecipesBase
using SpecialFunctions

In [34]:
#######################
# Plotting attributes #
#######################
myfont = "Helvetica"
mytitlefontsize = 12 

12

In [4]:
######################
# External functions #
######################
include(joinpath(@__DIR__, "..","src","wrangle_utils.jl" ));
include(joinpath(@__DIR__, "..","src","utils.jl" ));
include(joinpath(@__DIR__, "..","src","utils_copd_spiro.jl" ));
include(joinpath(@__DIR__, "..","src","demog.jl" ));
include(joinpath(@__DIR__, "..","src","mLinearModel.jl" ));
include(joinpath(@__DIR__,"..", "src","myPlots.jl" ));
	

In [5]:
copd =  get_data("COPDGene")

CohortData([1m784×14 DataFrame[0m
[1m Row [0m│[1m SampleName [0m[1m SampleID   [0m[1m Site    [0m[1m FinalGold [0m[1m Sex     [0m[1m Age     [0m[1m BMI     [0m[1m [0m ⋯
     │[90m String7    [0m[90m String15   [0m[90m String3 [0m[90m Int64     [0m[90m String7 [0m[90m Float64 [0m[90m Float64 [0m[90m [0m ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │ 10010J      NJHC-00611  NJC              2  Female      73.5    27.51   ⋯
   2 │ 10031R      NJHC-00004  NJC              2  Male        66.3    22.77
   3 │ 10032T      NJHC-00006  NJC              2  Female      66.3    31.78
   4 │ 10052Z      NJHC-00708  NJC              4  Male        55.1    26.08
   5 │ 10055F      NJHC-01099  NJC              0  Male        75.8    26.95   ⋯
   6 │ 10056H      NJHC-00003  NJC              4  Female      54.1    37.43
   7 │ 10057J      NJHC-00002  NJC              3  Male        62.2    24.53
   8 │ 10060Y      NJHC-00074 

In [10]:
fileClinicalDict = joinpath(@__DIR__,"..","data","processed","COPDGene",
                            "ClinicalDataDictionary.csv");
dfClinicalDict = CSV.read(fileClinicalDict, DataFrame);

In [11]:
############################################################
# Choose predictors for X matrix
############################################################
xCovariates = ["Intercept", "Sex", "Age", "BMI",
                "SmokingPackYears", "PercentEmphysema",
                "COPD", "NHW", "CurrentSmoker"]

# Build the predictor string
vPredictorNames = copy(xCovariates)
# Rename "Intercept" -> "1" before building the @formula
for i in eachindex(vPredictorNames)
    if vPredictorNames[i] == "Intercept"
        vPredictorNames[i] = "1"
    end
end
frml = join(vPredictorNames, " + ")

# Append "+ Site" (site-adjusted X)
frml_site_adjusted = frml * " + Site"

# formulaX == @formula(0 ~ <frml_site_adjusted>).rhs
formulaX = eval(Meta.parse(string("@formula(0 ~ ", frml_site_adjusted, ").rhs")))

############################################################
# Contrasts for categorical variables
############################################################
contrasts_copd = Dict(
    :Sex           => EffectsCoding(base = sort(unique(copd.dfInd.Sex))[2]),
    :NHW           => EffectsCoding(base = sort(unique(copd.dfInd.NHW))[1]),
    :Site          => EffectsCoding(base = sort(unique(copd.dfInd.Site))[1]),
    :CurrentSmoker => EffectsCoding(base = sort(unique(copd.dfInd.CurrentSmoker))[1]),
    :COPD          => EffectsCoding(base = sort(unique(copd.dfInd.COPD))[1]),
)

#################
# Build X matrix
#################
mXcopd  = modelmatrix(formulaX,  copd.dfInd;  hints = contrasts_copd)

784×10 Matrix{Float64}:
 1.0   1.0  73.5  27.51  30.7   2.42326    1.0   1.0  -1.0  -1.0
 1.0  -1.0  66.3  22.77  46.9  34.7749     1.0   1.0  -1.0  -1.0
 1.0   1.0  66.3  31.78  40.0   6.45937    1.0   1.0  -1.0  -1.0
 1.0  -1.0  55.1  26.08  39.5   4.27442    1.0   1.0   1.0  -1.0
 1.0  -1.0  75.8  26.95  20.0   1.51622   -1.0  -1.0  -1.0  -1.0
 1.0   1.0  54.1  37.43  72.0  21.8802     1.0   1.0  -1.0  -1.0
 1.0  -1.0  62.2  24.53  40.3  11.1913     1.0   1.0   1.0  -1.0
 1.0  -1.0  78.5  28.9   81.0  27.7891     1.0   1.0  -1.0  -1.0
 1.0   1.0  75.3  27.62  96.0   1.67065    1.0   1.0  -1.0  -1.0
 1.0   1.0  67.8  23.72  72.0   5.78305    1.0   1.0  -1.0  -1.0
 1.0  -1.0  56.2  37.06  68.4   1.89047    1.0   1.0   1.0  -1.0
 1.0  -1.0  54.5  39.41  31.8   3.56222   -1.0   1.0   1.0  -1.0
 1.0  -1.0  76.8  28.01  62.0   4.9056     1.0   1.0   1.0  -1.0
 ⋮                              ⋮                           
 1.0  -1.0  54.6  20.97  52.1   1.14824   -1.0  -1.0   1.0  -1.0
 1.0 

In [12]:
############################
# Coefficient-name vectors
############################
# Full coefficient names
sch_copd  = schema(formulaX,  copd.dfInd,  contrasts_copd)
vFrmlNames_copd  = coefnames(apply_schema(formulaX,  sch_copd))

# "pseudo" names: remove parens, replace ": " with "_", and " & " with "Ξ"
fix_covar_name(s::String) = replace(replace(replace(s, "("=>"", ")"=>""), ": "=>"_"), " & "=>"Ξ")
vPseudoFrmlNames_copd  = fix_covar_name.(vFrmlNames_copd)

# Indices of covariates (non-Site columns)
idx_covar_copd  = findall(.!occursin.("Site", vFrmlNames_copd))


9-element Vector{Int64}:
 1
 2
 3
 4
 5
 6
 7
 8
 9

In [13]:
# Define Z as an identity matrix (no grouping)
mZ_id_copd = Matrix{Float64}(I, size(copd.mY, 2), size(copd.mY, 2))

999×999 Matrix{Float64}:
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0

In [14]:
# Center COPD Y
Y_copd_centered = copd.mY .- mean(copd.mY; dims=1)

784×999 Matrix{Float64}:
  0.728162   -1.06166      0.521195   …  -0.216604   -0.351333    0.15616
  1.2142     -0.329079     0.17308        1.06625    -0.269919   -0.274296
  0.439633    0.817412    -0.531919      -0.244868   -0.203986   -0.488162
 -1.32416    -0.859182    -1.05515       -0.534012    0.73783     1.16527
  0.879904   -0.747748     2.10626        1.67352     2.16807     0.621652
  1.57041     0.013876     3.62777    …   4.57287    -0.0811238  -0.123824
 -1.18335     1.82516     -0.586831      -0.732048   -0.0968187   0.598783
 -1.03611     1.15481     -1.09194       -0.594452   -0.47619     0.125594
 -1.3051     -0.164123    -1.11712       -0.810295   -0.276888    1.45895
 -1.33534    -0.0770829    1.04624       -0.849367    0.463885    0.787756
 -1.12772    -0.829702    -1.14987    …  -0.521503    0.607445    1.29849
 -0.218956    2.3374      -0.162038       0.681653    1.17479     1.37088
 -0.233852    0.0919344   -0.691715      -0.302834   -1.58415    -1.03996
  ⋮   

In [15]:
"""
Split rows of Y (n×m) and X (n×p) into train/test.

Returns:
  (Y_tr, X_tr, Y_te, X_te, idx_tr, idx_te)
"""
function train_test_split_rows(Y, X; train_frac=0.70, seed=1234, shuffle=true)
    n = size(Y, 1)
    @assert size(X, 1) == n "X and Y must have the same number of rows (individuals)."

    rng = MersenneTwister(seed)
    idx = collect(1:n)
    if shuffle
        Random.shuffle!(rng, idx)
    end

    n_tr = floor(Int, train_frac * n)
    idx_tr = idx[1:n_tr]
    idx_te = idx[n_tr+1:end]

    Y_tr = @view Y[idx_tr, :]
    X_tr = @view X[idx_tr, :]
    Y_te = @view Y[idx_te, :]
    X_te = @view X[idx_te, :]

    return (Y_tr, X_tr, Y_te, X_te, idx_tr, idx_te)
end

# ---- run split COPD Data ----
Y_tr, X_tr, Y_te, X_te, idx_tr, idx_te =
    train_test_split_rows(Y_copd_centered, mXcopd; train_frac=0.70, seed=2026)

@show size(Y_tr) size(X_tr) size(Y_te) size(X_te)
@show length(idx_tr) length(idx_te)

size(Y_tr) = (548, 999)
size(X_tr) = (548, 10)
size(Y_te) = (236, 999)
size(X_te) = (236, 10)
length(idx_tr) = 548
length(idx_te) = 236


236

In [16]:
# ---------------------------------------------------------
# Coefficients via MatrixLM for training data
# ---------------------------------------------------------
copdZsp_tr = mlm(RawData(Response(Y_tr), Predictors(X_tr, mZ_id_copd)), addXIntercept=false, addZIntercept=false)

Mlm([-1.0024645631586762 0.519137526779583 … 1.8417637941995348 -0.19369749688326215; 0.18054789277205185 -0.030992874265464397 … 0.2562918720462106 -0.04304195694806943; … ; 0.043386118973545294 -0.11001247164523582 … 0.08878738186748097 0.0028458368662201547; 0.032566885931571476 0.06510298417529563 … -0.005776371892123309 0.05989605825761284], [0.19094558275933604 0.18505059964896664 … 0.17650457311105425 0.20200605618807446; 0.0018114254935384033 0.0017555021119351735 … 0.0016744293260894723 0.0019163518461143248; … ; 0.0031902331912025917 0.0030917424562585823 … 0.002948959276254314 0.003375026622571197; 0.0020790479136373373 0.002014862336995913 … 0.001921811749562686 0.0021994762255865205], [0.9293261985271388 -0.16727809162865537 … 0.05737084039894526 -0.3158970106544454; -0.16727809162865537 0.9006354995061193 … -0.15221706386196515 -0.009400393411382399; … ; 0.05737084039894526 -0.15221706386196515 … 0.8590422547699991 0.11373772644062527; -0.3158970106544454 -0.0094003934113

In [17]:
copdZsp_coef_tr = MatrixLM.coef(copdZsp_tr)

10×999 Matrix{Float64}:
 -1.00246      0.519138     0.11624      …   1.84176     -0.193697
  0.180548    -0.0309929    0.175911         0.256292    -0.043042
  0.005388    -0.00479892  -0.00502083      -0.0210259   -0.00940137
  0.0173752    0.00216972   0.00892039      -0.0182736    0.0264112
 -2.67085e-5  -0.00431652  -0.000949421      0.00225464   0.000896327
  0.0151943   -0.00716418   0.0152468    …   0.00612566  -0.00175379
 -0.0712256    0.147453    -0.0105228       -0.158521    -0.0471518
  0.0600663   -0.0425      -0.069813        -0.00677606   0.0527509
  0.0433861   -0.110012     0.0101168        0.0887874    0.00284584
  0.0325669    0.065103     0.127045        -0.00577637   0.0598961

In [18]:
copdZsp_tstat_tr = MatrixLM.t_stat(copdZsp_tr)

10×999 Matrix{Float64}:
 -2.29411     1.20681    0.261016  …   0.845393   4.38385    -0.430965
  4.24211    -0.739709   4.05557       2.80265    6.26327    -0.983228
  0.960586   -0.869083  -0.878318     -1.16814   -3.89888    -1.62957
  2.48531     0.315256   1.25199       0.512381  -2.71863     3.67291
 -0.0141263  -2.31911   -0.492725      0.933591   1.24031     0.460911
  2.92981    -1.40325    2.88473   …   0.394548   1.22854    -0.328782
 -1.29961     2.73299   -0.188397      0.513772  -3.00843    -0.836462
  0.738126   -0.530516  -0.841791     -1.84979   -0.0866071   0.630234
  0.768139   -1.97852    0.175752     -0.639241   1.635       0.0489859
  0.71424     1.45037    2.73397      -0.112213  -0.131765    1.27714

In [19]:
copdZsp_se_tr = abs.(copdZsp_coef_tr ./ copdZsp_tstat_tr)

10×999 Matrix{Float64}:
 0.436973    0.430175    0.445335    …  0.477211    0.420124    0.449451
 0.0425608   0.0418987   0.0433753      0.04648     0.0409198   0.0437762
 0.00560908  0.00552182  0.00571642     0.00612558  0.00539281  0.00576924
 0.00699118  0.00688241  0.00712496     0.00763495  0.00672161  0.00719081
 0.0018907   0.00186128  0.00192688     0.0020648   0.0018178   0.00194469
 0.0051861   0.00510542  0.00528534  …  0.00566365  0.00498613  0.00533419
 0.0548055   0.0539529   0.0558543      0.0598522   0.0526923   0.0563705
 0.0813767   0.0801107   0.0829339      0.0888702   0.078239    0.0837004
 0.0564821   0.0556034   0.057563       0.0616832   0.0543043   0.058095
 0.0455966   0.0448872   0.0464691      0.0497953   0.0438385   0.0468986

In [20]:
# Factorize superclasses and subclasses
# convert to categorical to get integer codes
copd.dfRef.SuperClassID = categorical(copd.dfRef.SuperClassID)
copd.dfRef.SubClassID   = categorical(copd.dfRef.SubClassID)

# integer codes (1-based)
superclass_of_met = levelcode.(copd.dfRef.SuperClassID)
subclass_of_met   = levelcode.(copd.dfRef.SubClassID)

G = length(levels(copd.dfRef.SuperClassID))  # number of superclasses
H = length(levels(copd.dfRef.SubClassID))    # number of subclasses

106

In [21]:
# Map subclasses to their superclasses
# For each subclass level, find its parent superclass code
super_of_sub = similar(1:H)
for (h, sublev) in enumerate(levels(copd.dfRef.SubClassID))
    # look up the first metabolite with this subclass and get its superclass code
    idx = findfirst(==(sublev), copd.dfRef.SubClassID)
    super_of_sub[h] = superclass_of_met[idx]
end

In [22]:
# Helper functions for Gibbs sampler

# Half-Cauchy(0, s) via truncation
half_cauchy(s) = truncated(Cauchy(0.0, s), 0.0, Inf)

# Normal update from precision parameters
@inline function normal_update_from_prec(lik_prec, lik_sum, prior_prec, prior_mean)
    post_prec = lik_prec + prior_prec
    post_mean = (lik_sum + prior_prec * prior_mean) / post_prec
    post_var  = 1.0 / post_prec
    return post_mean, post_var
end

# Inverse Gamma update
@inline ig_sample(shape, scale) = rand(InverseGamma(shape, scale))

ig_sample (generic function with 1 method)

In [24]:
# Gibbs Sampler for one level hierarchy 

function gibbs_meta_hier_traces_db(
    b_obs::Vector{Float64},           # length m
    se_obs::Vector{Float64},          # length m
    subclass_of_met::Vector{Int},     # length m, values in 1..H (Total DB class)
    H::Int;                           # number of DB categories
    mu0::Float64 = 0.0,
    s0::Float64 = 1.0,
    halfcauchy_scale::Float64 = 1.0,
    n_iter::Int = 2000,
    burnin::Int = 500,
    thin::Int = 1,
    seed::Int = 1234,
)
    @assert length(b_obs) == length(se_obs) "b_obs and se_obs must have same length"
    @assert length(subclass_of_met) == length(b_obs) "subclass_of_met must have length m"
    @assert maximum(subclass_of_met) == H "subclass_of_met must be in 1..H"
    @assert burnin < n_iter "burnin must be < n_iter"
    @assert thin ≥ 1

    Random.seed!(seed)
    m = length(b_obs)
    lam_j = 1.0 ./ (se_obs .^ 2)  # observation precisions

    # --- initialize state ---
    theta = copy(b_obs)                                     # length m (metabolite-level effects)
    beta  = [mean(theta[subclass_of_met .== h]) for h in 1:H]  # class means
    theta0 = mean(beta)                                     # global mean over classes

    tau_w2 = 1.0   # within-class variance for theta_j | beta_h
    tau_v2 = 1.0   # between-class variance for beta_h | theta0

    lambda_w = 1.0    # IG mixture auxiliaries for half-Cauchy
    lambda_v = 1.0

    # --- precompute group indices ---
    idx_by_sub = [findall(==(h), subclass_of_met) for h in 1:H]

    # --- storage sizes ---
    n_keep = floor(Int, (n_iter - burnin) ÷ thin)

    # scalars per draw (for quick monitoring)
    draws_scalar = DataFrame(
        theta0 = Vector{Float64}(undef, n_keep),
        tau_w2 = Vector{Float64}(undef, n_keep),
        tau_v2 = Vector{Float64}(undef, n_keep),
    )

    # vectors per draw
    beta_draws  = Array{Float64}(undef, H, n_keep)   # columns = kept iters
    theta_draws = Array{Float64}(undef, m, n_keep)

    keep_idx = 0

    for it in 1:n_iter
        #######################
        # 1) update theta_j   #
        #######################
        for j in 1:m
            h = subclass_of_met[j]
            lik_prec   = lam_j[j]
            lik_sum    = lam_j[j] * b_obs[j]
            prior_prec = 1.0 / tau_w2
            prior_mean = beta[h]
            mean_th = (lik_sum + prior_prec * prior_mean) / (lik_prec + prior_prec)
            var_th  = 1.0 / (lik_prec + prior_prec)
            theta[j] = rand(Normal(mean_th, sqrt(var_th)))
        end

        #######################
        # 2) update beta_h    #
        #######################
        for h in 1:H
            J = idx_by_sub[h]
            mh = length(J)

            # likelihood: theta_j | beta_h ~ N(beta_h, tau_w2)
            lik_prec   = mh / tau_w2
            lik_sum    = (1.0 / tau_w2) * sum(theta[J])

            # prior: beta_h | theta0 ~ N(theta0, tau_v2)
            prior_prec = 1.0 / tau_v2
            prior_mean = theta0

            mean_b = (lik_sum + prior_prec * prior_mean) / (lik_prec + prior_prec)
            var_b  = 1.0 / (lik_prec + prior_prec)
            beta[h] = rand(Normal(mean_b, sqrt(var_b)))
        end

        #######################
        # 3) update theta0    #
        #######################
        # beta_h | theta0 ~ N(theta0, tau_v2)
        lik_prec   = H / tau_v2
        lik_sum    = (1.0 / tau_v2) * sum(beta)

        # prior: theta0 ~ N(mu0, s0^2)
        prior_prec = 1.0 / (s0^2)
        prior_mean = mu0

        mean_t0 = (lik_sum + prior_prec * prior_mean) / (lik_prec + prior_prec)
        var_t0  = 1.0 / (lik_prec + prior_prec)
        theta0  = rand(Normal(mean_t0, sqrt(var_t0)))

        ############################################
        # 4) update tau_w2 and tau_v2 via IG mix   #
        ############################################

        # tau_w2: within-class spread of theta_j around beta_{class(j)}
        ssw = sum((theta .- beta[subclass_of_met]).^2)
        tau_w2   = rand(InverseGamma((m + 1.0) / 2.0, 0.5 * ssw + 1.0 / lambda_w))
        lambda_w = rand(InverseGamma(1.0, 1.0 / (halfcauchy_scale^2) + 1.0 / tau_w2))

        # tau_v2: between-class spread of beta_h around theta0
        ssv = sum((beta .- theta0).^2)
        tau_v2   = rand(InverseGamma((H + 1.0) / 2.0, 0.5 * ssv + 1.0 / lambda_v))
        lambda_v = rand(InverseGamma(1.0, 1.0 / (halfcauchy_scale^2) + 1.0 / tau_v2))

        ###################################
        # 5) store post-burn (with thin) #
        ###################################
        if it > burnin && ((it - burnin) % thin == 0)
            keep_idx += 1
            draws_scalar.theta0[keep_idx] = theta0
            draws_scalar.tau_w2[keep_idx] = tau_w2
            draws_scalar.tau_v2[keep_idx] = tau_v2

            beta_draws[:, keep_idx]  .= beta
            theta_draws[:, keep_idx] .= theta
        end
    end

    return (; draws_scalar,
            beta_draws,
            theta_draws,
            last_state = (; theta0, tau_w2, tau_v2,
                          beta = copy(beta), theta = copy(theta)))
end

gibbs_meta_hier_traces_db (generic function with 1 method)

In [25]:
"""
Given theta_draws (m × n_keep), return:
  mean_theta :: Vector{Float64} length m
  sd_theta   :: Vector{Float64} length m
"""
function summarize_theta_draws(theta_draws::AbstractMatrix{<:Real})
    m, n_keep = size(theta_draws)
    mean_theta = vec(mean(theta_draws; dims=2))
    sd_theta   = vec(std(theta_draws; dims=2, corrected=true))
    return mean_theta, sd_theta
end

summarize_theta_draws

In [27]:
"""
Run one level Gibbs meta-hierarchy for each covariate (row of B_obs/SE_obs).

Returns:
  B_bayes  :: Matrix{Float64}  (p×m) posterior means
  SE_bayes :: Matrix{Float64}  (p×m) posterior SDs   (NOT frequentist SEs)
  res_list :: Vector           results per covariate (optional to keep)
"""
function fit_bayes_all_covariates_db(
    B_obs::AbstractMatrix{<:Real},          # p×m
    SE_obs::AbstractMatrix{<:Real},         # p×m
    subclass_of_met::Vector{Int},
    H::Int;
    mu0::Float64 = 0.0,
    s0::Float64 = 1.0,
    halfcauchy_scale::Float64 = 1.0,
    n_iter::Int = 5000,
    burnin::Int = 1000,
    thin::Int = 1,
    seed0::Int = 42,
    keep_results::Bool = false
)
    p, m = size(B_obs)
    @assert size(SE_obs) == (p, m) "SE_obs must have same shape as B_obs"

    B_bayes  = Array{Float64}(undef, p, m)
    SE_bayes = Array{Float64}(undef, p, m)

    res_list = keep_results ? Vector{Any}(undef, p) : Any[]

    for k in 1:p
        b_vec  = vec(Float64.(B_obs[k, :]))
        se_vec = vec(Float64.(SE_obs[k, :]))

        #mu0 = mu0_mode === :empirical ? mean(b_vec) : 0.0

        res_k = gibbs_meta_hier_traces_db(
            b_vec, se_vec, subclass_of_met, H;
            mu0 = mu0, s0 = s0,
            halfcauchy_scale = halfcauchy_scale,
            n_iter = n_iter, burnin = burnin, thin = thin,
            seed = seed0 + k
        )

        @assert res_k.theta_draws !== nothing "theta_draws missing; ensure save_theta=true"

        mean_theta, sd_theta = summarize_theta_draws(res_k.theta_draws)

        # store into row k
        @inbounds begin
            B_bayes[k, :]  .= mean_theta
            SE_bayes[k, :] .= sd_theta
        end

        if keep_results
            res_list[k] = res_k
        end

        println("Done covariate k=$k / $p")
    end

    return B_bayes, SE_bayes, res_list
end

fit_bayes_all_covariates_db

In [29]:
B_bayes_tr, SE_bayes_tr, res_by_cov =
    fit_bayes_all_covariates_db(
        copdZsp_coef_tr, copdZsp_se_tr,
        subclass_of_met, 
        #super_of_sub, 
        #G, 
        H;
        mu0 = 0.0,      # can use mean(b_vec) if we want empirical prior mean
        s0 = 1.0,
        n_iter = 5000, burnin = 1000, thin = 1,
        seed0 = 1000,
        keep_results = false
    )

@show size(B_bayes_tr) size(SE_bayes_tr)

Done covariate k=1 / 10
Done covariate k=2 / 10
Done covariate k=3 / 10
Done covariate k=4 / 10
Done covariate k=5 / 10
Done covariate k=6 / 10
Done covariate k=7 / 10
Done covariate k=8 / 10
Done covariate k=9 / 10
Done covariate k=10 / 10
size(B_bayes_tr) = (10, 999)
size(SE_bayes_tr) = (10, 999)


(10, 999)

In [30]:
"""
Predict Yhat and compute MSE summaries.

Inputs
  Y_te :: (n_te × m) matrix
  X_te :: (n_te × p) matrix
  B    :: (p × m) coefficient matrix

Returns
  mse_all :: Float64
  mse_met :: Vector{Float64} length m   (MSE per metabolite)
  mse_ind :: Vector{Float64} length n_te (MSE per individual)
"""
function test_mse(Y_te::AbstractMatrix{<:Real},
                  X_te::AbstractMatrix{<:Real},
                  B::AbstractMatrix{<:Real})

    n_te, m = size(Y_te)
    @assert size(X_te, 1) == n_te "X_te and Y_te must have same number of rows"
    p = size(X_te, 2)
    @assert size(B) == (p, m) "B must be p×m with p=size(X_te,2), m=size(Y_te,2)"

    # predictions
    Yhat = X_te * B                       # n_te × m
    R = Y_te .- Yhat                      # residuals

    mse_all = mean(abs2, R)               # overall mean squared error
    mse_met = vec(mean(abs2, R; dims=1))  # average over individuals -> per metabolite
    mse_ind = vec(mean(abs2, R; dims=2))  # average over metabolites -> per individual

    return mse_all, mse_met, mse_ind
end



test_mse

In [35]:
# Ensure Float64 inputs
Y_te_f = Float64.(Y_te)
X_te_f = Float64.(X_te)

B_mlm  = Float64.(copdZsp_coef_tr)
B_bys  = Float64.(B_bayes_tr)

mse_mlm, mse_mlm_met, mse_mlm_ind = test_mse(Y_te_f, X_te_f, B_mlm)
mse_bys, mse_bys_met, mse_bys_ind = test_mse(Y_te_f, X_te_f, B_bys)

println("Test MSE (MatrixLM): ", mse_mlm)
println("Test MSE (Bayes):    ", mse_bys)
println("Relative improvement (positive is better): ",
        (mse_mlm - mse_bys) / mse_mlm)

Test MSE (MatrixLM): 0.9723541740310366
Test MSE (Bayes):    0.9719145182946919
Relative improvement (positive is better): 0.0004521559613634289


In [36]:
println("Individual MSE (MatrixLM): ", mse_mlm_ind)
println("Individual MSE (Bayes):    ", mse_bys_ind)

Individual MSE (MatrixLM): [0.7972851096303735, 0.7189728724899078, 1.8628344777367793, 0.8387912412366247, 0.7788584590226225, 1.5121830144630362, 1.064221842835165, 0.7316916398586963, 0.7208818892735704, 0.7938268239130176, 0.8837423973743314, 0.6706816357452142, 1.060861894529557, 0.7717897550289345, 0.755450810471406, 1.1414557134640058, 1.8851572803080952, 1.1339842345663889, 0.7654168883374015, 0.7762999753675847, 1.3040621385499365, 0.586309689280009, 1.1641711650731303, 1.0166410556078869, 0.6158551697632408, 1.5089870113810218, 0.8600937640135744, 0.9406045037210407, 0.7171887862765493, 0.8212428698157554, 0.9147072327804345, 1.13131225235871, 0.5639908591926286, 0.6049847308918864, 0.954626286280691, 1.6307457698743553, 0.9596482655214832, 1.540209683849023, 0.7362379372123021, 0.8599882167214867, 0.5123525605544922, 2.055032405119944, 1.047041754135167, 1.312152040319443, 1.1768482209484834, 0.7170709238321389, 0.9505500773960377, 0.9263699429583193, 0.7012572673347371, 2.7