In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler

import cmdstanpy
from cmdstanpy import CmdStanModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("df_reduced.csv")

In [3]:
df.shape

(1021, 19)

In [4]:
stan_code = r"""
data {
  int<lower=1> N;                       // total observations
  int<lower=1> I;                       // number of donors
  int<lower=1> K;                       // number of target variables (K = 5)
  int<lower=1> P;                       // number of covariates

  array[N] int<lower=1, upper=I> id;     // donor index for each observation
  matrix[N, K] Y;                       // log target variables (preferably centered)
  matrix[N, P] X;                       // covariate matrix
}

parameters {
  // Fixed effects
  matrix[P, K] beta;

  // Random effects
  // standard normal latent variables
  matrix[I, K] z_b;

  vector<lower=0>[K] tau_b;

  cholesky_factor_corr[K] L_Omega_b;

  // Residual covariance
  // residual standard deviations
  vector<lower=0>[K] tau_eps;

  // residual correlation
  cholesky_factor_corr[K] L_Omega_eps;
}

transformed parameters {
  // Cholesky of covariance for random effects
  matrix[K, K] L_Sigma_b = diag_pre_multiply(tau_b, L_Omega_b);

  matrix[I, K] b;
  {
    matrix[K, I] z_b_t = z_b';          // (K x I)
    matrix[K, I] b_t   = L_Sigma_b * z_b_t;    // (K x I)
    b = b_t';                           // (I x K)
  }

  // Cholesky of residual covariance
  matrix[K, K] L_Sigma = diag_pre_multiply(tau_eps, L_Omega_eps);
}

model {
  // Prior for fixed effects
  to_vector(beta) ~ normal(0, 2);

  // Prior per i Random Effects
  tau_b ~ student_t(3, 0, 1);
  L_Omega_b ~ lkj_corr_cholesky(2);
  to_vector(z_b) ~ normal(0, 1);

  // Prior for residual covariance
  tau_eps ~ normal(0, 0.5) T[0, ];
  L_Omega_eps ~ lkj_corr_cholesky(4);

  // Likelihood
  for (n in 1:N) {
    row_vector[K] mu_n = X[n] * beta + b[id[n]];
    Y[n] ~ multi_normal_cholesky(mu_n, L_Sigma);
  }
}

generated quantities {
  vector[N] log_lik;

  // Matrix to store posterior predictions (N x K)
  // Essential for posterior predictive checks (PPC)
  matrix[N, K] Y_rep;

  // Mean prediction for each observation (N x K)
  matrix[N, K] mu;

  // Full covariance matrices (K x K) from Cholesky factors
  cov_matrix[K] Sigma_b = tcrossprod(L_Sigma_b);
  cov_matrix[K] Sigma_eps = tcrossprod(L_Sigma);

  // Correlation matrices (K x K) from Cholesky factors
  corr_matrix[K] Omega_b = tcrossprod(L_Omega_b);
  corr_matrix[K] Omega_eps = tcrossprod(L_Omega_eps);

  for (n in 1:N) {
    row_vector[K] mu_n_row = X[n] * beta + b[id[n]];

    // Define the vector version of the mean for lpdf/rng functions
    vector[K] mu_n_vec = mu_n_row';

    mu[n] = mu_n_row;

    // Pointwise Log-likelihood
    log_lik[n] = multi_normal_cholesky_lpdf(Y[n]' | mu_n_vec, L_Sigma);

    // Posterior Prediction
    Y_rep[n] = multi_normal_cholesky_rng(mu_n_vec, L_Sigma)';
  }
}
"""
stan_file = "Model_6.stan"
with open(stan_file, "w") as f:
    f.write(stan_code)

print("Stan model written to:", stan_file)

model = CmdStanModel(stan_file=stan_file)
print("Model compiled.")

12:55:26 - cmdstanpy - INFO - compiling stan file /Users/eli/Desktop/BS_project/Model_6.stan to exe file /Users/eli/Desktop/BS_project/Model_6


Stan model written to: Model_6.stan


12:55:39 - cmdstanpy - INFO - compiled model executable: /Users/eli/Desktop/BS_project/Model_6


Model compiled.


In [5]:
ID_COL = "CAI"
ADD_INTERCEPT = False
target_list = ['PMAX', 'Glucosio', 'Trigliceridi', 'Colesterolo_Hdl', 'BMI']
covariate_cols = df.columns.drop(list(target_list) + [ID_COL])

cols_needed = [ID_COL] + target_list + list(covariate_cols)
df_model = df[cols_needed].dropna().copy()

Y_mat = df_model[target_list].to_numpy(dtype=float)
N, K = Y_mat.shape
X_mat = df_model[covariate_cols].to_numpy(dtype=float)
_, P = X_mat.shape

donor_ids, id_index = np.unique(df_model[ID_COL].to_numpy(), return_inverse=True)
I = len(donor_ids)
id_stan = id_index + 1

stan_data = {
    "N": int(N),
    "I": int(I),
    "K": int(K),
    "P": int(P),
    "Y": Y_mat,
    "X": X_mat,
    "id": id_stan
}

In [6]:
fit = model.sample(
    data=stan_data,
    chains=4,
    parallel_chains=4,
    iter_warmup=1500,
    iter_sampling=1000,
    adapt_delta=0.9,
    max_treedepth=12,
    show_progress=True
)
print(fit.diagnose())

12:55:40 - cmdstanpy - INFO - CmdStan start processing
chain 1:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m
chain 2:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m[A

chain 3:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m[A[A


chain 4:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m[A[A[A


chain 4:   4%|[33m▊                  [0m| 100/2500 [02:43<1:05:29,  1.64s/it, (Warmup)][0m[A[A[A

chain 1:   4%|[33m▊                  [0m| 100/2500 [03:02<1:12:58,  1.82s/it, (Warmup)][0m[A[A
chain 2:   4%|[33m▊                  [0m| 100/2500 [03:09<1:15:51,  1.90s/it, (Warmup)][0m[A


chain 4:   8%|[33m█▌                 [0m| 200/2500 [06:05<1:11:25,  1.86s/it, (Warmup)][0m[A[A[A
chain 1:   8%|[33m█▌                 [0m| 200/2500 [06:39<1:17:46,  2.03s/it, (Warmup)][0m[A

chain 3:   8%|[33m█▌                 [0m

                                                                                                                                                                                                                                                                                                                                


14:23:07 - cmdstanpy - INFO - CmdStan done processing.
Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_6.stan', line 54, column 2 to column 35)





	Chain 1 had 86 iterations at max treedepth (8.6%)
	Chain 2 had 72 iterations at max treedepth (7.2%)
	Chain 3 had 74 iterations at max treedepth (7.4%)
	Chain 4 had 60 iterations at max treedepth (6.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.


Checking sampler transitions treedepth.
292 of 4000 (7.30%) transitions hit the maximum treedepth limit of 12, or 2^12 leapfrog steps.
Trajectories that are prematurely terminated due to this limit will result in slow exploration.
For optimal performance, increase this limit.

Checking sampler transitions for divergences.
No divergent transitions found.

Checking E-BFMI - sampler transitions HMC potential energy.
E-BFMI satisfactory.

Rank-normalized split effective sample size satisfactory for all parameters.

Rank-normalized split R-hat values satisfactory for all parameters.

Processing complete.



In [7]:
print(fit.diagnose())
summary_df = fit.summary()
summary_df.head(50)

Checking sampler transitions treedepth.
292 of 4000 (7.30%) transitions hit the maximum treedepth limit of 12, or 2^12 leapfrog steps.
Trajectories that are prematurely terminated due to this limit will result in slow exploration.
For optimal performance, increase this limit.

Checking sampler transitions for divergences.
No divergent transitions found.

Checking E-BFMI - sampler transitions HMC potential energy.
E-BFMI satisfactory.

Rank-normalized split effective sample size satisfactory for all parameters.

Rank-normalized split R-hat values satisfactory for all parameters.

Processing complete.



Unnamed: 0,Mean,MCSE,StdDev,MAD,5%,50%,95%,ESS_bulk,ESS_tail,ESS_bulk/s,R_hat
lp__,10150.7,0.531891,15.1299,14.8141,10125.4,10150.7,10175.3,822.767,1885.94,0.099422,1.00121
"beta[1,1]",-0.001806,3.6e-05,0.002621,0.002576,-0.006121,-0.001828,0.00234,5273.83,3017.99,0.637281,1.00252
"beta[1,2]",0.006568,5.1e-05,0.003582,0.00365,0.000715,0.006618,0.012426,4921.98,3389.83,0.594765,0.999886
"beta[1,3]",0.025132,0.000157,0.011932,0.012049,0.005467,0.024945,0.044682,5860.01,3382.31,0.708115,1.00138
"beta[1,4]",0.000861,5.3e-05,0.004218,0.004084,-0.00594,0.000839,0.007828,6372.03,3546.14,0.769986,0.999708
"beta[1,5]",0.003537,2.2e-05,0.001621,0.001656,0.000902,0.003541,0.006143,5703.91,3216.86,0.689251,0.99991
"beta[2,1]",-0.00346,4.1e-05,0.00305,0.003046,-0.008498,-0.003511,0.001503,5536.37,3528.29,0.669006,1.00017
"beta[2,2]",0.00274,5.6e-05,0.003906,0.003953,-0.003541,0.002766,0.00908,4987.15,3248.05,0.60264,1.00137
"beta[2,3]",0.098953,0.0002,0.013142,0.013038,0.077452,0.098904,0.120324,4392.52,3059.67,0.530786,1.00027
"beta[2,4]",0.017702,6.6e-05,0.004779,0.004771,0.009884,0.017734,0.025619,5333.03,3548.43,0.644435,1.00061


In [8]:
with open("Model_6.pkl", "wb") as f:
    pickle.dump(fit, f)