In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler

import cmdstanpy
from cmdstanpy import CmdStanModel

In [7]:
df = pd.read_csv("df_reduced.csv")
print(df.shape)

(1021, 19)


In [8]:
df.columns

Index(['CAI', 'Trigliceridi', 'Colesterolo_Hdl', 'Glucosio', 'PMAX', 'BMI',
       'Alanina_aminotransferasi_alt', 'Colesterolo_totale',
       'Distribuzione_di_volume', 'Ematocrito_hct', 'Eosinofili_perc',
       'Leucociti_wbc', 'Linfociti_perc', 'Monociti_perc', 'Piastrine',
       'Polso', 'Proteine_totali', 'Volume_medio', 'Eta'],
      dtype='object')

In [9]:
stan_code = r"""
data {
  int<lower=1> N;                       // total observations
  int<lower=1> I;                       // number of donors
  int<lower=1> K;                       // number of target variables (K = 5)
  int<lower=1> P;                       // number of covariates

  array[N] int<lower=1, upper=I> id;     // donor index for each observation
  matrix[N, K] Y;                       // log target variables (preferably centered)
  matrix[N, P] X;                       // covariate matrix
}

parameters {
  // Fixed effects
  matrix[P, K] beta;

  // Random effects
  // standard normal latent variables
  matrix[I, K] z_b;

  // deviations (sd) of random effects
  vector<lower=0>[K] tau_b;

  // correlation between components of random effects
  cholesky_factor_corr[K] L_Omega_b;

  // Residual covariance
  // residual standard deviations
  vector<lower=0>[K] tau_eps;

  // residual correlation
  cholesky_factor_corr[K] L_Omega_eps;
}

transformed parameters {
  // Cholesky of covariance for random effects
  matrix[K, K] L_Sigma_b = diag_pre_multiply(tau_b, L_Omega_b);

  // random effects non-centered: b_i = L_Sigma_b * z_b_i
  matrix[I, K] b;
  {
    matrix[K, I] z_b_t = z_b';          // (K x I)
    matrix[K, I] b_t   = L_Sigma_b * z_b_t;    // (K x I)
    b = b_t';                           // (I x K)
  }

  // Cholesky of residual covariance
  matrix[K, K] L_Sigma = diag_pre_multiply(tau_eps, L_Omega_eps);
}

model {
  // Prior for fixed effects
  to_vector(beta) ~ normal(0, 2);

  // Priors for random effects hyperparameters
  tau_b ~ normal(0, 0.5) T[0, ];      // sd of random effects
  L_Omega_b ~ lkj_corr_cholesky(4);    // moderate prior on correlation
  to_vector(z_b) ~ normal(0, 1);      // non-centered

  // Priors for residual covariance
  tau_eps ~ normal(0, 0.5) T[0, ];      // sd of residuals
  L_Omega_eps ~ lkj_corr_cholesky(4);

  // Likelihood
  for (n in 1:N) {
    row_vector[K] mu_n = X[n] * beta + b[id[n]];
    Y[n] ~ multi_normal_cholesky(mu_n, L_Sigma);
  }
}

generated quantities {
  vector[N] log_lik;

  matrix[N, K] Y_rep;

  // Mean prediction for each observation (N x K)
  matrix[N, K] mu;

  // Full covariance matrices (K x K) from Cholesky factors
  cov_matrix[K] Sigma_b = tcrossprod(L_Sigma_b);
  cov_matrix[K] Sigma_eps = tcrossprod(L_Sigma);

  // Correlation matrices (K x K) from Cholesky factors
  corr_matrix[K] Omega_b = tcrossprod(L_Omega_b);
  corr_matrix[K] Omega_eps = tcrossprod(L_Omega_eps);

  for (n in 1:N) {
    row_vector[K] mu_n_row = X[n] * beta + b[id[n]];

    vector[K] mu_n_vec = mu_n_row';

    mu[n] = mu_n_row;

    // Pointwise Log-likelihood
    log_lik[n] = multi_normal_cholesky_lpdf(Y[n]' | mu_n_vec, L_Sigma);

    // Posterior Prediction
    Y_rep[n] = multi_normal_cholesky_rng(mu_n_vec, L_Sigma)';
  }
}
"""
stan_file = "Model_5.stan"
with open(stan_file, "w") as f:
    f.write(stan_code)

print("Stan model written to:", stan_file)

model = CmdStanModel(stan_file=stan_file)
print("Model compiled.")

11:45:10 - cmdstanpy - INFO - compiling stan file /Users/eli/Desktop/BS_project/Model_5.stan to exe file /Users/eli/Desktop/BS_project/Model_5


Stan model written to: Model_5.stan


11:45:23 - cmdstanpy - INFO - compiled model executable: /Users/eli/Desktop/BS_project/Model_5


Model compiled.


In [10]:
ID_COL = "CAI"
ADD_INTERCEPT = False
target_list = ['PMAX', 'Glucosio', 'Trigliceridi', 'Colesterolo_Hdl', 'BMI']
covariate_cols = df.columns.drop(list(target_list) + [ID_COL])

cols_needed = [ID_COL] + target_list + list(covariate_cols)
df_model = df[cols_needed].dropna().copy()

Y_mat = df_model[target_list].to_numpy(dtype=float)
N, K = Y_mat.shape
X_mat = df_model[covariate_cols].to_numpy(dtype=float)
_, P = X_mat.shape

donor_ids, id_index = np.unique(df_model[ID_COL].to_numpy(), return_inverse=True)
I = len(donor_ids)
id_stan = id_index + 1

stan_data = {
    "N": int(N),
    "I": int(I),
    "K": int(K),
    "P": int(P),
    "Y": Y_mat,
    "X": X_mat,
    "id": id_stan
}

In [11]:
fit = model.sample(
    data=stan_data,
    chains=4,
    parallel_chains=4,
    iter_warmup=1500,
    iter_sampling=1000,
    adapt_delta=0.9,
    max_treedepth=12,
    show_progress=True
)
print(fit.diagnose())

11:45:25 - cmdstanpy - INFO - CmdStan start processing
chain 1:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m
chain 2:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m[A

chain 3:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m[A[A


chain 4:   0%|[33m                               [0m| 0/2500 [00:00<?, ?it/s, (Warmup)][0m[A[A[A


chain 1:   4%|[33m▊                  [0m| 100/2500 [02:44<1:05:36,  1.64s/it, (Warmup)][0m[A[A[A

chain 3:   4%|[33m▊                  [0m| 100/2500 [02:45<1:06:16,  1.66s/it, (Warmup)][0m[A[A
chain 2:   4%|[33m▊                  [0m| 100/2500 [02:54<1:09:59,  1.75s/it, (Warmup)][0m[A


chain 1:   8%|[33m█▌                 [0m| 200/2500 [05:36<1:04:44,  1.69s/it, (Warmup)][0m[A[A[A
chain 2:   8%|[33m█▌                 [0m| 200/2500 [05:50<1:07:09,  1.75s/it, (Warmup)][0m[A

chain 3:   8%|[33m█▌                 [0m

                                                                                                                                                                                                                                                                                                                                


12:44:01 - cmdstanpy - INFO - CmdStan done processing.
Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[2] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)
	Exception: lkj_corr_cholesky_lpdf: Random variable[3] is 0, but must be positive! (in 'Model_5.stan', line 57, column 2 to column 35)





	Chain 3 had 1 iterations at max treedepth (0.1%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.


Checking sampler transitions treedepth.
1 of 4000 (0.03%) transitions hit the maximum treedepth limit of 12, or 2^12 leapfrog steps.
Trajectories that are prematurely terminated due to this limit will result in slow exploration.
For optimal performance, increase this limit.

Checking sampler transitions for divergences.
No divergent transitions found.

Checking E-BFMI - sampler transitions HMC potential energy.
E-BFMI satisfactory.

Rank-normalized split effective sample size satisfactory for all parameters.

Rank-normalized split R-hat values satisfactory for all parameters.

Processing complete.



In [12]:
print(fit.diagnose())
summary_df = fit.summary()
summary_df.head(50)

Checking sampler transitions treedepth.
1 of 4000 (0.03%) transitions hit the maximum treedepth limit of 12, or 2^12 leapfrog steps.
Trajectories that are prematurely terminated due to this limit will result in slow exploration.
For optimal performance, increase this limit.

Checking sampler transitions for divergences.
No divergent transitions found.

Checking E-BFMI - sampler transitions HMC potential energy.
E-BFMI satisfactory.

Rank-normalized split effective sample size satisfactory for all parameters.

Rank-normalized split R-hat values satisfactory for all parameters.

Processing complete.



Unnamed: 0,Mean,MCSE,StdDev,MAD,5%,50%,95%,ESS_bulk,ESS_tail,ESS_bulk/s,R_hat
lp__,10062.7,0.534746,15.0455,15.3612,10037.4,10063.1,10086.9,798.369,1541.43,0.150732,1.00721
"beta[1,1]",-0.001793,3.7e-05,0.002743,0.002728,-0.006244,-0.001783,0.002851,5636.28,3249.28,1.06413,0.999794
"beta[1,2]",0.006693,4.7e-05,0.003614,0.003561,0.000675,0.006646,0.012737,5971.17,3425.31,1.12735,1.00057
"beta[1,3]",0.025227,0.000154,0.012139,0.012059,0.005184,0.025535,0.045247,6327.09,3022.06,1.19455,1.00025
"beta[1,4]",0.000735,5.5e-05,0.004242,0.004199,-0.006267,0.000744,0.00774,6082.87,3466.93,1.14844,1.00038
"beta[1,5]",0.003557,2.3e-05,0.001633,0.001644,0.000864,0.003576,0.006236,5283.79,3049.81,0.997577,1.00121
"beta[2,1]",-0.003393,4.3e-05,0.003096,0.003125,-0.008544,-0.003419,0.00174,5377.61,2693.08,1.01529,1.00167
"beta[2,2]",0.002746,5e-05,0.003934,0.00395,-0.003604,0.002646,0.009215,6064.54,3090.19,1.14498,1.00104
"beta[2,3]",0.099826,0.000187,0.013215,0.013429,0.078192,0.0998,0.121755,5026.23,3613.73,0.94895,1.00014
"beta[2,4]",0.017572,6.8e-05,0.004742,0.004669,0.009576,0.017664,0.025257,4876.33,3116.6,0.92065,1.00011


In [13]:
with open("Model_5.pkl", "wb") as f:
    pickle.dump(fit, f)