In [None]:
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt

In [None]:
from cmdstanpy import cmdstan_path, set_cmdstan_path

# set the path to the CmdStan installation
system_drive = os.environ.get("SystemDrive", "C:")
print(system_drive)
path_to_cmdstan = os.path.join(system_drive + os.sep, "Users", "fonta", "cmdstan")
print(path_to_cmdstan)
set_cmdstan_path(path_to_cmdstan)

cmdstan_path()

In [None]:
from cmdstanpy import CmdStanModel

In [None]:
# for reproducibility
__NP_SEED__ = 23456789
np.random.seed(__NP_SEED__)

# Code-along Chapter 16

## Code 1: Main Blocks of a Stan code and exploring cmdstanpy outputs

First example from pp. 371

In [None]:
MODEL_filename = "code1_pp371.stan"

In [None]:
MCMC_chains = 4
MCMC_warm_up_iters = 100
MCMC_sampling_iters = 200

In [None]:
true_mu = 1.5
true_sigma = 0.2
num_samples = 100
y_data = np.random.normal(loc=true_mu, scale=true_sigma, size=num_samples)
data = {'N': num_samples, 'Y': y_data}

In [None]:
# Instantiate a STAN model
stan_model_parent_dir = os.getcwd()
print(stan_model_parent_dir)
model = CmdStanModel(stan_file=os.path.join(stan_model_parent_dir, MODEL_filename))
print(model)
print(model.exe_info())

In [None]:
data

In [None]:
fit = model.sample(
    data=data,
    chains=MCMC_chains,
    iter_warmup=MCMC_warm_up_iters,
    iter_sampling=MCMC_sampling_iters,
    show_console=True
)

In [None]:
fit

In [None]:
print("Cmdstan Config", fit.metadata.cmdstan_config, sep="\n")
print("Stan Vars", fit.metadata.stan_vars, sep="\n")
print("Method Vars", fit.metadata.method_vars, sep="\n")

In [None]:
(
    fit.metadata.cmdstan_config['num_warmup'],
    fit.metadata.cmdstan_config['num_samples'],
    # fit.metadata.cmdstan_config['num_chains'],
    fit.chains,
    fit.metadata.cmdstan_config['draws_warmup'],
    fit.metadata.cmdstan_config['draws_sampling'],
    fit.metadata.cmdstan_config['algorithm'],
    fit.metadata.cmdstan_config['engine'],
)

In [None]:
fit.summary(percentiles=[5, 25, 50, 75, 95], sig_figs=2)

In [None]:
print(fit.diagnose())

In [None]:
fit.stan_variables().keys()

In [None]:
(
    fit.stan_variable("mu").shape,
    fit.stan_variable("mu").shape == (MCMC_chains * MCMC_sampling_iters, )
 )

In [None]:
df_draws = fit.draws_pd()
df_draws

In [None]:
# plot histogram of var with hue by chain
# var_name = "mu"
var_name = "sigma"

sns.histplot(
    data=df_draws, 
    x=var_name, 
    hue='chain__', 
    bins=50, 
    kde=True, 
    palette=sns.color_palette("Set1", n_colors=MCMC_chains)
)
plt.title(f"Histogram of {var_name}")

In [None]:
# plot histogram of var (all chains)
var_name = "mu"
# var_name = "sigma"

sns.histplot(
    data=df_draws, 
    x=var_name,  
    bins=50, 
    kde=True
)
plt.title(f"Histogram of {var_name}")

## Code 2: Functions

Introducing functions in pp. 381

In [None]:
MODEL_filename_2 = "code2_pp381.stan"

In [None]:
num_samples_2 = 100

# weights
true_mu_x = 60
true_sigma_x = 10
x_data = np.random.normal(loc=true_mu_x, scale=true_sigma_x, size=num_samples_2)

# heights model as function of weights
true_beta = 0.3
true_sigma_2 = 0.3
y_data = true_beta * np.log(x_data) + np.random.normal(loc=0, scale=true_sigma_2, size=num_samples_2)

data_2 = {'N': num_samples, 'Y': y_data, 'X': x_data}

In [None]:
# Instantiate a STAN model
stan_model_parent_dir = os.getcwd()
print(stan_model_parent_dir)
model_2 = CmdStanModel(stan_file=os.path.join(stan_model_parent_dir, MODEL_filename_2))
print(model_2)
print(model_2.exe_info())

In [None]:
fit2 = model_2.sample(
    data=data_2,
    chains=MCMC_chains,
    iter_warmup=MCMC_warm_up_iters,
    iter_sampling=MCMC_sampling_iters,
    show_console=True
)

In [None]:
fit2.summary(percentiles=[5, 25, 50, 75, 95], sig_figs=2)

## Code 3: Independent Sampling Example

Generating independent samples of a distribution.

In [None]:
# negative binomial 2 distribution
MODEL_filename_3 = "code_3_pp389.stan"

In [None]:
# Instantiate a STAN model
stan_model_parent_dir = os.getcwd()
print(stan_model_parent_dir)
model_3 = CmdStanModel(stan_file=os.path.join(stan_model_parent_dir, MODEL_filename_3))
print(model_3)
print(model_3.exe_info())

In [None]:
fit3 = model_3.sample(
    data={
        'mu': 10,
        'kappa': 5
        },
    fixed_param=True,
    chains=1,
    iter_warmup=1,
    iter_sampling=4000,
    show_console=True
)

In [None]:
fit3.summary(percentiles=[5, 25, 50, 75, 95], sig_figs=2)

In [None]:
df3_draws = fit3.draws_pd()
df3_draws

In [None]:
# plot histogram of var (all chains)
var_name = "Y"

sns.histplot(
    data=df3_draws, 
    x=var_name, 
    bins=20,
    kde=True
)
plt.title(f"Histogram of {var_name}")

## Code 4: Discrete Parameters

HMC doesn't support discrete parameters by default, thus there is a workaround to marginalize LPDFs and obtain estimation of discrete params.

In [None]:
# negative binomial 2 distribution
MODEL_filename_4 = "code_4_pp403.stan"

In [None]:
# this was a clumpsy way to generate the data in R
# it can definitely be done in a more organized way with Python and numpy

nStudy = 20
N = 10  # number of trials
Z = np.zeros((N, nStudy), dtype=int)  # matrix of successes
theta = np.array([0.1, 0.9])  # probability of success for each study
state = np.zeros(nStudy)  # state of the study

for i in range(nStudy):
    if i < int(nStudy / 2):
        state[i] = 1
        Z[:, i] = np.random.binomial(n=1, p=theta[0], size=N)
    else:
        state[i] = 0
        Z[:, i] = np.random.binomial(n=1, p=theta[1], size=N)
        
X = np.sum(Z, axis=0)  # sum of successes for each study (axis=0 means summing over rows, thus keeping the shape of the columns)
print(Z.shape, X.shape, X.dtype)
print('Z for first half of columns\n', pd.DataFrame(Z[:, :int(nStudy / 2)]))
print('Z for second half of columns\n', pd.DataFrame(Z[:, int(nStudy / 2):]))
data_4 = {'N': N, 'X': X, 'nStudy': nStudy}
data_4

In [None]:
# Instantiate a STAN model
stan_model_parent_dir = os.getcwd()
print(stan_model_parent_dir)
model_4 = CmdStanModel(stan_file=os.path.join(stan_model_parent_dir, MODEL_filename_4))
print(model_4)
print(model_4.exe_info())

In [None]:
fit4 = model_4.sample(
    data=data_4,
    chains=MCMC_chains,
    iter_warmup=MCMC_warm_up_iters,
    iter_sampling=MCMC_sampling_iters,
    show_console=True
)

In [None]:
fit4.summary(percentiles=[20, 50, 75], sig_figs=2)

In [None]:
fit4.stan_variable("theta").shape

In [None]:
fit4.stan_variable("pstate").shape

In [None]:
df4_draws = fit4.draws_pd()
df4_draws = df4_draws[['chain__', 'iter__', 'draw__'] + [col for col in df4_draws.columns if ('theta' in col) or ('pstate' in col) or ('alpha' in col)]]
df4_draws.round(1)

In [None]:
# check that the state is correctly inferred
(fit4.stan_variable("pstate").mean(axis=0) > 0.5).astype(int) - state