In [74]:
import arviz as az
import numpy as np
import pymc3 as pm

In [75]:
data_array = np.loadtxt('babies.csv', delimiter=',', skiprows=1)
mage = data_array[:, 0]             # mother's age
death = data_array[:, 1]            # firstborn death or not
delta_time = data_array[:, 2]       # time between births

# Standardizing the data
mage_st = (mage - np.mean(mage)) / np.std(mage)
delta_time_st = (delta_time - np.mean(delta_time)) / np.std(delta_time)


In [77]:
with pm.Model() as model:
    # Data
    mage_st_data = pm.Data('mage_data', mage_st)
    death_data = pm.Data('death_data', death)
    delta_time_st_data = pm.Data('delta_time_data', delta_time_st)

    # Priors
    intercept = pm.Normal('intercept', mu=0, sigma=10)
    beta_1 = pm.Normal('beta_1', mu=0, sigma=10)
    beta_2 = pm.Normal('beta_2', mu=0, sigma=10)
    tau = pm.Gamma('tau', alpha=0.001, beta=0.001)

    # Linear regression equation
    mu = intercept + beta_1 * mage_st_data + beta_2 * death_data

    # Likelihood (sampling distribution) of observations
    likelihood = pm.Normal('likelihood', mu=mu, tau=tau, observed=delta_time_st_data)

    # Inference
    trace = pm.sample(3000, target_accept=0.95)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [tau, beta_2, beta_1, intercept]


  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 12 seconds.


In [24]:
az.summary(trace)

Got error No model on context stack. trying to find log_likelihood in translation.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
intercept,0.009,0.007,-0.005,0.023,0.0,0.0,14019.0,8955.0,1.0
beta_1,-0.285,0.007,-0.299,-0.271,0.0,0.0,15135.0,8150.0,1.0
beta_2,-0.973,0.077,-1.121,-0.835,0.001,0.0,14800.0,9051.0,1.0
tau,1.097,0.012,1.075,1.12,0.0,0.0,13380.0,7941.0,1.0


# Prediction

In [70]:
# Standardize Sofie's age
sofie_mage_std = (24 - np.mean(mage)) / np.std(mage)

sofie = {"mage_data": np.array([sofie_mage_std]), "death_data": np.array([0])}

pm.set_data(sofie, model=model)
ppc = pm.sample_posterior_predictive(trace, model=model)
summary = az.summary(ppc, hdi_prob=0.95, kind="stats").mean()
summary


In [71]:
# Calculate the mean and standard deviation of the original delta_time data
original_mean = np.mean(delta_time)
original_std = np.std(delta_time)

# Unstandardize the mean
unstandardized_mean = (summary['mean'] * original_std) + original_mean

# Unstandardize the HDI
unstandardized_hdi_low = (summary['hdi_2.5%'] * original_std) + original_mean
unstandardized_hdi_high = (summary['hdi_97.5%'] * original_std) + original_mean

print("Unstandardized Mean:", unstandardized_mean)
print("95% CI for predicted time (low):", unstandardized_hdi_low)
print("95% CI for predicted time (high):", unstandardized_hdi_high)

Unstandardized Mean: 956.3636408657428
95% CI for predicted time (low): 155.6046814797537
95% CI for predicted time (high): 1758.4072402935276


[A]: the 95% credible set for beta_2 is [-1.126,-0.834]. It's signifcant as the range is relatively narrow, and the min and max are both well below 0.

[B]: The derivative in of the regression function with respect to mage is beta_1. The value of beta_1 i -0.285, with high confidence. This coefficient value is signifantly smaller than that of the death covariate (over 3 times smaller), and therefore accounts for significantly less influence on the time.

[C]: 956.329718983197 days.

[D]: 

In [72]:
# Standardize Sofie's age
ingerid_mage_std = (28 - np.mean(mage)) / np.std(mage)

ingerid = {"mage_data": np.array([ingerid_mage_std]), "death_data": np.array([1])}

pm.set_data(ingerid, model=model)
ppc = pm.sample_posterior_predictive(trace, model=model)
summary = az.summary(ppc, hdi_prob=0.95, kind="stats").mean()
summary

In [73]:
# Calculate the mean and standard deviation of the original delta_time data
original_mean = np.mean(delta_time)
original_std = np.std(delta_time)

# Unstandardize the mean
unstandardized_mean = (summary['mean'] * original_std) + original_mean

# Unstandardize the HDI
unstandardized_hdi_low = (summary['hdi_2.5%'] * original_std) + original_mean
unstandardized_hdi_high = (summary['hdi_97.5%'] * original_std) + original_mean

print("Unstandardized Mean:", unstandardized_mean)
print("95% CI for predicted time (low):", unstandardized_hdi_low)
print("95% CI for predicted time (high):", unstandardized_hdi_high)

Unstandardized Mean: 317.8975400932777
95% CI for predicted time (low): -485.43069937630287
95% CI for predicted time (high): 1122.938632951919


# Answers

[A]: the 95% credible set for beta_2 is [-1.126,-0.834]. It's signifcant as the range is relatively narrow, and the min and max are both well below 0.

[B]: The derivative in of the regression function with respect to mage is beta_1. The value of beta_1 i -0.285, with high confidence. This coefficient value is signifantly smaller than that of the death covariate (over 3 times smaller), and therefore accounts for significantly less influence on the time.

[C]: 956.329718983197 days.

[D]: Mean: 317.89 days; Credible set: [-485.43069937630287,1122.938632951919].