# Data Extraction

In [1]:
import arviz as az
import numpy as np
import pymc3 as pm

time_after_injection = np.array([24, 32, 48, 56, np.nan, 70, 72, 75, 80, 96])
temperature = np.array([102.8, 104.5, 106.5, 107.0, 107.2, 105.1, 103.9, np.nan, 103.2, 102.1])

# Model

In [4]:
with pm.Model() as model:
    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sigma=100)
    beta1 = pm.Normal('beta1', mu=0, sigma=100)
    beta2 = pm.Normal('beta2', mu=0, sigma=100)
    tau = pm.Gamma("tau", alpha=0.001, beta=0.001)

    # Impute missing 'x' data
    time_imputed = pm.Normal('time_imputed', mu=np.nanmean(time_after_injection), sigma=np.nanstd(time_after_injection), observed=time_after_injection)

    # Quadratic regression equation
    mu = alpha + beta1 * time_imputed + beta2 * time_imputed**2

    # Likelihood (sampling distribution) of observations with missing 'y' data handled automatically
    likelihood = pm.Normal('likelihood', mu=mu, tau=tau, observed=temperature)

    # Inference
    trace = pm.sample(3000, target_accept=0.95)
    ppc = pm.sample_posterior_predictive(trace)
    inference_data = az.from_pymc3(trace=trace, posterior_predictive=ppc)

  return wrapped_(*args_, **kwargs_)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [likelihood_missing, time_imputed_missing, tau, beta2, beta1, alpha]


  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 31 seconds.


# Show Statistics

In [5]:
az.summary(trace)

Got error No model on context stack. trying to find log_likelihood in translation.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,97.29,3.09,91.582,103.358,0.054,0.039,3417.0,3699.0,1.0
beta1,0.323,0.112,0.104,0.529,0.002,0.001,3266.0,3476.0,1.0
beta2,-0.003,0.001,-0.005,-0.001,0.0,0.0,3344.0,3564.0,1.0
time_imputed_missing[0],56.264,11.372,35.317,77.258,0.143,0.105,6593.0,5728.0,1.0
likelihood_missing[0],104.975,1.5,102.015,107.63,0.021,0.015,5734.0,4970.0,1.0
tau,0.826,0.496,0.095,1.73,0.008,0.006,3252.0,4703.0,1.0


# Calculate R²

In [6]:
# Get temperatures where data is available.
observed_temps = temperature[~np.isnan(temperature)]

# Convert trace and ppc to ArviZ InferenceData and stack samples.
y_pred = inference_data.posterior_predictive.stack(sample=("chain", "draw"))["likelihood"].values.T

# Reshape y_pred and filter to match non-missing observed data.
y_pred_reshaped = y_pred.reshape(-1, temperature.shape[0])
non_missing_indices = np.where(~np.isnan(temperature))[0]
y_pred_filtered = y_pred_reshaped[:, non_missing_indices]

# Calculate and print Bayesian R-squared.
r2_score = az.r2_score(observed_temps, y_pred_filtered)
print(f'Bayesian R-squared:\n{r2_score}')


Bayesian R-squared:
r2        0.365722
r2_std    0.119161
dtype: float64


# Conslusion and Answers

### 1. Bayesian R²
    The utilized quadratic regression has a slightly higher Bayesian R²: 0.3657

### 2. Missing Data Estimators
    Missing time estimator (mean): 56.264 hours
    Missing temperature estimator (mean): 104.975 degrees F

### 3. Slope Credible Set Implications
    The 90% credible set is [0.105, 0.529] for beta1 and [-0.005, -0.001] for beta2. Given that neither of these contain zero is an indication that both the linear and quadratic terms in the model have some prediction significance. Further, in conjunction with a slightly higher R² value, this increase in predictive significance is indicative of a better model fit, as compared to a completely linear regression.