### Imports

In [12]:
import os
import numpy as np
from pathlib import Path
import pandas as pd
import geopandas as gpd
import scipy
from dotenv import load_dotenv
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
import pymc as pm
import pytensor
import pytensor.tensor as pt
import pytensor.sparse as sparse
import patsy
import arviz as az
import boto3

DATA_DIR = Path("data")

TRACE_PATH = DATA_DIR / 'trace-revise.nc'
INDEX_COL = "huc12"
bucket_name = "duke-research"
bucket_prefix = ""
file_name_inputs = "inputs.npz"
file_name_gpkg = "final.gpkg"

load_dotenv()

True

### Load data

#### Retrieve from S3

In [13]:
for file_name in [file_name_inputs, file_name_gpkg]:
    if os.path.exists(DATA_DIR / file_name):
        print(f"✓ {file_name} already exists")
        continue

    print(f"Downloading {file_name} from S3...")
    s3 = boto3.client('s3')
    s3.download_file(bucket_name, file_name, DATA_DIR/ file_name)
    print(f"✓ Successfully downloaded {file_name}")


✓ inputs.npz already exists
✓ final.gpkg already exists


In [14]:
final_gdf = gpd.read_file(DATA_DIR /  file_name_gpkg)
loaded_data = np.load(DATA_DIR / file_name_inputs)
W = scipy.sparse.csr_matrix((loaded_data['W_data'], loaded_data['W_indices'], loaded_data['W_indptr']))
X = loaded_data['X']
y = loaded_data['y']
coords = loaded_data['coords']
pretty_predictor_cols = loaded_data['pretty_predictor_cols']

W_sym = W + W.T
W_sym = (W_sym > 0).astype(float)

## Baseline modeling

### Linear model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=827)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

# Print smallest / largest coefficients as well as 90% range
coefs = lr_model.coef_
print("Smallest coefficients:")
print(coefs[np.argsort(coefs)[:3]])
print("Largest coefficients:")
print(coefs[np.argsort(coefs)[-3:]])
print("90% range of coefficients:")
print(np.percentile(coefs, 95) - np.percentile(coefs, 5))


r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Linear Regression R² on training set: {r2_train:.4f}")
print(f"Linear Regression R² on test set: {r2_test:.4f}")


Smallest coefficients:
[-8.94796198 -3.34667055 -2.0773022 ]
Largest coefficients:
[2.69161593 4.70096494 5.28841285]
90% range of coefficients:
2.5990572982445936
Linear Regression R² on training set: 0.3272
Linear Regression R² on test set: 0.3266


In [16]:


dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=pretty_predictor_cols.tolist())
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=pretty_predictor_cols.tolist())

params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 827  
}

num_boost_round = 100
xgb_model = xgb.train(params, dtrain, num_boost_round)

y_pred_xgb_train = xgb_model.predict(dtrain)
y_pred_xgb_test = xgb_model.predict(dtest)

r2_xgb_train = r2_score(y_train, y_pred_xgb_train)
r2_xgb_test = r2_score(y_test, y_pred_xgb_test)

print(f"XGBoost R² on training set: {r2_xgb_train:.4f}")
print(f"XGBoost R² on test set: {r2_xgb_test:.4f}")


XGBoost R² on training set: 0.7104
XGBoost R² on test set: 0.6374


### Declare model

In [None]:
USE_GAM          = False
RANDOM_SUBSAMPLE = True
USE_GP           = False
USE_DAG          = False
TRACK_MU         = True
MULTILEVEL_BETA  = True
ADD_INTERCEPT    = True
FLOAT_FORMAT = "float32"

n = len(y)
p = X.shape[1]

pytensor.config.floatX = FLOAT_FORMAT

y = y.astype(FLOAT_FORMAT)
X = X.astype(FLOAT_FORMAT)
W = W.astype(FLOAT_FORMAT)
coords = coords.astype(FLOAT_FORMAT)

if ADD_INTERCEPT:

    # Check to make sure no constant column is present
    if np.all(np.abs(X.mean(axis=0)) < 1e-6):
        X = np.concatenate([np.ones((X.shape[0], 1), dtype=FLOAT_FORMAT), X], axis=1)
        p = X.shape[1]
        pretty_predictor_cols = np.concatenate([["intercept"], pretty_predictor_cols], axis=0)

print(f"Preparing to run model inference with {X.shape[0]} samples and {X.shape[1]} predictors")
print(f"Range of response values is {y.min()} to {y.max()}")
print(f"Range of predictor values is {X.min()} to {X.max()}")

if RANDOM_SUBSAMPLE:
    is_used = np.random.rand(X.shape[0]) < 0.2
    n = is_used.sum()
else:
    is_used = np.ones(X.shape[0], dtype=bool)
    n = len(y)

dims = {'predictor': pretty_predictor_cols, "obs_id": final_gdf.loc[is_used, INDEX_COL].values}

if USE_GAM:
    spline_df = 3
    X_splines = np.stack([patsy.bs(x, df=spline_df) for x in X.T], axis=-1)
    dims['spline_degree']=np.arange(spline_df)


# If the multilevel beta option is used, the coefficients
# are given a multilevel prior in which the huc12 coefficients are grouped
# around a huc8 mean, which is in turn grouped around a huc4 mean
# and those are grouped around a huc2 mean. A global mean is also
# present at the top level. Each covariate gets a different scale parameter
# at each level of the model, the with the scale prior parameter getting
# progressively smaller from top to bottom. The integer-coded huc12, huc8, etc
# are stored as huc12_int and so on in final_gdf.
final_gdf_subset = final_gdf.loc[is_used].copy()
with pm.Model(coords=dims) as sparse_graph_model:

    X_data = pm.Data('X_data', X[is_used], dims=['obs_id', 'predictor'])

    if MULTILEVEL_BETA:
        unique_per_level = [1] + [len(final_gdf_subset[f'huc{i}'].unique()) for i in [2, 4, 8]]

        n_unique_combined = sum(unique_per_level)
        n_levels = len(unique_per_level)

        cumulative_unique = np.cumsum(unique_per_level)

        # Convention on levels is that 0 is for global mean, 1 is for huc2, 2 is for huc4, and 3 is for huc8.
        level_as_int = np.zeros(n_unique_combined, dtype=int)
        ptr = 0

        for i, n_codes in enumerate(unique_per_level):
            level_as_int[ptr:ptr+n_codes] = i 
            ptr += n_codes

        # Make exactly one row is marked as `0`
        assert np.sum(level_as_int == 0) == 1

        # We use a decreasing sequence of scale parameters
        # to encourage greater and greater shrinkage as we go down the levels
        beta_scales = pm.HalfNormal('beta_scales', sigma=np.asarray([5.0, 1.0, 0.2, 0.04]), shape = [p, n_levels])
        print(f"Found {n_unique_combined} unique huc2, huc4, huc8 values. Breakdown: {unique_per_level}")

        # Create random variables with N(0,1) prior. These will be rescaled and shifted
        # to create the final coefficients later on
        z = pm.Normal('z', mu=0, sigma=1, shape = [n_unique_combined, p])
        
        # Rescale the z by picking off the scale parameters for each level
        #[n_unique_combined, p] * [n_unique_combined, p]
        z_scaled = z * beta_scales[:, level_as_int].T

        # Next, our strategy is to add the 0 level to the 1 level, then add the 1 level to the 2 level, and so on
        # until we get to the huc8 level. We do this by iteratively indexing using the codes from final_gdf_subset
        # and then summing the results. The final result is a [n, p] array of coefficients
        beta = pm.math.zeros((n, p))

        
        # Add global effect (row 0 of z_scaled)
        # Use slicing [0:1, :] to maintain 2D shape for broadcasting safety, although [0, :] usually works
        beta += z_scaled[0:1, :]

        # Add HUC2 effects (indices 1 to cumulative_unique[1]-1)
        start = cumulative_unique[0] # = 1
        end = cumulative_unique[1]   # = 1 + num_unique_huc2
        z_scaled_block = z_scaled[start:end]
        # Ensure codes are numpy array of integers for indexing
        codes = final_gdf_subset['huc2'].astype('category').cat.codes.values
        beta += z_scaled_block[codes]

        # Add HUC4 effects (indices cumulative_unique[1] to cumulative_unique[2]-1)
        start = cumulative_unique[1]
        end = cumulative_unique[2]
        z_scaled_block = z_scaled[start:end]
        codes = final_gdf_subset['huc4'].astype('category').cat.codes.values
        beta += z_scaled_block[codes]

        # Add HUC8 effects (indices cumulative_unique[2] to cumulative_unique[3]-1, which is n_unique_combined-1)
        start = cumulative_unique[2]
        end = cumulative_unique[3] # This equals n_unique_combined
        z_scaled_block = z_scaled[start:end]
        codes = final_gdf_subset['huc8'].astype('category').cat.codes.values
        beta += z_scaled_block[codes]

        beta = pm.Deterministic('beta', beta, dims=['obs_id', 'predictor'])
        
        mu = pm.math.sum(beta * X_data, axis=1)

    else:    
        intercept = pm.Normal('intercept', mu=y.mean(), sigma=y.std() * 2)
        beta_sd = pm.HalfNormal('beta_sd', sigma=5)
        beta = pm.Normal('beta', mu=0, sigma=beta_sd, dims='predictor')
        mu = intercept + X[is_used] @ beta

    # Spatial random effect using geographic coordinates
    if USE_GP:
        ell = pm.Beta('ell', alpha=2, beta=2)
        eta = pm.HalfNormal('eta', sigma=0.05) # Push it to be closer to zero
        cov_func = eta**2 * pm.gp.cov.Matern52(input_dim=2, ls=ell)
        gp = pm.gp.HSGP(m=[50, 50], c=1.5, parametrization= "centered", cov_func=cov_func)
        eps = y[is_used] - mu
        f = gp.prior("f", X=coords[is_used], hsgp_coeffs_dims="basis_coeffs", gp_dims="obs_id")
        mu += f
        
       
    # Storing `mu` can take a lot of memory; this controls
    # whether or not it is stored in the trace
    if TRACK_MU:
        pm.Deterministic('mu', mu)


    # This part creates a likelihood for y ~ N(mu, (I-W)ᵀΩ(I-W)) where W is the adjacency matrix
    # and Ω is a diagonal matrix with entries ω
    # This is a Gaussian DAG model with a sparse precision matrix
    if USE_DAG:
        ε = pm.math.constant(y) - mu
        ω = pm.HalfNormal('ω', 5) # controls the diagonal entries of the precision matrix
        γ = pm.HalfNormal("γ", 5) # controls the off-diagonal entries of the precision matrix
        G_pt = sparse.as_sparse_or_tensor_variable(W[is_used][:, is_used])

        # Make ε be shape (n,1)
        ε_col = ε[:, None]  

        # Do all the ops as (n,1) => (n,1)
        ε_minus_γG_ε  = ε_col - sparse.structured_dot(G_pt,  γ * ε_col)
        ε_minus_γGT_ε = ε_col - sparse.structured_dot(G_pt.T, γ * ε_col)

        # Convert back to (n,) before summing
        resid1 = ε_minus_γG_ε.ravel()
        resid2 = ε_minus_γGT_ε.ravel()

        # For quadratic form εᵀ(I-γG)ᵀΩ(I-γG)ε
        # This is equivalent to ω * (ε - γGε)(ε - γGε)ᵀ
        quadratic_form = ω * pt.sum(resid1 * resid2)
            
        logdet = n * pt.log(ω)
        logp =  -0.5 * (n * pt.log(2 * np.pi) + quadratic_form) + 0.5 * logdet
        
    else:
        sigma = pm.HalfCauchy('sigma', beta=1)
        likelihood = pm.Normal('likelihood', mu=mu, sigma=sigma, observed=y[is_used])


Preparing to run model inference with 64220 samples and 169 predictors
Range of response values is -9.25863265991211 to 8.884825706481934
Range of predictor values is 0.0 to 1.0


### Logp profiling

In [None]:
RUN_PROFILING = False
if RUN_PROFILING:
    sparse_graph_model.profile(sparse_graph_model.logp()).summary()



Function profiling
  Message: /mnt/m2ssd/data/Dropbox/research/motives/motives-wq-modeling/.venv/lib/python3.10/site-packages/pymc/pytensorf.py:947
  Time in 1000 calls to Function.__call__: 1.118333e+01s
  Time in Function.vm.__call__: 11.131971331778914s (99.541%)
  Time in thunks: 11.10888934135437s (99.334%)
  Total compilation time: 2.106003e-01s
    Number of Apply nodes: 23
    PyTensor rewrite time: 1.858543e-01s
       PyTensor validate time: 1.829937e-03s
    PyTensor Linker time (includes C, CUDA code generation/compiling): 0.02292465406935662s
       C-cache preloading 1.470507e-02s
       Import time 0.000000e+00s
       Node make_thunk time 7.874939e-03s
           Node ExpandDims{axis=0}(Composite{...}.0) time 5.694540e-04s
           Node Composite{...}(beta_scales_log__, [[5.   1. ... .2  0.04]], [[ 1.60943 ... 21887582]]) time 5.598069e-04s
           Node Composite{switch(i4, ((-0.9189385 + (-0.5 * sqr(((i0 - i1) / i2)))) - i3), -inf)}(likelihood{[ 0.526272 ... .7284

### Run sampler

In [None]:
with sparse_graph_model:
    trace = pm.sample(return_inferencedata=False, chains=1, tune=1000, draws=1000, cores=1, nuts_sampler="numpyro", progressbar=True)
    az.to_netcdf(trace, TRACE_PATH)



  return lax_numpy.astype(self, dtype, copy=copy, device=device)
sample: 100%|██████████| 2000/2000 [7:30:35<00:00, 13.52s/it, 1023 steps of size 8.93e-05. acc. prob=0.80]  
  return lax_numpy.astype(self, dtype, copy=copy, device=device)
Only one chain was sampled, this makes it impossible to run some convergence checks


### Upload to S3

In [None]:
if os.path.exists(DATA_DIR / 'trace-revise.nc'):
    print(f"Uploading trace file to S3...")
    s3 = boto3.client('s3')
    TRACE_PATH = str(DATA_DIR / 'trace-revise.nc')
    s3.upload_file(TRACE_PATH, bucket_name, 'trace-revise.nc')
    print(f"✓ Successfully uploaded trace file to S3")
else:
    print("Trace file does not exist, nothing to upload")