<a href="https://colab.research.google.com/github/flyaflya/persuasive/blob/main/demoNotebooks/Achen2005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# When A Simple Linear Model Fails

Inspired by:

Achen, C. H. (2005). Let's put garbage-can regressions and garbage-can probits where they belong. Conflict Management and Peace Science, 22(4), 327-339.

---



In [None]:
#! pip install matplotlib seaborn daft arviz --upgrade

In [None]:
#!pip install -q numpyro@git+https://github.com/pyro-ppl/numpyro --upgrade

In [None]:
import xarray as xr
import numpy as np
import numpyro
import numpyro.distributions as dist
from jax import random
from numpyro.infer import MCMC, NUTS
import arviz as az
import pandas as pd
import seaborn as sns
url = "https://raw.githubusercontent.com/flyaflya/persuasive/main/achenGarbage.csv"
dataDF = pd.read_csv(url, index_col = "observationID")

In [None]:
# look at the data
dataDF

In [None]:
# quick data viz using seaborn
sns.pairplot(dataDF)

In [None]:
#@title The True (but hidden) Data Generating Process
from functools import partial, partialmethod
import daft   ### %pip install -U git+https://github.com/daft-dev/daft.git

class dag(daft.PGM):
    def __init__(self, *args, **kwargs):
        daft.PGM.__init__(self, *args, **kwargs)
    
    obsNode = partialmethod(daft.PGM.add_node, scale = 1.3, aspect = 2.4, fontsize = 10, plot_params = {'facecolor': 'cadetblue'})
    decNode = partialmethod(daft.PGM.add_node, aspect = 2.2, fontsize = 10, shape = "rectangle", plot_params = {'facecolor': 'thistle'})
    detNode = partialmethod(daft.PGM.add_node, aspect = 5.4, fontsize = 9.25, alternate = True, plot_params = {'facecolor': 'aliceblue'})
    latNode = partialmethod(daft.PGM.add_node, scale = 1.3, aspect = 2.4, fontsize = 10, plot_params = {'facecolor': 'aliceblue'})
    detNodeBig = partialmethod(daft.PGM.add_node, scale = 1.6, aspect = 2.25, fontsize = 10, alternate = True, plot_params = {'facecolor': 'aliceblue'})
    latNodeBig = partialmethod(daft.PGM.add_node, scale = 1.6, aspect = 2.2, fontsize = 10, plot_params = {'facecolor': 'aliceblue'})
    
pgm = dag(dpi = 300, alternate_style="outer")
pgm.obsNode("z","Indep. Explanatory Var\n"+r"$z$",-2,2.4, aspect = 2.4, scale = 1.8)
pgm.obsNode("x1","Measured Z for Var z\n"+r"$x_1$",-2,1.2, aspect = 2.4, scale = 1.8)
pgm.obsNode("x2","Indep. Explanatory Var\n"+r"$x_2$",0.5,2.4, aspect = 2.4, scale = 1.8)
pgm.latNode("y","Dependent Variable\n"+r"$y = z + 0.1*x_2$",0.5,1.2, aspect = 2.4, scale = 1.8)

pgm.add_edge("z","x1")
pgm.add_edge("z","y")
pgm.add_edge("x2","y")
#pgm.add_plate([-0.5, 0.0, 3.0, 4.2], label = "Observation:\n" + r"$i = 0, 1, 2, \ldots, 136$", 
 #             label_offset = (2,2), rect_params = dict({"fill": False, "linestyle": "dashed", "edgecolor": "black"}))
pgm.show(dpi=150)

In [None]:
#@title Recovering Coefficients Using Bayesian Data Analysis
from functools import partial, partialmethod
import daft   ### %pip install -U git+https://github.com/daft-dev/daft.git

class dag(daft.PGM):
    def __init__(self, *args, **kwargs):
        daft.PGM.__init__(self, *args, **kwargs)
    
    obsNode = partialmethod(daft.PGM.add_node, scale = 1.3, aspect = 2.4, fontsize = 10, plot_params = {'facecolor': 'cadetblue'})
    decNode = partialmethod(daft.PGM.add_node, aspect = 2.2, fontsize = 10, shape = "rectangle", plot_params = {'facecolor': 'thistle'})
    detNode = partialmethod(daft.PGM.add_node, aspect = 5.4, fontsize = 9.25, alternate = True, plot_params = {'facecolor': 'aliceblue'})
    latNode = partialmethod(daft.PGM.add_node, scale = 1.3, aspect = 2.4, fontsize = 10, plot_params = {'facecolor': 'aliceblue'})
    detNodeBig = partialmethod(daft.PGM.add_node, scale = 1.6, aspect = 2.25, fontsize = 10, alternate = True, plot_params = {'facecolor': 'aliceblue'})
    latNodeBig = partialmethod(daft.PGM.add_node, scale = 1.6, aspect = 2.2, fontsize = 10, plot_params = {'facecolor': 'aliceblue'})
    
pgm = dag(dpi = 300, alternate_style="outer")
pgm.obsNode("z","Actual Z Expl. Var\n"+r"$z$",-2,1.2, aspect = 2.4, scale = 1.8)
pgm.obsNode("x2","Indep. Explanatory Var\n"+r"$x_2$",-2,2.4, aspect = 2.4, scale = 1.8)
pgm.latNode("mu","Linear Predictor\n"+r"$\mu = \alpha + \beta_1 z + \beta_2 x_2$",0.5,1.2, aspect = 2.4, scale = 1.8)
pgm.latNode("alpha","Intercept\n"+r"$\alpha \sim Normal(0,1)$",0.5,3.6, aspect = 2.4, scale = 1.8)
pgm.latNode("beta1","Slope Coeff Var 1\n"+r"$\beta_1 \sim Normal(0,0.5)$",3,2.4, aspect = 2.4, scale = 1.8)
pgm.latNode("beta2","Slope Coeff Var 2\n"+r"$\beta_2 \sim Normal(0,0.5)$",5.5,2.4, aspect = 2.4, scale = 1.8)
pgm.obsNode("y","Dependent Variable\n"+r"$y \sim Normal(\mu,\sigma)$",0.5,0, aspect = 2.4, scale = 1.8)
pgm.latNode("sigma","StdDev-Observations\n"+r"$\sigma \sim Normal^+(0,0.01)$",3.5,1.2, aspect = 2.4, scale = 1.8)

pgm.add_edge("z","mu")
pgm.add_edge("x2","mu")
pgm.add_edge("alpha","mu")
pgm.add_edge("beta1","mu")
pgm.add_edge("beta2","mu")
pgm.add_edge("mu","y")
pgm.add_edge("sigma","y")
pgm.add_plate([-3.3, -0.5, 5.0, 3.5], label = "Observation:\n" + r"$i = 1, 1, 2, \ldots, 15$", 
               label_offset = (2,2), rect_params = dict({"fill": False, "linestyle": "dashed", "edgecolor": "black"}))
pgm.show(dpi=100)

In [None]:
## fit numpyro model using above DAG
z = dataDF.z.to_numpy()
x1 = dataDF.x_1.to_numpy()
x2 = dataDF.x_2.to_numpy()
y = dataDF.y.to_numpy()

## define the generative DAG as a Python function
def regressionModel(var1, var2, yVal):
    alpha = numpyro.sample('alpha', dist.Normal(0,0.5))
    beta1 = numpyro.sample('beta1', dist.Normal(0,0.5))
    beta2 = numpyro.sample('beta2', dist.Normal(0,0.5))
    sigma = numpyro.sample("sigma", dist.HalfNormal(0.01))
    with numpyro.plate('observation', len(yVal)):
        mu = numpyro.deterministic("mu", alpha + beta1 * var1 + beta2 * var2 )
        y = numpyro.sample("y", dist.Normal(mu,sigma), obs = yVal)

In [None]:
# an okay way visualize the model - good check that we faithfully reproduced DAG
numpyro.render_model(regressionModel, model_args=(z,x2,y), render_distributions=True)

In [None]:
# ## computationally get posterior distribution
mcmc = MCMC(NUTS(regressionModel), num_warmup=1000, num_samples=4000) 
rng_key = random.PRNGKey(seed = 111) ## so you and I get same results
mcmc.run(rng_key, var1 = z, var2 = x2, yVal = y) # get posterior
drawsDS = az.from_numpyro(mcmc).posterior ## get posterior samples into xarray

In [None]:
# print posterior summary - can't plot it because not enough variation in posterior estimates
az.summary(drawsDS, var_names = ["alpha","beta1","beta2","sigma"])

In [None]:
#@title Using A Proxy Measurement for True Explanatory Variable
from functools import partial, partialmethod
import daft   ### %pip install -U git+https://github.com/daft-dev/daft.git

class dag(daft.PGM):
    def __init__(self, *args, **kwargs):
        daft.PGM.__init__(self, *args, **kwargs)
    
    obsNode = partialmethod(daft.PGM.add_node, scale = 1.3, aspect = 2.4, fontsize = 10, plot_params = {'facecolor': 'cadetblue'})
    decNode = partialmethod(daft.PGM.add_node, aspect = 2.2, fontsize = 10, shape = "rectangle", plot_params = {'facecolor': 'thistle'})
    detNode = partialmethod(daft.PGM.add_node, aspect = 5.4, fontsize = 9.25, alternate = True, plot_params = {'facecolor': 'aliceblue'})
    latNode = partialmethod(daft.PGM.add_node, scale = 1.3, aspect = 2.4, fontsize = 10, plot_params = {'facecolor': 'aliceblue'})
    detNodeBig = partialmethod(daft.PGM.add_node, scale = 1.6, aspect = 2.25, fontsize = 10, alternate = True, plot_params = {'facecolor': 'aliceblue'})
    latNodeBig = partialmethod(daft.PGM.add_node, scale = 1.6, aspect = 2.2, fontsize = 10, plot_params = {'facecolor': 'aliceblue'})
    
pgm = dag(dpi = 300, alternate_style="outer")
pgm.obsNode("z","Indep. Explanatory Var\n"+r"$z$",-2,2.4, aspect = 2.4, scale = 1.8)
pgm.obsNode("x1","Measured Z for Var z\n"+r"$x_1$",-2,1.2, aspect = 2.4, scale = 1.8)
pgm.obsNode("x2","Indep. Explanatory Var\n"+r"$x_2$",0.5,2.4, aspect = 2.4, scale = 1.8)
pgm.latNode("y","Dependent Variable\n"+r"$y = z + 0.1*x_2$",0.5,1.2, aspect = 2.4, scale = 1.8)

pgm.add_edge("z","x1", plot_params = {"ec": "lightgrey", "fc": "lightgrey"}) ## not used for posterior calc
pgm.add_edge("z","y", plot_params = {"ec": "lightgrey", "fc": "lightgrey"})  ## not used for posterior calc
pgm.add_edge("x2","y")
pgm.add_edge("x1","y", plot_params = {"ec": "purple", "fc": "purple"})  ## NEWLY USED for posterior calc
#pgm.add_plate([-0.5, 0.0, 3.0, 4.2], label = "Observation:\n" + r"$i = 0, 1, 2, \ldots, 136$", 
 #             label_offset = (2,2), rect_params = dict({"fill": False, "linestyle": "dashed", "edgecolor": "black"}))
pgm.show(dpi=150)

## Posterior Distribution for Proxy Model

In [None]:
# ## computationally get posterior distribution
mcmc = MCMC(NUTS(regressionModel), num_warmup=1000, num_samples=4000) 
rng_key = random.PRNGKey(seed = 111) ## so you and I get same results
#mcmc.run(rng_key, var1 = z, var2 = x2, yVal = y) # OLD WAY to get posterior
mcmc.run(rng_key, var1 = x1, var2 = x2, yVal = y) # NEW Way to get posterior
drawsDS = az.from_numpyro(mcmc).posterior ## get posterior samples into xarray

In [None]:
drawsDS

In [None]:
# print posterior summary - can't plot it, not sure why - not enough variation in posterior estimates?
az.summary(drawsDS, var_names = ["alpha","beta1","beta2","sigma"])

In [None]:
# Show Basic Linear Regression Performance
from sklearn.linear_model import LinearRegression

# create a dataframe with x1, x2, and y
df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})

# create a matrix X with x1 and x2 as columns
X = df[['x1', 'x2']].values
X = df[['x1']].values
X = df[['x2']].values

# create a vector y with the target variable
y = df['y'].values

# create a linear regression object
reg = LinearRegression()

# fit the linear regression model to the data
reg.fit(X, y)

# calculate the R-squared value
r_squared = reg.score(X, y)

# print the coefficients and R-squared value
print('Coefficients:', reg.coef_)
print('Intercept:', reg.intercept_)
print('R-squared:', r_squared)


In [None]:
df