# Simulations

In [19]:
import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as az
import statsmodels.api as sm


## Simulating spurius association

In [2]:
n = 100

In [3]:
x_real = np.random.normal(size=n)

In [4]:
x_spurius  = np.random.normal(loc=x_real, size=n)

In [5]:
y = np.random.normal(loc=x_real, size=n)

In [6]:
df = pd.DataFrame({'x_real':x_real, 'x_spurius':x_spurius, 'y':y})

In [7]:
df.corr()

Unnamed: 0,x_real,x_spurius,y
x_real,1.0,0.810459,0.756883
x_spurius,0.810459,1.0,0.62505
y,0.756883,0.62505,1.0


In [8]:
with pm.Model() as m1:
    alpha = pm.Normal('alpha', 0, 0.2)
    beta_real = pm.Normal('beta_real', 0, 0.5)
    beta_spurius = pm.Normal('beta_spurius', 0, 0.5)
    sigma = pm.Exponential('sigma', 1)
    mu = pm.Deterministic('mu', alpha + beta_real * df['x_real'] + beta_spurius * df['x_spurius'])
    y = pm.Normal('y', mu, sd=sigma, observed=df['y'])
    m1_trace = pm.sample(return_inferencedata=True)
    m1_prior_pred = pm.sample_prior_predictive()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta_spurius, beta_real, alpha]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 17 seconds.
The acceptance probability does not match the target. It is 0.7064795603773494, but should be close to 0.8. Try to increase the number of tuning steps.


In [9]:
az.summary(m1_trace, var_names=['beta_real', 'beta_spurius'])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_real,0.935,0.151,0.664,1.22,0.003,0.002,2155.0,2162.0,1.0
beta_spurius,0.085,0.109,-0.114,0.302,0.002,0.002,2127.0,2008.0,1.0


In [21]:
model = sm.OLS(df['y'], df[['x_real','x_spurius']])
results = model.fit()
print(results.summary())
model = sm.OLS(df['y'], df[['x_real']])
results = model.fit()
print(results.summary())
model = sm.OLS(df['y'], df[['x_spurius']])
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.569
Model:                            OLS   Adj. R-squared (uncentered):              0.560
Method:                 Least Squares   F-statistic:                              64.76
Date:                Thu, 31 Mar 2022   Prob (F-statistic):                    1.19e-18
Time:                        12:44:14   Log-Likelihood:                         -137.49
No. Observations:                 100   AIC:                                      279.0
Df Residuals:                      98   BIC:                                      284.2
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

# Simulating a masking relationship

Imagine M->K<-A

In [10]:
n=100
x1 = np.random.normal(size=n)

In [11]:
x2 = np.random.normal(size=n, loc=x1)

In [12]:
y = np.random.normal(size=n, loc=x2-x1)

In [13]:
df2 = pd.DataFrame({'y':y, 'x1':x1, 'x2':x2})

In [14]:
df2.corr()

Unnamed: 0,y,x1,x2
y,1.0,-0.080504,0.521799
x1,-0.080504,1.0,0.611973
x2,0.521799,0.611973,1.0


In [15]:
with pm.Model() as m2:
    alpha = pm.Normal('alpha', 0, 0.2)
    beta_x1 = pm.Normal('beta_x1', 0, 0.5)
    beta_x2 = pm.Normal('beta_x2', 0, 0.5)
    sigma = pm.Exponential('sigma', 1)
    mu = pm.Deterministic('mu', alpha + beta_x1 * df2['x1'] + beta_x2 * df2['x2'])
    y = pm.Normal('y', mu, sd=sigma, observed=df2['y'])
    m2_trace = pm.sample(return_inferencedata=True)
    m2_prior_pred = pm.sample_prior_predictive()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, beta_x2, beta_x1, alpha]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 16 seconds.


In [16]:
az.summary(m2_trace, var_names=['beta_x1', 'beta_x2'])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta_x1,-0.958,0.149,-1.238,-0.683,0.003,0.002,2975.0,3036.0,1.0
beta_x2,0.993,0.105,0.795,1.19,0.002,0.001,3020.0,3148.0,1.0


In [23]:
model = sm.OLS(df2['y'], df2[['x1','x2']])
results = model.fit()
print(results.summary())
model = sm.OLS(df2['y'], df2[['x1']])
results = model.fit()
print(results.summary())
model = sm.OLS(df2['y'], df2[['x2']])
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.527
Model:                            OLS   Adj. R-squared (uncentered):              0.517
Method:                 Least Squares   F-statistic:                              54.61
Date:                Thu, 31 Mar 2022   Prob (F-statistic):                    1.16e-16
Time:                        12:45:39   Log-Likelihood:                         -141.75
No. Observations:                 100   AIC:                                      287.5
Df Residuals:                      98   BIC:                                      292.7
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------