## A Crash Course in Good and Bad Controls

### Group_2

In [14]:
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib import colors
import math

# for OLS

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
!pip install causalgraphicalmodels

Z will represent the variable whose inclusion in the regression equation is to be decided, with
“good control” standing for bias reduction, “bad control” standing for bias increase, and
“neutral control” when the addition of Z neither increases nor decreases the asymptotic
bias.

In [18]:
import pandas as pd
import numpy as np
import random
import math
import sklearn
import scipy as sp
import networkx
import matplotlib
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import sympy
# import dowhy as dw
from causalgraphicalmodels import CausalGraphicalModel

### Model 1

In [6]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
Z = np.random.normal(0,1,1000).reshape(1000,1)
X = Z + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + Z + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])


In [7]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_1 = summary_col([no_control, control], stars = True)
print(model_1)


                  Y I       Y II  
----------------------------------
Intercept      -0.0252   0.0046   
               (0.0396)  (0.0320) 
R-squared      0.7555    0.8406   
R-squared Adj. 0.7552    0.8402   
X              1.5070*** 0.9697***
               (0.0271)  (0.0320) 
Z                        1.0314***
                         (0.0447) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In [None]:
sprinkler1 = CausalGraphicalModel(nodes=["Z","Y","X"],
                                 edges=[("Z","Y"),
                                        ("Z","X"),
                                        ("X","Y")])
sprinkler1.draw()

In Model 1, Z stands for a common cause of both X and Y . Once we control for Z,
we block the back-door path from X to Y , producing an unbiased estimate of the ACE.

### Model 2

In [8]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U = np.random.normal(0,1,1000).reshape(1000,1)
Z = U + np.random.normal(0,1,1000).reshape(1000,1)
X = Z + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + Z + U + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((U, Z, X, Y))
data = pd.DataFrame(D, columns = ["U", "Z", "X", "Y"])

In [9]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_2 = summary_col([no_control, control], stars = True)
print(model_2)


                  Y I       Y II  
----------------------------------
Intercept      -0.0197   0.0001   
               (0.0553)  (0.0389) 
R-squared      0.8054    0.9039   
R-squared Adj. 0.8052    0.9037   
X              2.0205*** 1.0143***
               (0.0314)  (0.0385) 
Z                        1.4814***
                         (0.0463) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Z is not a common cause of both X and Y , and therefore, not a traditional
“confounder” as in Model 1. Nevertheless, controlling for Z blocks the back-door path from
X to Y due to the unobserved confounder U. Controling for Z produces an unbiased estimate of
the ACE.

In [None]:
sprinkler2 = CausalGraphicalModel(nodes=["U","Z","Y","X"],
                                 edges=[("U","Z"),
                                        ("Z","X"),
                                        ("U","Y"),
                                        ("X","Y")])
sprinkler2.draw()

### Model_3

In [10]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U = np.random.normal(0,1,1000).reshape(1000,1)
Z = U + np.random.normal(0,1,1000).reshape(1000,1)
X = U + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + Z + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((U, Z, X, Y))
data = pd.DataFrame(D, columns = ["U", "Z", "X", "Y"])

In [11]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_3 = summary_col([no_control, control], stars = True)
print(model_3)


                  Y I       Y II  
----------------------------------
Intercept      0.0174    0.0288   
               (0.0505)  (0.0321) 
R-squared      0.6480    0.8578   
R-squared Adj. 0.6477    0.8575   
X              1.4894*** 0.9883***
               (0.0347)  (0.0257) 
Z                        0.9806***
                         (0.0256) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Model_3 is analogous for Model_2. Z is not a common couse for both X and Y. Controlling for Z block the back-door path. 
Controling for Z produces an unbiased estimate of the ACE.

In [None]:
sprinkler3 = CausalGraphicalModel(nodes=["U","Z","Y","X"],
                                 edges=[("U","Z"),
                                        ("Z","Y"),
                                        ("U","X"),
                                        ("X","Y")])
sprinkler3.draw()

### Model_4

In [12]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
Z = np.random.normal(0,1,1000).reshape(1000,1)
X = Z + np.random.normal(0,1,1000).reshape(1000,1)
M = Z + X + np.random.normal(0,1,1000).reshape(1000,1)
Y = M + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((M, Z, X, Y))
data = pd.DataFrame(D, columns = ["M", "Z", "X", "Y"])

In [13]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_4 = summary_col([no_control, control], stars = True)
print(model_4)


                  Y I       Y II  
----------------------------------
Intercept      0.0038    0.0327   
               (0.0506)  (0.0453) 
R-squared      0.6464    0.7170   
R-squared Adj. 0.6461    0.7164   
X              1.4816*** 0.9617***
               (0.0347)  (0.0453) 
Z                        0.9983***
                         (0.0633) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Common causes of X and any mediator(M) (between X and Y ) also confound the effect of X on Y .Controling for Z blocks the 
back-door path from X to Y and produces an unbiased estimate of the ACE.

In [None]:
sprinkle4r = CausalGraphicalModel(nodes=["M","Z","Y","X"],
                                 edges=[("Z","X"),
                                        ("Z","M"),
                                        ("M","Y")])
sprinkler4.draw()

### Model_5

In [14]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U = np.random.normal(0,1,1000).reshape(1000,1)
Z = U + np.random.normal(0,1,1000).reshape(1000,1)
X = Z + np.random.normal(0,1,1000).reshape(1000,1)
M = U + X + np.random.normal(0,1,1000).reshape(1000,1)
Y = M + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((M, U, Z, X, Y))
data = pd.DataFrame(D, columns = ["M", "U", "Z", "X", "Y"])

In [15]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_5 = summary_col([no_control, control], stars = True)
print(model_5)


                  Y I       Y II  
----------------------------------
Intercept      -0.0298   -0.0234  
               (0.0503)  (0.0487) 
R-squared      0.6874    0.7074   
R-squared Adj. 0.6871    0.7068   
X              1.3392*** 1.0137***
               (0.0286)  (0.0481) 
Z                        0.4792***
                         (0.0580) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Z is not a common cause of both X,  Y or the Mediator. Controlling for Z produces unbiased ACE.

In [None]:
sprinkler5 = CausalGraphicalModel(nodes=["M","U","Z","Y","X"],
                                 edges=[("U","Z"),
                                        ("U","M"),
                                        ("Z","X"),
                                        ("X","M"),
                                        ("M","Y")])
sprinkler5.draw()

### Model_6

In [16]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U = np.random.normal(0,1,1000).reshape(1000,1)
X = U + np.random.normal(0,1,1000).reshape(1000,1)
Z = U + np.random.normal(0,1,1000).reshape(1000,1)
M = Z + X + np.random.normal(0,1,1000).reshape(1000,1)
Y = M + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((M, U, Z, X, Y))
data = pd.DataFrame(D, columns = ["M", "U", "Z", "X", "Y"])

In [17]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_6 = summary_col([no_control, control], stars = True)
print(model_6)


                  Y I       Y II  
----------------------------------
Intercept      -0.0197   0.0050   
               (0.0584)  (0.0438) 
R-squared      0.5773    0.7635   
R-squared Adj. 0.5769    0.7630   
X              1.4788*** 0.9823***
               (0.0401)  (0.0348) 
Z                        0.9794***
                         (0.0350) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Analogous case of Model_5. Z is not a common cause of both X and Y. Controling for Z produces unbiased ACE.

In [None]:
sprinkler6 = CausalGraphicalModel(nodes=["M","U","Z","Y","X"],
                                 edges=[("U","Z"),
                                        ("Z","M"),
                                        ("U","X"),
                                        ("X","M"), ("M","Y")])
sprinkler6.draw()

### Model_7

In [18]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U1 = np.random.normal(0,1,1000).reshape(1000,1)
U2 = np.random.normal(0,1,1000).reshape(1000,1)
Z = U1 + U2 + np.random.normal(0,1,1000).reshape(1000,1)
X = U1 + np.random.normal(0,1,1000).reshape(1000,1)
Y = U2 + X +np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((U1, U2, Z, X, Y))
data = pd.DataFrame(D, columns = ["U1", "U2", "Z", "X", "Y"])

In [19]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_7 = summary_col([no_control, control], stars = True)
print(model_7)


                  Y I       Y II  
----------------------------------
Intercept      -0.0107   -0.0026  
               (0.0446)  (0.0399) 
R-squared      0.4936    0.5956   
R-squared Adj. 0.4931    0.5948   
X              0.9764*** 0.7702***
               (0.0313)  (0.0309) 
Z                        0.3965***
                         (0.0250) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In this case controling for Z causes bias results. Here Z is correlated with the treatment and
the outcome and it is also a “pre-treatment” variable. This structure is know as "M-Bias" where controlling for Z will induce bias by opening the back-door path X ← U1 → Z ← U2 → Y , thus spoiling a previously unbiased estimate of the ACE. 

In [None]:
sprinkler7 = CausalGraphicalModel(nodes=["U1","U2","Z","Y","X"],
                                 edges=[("U1","Z"),
                                        ("U2","Z"),
                                        ("U1","X"),
                                        ("U2","Y"), ("X","Y")])
sprinkler7.draw()

### Model_7 variation

In [20]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U1 = np.random.normal(0,1,1000).reshape(1000,1)
U2 = np.random.normal(0,1,1000).reshape(1000,1)
Z = U1 + U2 + np.random.normal(0,1,1000).reshape(1000,1)
X = U1 + np.random.normal(0,1,1000).reshape(1000,1)
Y = U2 + X + Z + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((U1, U2, Z, X, Y))
data = pd.DataFrame(D, columns = ["U1", "U2", "Z", "X", "Y"])

In [21]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_7b = summary_col([no_control, control], stars = True)
print(model_7b)


                  Y I       Y II  
----------------------------------
Intercept      -0.0312   -0.0026  
               (0.0810)  (0.0399) 
R-squared      0.4098    0.8570   
R-squared Adj. 0.4092    0.8568   
X              1.4964*** 0.7702***
               (0.0568)  (0.0309) 
Z                        1.3965***
                         (0.0250) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


While adjusting for Z closes this back-door path, it also opens back-door the path X ← U1 → Z ← U2 → Y , as we had in our previous example (Model_7). The causal effect is not defined.

In [None]:
sprinkler7b = CausalGraphicalModel(nodes=["U1","U2","Z","Y","X"],
                                 edges=[("U1","Z"),
                                        ("U2","Z"),
                                        ("U1","X"),
                                        ("U2","Y"), ("X","Y"),
                                        ("Z","Y")])
sprinkler7b.draw()

### Model_8

In [22]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
Z = np.random.normal(0,1,1000).reshape(1000,1)
X = np.random.normal(0,1,1000).reshape(1000,1)
Y = X + Z + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [23]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_8 = summary_col([no_control, control], stars = True)
print(model_8)


                  Y I       Y II  
----------------------------------
Intercept      -0.0418   0.0046   
               (0.0460)  (0.0320) 
R-squared      0.3168    0.6699   
R-squared Adj. 0.3161    0.6693   
X              0.9892*** 0.9697***
               (0.0460)  (0.0320) 
Z                        1.0011***
                         (0.0307) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Z is not a confounder nor does it block any back-door paths. Controlling for Z does not open any back-door paths from X to Y.
Z is a “neutral control.” Analysis shows, however, that controlling for Z reduces the variation of the outcome variable Y , and helps to improve the precision of the ACE estimate in finite samples.

In [None]:
sprinkler8 = CausalGraphicalModel(nodes=["Z","Y","X"],
                                 edges=[("X","Z"),
                                        ("Z","Y")])
sprinkler8.draw()

### Model_9

In [24]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
Z = np.random.normal(0,1,1000).reshape(1000,1)
X = Z + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [25]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_9 = summary_col([no_control, control], stars = True)
print(model_9)


                  Y I       Y II  
----------------------------------
Intercept      0.0036    0.0046   
               (0.0320)  (0.0320) 
R-squared      0.6698    0.6699   
R-squared Adj. 0.6694    0.6693   
X              0.9861*** 0.9697***
               (0.0219)  (0.0320) 
Z                        0.0314   
                         (0.0447) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Same as Model_8, Z is a neutral variable.

In [None]:
sprinkler9 = CausalGraphicalModel(nodes=["Z","Y","X"],
                                 edges=[("Z","X"),
                                        ("X","Y")])
sprinkler9.draw()

### Model_10

In [26]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
U = np.random.normal(0,1,1000).reshape(1000,1)
Z = np.random.normal(0,1,1000).reshape(1000,1)
X = Z + U + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + U + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((U, Z, X, Y))
data = pd.DataFrame(D, columns = ["U", "Z", "X", "Y"])

In [27]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_10 = summary_col([no_control, control], stars = True)
print(model_10)


                  Y I       Y II   
-----------------------------------
Intercept      -0.0063   0.0044    
               (0.0409)  (0.0390)  
R-squared      0.7692    0.7908    
R-squared Adj. 0.7689    0.7904    
X              1.3413*** 1.4946*** 
               (0.0233)  (0.0268)  
Z                        -0.4787***
                         (0.0471)  
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Z only causes X. Naive control for Z in this model will not only fail to deconfound the effect of X on Y , but, in linear models, will amplify any existing bias. This is called "bias amplification".

In [None]:
sprinkler10 = CausalGraphicalModel(nodes=["U","Z","Y","X"],
                                 edges=[("Z","X"),
                                        ("X","Y"),
                                        ("U","X"),
                                        ("U","Y")])
sprinkler10.draw()

### Model_11

In [28]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
Z = X + np.random.normal(0,1,1000).reshape(1000,1)
Y = Z + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [29]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_11 = summary_col([no_control, control], stars = True)
print(model_11)


                  Y I       Y II  
----------------------------------
Intercept      0.0180    0.0046   
               (0.0444)  (0.0320) 
R-squared      0.3656    0.6699   
R-squared Adj. 0.3650    0.6693   
X              1.0185*** 0.0314   
               (0.0425)  (0.0447) 
Z                        0.9697***
                         (0.0320) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In Model 11, Z is a mediator of the causal effect of X on Y . Controlling for Z will block the very effect we want to estimate

In [None]:
sprinkler11 = CausalGraphicalModel(nodes=["Z","Y","X"],
                                 edges=[("X","Z"),
                                        ("Z","Y")])
sprinkler11.draw()

### Model_12

In [30]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
M = X + np.random.normal(0,1,1000).reshape(1000,1)
Z = M + np.random.normal(0,1,1000).reshape(1000,1)
Y = M + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((M, Z, X, Y))
data = pd.DataFrame(D, columns = ["M", "Z", "X", "Y"])

In [31]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_12 = summary_col([no_control, control], stars = True)
print(model_12)


                  Y I       Y II  
----------------------------------
Intercept      0.0419    0.0330   
               (0.0449)  (0.0393) 
R-squared      0.3405    0.4950   
R-squared Adj. 0.3399    0.4940   
X              0.9765*** 0.4773***
               (0.0430)  (0.0473) 
Z                        0.4902***
                         (0.0281) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In Model 12, although Z is not itself a mediator of the causal effect of X on Y , controlling for
Z is equivalent to partially controlling for the mediator M, and will thus bias our estimates.

In [None]:
sprinkler12 = CausalGraphicalModel(nodes=["M","Z","Y","X"],
                                 edges=[("X","M"),
                                        ("M","Y"),
                                        ("Z","M")])
sprinkler12.draw()

### Model_13

In [32]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
Z = np.random.normal(0,1,1000).reshape(1000,1)
M = X + Z + np.random.normal(0,1,1000).reshape(1000,1)
Y = M + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((M, Z, X, Y))
data = pd.DataFrame(D, columns = ["M", "Z", "X", "Y"])

In [33]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_13 = summary_col([no_control, control], stars = True)
print(model_13)


                  Y I       Y II  
----------------------------------
Intercept      0.0460    0.0327   
               (0.0546)  (0.0453) 
R-squared      0.2593    0.4899   
R-squared Adj. 0.2586    0.4889   
X              0.9771*** 0.9600***
               (0.0523)  (0.0434) 
Z                        0.9617***
                         (0.0453) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Neutral effect.  Z is a cause, not an effect, of the mediator (and,consequently, also a cause of Y ). Thus, Model 13 is analogous to Model 8, and so controlling for Z will be neutral in terms of bias and may increase the precision of the ACE.

### Model_14

In [34]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
Z = X + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [35]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_14 = summary_col([no_control, control], stars = True)
print(model_14)


                  Y I       Y II  
----------------------------------
Intercept      0.0041    0.0046   
               (0.0320)  (0.0320) 
R-squared      0.5165    0.5169   
R-squared Adj. 0.5160    0.5159   
X              1.0006*** 1.0314***
               (0.0306)  (0.0447) 
Z                        -0.0303  
                         (0.0320) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In Models 14 and 15 controlling for Z does not open any confounding paths between X and
Y . Thus, Z is neutral in terms of bias. However, controlling for Z does reduce the variation
of the treatment variable X and so may hurt the precision of the ACE estimate in finite
samples.

### Model_15

In [36]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
U = np.random.normal(0,1,1000).reshape(1000,1)
Z = X + np.random.normal(0,1,1000).reshape(1000,1)
W = Z + U + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + U + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((W, U, Z, X, Y))
data = pd.DataFrame(D, columns = ["W", "U", "Z", "X", "Y"])

In [37]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_15 = summary_col([no_control, control], stars = True)
print(model_15)


                  Y I       Y II  
----------------------------------
Intercept      -0.0102   -0.0101  
               (0.0446)  (0.0447) 
R-squared      0.3556    0.3559   
R-squared Adj. 0.3549    0.3546   
X              1.0028*** 1.0330***
               (0.0427)  (0.0615) 
Z                        -0.0302  
                         (0.0442) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In Model 15, suppose one has only samples with W = w recorded (a case of selection bias). In this case, controlling for Z can help to obtain the W-specific effect of X on Y by blocking the colliding path due to W. In linear models, controlling for Z actually fully recovers the ACE

### Model_16

In [38]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
U = np.random.normal(0,1,1000).reshape(1000,1)
Z = X + U + np.random.normal(0,1,1000).reshape(1000,1)
Y = X + U + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((U, Z, X, Y))
data = pd.DataFrame(D, columns = ["U", "Z", "X", "Y"])

In [39]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_16 = summary_col([no_control, control], stars = True)
print(model_16)


                  Y I       Y II  
----------------------------------
Intercept      0.0419    0.0330   
               (0.0449)  (0.0393) 
R-squared      0.3405    0.4950   
R-squared Adj. 0.3399    0.4940   
X              0.9765*** 0.4773***
               (0.0430)  (0.0473) 
Z                        0.4902***
                         (0.0281) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Z is a bad control.  Adjusting for Z in Model_16 opens the colliding path X → Z ← U → Y and so biases the ACE.

### Model_17

In [40]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
Y = X + np.random.normal(0,1,1000).reshape(1000,1)
Z = X + Y + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [41]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_17 = summary_col([no_control, control], stars = True)
print(model_17)


                  Y I       Y II  
----------------------------------
Intercept      0.0139    0.0050   
               (0.0317)  (0.0229) 
R-squared      0.5302    0.7555   
R-squared Adj. 0.5297    0.7551   
X              1.0179*** 0.0194   
               (0.0303)  (0.0395) 
Z                        0.4947***
                         (0.0163) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


In Model_17, adjusting for Z not only opens the path X → Z ← Y , but also the colliding
path due to the latent parents of Y , thus biasing the ACE

### Model_18

In [42]:
# Set seed

np.random.seed(123456)

n = 1000     # sample size
X = np.random.normal(0,1,1000).reshape(1000,1)
Y = X + np.random.normal(0,1,1000).reshape(1000,1)
Z = Y + np.random.normal(0,1,1000).reshape(1000,1)

# DataFrame

D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [43]:
# Regressions
no_control = "Y ~ X"
control = "Y ~ X + Z"

no_control = smf.ols(no_control, data= data).fit()
control = smf.ols(control, data=data).fit()

# Summary

model_18 = summary_col([no_control, control], stars = True)
print(model_18)


                  Y I       Y II  
----------------------------------
Intercept      0.0139    0.0050   
               (0.0317)  (0.0229) 
R-squared      0.5302    0.7555   
R-squared Adj. 0.5297    0.7551   
X              1.0179*** 0.5141***
               (0.0303)  (0.0275) 
Z                        0.4947***
                         (0.0163) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Z is a bad control. Z is not in the causal pathway from X to Y , Z is not a direct cause
of X, and Z is connected to Y . However, controlling for the effects of the outcome Y will induce bias in the estimate of the ACE, even without the direct arrow X → Z, thus making Z a “bad control.” This happens
because Z is in fact a descendant of a collider: the outcome Y itself. 