# PD7 - Directed Acyclic Graphs

In [1]:
import numpy as np
import statsmodels.api as sms

In [2]:
sample = np.random.uniform(0, 1, (1_000_000, 1)) < 0.001
sample.sum()

np.int64(1046)

In [3]:
sample_indices = np.where(sample)[0]

## 1.

![](assets/dags1.png)

In [4]:
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_Z + pop_X + np.random.normal(0, 1, (1_000_000, 1))

In [5]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.779
Model:                            OLS   Adj. R-squared (uncentered):              0.779
Method:                 Least Squares   F-statistic:                              3688.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                        0.00
Time:                        12:12:15   Log-Likelihood:                         -1678.1
No. Observations:                1046   AIC:                                      3358.
Df Residuals:                    1045   BIC:                                      3363.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [6]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.850
Model:                            OLS   Adj. R-squared (uncentered):              0.849
Method:                 Least Squares   F-statistic:                              2949.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                        0.00
Time:                        12:12:15   Log-Likelihood:                         -1477.3
No. Observations:                1046   AIC:                                      2959.
Df Residuals:                    1044   BIC:                                      2969.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

1. Good controls: controlling for a confounder

In [7]:
pop_U = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_U + pop_X + np.random.normal(0, 1, (1_000_000, 1))

In [8]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.767
Model:                            OLS   Adj. R-squared (uncentered):              0.766
Method:                 Least Squares   F-statistic:                              3434.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                        0.00
Time:                        12:12:15   Log-Likelihood:                         -1709.9
No. Observations:                1046   AIC:                                      3422.
Df Residuals:                    1045   BIC:                                      3427.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.791
Model:                            OLS   Adj. R-squared (uncentered):              0.790
Method:                 Least Squares   F-statistic:                              1972.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                        0.00
Time:                        12:12:15   Log-Likelihood:                         -1653.1
No. Observations:                1046   AIC:                                      3310.
Df Residuals:                    1044   BIC:                                      3320.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## 2. 

![](assets/dags2.png)

In [10]:
pop_U = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_M = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_M + np.random.normal(0, 1, (1_000_000, 1))

In [11]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.577
Model:                            OLS   Adj. R-squared (uncentered):              0.577
Method:                 Least Squares   F-statistic:                              1427.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   1.22e-197
Time:                        12:12:15   Log-Likelihood:                         -2125.6
No. Observations:                1046   AIC:                                      4253.
Df Residuals:                    1045   BIC:                                      4258.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [12]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.762
Model:                            OLS   Adj. R-squared (uncentered):              0.762
Method:                 Least Squares   F-statistic:                              1673.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                        0.00
Time:                        12:12:15   Log-Likelihood:                         -1824.8
No. Observations:                1046   AIC:                                      3654.
Df Residuals:                    1044   BIC:                                      3664.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

2. Good controls: controlling for a confounder

## 3.

![](assets/dags3.png)

In [13]:
pop_U1 = np.random.normal(0, 1, (1_000_000, 1))
pop_U2 = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_U1 + pop_U2 + np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_U1 + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_U2 + np.random.normal(0, 1, (1_000_000, 1))

In [14]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.531
Model:                            OLS   Adj. R-squared (uncentered):              0.530
Method:                 Least Squares   F-statistic:                              1183.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   5.87e-174
Time:                        12:12:15   Log-Likelihood:                         -1811.8
No. Observations:                1046   AIC:                                      3626.
Df Residuals:                    1045   BIC:                                      3631.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [15]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.625
Model:                            OLS   Adj. R-squared (uncentered):              0.625
Method:                 Least Squares   F-statistic:                              871.7
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   2.33e-223
Time:                        12:12:15   Log-Likelihood:                         -1694.1
No. Observations:                1046   AIC:                                      3392.
Df Residuals:                    1044   BIC:                                      3402.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

3. Bad control: controlling for a collider (M-graph)

## 4. Damned if you do damned if you don't

![](assets/dags4.png)

In this situation, it is not clear whether Z will be a good or bad controls. Other tools may be needed like sensitivity analysis to assess the effect of using it as a control.

## 5.

![](assets/dags5.png)

In [16]:
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_Z + np.random.normal(0, 1, (1_000_000, 1))

In [17]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.334
Model:                            OLS   Adj. R-squared (uncentered):              0.334
Method:                 Least Squares   F-statistic:                              525.0
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                    1.79e-94
Time:                        12:12:16   Log-Likelihood:                         -1869.1
No. Observations:                1046   AIC:                                      3740.
Df Residuals:                    1045   BIC:                                      3745.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [18]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.653
Model:                            OLS   Adj. R-squared (uncentered):              0.653
Method:                 Least Squares   F-statistic:                              983.2
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   8.20e-241
Time:                        12:12:16   Log-Likelihood:                         -1528.2
No. Observations:                1046   AIC:                                      3060.
Df Residuals:                    1044   BIC:                                      3070.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

5. Neutral control (but raises precision)

## 6.

![](assets/dags6.png)

In [19]:
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + np.random.normal(0, 1, (1_000_000, 1))

In [20]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.638
Model:                            OLS   Adj. R-squared (uncentered):              0.637
Method:                 Least Squares   F-statistic:                              1838.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   1.62e-232
Time:                        12:12:16   Log-Likelihood:                         -1502.9
No. Observations:                1046   AIC:                                      3008.
Df Residuals:                    1045   BIC:                                      3013.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [21]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.638
Model:                            OLS   Adj. R-squared (uncentered):              0.637
Method:                 Least Squares   F-statistic:                              918.2
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   8.36e-231
Time:                        12:12:16   Log-Likelihood:                         -1502.9
No. Observations:                1046   AIC:                                      3010.
Df Residuals:                    1044   BIC:                                      3020.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

6. Neutral control (but lowers precision)

## 7.

![](assets/dags7.png)

In [22]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_M = pop_X + np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_M + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_M + np.random.normal(0, 1, (1_000_000, 1))

In [23]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.307
Model:                            OLS   Adj. R-squared (uncentered):              0.307
Method:                 Least Squares   F-statistic:                              463.3
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                    2.36e-85
Time:                        12:12:16   Log-Likelihood:                         -1840.6
No. Observations:                1046   AIC:                                      3683.
Df Residuals:                    1045   BIC:                                      3688.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [24]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.490
Model:                            OLS   Adj. R-squared (uncentered):              0.489
Method:                 Least Squares   F-statistic:                              500.7
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   3.47e-153
Time:                        12:12:16   Log-Likelihood:                         -1680.8
No. Observations:                1046   AIC:                                      3366.
Df Residuals:                    1044   BIC:                                      3376.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

7. Bad control: introduces overcontrol bias

## 8.

![](assets/dags8.png)

In [25]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_M = pop_X + pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_M + np.random.normal(0, 1, (1_000_000, 1))

In [26]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.259
Model:                            OLS   Adj. R-squared (uncentered):              0.259
Method:                 Least Squares   F-statistic:                              365.7
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                    3.86e-70
Time:                        12:12:16   Log-Likelihood:                         -2065.8
No. Observations:                1046   AIC:                                      4134.
Df Residuals:                    1045   BIC:                                      4138.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [27]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.520
Model:                            OLS   Adj. R-squared (uncentered):              0.519
Method:                 Least Squares   F-statistic:                              565.7
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   3.73e-167
Time:                        12:12:16   Log-Likelihood:                         -1838.8
No. Observations:                1046   AIC:                                      3682.
Df Residuals:                    1044   BIC:                                      3691.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

8. Neutral control (but raises precision)

## 9.

![](assets/dags9.png)

In [28]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_U = np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_X + pop_U + np.random.normal(0, 1, (1_000_000, 1))

In [29]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.338
Model:                            OLS   Adj. R-squared (uncentered):              0.338
Method:                 Least Squares   F-statistic:                              534.2
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                    8.53e-96
Time:                        12:12:16   Log-Likelihood:                         -1843.9
No. Observations:                1046   AIC:                                      3690.
Df Residuals:                    1045   BIC:                                      3695.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [30]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.503
Model:                            OLS   Adj. R-squared (uncentered):              0.502
Method:                 Least Squares   F-statistic:                              528.4
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   3.03e-159
Time:                        12:12:16   Log-Likelihood:                         -1694.2
No. Observations:                1046   AIC:                                      3392.
Df Residuals:                    1044   BIC:                                      3402.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

9. Bad control: introduces selection bias

## 10

![](assets/dags10.png)

In [31]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_Y + np.random.normal(0, 1, (1_000_000, 1))

In [32]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.508
Model:                            OLS   Adj. R-squared (uncentered):              0.508
Method:                 Least Squares   F-statistic:                              1079.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   3.60e-163
Time:                        12:12:16   Log-Likelihood:                         -1509.5
No. Observations:                1046   AIC:                                      3021.
Df Residuals:                    1045   BIC:                                      3026.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [33]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.756
Model:                            OLS   Adj. R-squared (uncentered):              0.756
Method:                 Least Squares   F-statistic:                              1618.
Date:                Thu, 03 Oct 2024   Prob (F-statistic):                   1.58e-320
Time:                        12:12:16   Log-Likelihood:                         -1142.8
No. Observations:                1046   AIC:                                      2290.
Df Residuals:                    1044   BIC:                                      2299.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

10. Bad control: this is called case-control bias