# PD7 - Directed Acyclic Graphs

In [1]:
import numpy as np
import statsmodels.api as sms

In [2]:
sample = np.random.uniform(0, 1, (1_000_000, 1)) < 0.001
sample.sum()

np.int64(963)

In [3]:
sample_indices = np.where(sample)[0]

## 1.

![](assets/dags1.png)

In [4]:
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_Z + pop_X + np.random.normal(0, 1, (1_000_000, 1))

In [5]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.761
Model:                            OLS   Adj. R-squared (uncentered):              0.760
Method:                 Least Squares   F-statistic:                              3057.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   6.14e-301
Time:                        12:35:03   Log-Likelihood:                         -1564.2
No. Observations:                 963   AIC:                                      3130.
Df Residuals:                     962   BIC:                                      3135.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [6]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.848
Model:                            OLS   Adj. R-squared (uncentered):              0.848
Method:                 Least Squares   F-statistic:                              2683.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                        0.00
Time:                        12:35:07   Log-Likelihood:                         -1345.2
No. Observations:                 963   AIC:                                      2694.
Df Residuals:                     961   BIC:                                      2704.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

1. Good controls: controlling for a confounder

In [7]:
pop_U = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_U + pop_X + np.random.normal(0, 1, (1_000_000, 1))

In [8]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.757
Model:                            OLS   Adj. R-squared (uncentered):              0.756
Method:                 Least Squares   F-statistic:                              2989.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   2.23e-297
Time:                        12:35:13   Log-Likelihood:                         -1592.3
No. Observations:                 963   AIC:                                      3187.
Df Residuals:                     962   BIC:                                      3191.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.777
Model:                            OLS   Adj. R-squared (uncentered):              0.777
Method:                 Least Squares   F-statistic:                              1675.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   6.74e-314
Time:                        12:35:17   Log-Likelihood:                         -1549.9
No. Observations:                 963   AIC:                                      3104.
Df Residuals:                     961   BIC:                                      3114.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## 2. 

![](assets/dags2.png)

In [10]:
pop_U = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_M = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_M + np.random.normal(0, 1, (1_000_000, 1))

In [11]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.578
Model:                            OLS   Adj. R-squared (uncentered):              0.577
Method:                 Least Squares   F-statistic:                              1316.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   2.62e-182
Time:                        12:35:21   Log-Likelihood:                         -1941.0
No. Observations:                 963   AIC:                                      3884.
Df Residuals:                     962   BIC:                                      3889.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [12]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.760
Model:                            OLS   Adj. R-squared (uncentered):              0.759
Method:                 Least Squares   F-statistic:                              1519.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   3.06e-298
Time:                        12:35:23   Log-Likelihood:                         -1669.6
No. Observations:                 963   AIC:                                      3343.
Df Residuals:                     961   BIC:                                      3353.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

2. Good controls: controlling for a confounder

## 3.

![](assets/dags3.png)

In [19]:
pop_U1 = np.random.normal(0, 1, (1_000_000, 1))
pop_U2 = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_U1 + pop_U2 + np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_U1 + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_U2 + np.random.normal(0, 1, (1_000_000, 1))

In [20]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.493
Model:                            OLS   Adj. R-squared (uncentered):              0.492
Method:                 Least Squares   F-statistic:                              934.7
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   5.69e-144
Time:                        12:35:41   Log-Likelihood:                         -1710.1
No. Observations:                 963   AIC:                                      3422.
Df Residuals:                     962   BIC:                                      3427.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [21]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.602
Model:                            OLS   Adj. R-squared (uncentered):              0.601
Method:                 Least Squares   F-statistic:                              727.7
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   3.92e-193
Time:                        12:35:42   Log-Likelihood:                         -1593.0
No. Observations:                 963   AIC:                                      3190.
Df Residuals:                     961   BIC:                                      3200.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

3. Bad control: controlling for a collider (M-graph)

## 4. Damned if you do damned if you don't

![](assets/dags4.png)

In this situation, it is not clear whether Z will be a good or bad controls. Other tools may be needed like sensitivity analysis to assess the effect of using it as a control.

## 5.

![](assets/dags5.png)

In [22]:
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_Z + np.random.normal(0, 1, (1_000_000, 1))

In [23]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.317
Model:                            OLS   Adj. R-squared (uncentered):              0.316
Method:                 Least Squares   F-statistic:                              446.8
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                    9.18e-82
Time:                        12:35:54   Log-Likelihood:                         -1722.1
No. Observations:                 963   AIC:                                      3446.
Df Residuals:                     962   BIC:                                      3451.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [24]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.650
Model:                            OLS   Adj. R-squared (uncentered):              0.649
Method:                 Least Squares   F-statistic:                              892.0
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   9.50e-220
Time:                        12:35:56   Log-Likelihood:                         -1400.4
No. Observations:                 963   AIC:                                      2805.
Df Residuals:                     961   BIC:                                      2815.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

5. Neutral control (but raises precision)

## 6.

![](assets/dags6.png)

In [25]:
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_X = pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + np.random.normal(0, 1, (1_000_000, 1))

In [26]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.651
Model:                            OLS   Adj. R-squared (uncentered):              0.651
Method:                 Least Squares   F-statistic:                              1794.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   4.50e-222
Time:                        12:36:02   Log-Likelihood:                         -1376.4
No. Observations:                 963   AIC:                                      2755.
Df Residuals:                     962   BIC:                                      2760.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [27]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.652
Model:                            OLS   Adj. R-squared (uncentered):              0.651
Method:                 Least Squares   F-statistic:                              898.8
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   8.88e-221
Time:                        12:36:05   Log-Likelihood:                         -1375.4
No. Observations:                 963   AIC:                                      2755.
Df Residuals:                     961   BIC:                                      2765.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

6. Neutral control (but lowers precision)

## 7.

![](assets/dags7.png)

In [28]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_M = pop_X + np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_M + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_M + np.random.normal(0, 1, (1_000_000, 1))

In [29]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.346
Model:                            OLS   Adj. R-squared (uncentered):              0.346
Method:                 Least Squares   F-statistic:                              509.6
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                    6.81e-91
Time:                        12:36:14   Log-Likelihood:                         -1702.9
No. Observations:                 963   AIC:                                      3408.
Df Residuals:                     962   BIC:                                      3413.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [30]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.524
Model:                            OLS   Adj. R-squared (uncentered):              0.523
Method:                 Least Squares   F-statistic:                              529.4
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   1.01e-155
Time:                        12:36:14   Log-Likelihood:                         -1550.0
No. Observations:                 963   AIC:                                      3104.
Df Residuals:                     961   BIC:                                      3114.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

7. Bad control: introduces overcontrol bias

## 8.

![](assets/dags8.png)

In [31]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_Z = np.random.normal(0, 1, (1_000_000, 1))
pop_M = pop_X + pop_Z + np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_M + np.random.normal(0, 1, (1_000_000, 1))

In [32]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.299
Model:                            OLS   Adj. R-squared (uncentered):              0.298
Method:                 Least Squares   F-statistic:                              410.0
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                    3.20e-76
Time:                        12:36:18   Log-Likelihood:                         -1868.6
No. Observations:                 963   AIC:                                      3739.
Df Residuals:                     962   BIC:                                      3744.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [33]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.520
Model:                            OLS   Adj. R-squared (uncentered):              0.519
Method:                 Least Squares   F-statistic:                              520.2
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   8.05e-154
Time:                        12:36:19   Log-Likelihood:                         -1686.3
No. Observations:                 963   AIC:                                      3377.
Df Residuals:                     961   BIC:                                      3386.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

8. Neutral control (but raises precision)

## 9.

![](assets/dags9.png)

In [34]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_U = np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + pop_U + np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_X + pop_U + np.random.normal(0, 1, (1_000_000, 1))

In [35]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.364
Model:                            OLS   Adj. R-squared (uncentered):              0.363
Method:                 Least Squares   F-statistic:                              549.5
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                    1.72e-96
Time:                        12:36:22   Log-Likelihood:                         -1634.0
No. Observations:                 963   AIC:                                      3270.
Df Residuals:                     962   BIC:                                      3275.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [36]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.514
Model:                            OLS   Adj. R-squared (uncentered):              0.513
Method:                 Least Squares   F-statistic:                              508.3
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   2.55e-151
Time:                        12:36:22   Log-Likelihood:                         -1504.1
No. Observations:                 963   AIC:                                      3012.
Df Residuals:                     961   BIC:                                      3022.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

9. Bad control: introduces selection bias

## 10

![](assets/dags10.png)

In [37]:
pop_X = np.random.normal(0, 1, (1_000_000, 1))
pop_Y = pop_X + np.random.normal(0, 1, (1_000_000, 1))
pop_Z = pop_Y + np.random.normal(0, 1, (1_000_000, 1))

In [38]:
print(sms.OLS(pop_Y[sample_indices], pop_X[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.516
Model:                            OLS   Adj. R-squared (uncentered):              0.515
Method:                 Least Squares   F-statistic:                              1024.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   1.33e-153
Time:                        12:36:25   Log-Likelihood:                         -1355.2
No. Observations:                 963   AIC:                                      2712.
Df Residuals:                     962   BIC:                                      2717.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [39]:
print(sms.OLS(pop_Y[sample_indices], np.hstack((pop_X, pop_Z))[sample_indices]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.755
Model:                            OLS   Adj. R-squared (uncentered):              0.754
Method:                 Least Squares   F-statistic:                              1481.
Date:                Wed, 06 Aug 2025   Prob (F-statistic):                   3.24e-294
Time:                        12:36:25   Log-Likelihood:                         -1027.0
No. Observations:                 963   AIC:                                      2058.
Df Residuals:                     961   BIC:                                      2068.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

10. Bad control: this is called case-control bias