In [81]:
import hdmpy
import numpy as np
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors

In [82]:
# Import relevant packages for splitting data
import numpy as np
import random
import math
import pandas as pd

# Import packages for OLS regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col


# Model 1

In [84]:
# Set Seed
# to make the results replicable (generating random numbers)
np.random.seed(12345676)     # set MC seed

n = 1000                # sample size
Z = np.random.normal(0, 1, 1000).reshape((1000, 1))  # generate Z
X = 0.5*Z + np.random.normal(0, 1, 1000).reshape((1000, 1))
Y = 3*X + 2*Z + np.random.normal(0, 1, 1000).reshape((1000, 1))

# Create dataframe
D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [85]:
# Regressions
no_control = "Y ~ X"          
using_control = "Y ~ X + Z"      #classical

no_control = smf.ols(no_control , data=data).fit()
using_control = smf.ols(using_control , data=data).fit()

# Summary results
dfoutput = summary_col([no_control,using_control],stars=True)
print(dfoutput)


                  Y I       Y II  
----------------------------------
Intercept      0.1143*   0.0001   
               (0.0644)  (0.0327) 
R-squared      0.8227    0.9544   
R-squared Adj. 0.8225    0.9543   
X              3.8564*** 3.0192***
               (0.0567)  (0.0327) 
Z                        2.0062***
                         (0.0374) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


# Model 2

In [86]:
# Set Seed
# to make the results replicable (generating random numbers)
np.random.seed(12345676)     # set MC seed

n = 1000                # sample size
U = np.random.normal(0, 1, 1000).reshape((1000, 1))
Z = 2*U + np.random.normal(0, 1, 1000).reshape((1000, 1)) # generate Z
X = 0.5*Z + np.random.normal(0, 1, 1000).reshape((1000, 1))
Y = 3*X + 2*U + np.random.normal(0, 1, 1000).reshape((1000, 1))

# Create dataframe
D = np.hstack((U, Z, X, Y))
data = pd.DataFrame(D, columns = ["U", "Z", "X", "Y"])

In [87]:
# Regressions
no_control = "Y ~ X"          
using_control = "Y ~ X + Z"      #classical

no_control = smf.ols(no_control , data=data).fit()
using_control = smf.ols(using_control , data=data).fit()

# Summary results
dfoutput = summary_col([no_control,using_control],stars=True)
print(dfoutput)


                  Y I       Y II  
----------------------------------
Intercept      0.0284    -0.0190  
               (0.0571)  (0.0427) 
R-squared      0.9167    0.9535   
R-squared Adj. 0.9166    0.9534   
X              3.8628*** 2.9952***
               (0.0369)  (0.0414) 
Z                        0.7950***
                         (0.0283) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


# Model 7 - Bad Control (M-bias)

In [88]:
# Set Seed
# to make the results replicable (generating random numbers)
np.random.seed(12345676)     # set MC seed

n = 1000                # sample size
U_1 = np.random.normal(0, 1, 1000).reshape((1000, 1))
U_2 = np.random.normal(0, 1, 1000).reshape((1000, 1))

Z = 0.5*U_1 + 0.5*U_2 + np.random.normal(0, 1, 1000).reshape((1000, 1)) # generate Z
X = 2*U_1 + np.random.normal(0, 1, 1000).reshape((1000, 1))
Y = 3*X + 2*U_2 + np.random.normal(0, 1, 1000).reshape((1000, 1))

# Create dataframe
D = np.hstack((U_1,U_2, Z, X, Y))
data = pd.DataFrame(D, columns = ["U_1", "U_2", "Z", "X", "Y"])

In [89]:
# Regressions
no_control = "Y ~ X"          
using_control = "Y ~ X + Z"      #classical

no_control = smf.ols(no_control , data=data).fit()
using_control = smf.ols(using_control , data=data).fit()

# Summary results
dfoutput = summary_col([no_control,using_control],stars=True)
print(dfoutput)


                  Y I       Y II  
----------------------------------
Intercept      0.0238    0.0124   
               (0.0705)  (0.0645) 
R-squared      0.9033    0.9192   
R-squared Adj. 0.9032    0.9190   
X              3.0315*** 2.8690***
               (0.0314)  (0.0310) 
Z                        0.7654***
                         (0.0546) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


# Model 8 - ( Neutral Control (possibly good for precision)

In [90]:
# Set Seed
# to make the results replicable (generating random numbers)
np.random.seed(12345676)     # set MC seed

n = 1000                # sample size

Z = np.random.normal(0, 1, 1000).reshape((1000, 1))
X = np.random.normal(0, 1, 1000).reshape((1000, 1))
Y = 3*X + 2*Z + np.random.normal(0, 1, 1000).reshape((1000, 1))

# Create dataframe
D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

In [91]:
# Regressions
no_control = "Y ~ X"          
using_control = "Y ~ X + Z"      #classical

no_control = smf.ols(no_control , data=data).fit()
using_control = smf.ols(using_control , data=data).fit()

# Summary results
dfoutput = summary_col([no_control,using_control],stars=True)
print(dfoutput)


                  Y I       Y II  
----------------------------------
Intercept      0.1404**  0.0001   
               (0.0713)  (0.0327) 
R-squared      0.6556    0.9278   
R-squared Adj. 0.6553    0.9277   
X              3.1088*** 3.0192***
               (0.0713)  (0.0327) 
Z                        2.0158***
                         (0.0329) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


# Model 10 - Bad Control (bias amplification)

In [93]:
# Set Seed
# to make the results replicable (generating random numbers)
np.random.seed(12345676)     # set MC seed

n = 1000                # sample size
U = np.random.normal(0, 1, 1000).reshape((1000, 1))
Z = np.random.normal(0, 1, 1000).reshape((1000, 1)) # generate Z
X = 0.5*Z + 2*U + np.random.normal(0, 1, 1000).reshape((1000, 1))
Y = 3*X + 3*U + np.random.normal(0, 1, 1000).reshape((1000, 1))

# Create dataframe
D = np.hstack((U, Z, X, Y))
data = pd.DataFrame(D, columns = ["U", "Z", "X", "Y"])

# Regressions
no_control = "Y ~ X"          
using_control = "Y ~ X + Z"      #classical

no_control = smf.ols(no_control , data=data).fit()
using_control = smf.ols(using_control , data=data).fit()

# Summary results
dfoutput = summary_col([no_control,using_control],stars=True)
print(dfoutput)


                  Y I       Y II   
-----------------------------------
Intercept      0.0028    -0.0095   
               (0.0569)  (0.0538)  
R-squared      0.9663    0.9699    
R-squared Adj. 0.9662    0.9699    
X              4.1183*** 4.1866*** 
               (0.0244)  (0.0238)  
Z                        -0.6111***
                         (0.0556)  
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


# Models 17 - Bad Controls (selection bias)

In [80]:
# Set Seed
# to make the results replicable (generating random numbers)
np.random.seed(12345676)     # set MC seed

n = 1000                # sample size
X = 0.5*Z + np.random.normal(0, 1, 1000).reshape((1000, 1))
Y = 3*X + np.random.normal(0, 1, 1000).reshape((1000, 1))
Z = 2*Y + np.random.normal(0, 1, 1000).reshape((1000, 1))  # generate Z

# Create dataframe
D = np.hstack((Z, X, Y))
data = pd.DataFrame(D, columns = ["Z", "X", "Y"])

# Regressions
no_control = "Y ~ X"          
using_control = "Y ~ X + Z"      #classical

no_control = smf.ols(no_control , data=data).fit()
using_control = smf.ols(using_control , data=data).fit()

# Summary results
dfoutput = summary_col([no_control,using_control],stars=True)
print(dfoutput)


                  Y I       Y II  
----------------------------------
Intercept      -0.0332   -0.0085  
               (0.0278)  (0.0140) 
R-squared      0.9514    0.9876   
R-squared Adj. 0.9513    0.9876   
X              3.4241*** 0.8793***
               (0.0245)  (0.0487) 
Z                        0.3705***
                         (0.0069) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
