In [5]:
import numpy as np
from numpy.random import randn
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression

# We need a lot of samples to plot the conditional distribution:

def simulate(s, alpha, beta, gamma, delta, n_samples):   
    np.random.default_rng(seed=s)
    I = randn(n_samples)
    W = randn(n_samples)
    X = alpha * I + gamma * W + randn(n_samples)
    Y = beta * X + delta * W + randn(n_samples)
    return pd.DataFrame({ "I": I, "W": W, "X": X,"Y":Y})

def linear_coeff(X,Y):
    linear_regressor = LinearRegression() 
    linear_regressor.fit(X, Y)
    return linear_regressor.coef_


For given values of alpha, beta, gamma and delta, we can see that the standard regression is always wrong in the prediction of beta, while the Instrumental variable method and the Two Stage Least Squares are close to the true value of beta. Try changing the alpha, beta, gamma and delta parameters and see how the linear regression coefficient varies. What happens if gamma = 0?

In [34]:
alpha=5
beta=2
gamma=3
delta=4


df = simulate(1, alpha, beta, gamma, delta, 1000)
Y = df["Y"].values.reshape(-1, 1)
X = df["X"].values.reshape(-1, 1)
I = df["I"].values.reshape(-1, 1)

# Linear coefficient vs estimated beta^{OLS} vs real beta
print ("Estimated beta from linear regression: ", linear_coeff(X,Y)[:,0][0])
beta_ols = beta + delta*gamma/(alpha**2+gamma**2+1)
print("Theoretical beta from linear regression: ", beta_ols)
beta_IV = np.cov(I.T, Y.T)[1,0]/np.cov(I.T, X.T)[1,0]
print("Beta from Instrumental variable method: ", beta_IV)
print("True beta: ", beta)

Estimated beta from linear regression:  2.3552317006216557
Theoretical beta from linear regression:  2.342857142857143
Beta from Instrumental variable method:  1.9903336922932124
True beta:  2


In [35]:
# Simplified 2SLS

# Step 1: regress X on I 
alpha_hat = linear_coeff(I, X)
print("Alpha_hat", alpha_hat)
print("True alpha", alpha)
# Step 2: create X_hat
X_hat = alpha_hat * I
# Step 3: regress Y on X_hat
beta_2SLS = linear_coeff(X_hat, Y)
print("Beta from Two Stage Least Squares: ", beta_2SLS)

Alpha_hat [[4.95393781]]
True alpha 5
Beta from Two Stage Least Squares:  [[1.99033369]]


We can now see that even if the estimated beta with Instrumental Variables and Two Stage Least Squares is unbiased, for small values of alpha, it's variance is usually larger than the variance of the OLS estimator. This is true even in settings with a lot of data (e.g. 10k). Try changing alpha to a larger number (e.g. 100) and see how the variance changes.

In [27]:
beta_ols_list = []
beta_IV_list = []
beta_2SLS_list = []

alpha=5
beta=2
gamma=3
delta=4


def run_comparison(nsamples):
    for i in range(1,100):
        df = simulate(1, alpha, beta, gamma, delta, nsamples)
        Y = df["Y"].values.reshape(-1, 1)
        X = df["X"].values.reshape(-1, 1)
        I = df["I"].values.reshape(-1, 1)
        beta_ols_list.append(linear_coeff(X,Y)[:,0][0])
        beta_IV_list.append(np.cov(I.T, Y.T)[1,0]/np.cov(I.T, X.T)[1,0])
        alpha_hat = linear_coeff(I, X)
        beta_2SLS_list.append(linear_coeff(alpha_hat * I, Y))
    print("OLS ", np.mean(beta_ols_list), np.var(beta_ols_list))
    print("IV ",np.mean(beta_IV_list), np.var(beta_IV_list))
    print("2SLS ", np.mean(beta_2SLS_list), np.var(beta_2SLS_list))
    print("True beta: ", beta)

In [28]:
run_comparison(10)

OLS  2.2968682254725588 0.07056795652917784
IV  1.8458217635482395 0.19444626705554077
2SLS  1.8458217635482395 0.1944462670555408
True beta:  2


In [30]:
run_comparison(100)

OLS  2.32626060303418 0.03319268679382803
IV  1.9290764830731102 0.09450922225457216
2SLS  1.9290764830731095 0.09450922225457212
True beta:  2


In [31]:
run_comparison(1000)

OLS  2.331579998425101 0.02318344440404382
IV  1.9509344131076605 0.06673372264687838
2SLS  1.9509344131076605 0.06673372264687837
True beta:  2


In [32]:
run_comparison(10000)

OLS  2.3341847821291246 0.01776403055127781
IV  1.9624414410920126 0.051492632347677605
2SLS  1.9624414410920124 0.0514926323476776
True beta:  2


In [33]:
run_comparison(100000)

OLS  2.3358716505262422 0.01439484557261064
IV  1.9696545316935061 0.04191174809949637
2SLS  1.9696545316935057 0.04191174809949634
True beta:  2
