# Instrumental Variables

In [15]:
import numpy as np
import math 
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import scipy.stats as stats
from sklearn import datasets, linear_model

def IV(covXZ=0.7):
    N=500
    alpha = 0.5
    beta = 2.0
    mean = [0, 0, 0]
    cov = [[1, covXZ, 0.7], [covXZ, 1, 0], [0.7, 0, 1]]  # X, Z, eps
    #np.random.seed(123)
    X, Z, eps = np.random.multivariate_normal(mean, cov, N).T

    '''
    plt.scatter(X, Z)
    plt.xlabel("X")
    plt.ylabel("Z")    
    plt.show()
    plt.scatter(X, eps)
    plt.xlabel("X")
    plt.ylabel("epsilon")    
    plt.show()
    plt.scatter(Z, eps)
    plt.xlabel("Z")
    plt.ylabel("epsilon")
    plt.show()
    '''

    #print np.corrcoef(X, Z)[0][1], np.corrcoef(X, eps)[0][1], np.corrcoef(Z, eps)[0][1]
    
    Y = [alpha + beta * x + e for x, e in zip(X, eps)]
    
    X = np.array(X)
    Y = np.array(Y)
    Z = np.array(Z)
    
    regr = linear_model.LinearRegression()
    xx = np.linspace(-3, 3, 300)

    
    print ("One method of estimating the true beta with the Instrumental Variable Z is to first fit X~Z to get Xhat, " + \
           "and then fit Y~Xhat to get Yhat. The resulting slope would correclty estimate the true beta=2")
    
    regr.fit(Z.reshape(N, 1), X.reshape(N, 1))
    Xhat = regr.predict(Z.reshape(N, 1))
    zx = [regr.intercept_ + x * regr.coef_[0] for x in xx] 
    ZXbeta = regr.coef_[0][0]
    plt.scatter(Z, X)
    plt.plot(xx, zx, 'C1', label='slope: ' + str(round(regr.coef_[0][0], 3)))
    plt.xlabel("Z")
    plt.ylabel("X")    
    plt.title("stage 1: fit X against Z")
    plt.legend()
    plt.show()    
    
    regr.fit(Xhat.reshape(N, 1), Y.reshape(N, 1))
    Yhat = regr.predict(Xhat.reshape(N, 1))
    yx = [regr.intercept_ + x * regr.coef_[0] for x in xx] 
    plt.scatter(Xhat, Y)
    plt.plot(xx, yx, 'C1', label='slope: ' + str(round(regr.coef_[0][0], 3)))
    plt.xlabel("X")
    plt.ylabel("Y")    
    plt.title("stage 2: fit Y against Xhat from stage 1")
    plt.legend()
    plt.show()        
    print ("Observe that the resulting fitted line from stage 2 closely estimates the true beta=2.\n")
    print ("Another method is to fit X~Z to get Xhat, fit Y~Z to get Yhat, and to calculate the ratio " + \
           "of betahat from Y~Z / betahat from X~Z.")
    
    
    regr.fit(Z.reshape(N, 1), Y.reshape(N, 1))
    ZYbeta = regr.coef_[0][0]
    plt.scatter(Z, Y)
    yx2 = [regr.intercept_ + x * regr.coef_[0] for x in xx] 
    plt.plot(xx, yx, 'C1', label='slope: ' + str(round(regr.coef_[0][0], 3)))
    plt.xlabel("X")
    plt.ylabel("Y")    
    plt.title("stage 3: fit Y against Z")
    plt.legend()
    plt.show()       
    print ("Note that the slope from stage 3/stage 1: " + str(ZYbeta) + " + " + str(ZXbeta) + " = " + str(ZYbeta / ZXbeta) +\
          " closely estimates the true beta=2 as well.\n")
    
    print ("The final plot below shows that taking the Instrumental Variable Z into account correctly estimates " + \
           "the true beta, while simply fitting Y~X does not.")
    
    
    plt.plot(X, Y, 'bo', label="without Z")
    plt.plot(Xhat, Yhat, 'ro', label="with Z")
    
    xx = np.linspace(-3, 3, 100)
    yy = [alpha + beta*x for x in xx]
    plt.plot(xx, yy, 'C1', label='line with true beta')
    plt.title("Fitting Y~Z with/without the Instrumental Variable Z")
    plt.legend()
    plt.show()
    
interact_manual(IV, covXZ=(-0.9, 0.9))

<function __main__.IV>