In [1]:
import numpy as np
from scipy.optimize import minimize
import statsmodels.api as sm
from scipy.stats import norm

In [2]:
def gen_data(nobs=1000, a=2, b=1, c=2, d=1):
    #true value
    y = np.random.normal(loc=0, scale=1, size=(nobs,1))
    
    #measurement error terms
    e1 = np.random.normal(loc=0, scale=1, size=(nobs,1))
    e2 = np.random.normal(loc=0, scale=1, size=(nobs,1))
    
    #procedure
    y1 =b+ a*y + e1
    y2 =d+ c*y + e2
    return y1,y2,y,nobs

y1n,y2n,y,n = gen_data()


print(y1n.mean())
print(y2n.mean())
print(y1n.var())
print(y2n.var())

1.0216069743632954
1.0490929997261356
4.985844796724721
5.070743871632283


In [3]:
def estimate(y1n,y2n):
    b = y1n.mean()
    d = y2n.mean()
    a = np.sqrt(y1n.var() - 1)
    c = np.sqrt(y2n.var() - 1)
    sign = ((y1n*y2n).mean() - b*d) > 0
    if not sign:
        c = -1*c
    return a,b,c,d

for i in [(3,1,2,4),(2,4,-5,1),(2,4,5,1)]:
    y1n,y2n,y,n = gen_data(a=i[0],b=i[1],c=i[2],d=i[3])
    print(estimate(y1n,y2n))

(2.959226063641094, 0.987905279907063, 2.0155242024008815, 4.112721500121178)
(2.0830499329880037, 3.9651710172767194, -5.099327161763274, 1.0656742941338258)
(1.9829306091374255, 3.9782599475927016, 4.877202324856432, 1.082979041684761)


In [4]:
#impute value of y*
##compare just averaging y1+y2 with this.....

def conditional_exp1(x,y,a,b,c,d):
    """numerically integrate things"""
    grid_size = 1000
    z_range = 100
    z_grid = np.linspace(-z_range/2,
                z_range/2, grid_size)

    method1 = (z_range/grid_size)*z_grid*norm.pdf(y-(a*z_grid+b))*norm.pdf(x-(c*z_grid+d))
    method1 = method1.sum()
    return method1


def conditional_exp2(x,y,a,b,c,d):
    """i got this from wolfram"""
    method2 = np.exp(-(a*(x-d) - c*(y-b))**2/(2*(a**2+c**2)))*(a*(y-b) + c*(x-d))/(np.sqrt(2*np.pi)*(a**2+c**2)**(3/2))
    return  method2
    
    
for j in ([0,0],[0,1],[1,0],[1,1],[2,3],[5,7]):
    a,b,c,d = 5,1,4,1
    print(conditional_exp1(j[0],j[1],a,b,c,d),conditional_exp2(j[0],j[1],a,b,c,d))

-0.013497272132567866 -0.013510782915483343
-0.004476647577573008 -0.004481128706279284
-0.006244956054242419 -0.006251207261503918
-3.2526065174565133e-19 0.0
0.019044138757173486 0.019063201959132615
0.057453595699030253 0.05751110680583605


In [8]:
def estimate_predict(y1n,y2n):
    a,b,c,d = estimate(y1n,y2n)
    pred = conditional_exp2(y1n,y2n,a,b,c,d)
    return pred 

def mse(x,y):
    return ((x-y)**2).mean()

y1n,y2n,y,n = gen_data()
y_pred = estimate_predict(y1n,y2n)
result = np.concatenate( (y,(y1n+y2n)/2,  estimate_predict(y1n,y2n) ) ,axis=1)
print(result[0:10,:])
print(mse(y,(y1n+y2n)/2),mse(y,estimate_predict(y1n,y2n)))

[[-5.01799276e-01  4.03171040e-01 -3.99035230e-02]
 [-9.52986678e-01  1.00351083e-01 -6.67711653e-02]
 [ 3.55763540e-01  9.10646201e-01 -8.17787474e-03]
 [ 8.08161367e-02  9.97538905e-01 -1.29937647e-03]
 [-6.35764919e-01 -2.43286895e-01 -4.39480994e-02]
 [ 6.75017704e-01  2.97211971e+00  7.17646143e-02]
 [-1.38450814e+00 -1.69060615e+00 -1.69648551e-01]
 [-3.24740877e-01  5.18356894e-01 -3.04788713e-02]
 [-4.15933546e-01 -1.11361021e+00 -6.75378094e-02]
 [ 5.33670695e-01  1.54283488e+00  3.19400256e-02]]
2.5275528370991003 0.7756529348860458
